----------------------------------------------------------------------------- -- | -- Module : Text.Hyphenation.Pattern -- Copyright : (C) 2012 Edward Kmett, -- License : BSD-style (see the file LICENSE) -- -- Maintainer : Edward Kmett -- Stability : provisional -- Portability : portable -- ---------------------------------------------------------------------------- module Text.Hyphenation.Pattern ( -- * Pattern file support Patterns , insertPattern , lookupPattern , scorePattern , parsePatterns ) where import qualified Data.IntMap as IM import Data.Monoid import Prelude hiding (lookup) import Data.Char (digitToInt, isDigit) -- | Hyphenation patterns data Patterns = Patterns [Int] (IM.IntMap Patterns) deriving Show instance Monoid Patterns where mempty = Patterns [] IM.empty Patterns ps m `mappend` Patterns qs n = Patterns (zipMax ps qs) (IM.unionWith mappend m n) -- | Tallies the hyphenation scores for a word considering all tails. lookupPattern :: String -> Patterns -> [Int] lookupPattern xs0 = init . tail . go ('.' : xs0 ++ ".") where go [] (Patterns ys _) = ys go xxs@(_:xs) t = zipMax (go1 xxs t) (0:go xs t) go1 [] (Patterns ys _) = ys go1 (x:xs) (Patterns ys m) = case IM.lookup (fromEnum x) m of Just t' -> zipMax ys (go1 xs t') Nothing -> ys -- | Insert a Knuth-Liang hyphenation pattern into the trie -- -- 1. @.@ denotes the start or end of the input -- -- 2. @0-9@ are used to denote hyphenation or dehyphenation depending on whether or not they are even (no hyphen) or odd (hyphen allowed). -- -- Patterns are overlaid and the maximum value at each location is used. -- this allows you to implement a finite number of precedences between hyphenation rules -- -- (e.g. @3foo.@ indicates that the suffix '-foo' should be hyphenated with precedence 3.) insertPattern :: String -> Patterns -> Patterns insertPattern s0 = go (chars s0) where pts = scorePattern s0 go [] (Patterns _ m) = Patterns pts m go (x:xs) (Patterns n m) = Patterns n (IM.insertWith (\_ -> go xs) (fromEnum x) (mk xs) m) mk [] = Patterns pts IM.empty mk (x:xs) = Patterns [] (IM.singleton (fromEnum x) (mk xs)) -- | Parse one pattern per line from an input string -- -- @hyph-utf8@ supplies these files UTF-8 encoded in the @txt@ folder with a @.pat.txt@ extension parsePatterns :: String -> Patterns parsePatterns = foldr insertPattern mempty . lines chars :: String -> String chars = filter (\x -> (x < '0' || x > '9')) -- | Convert a Pattern string to a Score scorePattern :: String -> [Int] scorePattern [] = [0] scorePattern (x:ys) | isDigit x = digitToInt x : if null ys then [] else scorePattern (tail ys) | otherwise = 0 : scorePattern ys -- | Zip two scores. zipMax :: [Int] -> [Int] -> [Int] zipMax (x:xs) (y:ys) = max x y : zipMax xs ys zipMax [] ys = ys zipMax xs [] = xs