{-# LANGUAGE TypeFamilies, OverloadedStrings #-} module NLP.Probability.Example.Trigram ( -- * Source for a trigram language modeling ) where import qualified Data.Text as T import qualified Data.Map as M import Data.Monoid import NLP.Probability.ConditionalDistribution import NLP.Probability.Observation newtype Word = Word T.Text deriving (Ord, Eq) newtype TrigramContext = Trigram (Word, Word) instance Event Word where type EventMap Word = M.Map instance Context TrigramContext where type Sub TrigramContext = Word type SubMap TrigramContext = M.Map decompose (Trigram (w1, w2)) = [w1, w2] makeTrigrams :: T.Text -> CondObserved Word TrigramContext makeTrigrams sentence = mconcat $ map (uncurry condObservation) $ take3 $ map Word words where words = ["*S1*", "*S2*"] ++ (T.split " " sentence) ++ ["*E1*", "*E2*"] take3 [_,_] = [] take3 (a:b:c:rest) = (c, Trigram (a, b)):(take3 (b:c:rest)) languageModel :: String -> CondDistribution Word TrigramContext languageModel sentences = mkDist $ estimateGeneralLinear (wittenBell 5) $ -- (simpleLinear [0.7, 0.3, 0.0]) $ mconcat $ map makeTrigrams $ T.split "." $ T.pack sentences prob lm (w1, w2, w3) = lm (Trigram (Word $ T.pack w1, Word $ T.pack w2)) $ Word $ T.pack w3