{-# LANGUAGE TypeFamilies, OverloadedStrings #-}
module NLP.Probability.Example.Trigram (
   -- * Source for a trigram language modeling

) where
import qualified Data.Text as T  
import qualified Data.Map as M 
import Data.Monoid 
import NLP.Probability.ConditionalDistribution
import NLP.Probability.Observation

newtype Word = Word T.Text 
    deriving (Ord, Eq)

newtype TrigramContext = Trigram (Word, Word)

instance Event Word where type EventMap Word = M.Map

instance Context TrigramContext where
    type Sub TrigramContext = Word 
    type SubMap TrigramContext = M.Map 
    decompose (Trigram (w1, w2)) = [w1, w2] 
       
makeTrigrams :: T.Text -> CondObserved Word TrigramContext
makeTrigrams sentence = 
    mconcat $ map (uncurry condObservation) $ take3 $ map Word words 
    where words =  ["*S1*", "*S2*"] ++  (T.split " " sentence) ++  ["*E1*", "*E2*"] 
          take3 [_,_] = []
          take3 (a:b:c:rest) = (c, Trigram (a, b)):(take3 (b:c:rest))  

languageModel :: String -> CondDistribution Word TrigramContext
languageModel sentences = 
    mkDist $ estimateGeneralLinear (wittenBell 5) $ -- (simpleLinear [0.7, 0.3, 0.0]) $ 
    mconcat $ map makeTrigrams $ T.split "." $ T.pack sentences

prob lm (w1, w2, w3) =
    lm (Trigram (Word $ T.pack w1, Word $ T.pack w2)) $ Word $ T.pack w3