module NLP.POS.UnambiguousTagger where
import Data.ByteString (ByteString)
import Data.ByteString.Char8 (pack)
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Serialize (encode, decode)
import Data.Text (Text)
import NLP.Tokenize.Chatter (tokenize)
import NLP.Types
import qualified NLP.POS.LiteralTagger as LT
taggerID :: ByteString
taggerID = pack "NLP.POS.UnambiguousTagger"
readTagger :: Tag t => ByteString -> Maybe (POSTagger t) -> Either String (POSTagger t)
readTagger bs backoff = do
  model <- decode bs
  return $ mkTagger model backoff
mkTagger :: Tag t => Map Text t -> Maybe (POSTagger t) -> POSTagger t
mkTagger table mTgr = let
  litTagger = LT.mkTagger table LT.Sensitive mTgr
  trainer exs = do
    let newTable = train table exs
    return $ mkTagger newTable mTgr
  in litTagger { posTrainer = trainer
               , posSerialize = encode table
               , posID = taggerID
               , posTokenizer = tokenize
               }
train :: Tag t => Map Text t -> [TaggedSentence t] -> Map Text t
train table exs = let
  pairs = concatMap unTS exs
  trainOnPair t (POS tag (Token txt)) = Map.alter (incorporate tag) txt t
  incorporate new Nothing                 = Just new
  incorporate new (Just old) | new == old = Just old
                             | otherwise  = Just tagUNK 
  in foldl trainOnPair table pairs