-- | This POS tagger deterministically tags tokens.  However, if it
-- ever sees multiple tags for the same token, it will forget the tag
-- it has learned.  This is useful for creating taggers that have very
-- high precision, but very low recall.
--
-- Unambiguous taggers are also useful when defined with a
-- non-deterministic backoff tagger, such as an
-- "NLP.POS.AveragedPerceptronTagger", since the high-confidence tags
-- will be applied first, followed by the more non-deterministic
-- results of the backoff tagger.
module NLP.POS.UnambiguousTagger where

import Data.ByteString (ByteString)
import Data.ByteString.Char8 (pack)
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Serialize (encode, decode)
import Data.Text (Text)

import NLP.Tokenize.Chatter (tokenize)
import NLP.Types

import qualified NLP.POS.LiteralTagger as LT

taggerID :: ByteString
taggerID = pack "NLP.POS.UnambiguousTagger"

readTagger :: Tag t => ByteString -> Maybe (POSTagger t) -> Either String (POSTagger t)
readTagger bs backoff = do
  model <- decode bs
  return $ mkTagger model backoff

-- | Create an unambiguous tagger, using the supplied 'Map' as a
-- source of tags.
mkTagger :: Tag t => Map Text t -> Maybe (POSTagger t) -> POSTagger t
mkTagger table mTgr = let
  litTagger = LT.mkTagger table LT.Sensitive mTgr

--  trainer :: Tag t => [TaggedSentence t] -> IO (POSTagger t)
  trainer exs = do
    let newTable = train table exs
    return $ mkTagger newTable mTgr

  in litTagger { posTrainer = trainer
               , posSerialize = encode table
               , posID = taggerID
               , posTokenizer = tokenize
               }

-- | Trainer method for unambiguous taggers.
train :: Tag t => Map Text t -> [TaggedSentence t] -> Map Text t
train table exs = let

--  pairs :: POS t
  pairs = concatMap unTS exs

--  trainOnPair :: Map Text t -> POS t -> Map Text t
  trainOnPair t (POS tag (Token txt)) = Map.alter (incorporate tag) txt t

--  incorporate :: t -> Maybe t -> Maybe t
  incorporate new Nothing                 = Just new
  incorporate new (Just old) | new == old = Just old
                             | otherwise  = Just tagUNK -- Forget the tag.

  in foldl trainOnPair table pairs