module NLP.POS.UnambiguousTagger where
import Data.ByteString (ByteString)
import Data.ByteString.Char8 (pack)
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Serialize (encode, decode)
import Data.Text (Text)
import NLP.Tokenize.Chatter (tokenize)
import NLP.Types
import qualified NLP.POS.LiteralTagger as LT
taggerID :: ByteString
taggerID = pack "NLP.POS.UnambiguousTagger"
readTagger :: Tag t => ByteString -> Maybe (POSTagger t) -> Either String (POSTagger t)
readTagger bs backoff = do
model <- decode bs
return $ mkTagger model backoff
mkTagger :: Tag t => Map Text t -> Maybe (POSTagger t) -> POSTagger t
mkTagger table mTgr = let
litTagger = LT.mkTagger table LT.Sensitive mTgr
trainer exs = do
let newTable = train table exs
return $ mkTagger newTable mTgr
in litTagger { posTrainer = trainer
, posSerialize = encode table
, posID = taggerID
, posTokenizer = tokenize
}
train :: Tag t => Map Text t -> [TaggedSentence t] -> Map Text t
train table exs = let
pairs = concatMap unTS exs
trainOnPair t (POS tag (Token txt)) = Map.alter (incorporate tag) txt t
incorporate new Nothing = Just new
incorporate new (Just old) | new == old = Just old
| otherwise = Just tagUNK
in foldl trainOnPair table pairs