module NLP.POS.LiteralTagger
( tag
, tagSentence
, mkTagger
, taggerID
, readTagger
)
where
import Data.ByteString (ByteString)
import Data.ByteString.Char8 (pack)
import qualified Data.Map.Strict as Map
import Data.Serialize (encode, decode)
import Data.Map.Strict (Map)
import Data.Text (Text)
import qualified Data.Text as T
import NLP.Tokenize.Text (tokenize)
import NLP.FullStop (segment)
import NLP.Types ( tagUNK, Sentence, TaggedSentence
, Tag, POSTagger(..))
taggerID :: ByteString
taggerID = pack "NLP.POS.LiteralTagger"
mkTagger :: Map Text Tag -> Maybe POSTagger -> POSTagger
mkTagger table mTgr = POSTagger { posTagger = tag table
, posTrainer = \_ -> return $ mkTagger table mTgr
, posBackoff = mTgr
, posTokenizer = tokenize
, posSplitter = (map T.pack) . segment . T.unpack
, posSerialize = encode table
, posID = taggerID
}
tag :: Map Text Tag -> [Sentence] -> [TaggedSentence]
tag table ss = map (tagSentence table) ss
tagSentence :: Map Text Tag -> Sentence -> TaggedSentence
tagSentence table toks = map findTag toks
where
findTag :: Text -> (Text, Tag)
findTag txt = (txt, Map.findWithDefault tagUNK txt table)
readTagger :: ByteString -> Maybe POSTagger -> Either String POSTagger
readTagger bs backoff = do
model <- decode bs
return $ mkTagger model backoff