module NLP.POS
( tag
, tagStr
, tagText
, train
, trainStr
, trainText
, eval
, serialize
, deserialize
, taggerTable
, saveTagger
, loadTagger
, defaultTagger
)
where
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import Data.List (isSuffixOf)
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Text (Text)
import qualified Data.Text as T
import Data.Serialize (encode, decode)
import Codec.Compression.GZip (decompress)
import System.FilePath ((</>))
import NLP.Corpora.Parsing (readPOS)
import NLP.Types (TaggedSentence, Tag(..), Sentence
, POSTagger(..), tagUNK, stripTags)
import qualified NLP.POS.LiteralTagger as LT
import qualified NLP.POS.UnambiguousTagger as UT
import qualified NLP.POS.AvgPerceptronTagger as Avg
import Paths_chatter
defaultTagger :: IO POSTagger
defaultTagger = do
dir <- getDataDir
loadTagger (dir </> "data" </> "models" </> "brown-train.model.gz")
taggerTable :: Map ByteString (ByteString -> Maybe POSTagger -> Either String POSTagger)
taggerTable = Map.fromList
[ (LT.taggerID, LT.readTagger)
, (Avg.taggerID, Avg.readTagger)
, (UT.taggerID, UT.readTagger)
]
saveTagger :: POSTagger -> FilePath -> IO ()
saveTagger tagger file = BS.writeFile file (serialize tagger)
loadTagger :: FilePath -> IO POSTagger
loadTagger file = do
content <- getContent file
case deserialize taggerTable content of
Left err -> error err
Right tgr -> return tgr
where
getContent :: FilePath -> IO ByteString
getContent f | ".gz" `isSuffixOf` file = fmap (LBS.toStrict . decompress) $ LBS.readFile f
| otherwise = BS.readFile f
serialize :: POSTagger -> ByteString
serialize tagger =
let backoff = case posBackoff tagger of
Nothing -> Nothing
Just btgr -> Just $ serialize btgr
in encode ( posID tagger
, posSerialize tagger
, backoff
)
deserialize :: Map ByteString (ByteString -> Maybe POSTagger -> Either String POSTagger)
-> ByteString
-> Either String POSTagger
deserialize table bs = do
(theID, theTgr, mBackoff) <- decode bs
backoff <- case mBackoff of
Nothing -> Right Nothing
Just str -> Just `fmap` (deserialize table str)
case Map.lookup theID table of
Nothing -> Left ("Could not find ID in POSTagger function map: " ++ show theID)
Just fn -> fn theTgr backoff
tag :: POSTagger -> Text -> [TaggedSentence]
tag p txt = let sentences = (posSplitter p) txt
tokens = map (posTokenizer p) sentences
in tagTokens p tokens
tagTokens :: POSTagger -> [Sentence] -> [TaggedSentence]
tagTokens p tokens = let priority = (posTagger p) tokens
in case posBackoff p of
Nothing -> priority
Just tgr -> combine priority (tagTokens tgr tokens)
combine :: [TaggedSentence] -> [TaggedSentence] -> [TaggedSentence]
combine xs ys = zipWith combineSentences xs ys
combineSentences :: TaggedSentence -> TaggedSentence -> TaggedSentence
combineSentences xs ys = zipWith pickTag xs ys
pickTag :: (Text, Tag) -> (Text, Tag) -> (Text, Tag)
pickTag a@(txt1, t1) b@(txt2, t2) | txt1 /= txt2 = error ("Text does not match: "++ show a ++ " " ++ show b)
| t1 /= tagUNK = (txt1, t1)
| otherwise = (txt1, t2)
tagStr :: POSTagger -> String -> String
tagStr tgr = T.unpack . tagText tgr . T.pack
tagText :: POSTagger -> Text -> Text
tagText tgr str = T.intercalate " " $ map toTaggedTok taggedSents
where
taggedSents = concat $ tag tgr str
toTaggedTok :: (Text, Tag) -> Text
toTaggedTok (tok, Tag c) = tok `T.append` (T.cons '/' c)
trainStr :: POSTagger -> String -> IO POSTagger
trainStr tgr = trainText tgr . T.pack
trainText :: POSTagger -> Text -> IO POSTagger
trainText p exs = train p (map readPOS $ (posTokenizer p) exs)
train :: POSTagger -> [TaggedSentence] -> IO POSTagger
train p exs = do
let
trainBackoff = case posBackoff p of
Nothing -> return $ Nothing
Just b -> do tgr <- train b exs
return $ Just tgr
trainer = posTrainer p
newTgr <- trainer exs
newBackoff <- trainBackoff
return (newTgr { posBackoff = newBackoff })
eval :: POSTagger -> [TaggedSentence] -> Double
eval tgr oracle = let
sentences = map stripTags oracle
results = (posTagger tgr) sentences
totalTokens = fromIntegral $ sum $ map length oracle
isMatch :: (Text, Tag) -> (Text, Tag) -> Double
isMatch (_, rTag) (_, oTag) | rTag == oTag = 1
| otherwise = 0
in (sum $ zipWith isMatch (concat results) (concat oracle)) / totalTokens