module NLP.Chunk
where
import Codec.Compression.GZip (decompress)
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import Data.List (isSuffixOf)
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Serialize (decode, encode)
import Data.Text (Text)
import qualified Data.Text as T
import System.FilePath ((</>))
import NLP.POS (tag)
import NLP.Types
import NLP.Chunk.AvgPerceptronChunker (Chunker(..))
import qualified NLP.Chunk.AvgPerceptronChunker as Avg
import qualified NLP.Corpora.Conll as C
import Paths_chatter
defaultChunker :: IO (Chunker C.Chunk C.Tag)
defaultChunker = conllChunker
conllChunker :: IO (Chunker C.Chunk C.Tag)
conllChunker = do
dir <- getDataDir
loadChunker (dir </> "data" </> "models" </> "conll2000.chunk.model.gz")
train :: (ChunkTag c, Tag t) => Chunker c t -> [ChunkedSentence c t] -> IO (Chunker c t)
train ch exs = chTrainer ch exs
chunk :: (ChunkTag c, Tag t) => Chunker c t -> [TaggedSentence t] -> [ChunkedSentence c t]
chunk chk input = chChunker chk input
chunkText :: (ChunkTag c, Tag t) => POSTagger t -> Chunker c t -> Text -> Text
chunkText tgr chk input = T.intercalate " " $ map showChunkedSent $ chunk chk $ tag tgr input
chunkStr :: (ChunkTag c, Tag t) => POSTagger t -> Chunker c t -> String -> String
chunkStr tgr chk str = T.unpack $ chunkText tgr chk $ T.pack str
chunkerTable :: (ChunkTag c, Tag t) => Map ByteString
(ByteString -> Either String (Chunker c t))
chunkerTable = Map.fromList
[ (Avg.chunkerID, Avg.readChunker)
]
saveChunker :: (ChunkTag c, Tag t) => Chunker c t -> FilePath -> IO ()
saveChunker chunker file = BS.writeFile file (serialize chunker)
loadChunker :: (ChunkTag c, Tag t) => FilePath -> IO (Chunker c t)
loadChunker file = do
content <- getContent file
case deserialize chunkerTable content of
Left err -> error err
Right chunker -> return chunker
where
getContent :: FilePath -> IO ByteString
getContent f | ".gz" `isSuffixOf` file = fmap (LBS.toStrict . decompress) $ LBS.readFile f
| otherwise = BS.readFile f
serialize :: (ChunkTag c, Tag t) => Chunker c t -> ByteString
serialize chunker = encode ( chId chunker, chSerialize chunker)
deserialize :: (ChunkTag c, Tag t) =>
Map ByteString
(ByteString -> Either String (Chunker c t))
-> ByteString
-> Either String (Chunker c t)
deserialize table bs = do
(theID, payload) <- decode bs
case Map.lookup theID table of
Nothing -> Left ("Could not find ID in Chunker function map: " ++ show theID)
Just fn -> fn payload