{-# LANGUAGE OverloadedStrings #-} {-| Module : NLP.Chunk Description : Phrase Chunking facilities. Copyright : Rogan Creswick, 2014 Maintainer : creswick@gmail.com Stability : experimental NLP.Chunk aims to make phrasal chunking trivially easy -- it is the corolary to NLP.POS. The simplest way to try out chunking with Chatter is to open a repl after installing chatter and try this: >> import NLP.POS >> import NLP.Chunk >> tgr <- defaultTagger >> chk <- defaultChunker >> chunkText tgr chk "Monads are monoids in the category of endofunctors." > "[NP Monads/NNS are/VBP monoids/NNS] [PP in/IN] [NP the/DT category/NN] [PP of/IN] [NP endofunctors/NNS] ./." Note that it isn't perfect--phrase chunking is tricky, and the 'defaultTagger' and 'defaultChunker' aren't trained on the largest training set (they use Conll 2000). You can easily train more taggers and chunkers using the APIs exposed here if you have the training data to do so. -} module NLP.Chunk where import Codec.Compression.GZip (decompress) import Data.ByteString (ByteString) import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as LBS import Data.List (isSuffixOf) import Data.Map (Map) import qualified Data.Map as Map import Data.Serialize (decode, encode) import Data.Text (Text) import qualified Data.Text as T import System.FilePath (()) import NLP.POS (tag) import NLP.Types import NLP.Chunk.AvgPerceptronChunker (Chunker(..)) import qualified NLP.Chunk.AvgPerceptronChunker as Avg import qualified NLP.Corpora.Conll as C import Paths_chatter -- | A basic Phrasal chunker. defaultChunker :: IO (Chunker C.Chunk C.Tag) defaultChunker = conllChunker -- | Convenient function to load the Conll2000 Chunker. conllChunker :: IO (Chunker C.Chunk C.Tag) conllChunker = do dir <- getDataDir loadChunker (dir "data" "models" "conll2000.chunk.model.gz") -- | Train a chunker on a set of additional examples. train :: (ChunkTag c, Tag t) => Chunker c t -> [ChunkedSentence c t] -> IO (Chunker c t) train ch exs = chTrainer ch exs -- | Chunk a 'TaggedSentence' that has been produced by a Chatter -- tagger, producing a rich representation of the Chunks and the Tags -- detected. -- -- If you just want to see chunked output from standard text, you -- probably want 'chunkText' or 'chunkStr'. chunk :: (ChunkTag c, Tag t) => Chunker c t -> [TaggedSentence t] -> [ChunkedSentence c t] chunk chk input = chChunker chk input -- | Convenience funciton to Tokenize, POS-tag, then Chunk the -- provided text, and format the result in an easy-to-read format. -- -- > > tgr <- defaultTagger -- > > chk <- defaultChunker -- > > chunkText tgr chk "The brown dog jumped over the lazy cat." -- > "[NP The/DT brown/NN dog/NN] [VP jumped/VBD] [NP over/IN the/DT lazy/JJ cat/NN] ./." -- chunkText :: (ChunkTag c, Tag t) => POSTagger t -> Chunker c t -> Text -> Text chunkText tgr chk input = T.intercalate " " $ map showChunkedSent $ chunk chk $ tag tgr input -- | A wrapper around 'chunkText' that packs strings. chunkStr :: (ChunkTag c, Tag t) => POSTagger t -> Chunker c t -> String -> String chunkStr tgr chk str = T.unpack $ chunkText tgr chk $ T.pack str -- | The default table of tagger IDs to readTagger functions. Each -- tagger packaged with Chatter should have an entry here. By -- convention, the IDs use are the fully qualified module name of the -- tagger package. chunkerTable :: (ChunkTag c, Tag t) => Map ByteString (ByteString -> Either String (Chunker c t)) chunkerTable = Map.fromList [ (Avg.chunkerID, Avg.readChunker) ] -- | Store a 'Chunker' to disk. saveChunker :: (ChunkTag c, Tag t) => Chunker c t -> FilePath -> IO () saveChunker chunker file = BS.writeFile file (serialize chunker) -- | Load a 'Chunker' from disk, optionally gunzipping if -- needed. (based on file extension) loadChunker :: (ChunkTag c, Tag t) => FilePath -> IO (Chunker c t) loadChunker file = do content <- getContent file case deserialize chunkerTable content of Left err -> error err Right chunker -> return chunker where getContent :: FilePath -> IO ByteString getContent f | ".gz" `isSuffixOf` file = fmap (LBS.toStrict . decompress) $ LBS.readFile f | otherwise = BS.readFile f serialize :: (ChunkTag c, Tag t) => Chunker c t -> ByteString serialize chunker = encode ( chId chunker, chSerialize chunker) deserialize :: (ChunkTag c, Tag t) => Map ByteString (ByteString -> Either String (Chunker c t)) -> ByteString -> Either String (Chunker c t) deserialize table bs = do (theID, payload) <- decode bs case Map.lookup theID table of Nothing -> Left ("Could not find ID in Chunker function map: " ++ show theID) Just fn -> fn payload