module NLP.Nerf
( Nerf (..)
, train
, ner
, tryOx
, module NLP.Nerf.Types
) where
import Control.Applicative ((<$>), (<*>))
import Data.Binary (Binary, put, get)
import Data.Foldable (foldMap)
import Data.List (intercalate)
import qualified Data.Text as T
import qualified Data.Text.Lazy.IO as L
import Text.Named.Enamex (parseEnamex)
import qualified Data.Named.Tree as N
import qualified Data.Named.IOB as IOB
import Numeric.SGD (SgdArgs)
import qualified Data.CRF.Chain1 as CRF
import NLP.Nerf.Types
import NLP.Nerf.Tokenize (tokenize, moveNEs)
import NLP.Nerf.Schema (SchemaConf, Schema, fromConf, schematize)
data Nerf = Nerf
{ schemaConf :: SchemaConf
, crf :: CRF.CRF Ob Lb }
instance Binary Nerf where
put Nerf{..} = put schemaConf >> put crf
get = Nerf <$> get <*> get
flatten :: Schema a -> N.NeForest NE Word -> CRF.SentL Ob Lb
flatten schema forest =
[ CRF.annotate x y
| (x, y) <- zip xs ys ]
where
iob = IOB.encodeForest forest
xs = schematize schema (map IOB.word iob)
ys = map IOB.label iob
reTokenize :: N.NeForest NE Word -> N.NeForest NE Word
reTokenize ft =
moveNEs ft ((doTok . leaves) ft)
where
doTok = map T.pack . tokenize . intercalate " " . map T.unpack
leaves = concatMap $ foldMap (either (const []) (:[]))
readDeep :: FilePath -> IO [N.NeForest NE Word]
readDeep path = map reTokenize . parseEnamex <$> L.readFile path
readFlat :: Schema a -> FilePath -> IO [CRF.SentL Ob Lb]
readFlat schema path = map (flatten schema) <$> readDeep path
drawSent :: CRF.SentL Ob Lb -> IO ()
drawSent sent = do
let unDist (x, y) = (x, CRF.unDist y)
mapM_ (print . unDist) sent
putStrLn ""
tryOx :: SchemaConf -> FilePath -> IO ()
tryOx cfg path = do
input <- readFlat (fromConf cfg) path
mapM_ drawSent input
train
:: SgdArgs
-> SchemaConf
-> FilePath
-> Maybe FilePath
-> IO Nerf
train sgdArgs cfg trainPath evalPathM = do
let schema = fromConf cfg
readTrain = readFlat schema trainPath
readEvalM = evalPathM >>= \evalPath ->
Just ([], readFlat schema evalPath)
_crf <- CRF.train sgdArgs readTrain readEvalM CRF.presentFeats
return $ Nerf cfg _crf
ner :: Nerf -> String -> N.NeForest NE Word
ner nerf sent =
let ws = map T.pack . tokenize $ sent
schema = fromConf (schemaConf nerf)
xs = CRF.tag (crf nerf) (schematize schema ws)
in IOB.decodeForest [IOB.IOB w x | (w, x) <- zip ws xs]