module NLP.Sequor
( ModelData
, P.Trace
, Template.Feature
, Config
, Token
, Label
, Sentence
, train
, predict
, parseTemplate
, defaultFlags
)
where
import qualified Data.Map as Map
import qualified Data.Set as Set
import qualified Data.IntMap as IntMap
import qualified Data.IntSet as IntSet
import Data.List (foldl',tails)
import Data.Maybe (fromMaybe)
import Helper.ListZipper
import qualified NLP.Perceptron.Sequence as P
import NLP.Perceptron.Sequence (Options(..))
import NLP.Sequor.CoNLL
import Helper.Utils (splitWith,uniq)
import Helper.Atom
import Control.Monad.RWS
import NLP.Sequor.Features (inputFeatures,features,maybeFeatures,outputFeatures,indexFeatures)
import qualified NLP.Sequor.FeatureTemplate as Template
import qualified Data.Array as A
import qualified Data.Vector.Unboxed as V
import qualified Data.Binary as Binary
import qualified Helper.Text as Text
import Helper.Text(Txt)
import qualified Data.Text.Lazy as Text
import Data.Char
import Data.Maybe (catMaybes)
import NLP.Sequor.Config
import Text.Printf
import Debug.Trace
data ModelData = ModelData { model :: P.Model
, config :: Config
}
instance Binary.Binary ModelData where
get = return ModelData `ap` Binary.get `ap` Binary.get
put (ModelData a b) = Binary.put a >> Binary.put b
predict :: ModelData -> [[Token]] -> [[Label]]
predict m testdat =
let bounds = oFeatBounds . P.options . model $ m
in fst . flip runAtoms (maybe (error "NLP.Sequor.predict:Nothing") id . atomTable . config $ m)
$ do flip mapM (map (toZippers . map (take (fieldNumber m))) testdat) $ \x ->
do x' <- mapM (maybeFeatures bounds (config m)) $ x
predict' (P.decode (model m)) $ x'
train :: Flags
-> Template.Feature
-> [(Sentence, [Label])]
-> [(Sentence, [Label])]
-> (ModelData, P.Trace)
train fs template traindat heldout =
let len = length . (\(x:_) -> x) . fst . (\(x:_) -> x) $ traindat
conf = Config { featureTemplate = template
, atomTable = Nothing
, flags = fs
, fieldNum = len }
((m,_predicted, info),_atoms) =
runAtoms (run conf
(zippify traindat)
(zippify heldout))
$ empty
in (m, info)
parseTemplate :: Text.Text -> Template.Feature
parseTemplate = Template.parse
defaultFlags :: Flags
defaultFlags = Flags { flagRate = 0.01
, flagBeam = 10
, flagIter = 10
, flagMinFeatCount = 100
, flagHeldout = Nothing
, flagHash = False
, flagHashSample = 1000
, flagHashMaxSize = Nothing
, flagStopWinSize = 5
, flagStopThreshold = 0.05
}
fieldNumber :: ModelData -> Int
fieldNumber = fieldNum . config
type F = Int
type Tag = Int
zippify :: [([Token], [Txt])] -> [([ListZipper Token], [Txt])]
zippify = map (\ (x, y) -> (toZippers x, y))
tagDictionary :: IntSet.IntSet
-> Int
-> [([V.Vector Int], [F])]
-> IntMap.IntMap [Tag]
tagDictionary indexFeatureSet wmin trainset =
let tags = concat . map snd $ trainset
ws = catMaybes
. map (V.find (`IntSet.member` indexFeatureSet))
. concat
. map fst
$ trainset
count_ws = IntMap.fromListWith (+) [ (w,1) | w <- ws ]
dict = IntMap.map Set.toList
. IntMap.fromListWith Set.union
$ [ (w,Set.singleton t) | (w,t) <- zip ws tags
, count_ws IntMap.! w >= wmin]
in dict == dict `seq` dict
pruneLabels :: Int -> [(x,[Txt])] -> [(x,[Txt])]
pruneLabels lim xys =
let freq = Map.fromListWith (+)
. map (\y -> (y,1))
. concat
. map snd
$ xys
undet = "UNDETERMINED"
in [ (x,[ if freq Map.! yi < lim then undet else yi | yi <- y ])
| (x,y) <- xys ]
run :: (Functor m, MonadAtoms m) =>
Config
-> [([ListZipper Token], [Txt])]
-> [([ListZipper Token], [Txt])]
-> m (ModelData, [[Txt]], P.Trace)
run conf trainset_in testset_in = do
let
ys = uniq . concat . map snd $ trainset_in :: [Txt]
ys' <- mapM toAtom ys
outm <- mkOutputFeatureAtoms . map snd $ trainset_in
let size = outputFeatureCount outm +
maybe (estimateFeatureCount conf . map fst $ trainset_in)
id
(flagHashMaxSize . flags $ conf)
bounds = if flagHash . flags $ conf
then Just (0,size)
else Nothing
trainset <- mapM (mkfs $ features bounds conf) trainset_in
testset <- mapM (mkfs $ maybeFeatures bounds conf) testset_in
tab <- table
let indexFeatureSet = indexFeatures tab
conf' = conf {atomTable = Just tab }
opts = Options { oYMap = outm
, oIndexSet = indexFeatureSet
, oYDict = tagDictionary indexFeatureSet
(flagMinFeatCount . flags $ conf') trainset
, oYs = ys'
, oBeam = flagBeam . flags $ conf
, oRate = flagRate . flags $ conf
, oEpochs = flagIter . flags $ conf
, oFeatBounds = bounds
, oStopWinSize = flagStopWinSize . flags $ conf
, oStopThreshold = flagStopThreshold . flags $ conf
}
(m, info) = P.train opts testset trainset
ps <- mapM (predict' (P.decode m . fst)) testset
return (ModelData { model = m , config = conf' } , ps, info)
predict' :: (MonadAtoms m) =>
(t -> [Int]) -> t -> m [Txt]
predict' dec x = do
let xr = dec x
xr'<- mapM fromAtom xr
return xr'
mkOutputFeatureAtoms :: (MonadAtoms m) => [[Txt]] -> m P.YMap
mkOutputFeatureAtoms yss = do
let unigrams = map return . uniq . concat $ yss
bigrams = uniq $ concat [ filter ((==2) . length)
. map (take 2)
. tails
$ ys | ys <- yss ]
unigramis <- mapM (mapM toAtom) unigrams
bigramis <- mapM (mapM toAtom) bigrams
let ys = map head unigramis
(lo,hi) = (minimum ys,maximum ys)
unigramfs <- mapM (mapM toAtom) . map outputFeatures $ unigrams
bigramfs <- mapM (mapM toAtom) . map outputFeatures $ bigrams
zerofs <- mapM toAtom . outputFeatures $ []
let ymap1 = A.accumArray (V.++) V.empty (lo,hi)
. zip (map head unigramis)
. map V.fromList
$ unigramfs
ymap2 = A.accumArray (V.++) V.empty ((lo,lo),(hi,hi))
. zip (map (\ [y1,y2] -> (y1,y2)) bigramis)
. map V.fromList
$ bigramfs
return $ (V.fromList zerofs, ymap1, ymap2)
outputFeatureCount :: P.YMap -> Int
outputFeatureCount (zero,uni,bi) =
maximum (V.toList zero
++ (concatMap V.toList . A.elems $ uni)
++ (concatMap V.toList . A.elems $ bi ))
mkfs :: (MonadAtoms m) =>
(ListZipper Token -> m (V.Vector F))
-> ([ListZipper Token], [Txt])
-> m ([V.Vector F], [Tag])
mkfs f (x,y) = do
fs <- mapM f x
fs == fs `seq` return ()
y' <- mapM toAtom y
y' == y' `seq` return ()
return $ (fs,y')
estimateFeatureCount :: Config -> [[ListZipper Token]] -> Int
estimateFeatureCount conf xs =
let len = length xs
size = min len . flagHashSample . flags $ conf
factor = length xs `div` size
tokno = (factor *)
. length
. uniq
. concatMap (concatMap (inputFeatures conf))
. take size
$ xs
in tokno