{-# LANGUAGE OverloadedStrings #-} module NLP.Sequor ( ModelData , P.Trace , Template.Feature , Config , Token , Label , Sentence , train , predict , parseTemplate , defaultFlags ) where import qualified Data.Map as Map import qualified Data.Set as Set import qualified Data.IntMap as IntMap import qualified Data.IntSet as IntSet import Data.List (foldl',tails) import Data.Maybe (fromMaybe) import Helper.ListZipper import qualified NLP.Perceptron.Sequence as P import NLP.Perceptron.Sequence (Options(..)) import NLP.Sequor.CoNLL import Helper.Utils (splitWith,uniq) import Helper.Atom import Control.Monad.RWS import NLP.Sequor.Features (inputFeatures,features,maybeFeatures,outputFeatures,indexFeatures) import qualified NLP.Sequor.FeatureTemplate as Template import qualified Data.Array as A import qualified Data.Vector.Unboxed as V import qualified Data.Binary as Binary import qualified Helper.Text as Text import Helper.Text(Txt) import qualified Data.Text.Lazy as Text import Data.Char import Data.Maybe (catMaybes) import NLP.Sequor.Config import Text.Printf import Debug.Trace data ModelData = ModelData { model :: P.Model -- ^ Sequence perceptron model , config :: Config -- ^ Model configuration options } instance Binary.Binary ModelData where get = return ModelData `ap` Binary.get `ap` Binary.get put (ModelData a b) = Binary.put a >> Binary.put b -- | @predict model xs@ returns the best label sequence for -- each sentence in xs. A sentence is a sequence of 'Token's. predict :: ModelData -> [[Token]] -> [[Label]] predict m testdat = let bounds = oFeatBounds . P.options . model $ m in fst . flip runAtoms (maybe (error "NLP.Sequor.predict:Nothing") id . atomTable . config $ m) $ do flip mapM (map (toZippers . map (take (fieldNumber m))) testdat) $ \x -> do x' <- mapM (maybeFeatures bounds (config m)) $ x predict' (P.decode (model m)) $ x' -- | @train flags template training development@ trains a model on training -- sentences using give flags and feature template and returns the model and a -- for each iteration the error rate on training and development sentences. train :: Flags -> Template.Feature -> [(Sentence, [Label])] -> [(Sentence, [Label])] -> (ModelData, P.Trace) train fs template traindat heldout = let len = length . (\(x:_) -> x) . fst . (\(x:_) -> x) $ traindat conf = Config { featureTemplate = template , atomTable = Nothing , flags = fs , fieldNum = len } ((m,_predicted, info),_atoms) = runAtoms (run conf (zippify traindat) (zippify heldout)) $ empty in (m, info) -- | @parseTemplete s@ parses feature template in s and returns the -- result. parseTemplate :: Text.Text -> Template.Feature parseTemplate = Template.parse defaultFlags :: Flags defaultFlags = Flags { flagRate = 0.01 , flagRateDecay = 0.0 , flagBeam = 10 , flagIter = 10 , flagMinFeatCount = 100 , flagHeldout = Nothing , flagHash = False , flagHashSample = 1000 , flagHashMaxSize = Nothing , flagStopWinSize = 5 , flagStopThreshold = 0.05 } -- Implementation fieldNumber :: ModelData -> Int fieldNumber = fieldNum . config type F = Int type Tag = Int zippify :: [([Token], [Txt])] -> [([ListZipper Token], [Txt])] zippify = map (\ (x, y) -> (toZippers x, y)) tagDictionary :: IntSet.IntSet -> Int -> [([V.Vector Int], [F])] -> IntMap.IntMap [Tag] tagDictionary indexFeatureSet wmin trainset = let tags = concat . map snd $ trainset ws = catMaybes . map (V.find (`IntSet.member` indexFeatureSet)) . concat . map fst $ trainset count_ws = IntMap.fromListWith (+) [ (w,1) | w <- ws ] dict = IntMap.map Set.toList . IntMap.fromListWith Set.union $ [ (w,Set.singleton t) | (w,t) <- zip ws tags , count_ws IntMap.! w >= wmin] in dict == dict `seq` dict pruneLabels :: Int -> [(x,[Txt])] -> [(x,[Txt])] pruneLabels lim xys = let freq = Map.fromListWith (+) . map (\y -> (y,1)) . concat . map snd $ xys undet = "UNDETERMINED" in [ (x,[ if freq Map.! yi < lim then undet else yi | yi <- y ]) | (x,y) <- xys ] run :: (Functor m, MonadAtoms m) => Config -> [([ListZipper Token], [Txt])] -> [([ListZipper Token], [Txt])] -> m (ModelData, [[Txt]], P.Trace) run conf trainset_in testset_in = do let --trainset_in = pruneLabels (minLabelFreq conf) trainset_in_full ys = uniq . concat . map snd $ trainset_in :: [Txt] ys' <- mapM toAtom ys outm <- mkOutputFeatureAtoms . map snd $ trainset_in let size = outputFeatureCount outm + maybe (estimateFeatureCount conf . map fst $ trainset_in) id (flagHashMaxSize . flags $ conf) bounds = if flagHash . flags $ conf then Just (0,size) else Nothing trainset <- mapM (mkfs $ features bounds conf) trainset_in testset <- mapM (mkfs $ maybeFeatures bounds conf) testset_in tab <- table let indexFeatureSet = indexFeatures tab conf' = conf {atomTable = Just tab } opts = Options { oYMap = outm , oIndexSet = indexFeatureSet , oYDict = tagDictionary indexFeatureSet (flagMinFeatCount . flags $ conf') trainset , oYs = ys' , oBeam = flagBeam . flags $ conf , oRate = flagRate . flags $ conf , oRateDecay = flagRateDecay . flags $ conf , oEpochs = flagIter . flags $ conf , oFeatBounds = bounds , oStopWinSize = flagStopWinSize . flags $ conf , oStopThreshold = flagStopThreshold . flags $ conf } (m, info) = P.train opts testset trainset ps <- mapM (predict' (P.decode m . fst)) testset return (ModelData { model = m , config = conf' } , ps, info) predict' :: (MonadAtoms m) => (t -> [Int]) -> t -> m [Txt] predict' dec x = do let xr = dec x xr'<- mapM fromAtom xr return xr' mkOutputFeatureAtoms :: (MonadAtoms m) => [[Txt]] -> m P.YMap mkOutputFeatureAtoms yss = do let unigrams = map return . uniq . concat $ yss bigrams = uniq $ concat [ filter ((==2) . length) . map (take 2) . tails $ ys | ys <- yss ] unigramis <- mapM (mapM toAtom) unigrams bigramis <- mapM (mapM toAtom) bigrams let ys = map head unigramis (lo,hi) = (minimum ys,maximum ys) unigramfs <- mapM (mapM toAtom) . map outputFeatures $ unigrams bigramfs <- mapM (mapM toAtom) . map outputFeatures $ bigrams zerofs <- mapM toAtom . outputFeatures $ [] let ymap1 = A.accumArray (V.++) V.empty (lo,hi) . zip (map head unigramis) . map V.fromList $ unigramfs ymap2 = A.accumArray (V.++) V.empty ((lo,lo),(hi,hi)) . zip (map (\ [y1,y2] -> (y1,y2)) bigramis) . map V.fromList $ bigramfs return $ (V.fromList zerofs, ymap1, ymap2) outputFeatureCount :: P.YMap -> Int outputFeatureCount (zero,uni,bi) = maximum (V.toList zero ++ (concatMap V.toList . A.elems $ uni) ++ (concatMap V.toList . A.elems $ bi )) mkfs :: (MonadAtoms m) => (ListZipper Token -> m (V.Vector F)) -> ([ListZipper Token], [Txt]) -> m ([V.Vector F], [Tag]) mkfs f (x,y) = do fs <- mapM f x fs == fs `seq` return () y' <- mapM toAtom y y' == y' `seq` return () return $ (fs,y') estimateFeatureCount :: Config -> [[ListZipper Token]] -> Int estimateFeatureCount conf xs = let len = length xs size = min len . flagHashSample . flags $ conf factor = length xs `div` size tokno = (factor *) . length . uniq . concatMap (concatMap (inputFeatures conf)) . take size $ xs in tokno