{-# LANGUAGE OverloadedStrings #-} module Labeler ( ModelData(..) , Config(..) , train , predict ) where import qualified Data.Map as Map import qualified Data.Set as Set import qualified Data.IntMap as IntMap import qualified Data.IntSet as IntSet import Data.List (foldl',tails) import Data.Maybe (fromMaybe) import Helper.ListZipper import qualified Perceptron.Sequence as P import Perceptron.Sequence (Options(..)) import CorpusReader (Token) import Helper.Utils (splitWith,uniq) import Text.Printf import Helper.Atom import Control.Monad.RWS import Features (inputFeatures,features,maybeFeatures,outputFeatures ,indexFeatures) import qualified Data.Array as A import qualified Data.Vector.Unboxed as V import qualified Data.Binary as Binary import qualified Helper.Text as Text import Helper.Text(Txt) import Data.Char import Data.Maybe (catMaybes) import Config data ModelData = ModelData { model :: P.Model , config :: Config } instance Binary.Binary ModelData where get = return ModelData `ap` Binary.get `ap` Binary.get put (ModelData a b) = Binary.put a >> Binary.put b -- Main exported functions predict :: ModelData -> [[ListZipper Token]] -> [[Txt]] predict m testdat = let bounds = oFeatBounds . P.options . model $ m in fst . flip runAtoms (atomTable . config $ m) $ do flip mapM testdat $ \x -> do x' <- mapM (maybeFeatures bounds (config m)) $ x predict' (P.decode (model m)) $ x' train :: Config -> [([ListZipper Token],[Txt])] -> [([ListZipper Token],[Txt])] -> ModelData train conf traindat heldout = let ((m,_predicted),_atoms) = runAtoms (run conf traindat heldout) $ empty in m -- Implementation type F = Int type Tag = Int tagDictionary :: IntSet.IntSet -> Int -> [([V.Vector Int], [F])] -> IntMap.IntMap [Tag] tagDictionary indexFeatureSet wmin trainset = let tags = concat . map snd $ trainset ws = catMaybes . map (V.find (`IntSet.member` indexFeatureSet)) . concat . map fst $ trainset count_ws = IntMap.fromListWith (+) [ (w,1) | w <- ws ] dict = IntMap.map Set.toList . IntMap.fromListWith Set.union $ [ (w,Set.singleton t) | (w,t) <- zip ws tags , count_ws IntMap.! w >= wmin] in dict == dict `seq` dict pruneLabels :: Int -> [(x,[Txt])] -> [(x,[Txt])] pruneLabels lim xys = let freq = Map.fromListWith (+) . map (\y -> (y,1)) . concat . map snd $ xys undet = "UNDETERMINED" in [ (x,[ if freq Map.! yi < lim then undet else yi | yi <- y ]) | (x,y) <- xys ] run :: (Functor m, MonadAtoms m) => Config -> [([ListZipper Token], [Txt])] -> [([ListZipper Token], [Txt])] -> m (ModelData, [[Txt]]) run conf trainset_in testset_in = do let --trainset_in = pruneLabels (minLabelFreq conf) trainset_in_full ys = uniq . concat . map snd $ trainset_in :: [Txt] ys' <- mapM toAtom ys outm <- mkOutputFeatureAtoms . map snd $ trainset_in let size = outputFeatureCount outm + maybe (estimateFeatureCount conf . map fst $ trainset_in) id (flagHashMaxSize . flags $ conf) bounds = if flagHash . flags $ conf then Just (0,size) else Nothing trainset <- mapM (mkfs $ features bounds conf) trainset_in testset <- mapM (mkfs $ maybeFeatures bounds conf) testset_in tab <- table let indexFeatureSet = indexFeatures tab conf' = conf {atomTable = tab } opts = Options { oYMap = outm , oIndexSet = indexFeatureSet , oYDict = tagDictionary indexFeatureSet (flagMinFeatCount . flags $ conf') trainset , oYs = ys' , oBeam = flagBeam . flags $ conf , oRate = flagRate . flags $ conf , oEpochs = flagIter . flags $ conf , oFeatBounds = bounds } m = P.train opts testset formatEval trainset ps <- mapM (predict' (P.decode m . fst)) testset return $ (ModelData { model = m , config = conf' } ,ps) predict' :: (MonadAtoms m) => (t -> [Int]) -> t -> m [Txt] predict' dec x = do let xr = dec x xr'<- mapM fromAtom xr return xr' mkOutputFeatureAtoms :: (MonadAtoms m) => [[Txt]] -> m P.YMap mkOutputFeatureAtoms yss = do let unigrams = map return . uniq . concat $ yss bigrams = uniq $ concat [ filter ((==2) . length) . map (take 2) . tails $ ys | ys <- yss ] unigramis <- mapM (mapM toAtom) unigrams bigramis <- mapM (mapM toAtom) bigrams let ys = map head unigramis (lo,hi) = (minimum ys,maximum ys) unigramfs <- mapM (mapM toAtom) . map outputFeatures $ unigrams bigramfs <- mapM (mapM toAtom) . map outputFeatures $ bigrams zerofs <- mapM toAtom . outputFeatures $ [] let ymap1 = A.accumArray (V.++) V.empty (lo,hi) . zip (map head unigramis) . map V.fromList $ unigramfs ymap2 = A.accumArray (V.++) V.empty ((lo,lo),(hi,hi)) . zip (map (\ [y1,y2] -> (y1,y2)) bigramis) . map V.fromList $ bigramfs return $ (V.fromList zerofs, ymap1, ymap2) outputFeatureCount :: P.YMap -> Int outputFeatureCount (zero,uni,bi) = maximum (V.toList zero ++ (concatMap V.toList . A.elems $ uni) ++ (concatMap V.toList . A.elems $ bi )) mkfs :: (MonadAtoms m) => (ListZipper Token -> m (V.Vector F)) -> ([ListZipper Token], [Txt]) -> m ([V.Vector F], [Tag]) mkfs f (x,y) = do fs <- mapM f x fs == fs `seq` return () y' <- mapM toAtom y y' == y' `seq` return () return $ (fs,y') estimateFeatureCount :: Config -> [[ListZipper Token]] -> Int estimateFeatureCount conf xs = let len = length xs size = min len . flagHashSample . flags $ conf factor = length xs `div` size tokno = (factor *) . length . uniq . concatMap (concatMap (inputFeatures conf)) . take size $ xs in tokno formatEval :: P.Eval formatEval 0 _ _ = printf "%10s %10s %10s" ("Iter"::String) ("Train"::String) ("Heldout"::String) formatEval i ss heldout = printf "%10d %10.4f %10.4f" i (eval ss) (eval heldout) eval :: Eq a => [([a],[a])] -> Double eval ys = let corr = foldl' (+) 0 . concat $ [ [ 1 | (y,y') <- ys , (yi,yi') <- zip y y' , yi == yi' ] ] in corr / fromIntegral (length . concatMap fst $ ys)