{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE TupleSections #-}
module NLP.Concraft.DAG.DisambSeg
(
Tag (..)
, Disamb (..)
, putDisamb
, getDisamb
, P.Tier (..)
, P.Atom (..)
, CRF.ProbType (..)
, probsSent
, probs
, TrainConf (..)
, train
, prune
) where
import Prelude hiding (words)
import Control.Applicative ((<$>), (<*>), pure)
import Data.Binary (Binary, put, get, Put, Get)
import Data.Text.Binary ()
import System.Console.CmdArgs
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Vector as V
import qualified Data.List as List
import qualified Data.DAG as DAG
import Data.DAG (DAG)
import qualified Control.Monad.Ox as Ox
import qualified Numeric.SGD.Momentum as SGD
import qualified Data.CRF.Chain2.Tiers.DAG as CRF
import qualified Data.Tagset.Positional as T
import qualified NLP.Concraft.Disamb.Positional as P
import NLP.Concraft.DAG.Schema hiding (schematize)
import qualified NLP.Concraft.DAG.Morphosyntax as X
import NLP.Concraft.DAG.Disamb (schematize)
data Tag = Tag
{ posiTag :: T.Tag
, hasEos :: Bool
} deriving (Show, Eq, Ord)
data Disamb t = Disamb
{ tiers :: [P.Tier]
, schemaConf :: SchemaConf
, crf :: CRF.CRF Ob P.Atom
, simplify :: t -> Tag
}
putDisamb :: Disamb t -> Put
putDisamb Disamb{..} =
put tiers >> put schemaConf >> put crf
getDisamb :: (t -> Tag) -> Get (Disamb t)
getDisamb smp =
Disamb <$> get <*> get <*> get <*> pure smp
inject
:: (Ord t, X.Word w)
=> Disamb t
-> DAG () (X.WMap [P.Atom])
-> X.Sent w t
-> X.Sent w t
inject dmb newSent srcSent =
let doit (target, src) =
let oldTags = X.tags src
newTags = injectWMap dmb target oldTags
in src {X.tags = newTags}
in fmap doit (DAG.zipE newSent srcSent)
injectWMap
:: (Ord t)
=> Disamb t
-> X.WMap [P.Atom]
-> X.WMap t
-> X.WMap t
injectWMap dmb newSpl src = X.mkWMap
[ ( tag
, maybe 0 id $
M.lookup (split (tiers dmb) (simplify dmb tag)) (X.unWMap newSpl) )
| (tag, _) <- M.toList (X.unWMap src) ]
probs :: (X.Word w, Ord t) => CRF.ProbType -> Disamb t -> X.Sent w t -> DAG () (X.WMap t)
probs probTyp dmb = fmap X.tags . probsSent probTyp dmb
probsSent :: (X.Word w, Ord t) => CRF.ProbType -> Disamb t -> X.Sent w t -> X.Sent w t
probsSent probTyp dmb sent
= (\new -> inject dmb new sent)
. fmap getTags
. probsCRF probTyp dmb
$ sent
where
getTags = X.mkWMap . M.toList . choice
choice w = M.unionWith (+)
(CRF.unProb . snd $ w)
(M.fromList . map (,0) . interps $ w)
interps = S.toList . CRF.lbs . fst
probsCRF :: (X.Word w, Ord t) => CRF.ProbType -> Disamb t -> X.Sent w t -> CRF.SentL Ob P.Atom
probsCRF probTyp dmb
= CRF.probs probTyp (crf dmb)
. schematize schema
. X.mapSent (split (tiers dmb) . simplify dmb)
where
schema = fromConf (schemaConf dmb)
prune :: Double -> Disamb t -> Disamb t
prune x dmb =
let crf' = CRF.prune x (crf dmb)
in dmb { crf = crf' }
data TrainConf t = TrainConf
{ tiersT :: [P.Tier]
, schemaConfT :: SchemaConf
, sgdArgsT :: SGD.SgdArgs
, onDiskT :: Bool
, simplifyLabel :: t -> Tag
}
train
:: (X.Word w, Ord t)
=> TrainConf t
-> IO [X.Sent w t]
-> IO [X.Sent w t]
-> IO (Disamb t)
train TrainConf{..} trainData evalData = do
crf <- CRF.train (length tiersT) CRF.selectHidden sgdArgsT onDiskT
(schemed simplifyLabel schema (split tiersT) <$> trainData)
(schemed simplifyLabel schema (split tiersT) <$> evalData)
putStr "\nNumber of features: " >> print (CRF.size crf)
return $ Disamb tiersT schemaConfT crf simplifyLabel
where
schema = fromConf schemaConfT
schemed
:: (Ord a)
=> (t -> Tag)
-> Schema w [a] b
-> (Tag -> [a])
-> [X.Sent w t]
-> [CRF.SentL Ob a]
schemed simpl schema split =
map onSent
where
onSent sent =
let xs = fmap (X.mapSeg split) (X.mapSent simpl sent)
mkProb = CRF.mkProb . M.toList . X.unWMap . X.tags
in DAG.zipE (schematize schema xs) (fmap mkProb xs)
split :: [P.Tier] -> Tag -> [P.Atom]
split tiers tag = P.split tiers (posiTag tag) (Just $ hasEos tag)