{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE TupleSections #-}
module NLP.Concraft.DAG.DisambSeg
(
Tag (..)
, Disamb (..)
, putDisamb
, getDisamb
, P.Tier (..)
, P.Atom (..)
, disamb
, CRF.ProbType (..)
, probsSent
, probs
, TrainConf (..)
, train
, prune
) where
import Prelude hiding (words)
import Control.Applicative ((<$>), (<*>), pure)
import Control.Monad (guard)
import Data.Binary (put, get, Put, Get)
import Data.Maybe (maybeToList)
import Data.Text.Binary ()
import qualified Data.Set as S
import qualified Data.Map.Strict as M
import qualified Data.DAG as DAG
import Data.DAG (DAG)
import qualified Numeric.SGD.Momentum as SGD
import qualified Data.CRF.Chain2.Tiers.DAG as CRF
import qualified Data.Tagset.Positional as T
import qualified NLP.Concraft.Disamb.Positional as P
import NLP.Concraft.DAG.Schema hiding (schematize)
import qualified NLP.Concraft.DAG.Morphosyntax as X
import NLP.Concraft.DAG.Disamb (schematize)
data Tag = Tag
{ posiTag :: T.Tag
, hasEos :: Bool
} deriving (Show, Eq, Ord)
data Disamb t = Disamb
{ tiers :: [P.Tier]
, schemaConf :: SchemaConf
, crf :: CRF.CRF Ob P.Atom
, simplify :: t -> Tag
}
putDisamb :: Disamb t -> Put
putDisamb Disamb{..} =
put tiers >> put schemaConf >> put crf
getDisamb :: (t -> Tag) -> Get (Disamb t)
getDisamb smp =
Disamb <$> get <*> get <*> get <*> pure smp
inject
:: (Ord t, X.Word w)
=> Disamb t
-> DAG () (X.WMap [P.Atom])
-> X.Sent w t
-> X.Sent w t
inject dmb newSent srcSent =
let doit (target, src) =
let oldTags = X.tags src
newTags = injectWMap dmb target oldTags
in src {X.tags = newTags}
in fmap doit (DAG.zipE newSent srcSent)
injectWMap
:: (Ord t)
=> Disamb t
-> X.WMap [P.Atom]
-> X.WMap t
-> X.WMap t
injectWMap dmb newSpl src = X.mkWMap
[ ( tag
, maybe 0 id $
M.lookup (split (tiers dmb) (simplify dmb tag)) (X.unWMap newSpl) )
| (tag, _) <- M.toList (X.unWMap src) ]
probs :: (X.Word w, Ord t) => CRF.ProbType -> Disamb t -> X.Sent w t -> DAG () (X.WMap t)
probs probTyp dmb = fmap X.tags . probsSent probTyp dmb
probsSent :: (X.Word w, Ord t) => CRF.ProbType -> Disamb t -> X.Sent w t -> X.Sent w t
probsSent probTyp dmb sent
= (\new -> inject dmb new sent)
. fmap getTags
. probsCRF probTyp dmb
$ sent
where
getTags = X.fromMap . choice
choice w = M.unionWith (+)
(CRF.unProb . snd $ w)
(M.fromSet (const 0) . interps $ w)
interps = CRF.lbs . fst
probsCRF ::
(X.Word w, Ord t)
=> CRF.ProbType
-> Disamb t
-> X.Sent w t
-> CRF.SentL Ob P.Atom
probsCRF probTyp dmb
= CRF.probs probTyp (crf dmb)
. schematize schema
. X.mapSent (split (tiers dmb) . simplify dmb)
where
schema = fromConf (schemaConf dmb)
disamb :: (X.Word w, Ord t) => Disamb t -> X.Sent w t -> DAG () (M.Map t Bool)
disamb dmb srcSent
= injectDmb
. disambCRF dmb
$ srcSent
where
injectDmb newSent =
let doit (target, src) = M.fromList $ do
tag <- X.interps src
let tag' = split (tiers dmb) (simplify dmb tag)
isDmb = Just tag' == target
return (tag, isDmb)
in fmap doit (DAG.zipE newSent srcSent)
disambCRF ::
(X.Word w, Ord t)
=> Disamb t
-> X.Sent w t
-> DAG () (Maybe [P.Atom])
disambCRF dmb
= CRF.tag (crf dmb)
. schematize schema
. X.mapSent (split (tiers dmb) . simplify dmb)
where
schema = fromConf (schemaConf dmb)
prune :: Double -> Disamb t -> Disamb t
prune x dmb =
let crf' = CRF.prune x (crf dmb)
in dmb { crf = crf' }
data TrainConf t = TrainConf
{ tiersT :: [P.Tier]
, schemaConfT :: SchemaConf
, sgdArgsT :: SGD.SgdArgs
, onDiskT :: Bool
, simplifyLabel :: t -> Tag
}
train
:: (X.Word w, Ord t)
=> TrainConf t
-> IO [X.Sent w t]
-> IO [X.Sent w t]
-> IO (Disamb t)
train TrainConf{..} trainData evalData = do
crf <- CRF.train (length tiersT) CRF.selectHidden sgdArgsT onDiskT
(schemed simplifyLabel schema (split tiersT) <$> trainData)
(schemed simplifyLabel schema (split tiersT) <$> evalData)
putStr "\nNumber of features: " >> print (CRF.size crf)
return $ Disamb tiersT schemaConfT crf simplifyLabel
where
schema = fromConf schemaConfT
schemed
:: (Ord a)
=> (t -> Tag)
-> Schema w [a] b
-> (Tag -> [a])
-> [X.Sent w t]
-> [CRF.SentL Ob a]
schemed simpl schema splitIt =
map onSent
where
onSent sent =
let xs = fmap (X.mapSeg splitIt) (X.mapSent simpl sent)
mkProb = CRF.mkProb . M.toList . X.unWMap . X.tags
in DAG.zipE (schematize schema xs) (fmap mkProb xs)
split :: [P.Tier] -> Tag -> [P.Atom]
split tiers tag = P.split tiers (posiTag tag) (Just $ hasEos tag)