module NLP.Concraft.Polish
(
C.Concraft
, C.saveModel
, C.loadModel
, tag
, marginals
, macaPar
, TrainConf (..)
, train
, C.prune
) where
import Control.Applicative ((<$>))
import qualified Data.Text.Lazy as L
import qualified Data.Set as S
import qualified Data.Tagset.Positional as P
import qualified Numeric.SGD as SGD
import qualified NLP.Concraft.Morphosyntax as X
import qualified NLP.Concraft.Schema as S
import NLP.Concraft.Schema (SchemaConf(..), entry, entryWith)
import qualified NLP.Concraft.Guess as G
import qualified NLP.Concraft.Disamb as D
import qualified NLP.Concraft as C
import NLP.Concraft.Polish.Morphosyntax hiding (tag)
import NLP.Concraft.Polish.Maca
guessSchemaDefault :: SchemaConf
guessSchemaDefault = S.nullConf
{ lowPrefixesC = entryWith [1, 2] [0]
, lowSuffixesC = entryWith [1, 2] [0]
, knownC = entry [0]
, begPackedC = entry [0] }
disambSchemaDefault :: SchemaConf
disambSchemaDefault = S.nullConf
{ lowOrthC = entry [2, 1, 0, 1]
, lowPrefixesC = oov $ entryWith [1, 2, 3] [0]
, lowSuffixesC = oov $ entryWith [1, 2, 3] [0]
, begPackedC = oov $ entry [0] }
where
oov (Just body) = Just $ body { S.oovOnly = True }
oov Nothing = Nothing
tiersDefault :: [D.Tier]
tiersDefault =
[tier1, tier2]
where
tier1 = D.Tier True $ S.fromList ["cas", "per"]
tier2 = D.Tier False $ S.fromList
[ "nmb", "gnd", "deg", "asp" , "ngt", "acm"
, "acn", "ppr", "agg", "vlc", "dot" ]
tag :: C.Concraft -> Sent Tag -> Sent Tag
tag concraft sent =
[ select' gs t seg
| (seg, gs, t) <- zip3 sent gss ts ]
where
tagset = C.tagset concraft
packed = packSent tagset sent
tagged = C.tag concraft packed
gss = map (map showTag . S.toList . fst) tagged
ts = map (showTag . snd) tagged
showTag = P.showTag tagset
marginals :: C.Concraft -> Sent Tag -> Sent Tag
marginals concraft sent
= map (uncurry selectWMap)
$ zip wmaps sent
where
tagset = C.tagset concraft
packed = packSent tagset sent
wmaps = map
(X.mapWMap showTag)
(C.marginals concraft packed)
showTag = P.showTag tagset
data TrainConf = TrainConf {
tagset :: P.Tagset
, sgdArgs :: SGD.SgdArgs
, reana :: Bool
, onDisk :: Bool
, guessNum :: Int
, r0 :: G.R0T }
train
:: TrainConf
-> IO [SentO Tag]
-> IO [SentO Tag]
-> IO C.Concraft
train TrainConf{..} train0 eval0 = do
pool <- newMacaPool 1
let ana = anaSent tagset pool
train1 = map (packSentO tagset) <$> train0
eval1 = map (packSentO tagset) <$> eval0
if reana
then doReana ana train1 eval1
else noReana train1 eval1
where
doReana ana = C.reAnaTrain tagset ana guessNum guessConf disambConf
noReana tr ev = C.train tagset guessNum guessConf disambConf
(map X.segs <$> tr) (map X.segs <$> ev)
guessConf = G.TrainConf guessSchemaDefault sgdArgs onDisk r0
disambConf = D.TrainConf tiersDefault disambSchemaDefault sgdArgs onDisk
anaSent :: P.Tagset -> MacaPool -> L.Text -> IO (X.Sent Word P.Tag)
anaSent tagset pool
= fmap (packSent tagset . concat)
. macaPar pool . L.toStrict