module NLP.Concraft.Polish
(
C.Concraft
, C.saveModel
, C.loadModel
, tag
, tag'
, tagSent
, TrainConf (..)
, train
) where
import qualified Control.Monad.LazyIO as LazyIO
import Control.Applicative ((<$>))
import qualified Data.List.Split as Split
import qualified Data.Char as Char
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Set as S
import qualified Data.Tagset.Positional as P
import qualified Numeric.SGD as SGD
import qualified NLP.Concraft.Morphosyntax as X
import qualified NLP.Concraft.Schema as S
import NLP.Concraft.Schema (SchemaConf(..), entry, entryWith)
import qualified NLP.Concraft.Guess as G
import qualified NLP.Concraft.Disamb as D
import qualified NLP.Concraft as C
import NLP.Concraft.Polish.Morphosyntax hiding (tag)
import NLP.Concraft.Polish.Maca
guessSchemaDefault :: SchemaConf
guessSchemaDefault = S.nullConf
{ lowPrefixesC = entryWith [1, 2] [0]
, lowSuffixesC = entryWith [1, 2] [0]
, knownC = entry [0]
, begPackedC = entry [0] }
disambSchemaDefault :: SchemaConf
disambSchemaDefault = S.nullConf
{ lowOrthC = entry [2, 1, 0, 1]
, lowPrefixesC = oov $ entryWith [1, 2, 3] [0]
, lowSuffixesC = oov $ entryWith [1, 2, 3] [0]
, begPackedC = oov $ entry [0] }
where
oov (Just body) = Just $ body { S.oovOnly = True }
oov Nothing = Nothing
tiersDefault :: [D.Tier]
tiersDefault =
[tier1, tier2]
where
tier1 = D.Tier True $ S.fromList ["cas", "per"]
tier2 = D.Tier False $ S.fromList
[ "nmb", "gnd", "deg", "asp" , "ngt", "acm"
, "acn", "ppr", "agg", "vlc", "dot" ]
tag :: MacaPool -> C.Concraft -> T.Text -> IO [Sent Tag]
tag pool concraft inp = map (tagSent concraft) <$> macaPar pool inp
tag' :: MacaPool -> C.Concraft -> L.Text -> IO [[Sent Tag]]
tag' pool concraft
= LazyIO.mapM (tag pool concraft . L.toStrict)
. map L.unlines
. Split.splitWhen
(L.all Char.isSpace)
. L.lines
tagSent :: C.Concraft -> Sent Tag -> Sent Tag
tagSent concraft sent =
let tagset = C.tagset concraft
packed = packSent tagset sent
tags = map (P.showTag tagset) (C.tag concraft packed)
in map (uncurry select) (zip tags sent)
data TrainConf = TrainConf {
tagset :: P.Tagset
, sgdArgs :: SGD.SgdArgs
, reana :: Bool
, onDisk :: Bool
, guessNum :: Int
, prune :: Maybe Double
, r0 :: G.R0T }
train
:: TrainConf
-> IO [SentO Tag]
-> IO [SentO Tag]
-> IO C.Concraft
train TrainConf{..} train0 eval0 = do
pool <- newMacaPool 1
let ana = fmap (packSent tagset . concat) . macaPar pool . L.toStrict
train1 = map (packSentO tagset) <$> train0
eval1 = map (packSentO tagset) <$> eval0
if reana
then doReana ana train1 eval1
else noReana train1 eval1
where
guessConf = G.TrainConf guessSchemaDefault sgdArgs onDisk r0
disambConf = D.TrainConf tiersDefault disambSchemaDefault
sgdArgs onDisk prune
doReana ana = C.reAnaTrain tagset ana guessNum guessConf disambConf
noReana tr ev = C.train tagset guessNum guessConf disambConf
(map X.segs <$> tr) (map X.segs <$> ev)