module NLP.Concraft.Guess
( Guesser (..)
, guess
, include
, guessSent
, guessDoc
, trainOn
) where
import Prelude hiding (words)
import Control.Applicative ((<$>))
import Data.Binary (Binary)
import Data.Foldable (Foldable, foldMap)
import Data.Text.Binary ()
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.IO as L
import qualified Data.Vector as V
import qualified Control.Monad.Ox as Ox
import qualified Data.CRF.Chain1.Constrained as CRF
import qualified Numeric.SGD as SGD
import NLP.Concraft.Schema
import qualified NLP.Concraft.Morphosyntax as Mx
import qualified NLP.Concraft.Format as F
schematize :: Ord t => Schema t a -> Mx.Sent t -> CRF.Sent Ob t
schematize schema sent =
[ CRF.Word (obs i) (lbs i)
| i <- [0 .. n 1] ]
where
v = V.fromList sent
n = V.length v
obs = S.fromList . Ox.execOx . schema v
lbs i
| Mx.oov w = S.empty
| otherwise = Mx.interpsSet w
where w = v V.! i
newtype Guesser t = Guesser { crf :: CRF.CRF Ob t }
deriving (Binary)
guess :: Ord t => Int -> Schema t a -> Guesser t -> Mx.Sent t -> [[t]]
guess k schema gsr sent = CRF.tagK k (crf gsr) (schematize schema sent)
includeWMaps :: Ord t => Mx.Sent t -> [[t]] -> [Mx.WMap t]
includeWMaps words guessed =
[ if Mx.oov word
then addInterps (Mx.tagWMap word) xs
else Mx.tagWMap word
| (xs, word) <- zip guessed words ]
where
addInterps wm xs = Mx.mkWMap
$ M.toList (Mx.unWMap wm)
++ zip xs [0, 0 ..]
include :: Ord t => Mx.Sent t -> [[t]] -> Mx.Sent t
include words guessed =
[ word { Mx.tagWMap = wMap }
| (word, wMap) <- zip words wMaps ]
where
wMaps = includeWMaps words guessed
guessSent :: F.Sent s w -> Int -> Schema F.Tag a -> Guesser F.Tag -> s -> s
guessSent F.Sent{..} k schema gsr sent = flip mergeSent sent
[ select wMap word
| (wMap, word) <- zip wMaps (parseSent sent) ]
where
F.Word{..} = wordHandler
words = map extract (parseSent sent)
guessed = guess k schema gsr words
wMaps = includeWMaps words guessed
guessDoc
:: Functor f
=> F.Doc f s w
-> Int
-> Schema F.Tag a
-> Guesser F.Tag
-> L.Text
-> L.Text
guessDoc F.Doc{..} k schema gsr
= showDoc
. fmap (guessSent sentHandler k schema gsr)
. parseDoc
trainOn
:: Foldable f
=> F.Doc f s w
-> Schema F.Tag a
-> SGD.SgdArgs
-> FilePath
-> Maybe FilePath
-> IO (Guesser F.Tag)
trainOn format schema sgdArgs trainPath evalPath'Maybe = do
_crf <- CRF.train sgdArgs
(schemed format schema trainPath)
(schemed format schema <$> evalPath'Maybe)
(const CRF.presentFeats)
return $ Guesser _crf
schemed
:: Foldable f => F.Doc f s w -> Schema F.Tag a
-> FilePath -> IO [CRF.SentL Ob F.Tag]
schemed F.Doc{..} schema path =
foldMap onSent . parseDoc <$> L.readFile path
where
F.Sent{..} = sentHandler
F.Word{..} = wordHandler
onSent sent =
let xs = map extract (parseSent sent)
mkProb = CRF.mkProb . M.toList . Mx.unWMap . Mx.tagWMap
in [zip (schematize schema xs) (map mkProb xs)]