{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE DeriveDataTypeable #-} module NLP.Concraft.Guess ( -- * Types Guesser (..) -- * Guessing , guess , include , guessSent -- * Training , TrainConf (..) , R0T (..) , train ) where import Prelude hiding (words) import Control.Applicative ((<$>), (<*>)) import Data.Binary (Binary, put, get) import Data.Text.Binary () import System.Console.CmdArgs import qualified Data.Set as S import qualified Data.Map as M import qualified Data.Vector as V import qualified Control.Monad.Ox as Ox import qualified Data.CRF.Chain1.Constrained as CRF import qualified Numeric.SGD as SGD import NLP.Concraft.Schema hiding (schematize) import qualified NLP.Concraft.Morphosyntax as X -- | A guessing model. data Guesser t = Guesser { schemaConf :: SchemaConf , crf :: CRF.CRF Ob t } instance (Ord t, Binary t) => Binary (Guesser t) where put Guesser{..} = put schemaConf >> put crf get = Guesser <$> get <*> get -- | Schematize the input sentence with according to 'schema' rules. schematize :: (X.Word w, Ord t) => Schema w t a -> X.Sent w t -> CRF.Sent Ob t schematize schema sent = [ CRF.Word (obs i) (lbs i) | i <- [0 .. n - 1] ] where v = V.fromList sent n = V.length v obs = S.fromList . Ox.execOx . schema v lbs i | X.oov w = S.empty | otherwise = X.interpsSet w where w = v V.! i -- | Determine the 'k' most probable labels for each word in the sentence. -- TODO: Perhaps it would be better to use sets instead of lists -- as output? guess :: (X.Word w, Ord t) => Int -> Guesser t -> X.Sent w t -> [[t]] guess k gsr sent = let schema = fromConf (schemaConf gsr) in CRF.tagK k (crf gsr) (schematize schema sent) -- | Insert guessing results into the sentence. Only interpretations -- of OOV words will be extended. include :: (X.Word w, Ord t) => [[t]] -> X.Sent w t -> X.Sent w t include xss sent = [ word { X.tags = tags } | (word, tags) <- zip sent sentTags ] where sentTags = [ if X.oov word then addInterps (X.tags word) xs else X.tags word | (xs, word) <- zip xss sent ] addInterps wm xs = X.mkWMap $ M.toList (X.unWMap wm) ++ zip xs [0, 0 ..] -- | Combine `guess` with `include`. guessSent :: (X.Word w, Ord t) => Int -> Guesser t -> X.Sent w t -> X.Sent w t guessSent guessNum guesser sent = include (guess guessNum guesser sent) sent -- | Method of constructing the default set of labels (R0). data R0T = AnyInterps -- ^ See `CRF.anyInterps` | AnyChosen -- ^ See `CRF.anyChosen` | OovChosen -- ^ See `CRF.oovChosen` deriving (Show, Eq, Ord, Enum, Typeable, Data) -- | Training configuration. data TrainConf = TrainConf { schemaConfT :: SchemaConf -- | SGD parameters. , sgdArgsT :: SGD.SgdArgs -- | Store SGD dataset on disk , onDiskT :: Bool -- | R0 construction method , r0T :: R0T } -- | Train guesser. train :: (X.Word w, Ord t) => TrainConf -- ^ Training configuration -> IO [X.Sent w t] -- ^ Training data -> IO [X.Sent w t] -- ^ Evaluation data -> IO (Guesser t) train TrainConf{..} trainData evalData = do let schema = fromConf schemaConfT mkR0 = case r0T of AnyInterps -> CRF.anyInterps AnyChosen -> CRF.anyChosen OovChosen -> CRF.oovChosen crf <- CRF.train sgdArgsT onDiskT mkR0 (const CRF.presentFeats) (schemed schema <$> trainData) (schemed schema <$> evalData) return $ Guesser schemaConfT crf -- | Schematized dataset. schemed :: (X.Word w, Ord t) => Schema w t a -> [X.Sent w t] -> [CRF.SentL Ob t] schemed schema = map onSent where onSent xs = let mkProb = CRF.mkProb . M.toList . X.unWMap . X.tags in map (uncurry CRF.mkWordL) $ zip (schematize schema xs) (map mkProb xs)