{-# LANGUAGE RecordWildCards #-} module NLP.Concraft ( GuessConf (..) , GuessData (..) , DisambConf (..) , DisambWith (..) , DisambTag , DisambTrain , disamb , disambDoc , trainOn ) where -- import Data.Binary (Binary, put, get) -- import qualified Data.Text as T import System.IO (hClose) import Data.Foldable (Foldable) import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.IO as L import qualified Numeric.SGD as SGD import qualified System.IO.Temp as Temp import NLP.Concraft.Schema import qualified NLP.Concraft.Morphosyntax as Mx import qualified NLP.Concraft.Format as F import qualified NLP.Concraft.Guess as G import qualified NLP.Concraft.Disamb as D -- | Guessing configuration. data GuessConf r = GuessConf { guessNum :: Int , guessSchema :: Schema r () } -- | Guessing configuration and model data. data GuessData r = GuessData { guessConf :: GuessConf r , guesser :: G.Guesser r } -- | Disambiguation configuration. data DisambConf r t = DisambConf { split :: D.Split r t , disambSchema :: Schema t () } -- | Disambiguation configuration with... data DisambWith r t a = DisambWith { disambConf :: DisambConf r t , disambWith :: a } -- | Tagging with disambiguation configuration. type DisambTag r t = DisambWith r t (D.TagCRF Ob t) -- | Training disambiguation model configuration. type DisambTrain r t c = DisambWith r t (D.TrainCRF Ob t c) -- | Perform disambiguation preceded by context-sensitive guessing. disamb :: (Ord r, Ord t) => GuessData r -- ^ Guessing configuration -> DisambTag r t -- ^ Disambiguation configuration -> Mx.Sent r -- ^ Input -> [r] -- ^ Output disamb GuessData{..} DisambWith{..} sent = D.disamb disambSchema split tagCRF . G.include sent . G.guess guessNum guessSchema guesser $ sent where GuessConf{..} = guessConf DisambConf{..} = disambConf tagCRF = disambWith -- | Tag the sentence. disambSent :: Ord t => F.Sent s w -> GuessData F.Tag -> DisambTag F.Tag t -> s -> s disambSent sentH GuessData{..} DisambWith{..} = D.disambSent sentH disambSchema split tagCRF . G.guessSent sentH guessNum guessSchema guesser where GuessConf{..} = guessConf DisambConf{..} = disambConf tagCRF = disambWith -- | Tag document. disambDoc :: (Functor f, Ord t) => F.Doc f s w -- ^ Document format handler -> GuessData F.Tag -- ^ Guessing configuration -> DisambTag F.Tag t -- ^ Disambiguation configuration -> L.Text -- ^ Input -> L.Text -- ^ Output disambDoc F.Doc{..} guessData disambTag = let onSent = disambSent sentHandler guessData disambTag in showDoc . fmap onSent . parseDoc -- | Train guessing and disambiguation models. trainOn :: (Functor f, Foldable f, Ord t) => F.Doc f s w -- ^ Document format handler -> GuessConf F.Tag -- ^ Guessing configuration -> SGD.SgdArgs -- ^ SGD params for guesser -> DisambTrain F.Tag t c -- ^ Disambiguation configuration -> FilePath -- ^ Training file -> Maybe FilePath -- ^ Maybe eval file -> IO (G.Guesser F.Tag, c) -- ^ Resultant models trainOn format guessConf@GuessConf{..} sgdArgs DisambWith{..} trainPath evalPath'Maybe = do putStrLn "\n===== Train guessing model ====\n" guesser <- G.trainOn format guessSchema sgdArgs trainPath evalPath'Maybe let guessData = GuessData guessConf guesser let withGuesser = guessFile format guessData withGuesser "train" (Just trainPath) $ \(Just trainPathG) -> withGuesser "eval" evalPath'Maybe $ \evalPathG'Maybe -> do putStrLn "\n===== Train disambiguation model ====\n" let DisambConf{..} = disambConf let trainCRF = disambWith disambCRF <- D.trainOn format disambSchema split trainCRF trainPathG evalPathG'Maybe return (guesser, disambCRF) guessFile :: Functor f => F.Doc f s w -- ^ Document format handler -> GuessData F.Tag -- ^ Guesser -> String -- ^ Template for temporary file name -> Maybe FilePath -- ^ File to guess -> (Maybe FilePath -> IO a) -- ^ Handler -> IO a guessFile _ _ _ Nothing handler = handler Nothing guessFile format GuessData{..} tmpl (Just path) handler = Temp.withTempFile "." tmpl $ \tmpPath tmpHandle -> do inp <- L.readFile path let GuessConf{..} = guessConf let out = G.guessDoc format guessNum guessSchema guesser inp hClose tmpHandle L.writeFile tmpPath out handler (Just tmpPath)