NLP/Concraft/Disamb.hs

{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE MultiParamTypeClasses #-}

module NLP.Concraft.Disamb
( Split
, TrainCRF
, TagCRF
, disamb
, disambSent
, disambDoc
, trainOn
) where

import Control.Applicative ((<$>))
import Data.Maybe (fromJust)
import Data.List (find)
import Data.Foldable (Foldable, foldMap)
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Vector as V
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.IO as L

import qualified Control.Monad.Ox as Ox
import qualified Data.CRF.Chain2.Generic.External as CRF

import NLP.Concraft.Schema
import qualified NLP.Concraft.Morphosyntax as Mx
import qualified NLP.Concraft.Format as F

-- | Schematize the input sentence with according to 'schema' rules.
schematize :: Schema t a -> Mx.Sent t -> CRF.Sent Ob t
schematize schema sent =
    [ CRF.mkWord (obs i) (lbs i)
    | i <- [0 .. n - 1] ]
  where
    v = V.fromList sent
    n = V.length v
    obs = S.fromList . Ox.execOx . schema v
    lbs i = Mx.interpsSet w
        where w = v V.! i

-- | Split is just a function from an original tag form
-- to a complex tag form.
type Split r t = r -> t

-- | Unsplit the complex tag (assuming, that it is one
-- of the interpretations of the word).
unSplit :: Eq t => Split r t -> Mx.Word r -> t -> r
unSplit split' word x = fromJust $ find ((==x) . split') (Mx.interps word)

-- | CRF training function.
type TrainCRF o t c
    =  IO [CRF.SentL o t]           -- ^ Training data 'IO' action
    -> Maybe (IO [CRF.SentL o t])   -- ^ Maybe evalation data
    -> IO c                         -- ^ Resulting model

-- | CRF tagging function.
type TagCRF o t = CRF.Sent o t -> [t]

-- | Perform context-sensitive disambiguation.
disamb
    :: (Ord r, Ord t)
    => Schema t a
    -> Split r t
    -> TagCRF Ob t
    -> Mx.Sent r
    -> [r]
disamb schema split tag sent
    = map (uncurry embed)
    . zip sent
    . tag
    . schematize schema
    . Mx.mapSent split
    $ sent
  where
    embed = unSplit split

-- | Tag the sentence.
disambSent
    :: Ord t
    => F.Sent s w
    -> Schema t a
    -> Split F.Tag t
    -> TagCRF Ob t
    -> s -> s
disambSent F.Sent{..} schema split tag sent =
  flip mergeSent sent
    [ select wMap orig
    | (wMap, orig) <- zip
        (doDmb sent)
        (parseSent sent) ]
  where
    F.Word{..} = wordHandler
    doDmb orig =
        let xs = map extract (parseSent orig)
        in  map (uncurry mkChoice) (zip xs (disamb schema split tag xs))
    mkChoice word x = Mx.mkWMap
        [ if x == y
            then (x, 1)
            else (y, 0)
        | y <- Mx.interps word ]

-- | Disambiguate document.
disambDoc
    :: (Functor f, Ord t)
    => F.Doc f s w      -- ^ Document format handler
    -> Schema t a       -- ^ Observation schema
    -> Split F.Tag t    -- ^ Tiered tagging
    -> TagCRF Ob t      -- ^ CRF tagging function
    -> L.Text           -- ^ Input
    -> L.Text           -- ^ Output
disambDoc F.Doc{..} schema split tag =
    let onSent = disambSent sentHandler schema split tag
    in  showDoc . fmap onSent . parseDoc

-- | Train disamb model.
trainOn
    :: (Foldable f, Ord t)
    => F.Doc f s w      -- ^ Document format handler
    -> Schema t a       -- ^ Observation schema
    -> Split F.Tag t    -- ^ Tiered tagging
    -> TrainCRF Ob t c  -- ^ CRF training function
    -> FilePath         -- ^ Training file
    -> Maybe FilePath   -- ^ Maybe eval file
    -> IO c             -- ^ Resultant model data
trainOn format schema split train trainPath evalPath'Maybe = do
    crf <- train
        (schemed format schema split trainPath)
        (schemed format schema split <$> evalPath'Maybe)
    return crf

-- | Schematized data from the plain file.
schemed
    :: (Foldable f, Ord t)
    => F.Doc f s w -> Schema t a -> Split F.Tag t
    -> FilePath -> IO [CRF.SentL Ob t]
schemed F.Doc{..} schema split path =
    foldMap onSent . parseDoc <$> L.readFile path
  where
    F.Sent{..} = sentHandler
    F.Word{..} = wordHandler
    onSent sent =
        [zip (schematize schema xs) (map mkDist xs)]
      where
        xs  = map (Mx.mapWord split . extract) (parseSent sent)
        mkDist = CRF.mkDist . M.toList . Mx.unWMap . Mx.tagWMap