module NLP.Concraft.Disamb
( Split
, TrainCRF
, TagCRF
, disamb
, disambSent
, disambDoc
, trainOn
) where
import Control.Applicative ((<$>))
import Data.Maybe (fromJust)
import Data.List (find)
import Data.Foldable (Foldable, foldMap)
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Vector as V
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.IO as L
import qualified Control.Monad.Ox as Ox
import qualified Data.CRF.Chain2.Generic.External as CRF
import NLP.Concraft.Schema
import qualified NLP.Concraft.Morphosyntax as Mx
import qualified NLP.Concraft.Format as F
schematize :: Schema t a -> Mx.Sent t -> CRF.Sent Ob t
schematize schema sent =
[ CRF.mkWord (obs i) (lbs i)
| i <- [0 .. n 1] ]
where
v = V.fromList sent
n = V.length v
obs = S.fromList . Ox.execOx . schema v
lbs i = Mx.interpsSet w
where w = v V.! i
type Split r t = r -> t
unSplit :: Eq t => Split r t -> Mx.Word r -> t -> r
unSplit split' word x = fromJust $ find ((==x) . split') (Mx.interps word)
type TrainCRF o t c
= IO [CRF.SentL o t]
-> Maybe (IO [CRF.SentL o t])
-> IO c
type TagCRF o t = CRF.Sent o t -> [t]
disamb
:: (Ord r, Ord t)
=> Schema t a
-> Split r t
-> TagCRF Ob t
-> Mx.Sent r
-> [r]
disamb schema split tag sent
= map (uncurry embed)
. zip sent
. tag
. schematize schema
. Mx.mapSent split
$ sent
where
embed = unSplit split
disambSent
:: Ord t
=> F.Sent s w
-> Schema t a
-> Split F.Tag t
-> TagCRF Ob t
-> s -> s
disambSent F.Sent{..} schema split tag sent =
flip mergeSent sent
[ select wMap orig
| (wMap, orig) <- zip
(doDmb sent)
(parseSent sent) ]
where
F.Word{..} = wordHandler
doDmb orig =
let xs = map extract (parseSent orig)
in map (uncurry mkChoice) (zip xs (disamb schema split tag xs))
mkChoice word x = Mx.mkWMap
[ if x == y
then (x, 1)
else (y, 0)
| y <- Mx.interps word ]
disambDoc
:: (Functor f, Ord t)
=> F.Doc f s w
-> Schema t a
-> Split F.Tag t
-> TagCRF Ob t
-> L.Text
-> L.Text
disambDoc F.Doc{..} schema split tag =
let onSent = disambSent sentHandler schema split tag
in showDoc . fmap onSent . parseDoc
trainOn
:: (Foldable f, Ord t)
=> F.Doc f s w
-> Schema t a
-> Split F.Tag t
-> TrainCRF Ob t c
-> FilePath
-> Maybe FilePath
-> IO c
trainOn format schema split train trainPath evalPath'Maybe = do
crf <- train
(schemed format schema split trainPath)
(schemed format schema split <$> evalPath'Maybe)
return crf
schemed
:: (Foldable f, Ord t)
=> F.Doc f s w -> Schema t a -> Split F.Tag t
-> FilePath -> IO [CRF.SentL Ob t]
schemed F.Doc{..} schema split path =
foldMap onSent . parseDoc <$> L.readFile path
where
F.Sent{..} = sentHandler
F.Word{..} = wordHandler
onSent sent =
[zip (schematize schema xs) (map mkDist xs)]
where
xs = map (Mx.mapWord split . extract) (parseSent sent)
mkDist = CRF.mkDist . M.toList . Mx.unWMap . Mx.tagWMap