module NLP.Concraft.Disamb
( Tier (..)
, Tag (..)
, select
, splitWord
, splitSent
, Ox
, Schema
, Ob
, schema
, schematize
, TierConf
, tear
, deTear
, deTears
, Disamb
, disamb
, tagFile
, learn
) where
import Control.Applicative ((<$>), (<*>))
import Data.Maybe (fromJust)
import Data.List (find)
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Vector as V
import Data.Binary (Binary, get, put)
import Data.Text.Binary ()
import qualified Control.Monad.Ox as Ox
import qualified Data.CRF.Chain2.Pair as CRF
import qualified Numeric.SGD as SGD
import qualified Data.Tagset.Positional as TP
import NLP.Concraft.Morphosyntax
import qualified NLP.Concraft.Plain as P
data Tier = Tier {
withPos :: Bool
, withAtts :: S.Set TP.Attr }
instance Binary Tier where
put Tier{..} = put withPos >> put withAtts
get = Tier <$> get <*> get
data Tag = Tag
{ pos :: Maybe TP.POS
, atts :: M.Map TP.Attr T.Text }
deriving (Show, Eq, Ord)
instance Binary Tag where
put Tag{..} = put pos >> put atts
get = Tag <$> get <*> get
select :: Tier -> TP.Tag -> Tag
select Tier{..} tag = Tag
{ pos = if withPos then Just (TP.pos tag) else Nothing
, atts = M.filterWithKey (\k _ -> k `S.member` withAtts) (TP.atts tag) }
type Ox t a = Ox.Ox (Word t) T.Text a
type Schema t a = V.Vector (Word t) -> Int -> Ox t a
type Ob = ([Int], T.Text)
schema :: Schema t ()
schema sent = \k -> do
mapM_ (Ox.save . lowOrth) [k 1, k, k + 1]
where
at = Ox.atWith sent
lowOrth i = T.toLower <$> orth `at` i
schematize :: Sent t -> CRF.Sent Ob t
schematize sent =
[ CRF.mkWord (obs i) (lbs i)
| i <- [0 .. n 1] ]
where
v = V.fromList sent
n = V.length v
obs = S.fromList . Ox.execOx . schema v
lbs = tags . (v V.!)
type TierConf = (Tier, Tier)
tear :: TierConf -> TP.Tag -> (Tag, Tag)
tear (t1, t2) = (,) <$> select t1 <*> select t2
splitWord :: TierConf -> Word TP.Tag -> Word (Tag, Tag)
splitWord cfg = mapWord (tear cfg)
splitSent :: TierConf -> Sent TP.Tag -> Sent (Tag, Tag)
splitSent ts = map (splitWord ts)
data Disamb = Disamb
{ crf :: CRF.CRF Ob Tag Tag
, tagset :: TP.Tagset
, tierConf :: TierConf }
instance Binary Disamb where
put Disamb{..} = put crf >> put tagset >> put tierConf
get = Disamb <$> get <*> get <*> get
disamb :: Disamb -> Sent TP.Tag -> [TP.Tag]
disamb Disamb{..} sent
= deTears tierConf sent
. CRF.tag crf
. schematize
. splitSent tierConf
$ sent
deTears :: TierConf -> Sent TP.Tag -> [(Tag, Tag)] -> [TP.Tag]
deTears cfg sent tiered =
[ deTear cfg word pair
| (word, pair) <- zip sent tiered ]
deTear :: TierConf -> Word TP.Tag -> (Tag, Tag) -> TP.Tag
deTear cfg word tiered =
fromJust $ find
((==tiered) . tear cfg)
(S.toList $ tags word)
tagFile
:: T.Text
-> Disamb
-> FilePath
-> IO L.Text
tagFile ign dmb path =
P.showPlain ign . map onSent <$> P.readPlain ign path
where
onSent sent =
[ choose tok y
| (tok, y) <- zip sent ys ]
where
rs = map (fst . P.fromTok) sent
xs = map (mapWord parseTag) rs
ys = map showTag (disamb dmb xs)
choose tok y = P.choose tok (S.singleton y)
parseTag = TP.parseTag (tagset dmb)
showTag = TP.showTag (tagset dmb)
learn
:: SGD.SgdArgs
-> FilePath
-> T.Text
-> TierConf
-> FilePath
-> Maybe FilePath
-> IO Disamb
learn sgdArgs tagsetPath ign tierConf trainPath evalPath'Maybe = do
_tagset <- TP.parseTagset tagsetPath <$> readFile tagsetPath
_crf <- CRF.train sgdArgs
(schemed _tagset ign tierConf trainPath)
(schemed _tagset ign tierConf <$> evalPath'Maybe)
return $ Disamb _crf _tagset tierConf
schemed
:: TP.Tagset -> T.Text -> TierConf
-> FilePath -> IO [CRF.SentL Ob (Tag, Tag)]
schemed tagset _ign cfg =
fmap (map onSent) . P.readPlain _ign
where
onSent sent =
zip (schematize xs') (map mkDist ys')
where
(xs, ys) = unzip (map P.fromTok sent)
xs' = map (mapWord smash) xs
ys' = map (mapChoice smash) ys
smash = tear cfg . parseTag
parseTag = TP.parseTag tagset
mkDist = CRF.mkDist . M.toList . M.map unPositive