module NLP.Nerf.Schema
(
Ox
, Schema
, void
, sequenceS_
, schematize
, SchemaCfg (..)
, defaultCfg
, fromCfg
, Block
, fromBlock
, orthS
, lemmaS
, shapeS
, shapePairS
, suffixS
, searchS
) where
import Control.Applicative ((<$>), (<*>))
import Control.Monad (forM_, join)
import Data.Maybe (maybeToList)
import Data.Binary (Binary, put, get, decodeFile)
import qualified Data.Char as C
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Vector as V
import qualified Data.Text as T
import qualified Data.CRF.Chain1 as CRF
import qualified Control.Monad.Ox as Ox
import qualified Control.Monad.Ox.Text as Ox
import NLP.Nerf.Types
import qualified NLP.Nerf.Dict as Dict
type Ox a = Ox.Ox Word T.Text a
type Schema a = V.Vector Word -> Int -> Ox a
void :: a -> Schema a
void x _ _ = return x
sequenceS_ :: [Schema a] -> Schema ()
sequenceS_ xs sent =
let ys = map ($sent) xs
in \k -> sequence_ (map ($k) ys)
data BaseOb = BaseOb
{ orth :: Int -> Maybe T.Text
, lowOrth :: Int -> Maybe T.Text }
mkBaseOb :: V.Vector Word -> BaseOb
mkBaseOb sent = BaseOb
{ orth = _orth
, lowOrth = _lowOrth }
where
at = Ox.atWith sent
_orth = (id `at`)
_lowOrth i = T.toLower <$> _orth i
type Block a = V.Vector Word -> [Int] -> Ox a
fromBlock :: Block a -> [Int] -> Schema a
fromBlock blk xs sent =
let blkSent = blk sent
in \k -> blkSent [x + k | x <- xs]
orthS :: Block ()
orthS sent = \ks -> do
mapM_ (Ox.save . lowOrth) ks
mapM_ (Ox.save . upOnlyOrth) ks
where
BaseOb{..} = mkBaseOb sent
upOnlyOrth i = orth i >>= \x -> case T.any C.isUpper x of
True -> Just x
False -> Nothing
lemmaS :: Block ()
lemmaS sent = \ks -> do
mapM_ lowLemma ks
where
BaseOb{..} = mkBaseOb sent
lowPrefix i j = Ox.prefix j =<< lowOrth i
lowSuffix i j = Ox.suffix j =<< lowOrth i
lowLemma i = Ox.group $ do
mapM_ (Ox.save . lowPrefix i) [0, 1, 2, 3]
mapM_ (Ox.save . lowSuffix i) [0, 1, 2, 3]
shapeS :: Block ()
shapeS sent = \ks -> do
mapM_ (Ox.save . shape) ks
mapM_ (Ox.save . shapeP) ks
where
BaseOb{..} = mkBaseOb sent
shape i = Ox.shape <$> orth i
shapeP i = Ox.pack <$> shape i
shapePairS :: Block ()
shapePairS sent = \ks ->
forM_ ks $ \i -> do
Ox.save $ link <$> shape i <*> shape (i 1)
Ox.save $ link <$> shapeP i <*> shapeP (i 1)
where
BaseOb{..} = mkBaseOb sent
shape i = Ox.shape <$> orth i
shapeP i = Ox.pack <$> shape i
link x y = T.concat [x, "-", y]
suffixS :: Block ()
suffixS sent = \ks ->
forM_ ks $ \i ->
mapM_ (Ox.save . lowSuffix i) [2, 3, 4]
where
BaseOb{..} = mkBaseOb sent
lowSuffix i j = Ox.suffix j =<< lowOrth i
searchS :: Dict.NeDict -> Block ()
searchS dict sent = \ks -> do
mapM_ (Ox.saves . searchDict) ks
where
BaseOb{..} = mkBaseOb sent
searchDict i = join . maybeToList $
S.toList <$> (orth i >>= flip M.lookup dict)
data SchemaCfg = SchemaCfg
{ orthC :: [Int]
, lemmaC :: [Int]
, shapeC :: [Int]
, shapePairC :: [Int]
, suffixC :: [Int]
, dictC :: Maybe (Dict.NeDict, [Int])
}
instance Binary SchemaCfg where
put SchemaCfg{..} = do
put orthC
put lemmaC
put shapeC
put shapePairC
put suffixC
put dictC
get = SchemaCfg
<$> get
<*> get
<*> get
<*> get
<*> get
<*> get
defaultCfg
:: FilePath
-> IO SchemaCfg
defaultCfg nePath = do
neDict <- decodeFile nePath
return $ SchemaCfg
{ orthC = [1, 0]
, lemmaC = [1, 0]
, shapeC = [1, 0]
, shapePairC = [0]
, suffixC = [0]
, dictC = Just (neDict, [1, 0]) }
mkBasicS :: Block () -> [Int] -> Schema ()
mkBasicS _ [] = void ()
mkBasicS blk xs = fromBlock blk xs
mkDictS :: Maybe (Dict.NeDict, [Int]) -> Schema ()
mkDictS (Just (d, xs)) = fromBlock (searchS d) xs
mkDictS Nothing = void ()
fromCfg :: SchemaCfg -> Schema ()
fromCfg SchemaCfg{..} = sequenceS_
[ mkBasicS orthS orthC
, mkBasicS lemmaS lemmaC
, mkBasicS shapeS shapeC
, mkBasicS shapePairS shapePairC
, mkBasicS suffixS suffixC
, mkDictS dictC ]
schematize :: Schema a -> [Word] -> CRF.Sent Ob
schematize schema xs =
map (S.fromList . Ox.execOx . schema v) [0 .. n 1]
where
v = V.fromList xs
n = V.length v