module NLP.HistPL.Fusion
(
Rule (..)
, apply
, between
, UID
, POS
, Word
, Base
, IsBase
, Lex (..)
, LexKey (..)
, LexElem (..)
, LexSet
, mkLexSet
, unLexSet
, Dict
, BaseDict
, FormDict
, mkDict
, unDict
, revDict
, lookup
, entries
, Bila (..)
, mkBila
, withForm
, Hist
, mkHist
, HLex
, Poli
, PLex
, PLexSet
, mkPoli
, Corresp
, buildCorresp
, Core
, Filter
, Choice
, byForms
, posFilter
, sumChoice
, Fused
, FLex
, Code (..)
, extend
, fuse
) where
import Prelude hiding (lookup)
import Control.Applicative ((<$>), (<*>))
import Control.Arrow (first)
import Data.Binary (Binary, get, put)
import Data.Text.Binary ()
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.PoliMorf as P
import qualified Data.DAWG.Static as D
import qualified NLP.HistPL as H
data Rule = Rule {
cut :: !Int
, suffix :: !T.Text }
deriving (Show, Eq, Ord)
instance Binary Rule where
put Rule{..} = put cut >> put suffix
get = Rule <$> get <*> get
apply :: Rule -> T.Text -> T.Text
apply r x = T.take (T.length x cut r) x `T.append` suffix r
between :: T.Text -> T.Text -> Rule
between source dest =
let k = lcp source dest
in Rule (T.length source k) (T.drop k dest)
where
lcp a b = case T.commonPrefixes a b of
Just (c, _, _) -> T.length c
Nothing -> 0
type UID = Int
type POS = T.Text
type Base = T.Text
type Word = T.Text
type IsBase = Bool
data Lex i a b = Lex
{ lexKey :: LexKey i
, lexElem :: LexElem a b }
deriving (Show, Eq, Ord)
listLex :: Lex i a b -> [(T.Text, i, a, T.Text, b)]
listLex Lex{..} =
[ (key, uid, info, word, y)
| (word, y) <- M.assocs forms ]
where
LexKey{..} = lexKey
LexElem{..} = lexElem
data LexKey i = LexKey
{ key :: T.Text
, uid :: i }
deriving (Show, Eq, Ord)
data LexElem a b = LexElem
{ info :: a
, forms :: M.Map Word b }
deriving (Show, Eq, Ord)
type LexSet i a b = M.Map (LexKey i) (LexElem a b)
mkLexSet :: Ord i => [Lex i a b] -> LexSet i a b
mkLexSet = M.fromList . map ((,) <$> lexKey <*> lexElem)
unLexSet :: LexSet i a b -> [Lex i a b]
unLexSet = map (uncurry Lex) . M.toList
type RuleEntry i a b = M.Map i (a, M.Map Rule b)
type Dict i a b = D.DAWG Char () (M.Map i (a, M.Map Rule b))
type BaseDict i a b = Dict i a b
type FormDict i a b = Dict i a b
decode :: Ord i => T.Text -> RuleEntry i a b -> LexSet i a b
decode key ruleEntry = mkLexSet
[ Lex
(LexKey key i)
(LexElem x $ M.fromList
[ (apply rule key, y)
| (rule, y) <- M.assocs ruleMap ])
| (i, (x, ruleMap)) <- M.assocs ruleEntry ]
lookup :: Ord i => T.Text -> Dict i a b -> LexSet i a b
lookup key dict = decode key $ case D.lookup (T.unpack key) dict of
Just m -> m
Nothing -> M.empty
entries :: Ord i => Dict i a b -> [Lex i a b]
entries =
let f = unLexSet . uncurry decode . first T.pack
in concatMap f . D.assocs
mkDict :: (Ord i, Ord a, Ord b) => [(T.Text, i, a, T.Text, b)] -> Dict i a b
mkDict xs = D.fromListWith union $
[ ( T.unpack x
, M.singleton i
(a, M.singleton (between x y) b) )
| (x, i, a, y, b) <- xs ]
where
union = M.unionWith $ both const M.union
both f g (x, y) (x', y') = (f x x', g y y')
unDict :: (Ord i, Ord a, Ord b) => Dict i a b -> [(T.Text, i, a, T.Text, b)]
unDict = concatMap listLex . entries
revDict :: (Ord i, Ord a, Ord b) => Dict i a b -> Dict i a b
revDict =
let swap (base, i, x, form, y) = (form, i, x, base, y)
in mkDict . map swap . unDict
data Bila i a b = Bila
{ baseDict :: BaseDict i a b
, formDict :: FormDict i a b }
deriving (Show, Eq, Ord)
instance (Ord i, Binary i, Binary a, Binary b) => Binary (Bila i a b) where
put Bila{..} = put baseDict >> put formDict
get = Bila <$> get <*> get
mkBila :: (Ord i, Ord a, Ord b) => [(Base, i, a, Word, b)] -> Bila i a b
mkBila xs = Bila
{ baseDict = baseDict'
, formDict = formDict' }
where
baseDict' = mkDict xs
formDict' = revDict baseDict'
withForm :: Ord i => Bila i a b -> Word -> LexSet i a b
withForm Bila{..} word = M.unions
[ lookup base baseDict
| (_, lexElem) <- M.assocs (lookup word formDict)
, base <- M.keys (forms lexElem) ]
type Poli = Bila POS () ()
type PLex = Lex POS () ()
type PLexSet = LexSet POS () ()
mkPoli :: [P.Entry] -> Poli
mkPoli = mkBila . map ((,,(),,()) <$> P.base <*> P.pos <*> P.form)
type Hist = BaseDict UID (S.Set POS) IsBase
type HLex = Lex UID (S.Set POS) IsBase
mkHist :: [H.BinEntry] -> Hist
mkHist xs = mkDict
[ ( H.keyForm key
, H.keyUid key
, S.fromList (H.pos entry)
, form
, isBase )
| binEntry <- xs
, let key = H.binKey binEntry
, let entry = H.lexEntry binEntry
, (form, isBase) <-
map (,True) (lemmas entry) ++
map (,False) (forms entry)
, oneWord form ]
where
lemmas = H.text . H.lemma
forms = concatMap H.text . H.forms
oneWord = (==1) . length . T.words
type Corresp = Poli -> HLex -> PLexSet
type Core = Poli -> HLex -> [PLexSet]
type Filter = HLex -> PLex -> Bool
type Choice = [PLexSet] -> PLexSet
byForms :: Core
byForms bila Lex{..} =
[ withForm bila word
| word <- M.keys (forms lexElem) ]
posFilter :: Filter
posFilter h p = uid (lexKey p) `S.member` info (lexElem h)
sumChoice :: Choice
sumChoice = M.unions
buildCorresp :: Core -> Filter -> Choice -> Corresp
buildCorresp core filt choice bila hLex =
let filterSet = mkLexSet . filter (filt hLex) . unLexSet
in choice . map filterSet . core bila $ hLex
type Fused = BaseDict UID () Code
type FLex = Lex UID () Code
data Code
= Orig
| Copy
deriving (Show, Eq, Ord)
instance Binary Code where
put Orig = put '1'
put Copy = put '2'
get = get >>= \x -> return $ case x of
'1' -> Orig
'2' -> Copy
c -> error $ "get: invalid Code value '" ++ [c] ++ "'"
extend :: HLex -> PLexSet -> FLex
extend hLex lexSet = subForms . M.fromList $
concatMap (fromElem Copy) (M.elems lexSet) ++
fromElem Orig (lexElem hLex)
where
subForms x = hLex { lexElem = LexElem () x }
fromElem code = map (,code) . (M.keys . forms)
fuse :: Corresp -> Hist -> Poli -> Fused
fuse corr hist bila = mkDict
[ (key, uid, (), word, code)
| hLex <- entries hist
, let Lex{..} = extend hLex (corr bila hLex)
, let LexKey{..} = lexKey
, (word, code) <- M.assocs (forms lexElem) ]