module NLP.HistPL.DAWG
(
Rule (..)
, apply
, between
, Lex (..)
, Key (..)
, Val (..)
, LexSet
, mkLexSet
, unLexSet
, Node
, decode
, DAWG
, DAWG'Init
, DM.empty
, insert
, DS.freeze
, lookup
, submap
, DS.Weight
, DS.weigh
, DS.size
, index
, byIndex
, fromList
, toList
, entries
, revDAWG
) where
import Prelude hiding (lookup)
import Control.Applicative ((<$>), (<*>))
import Control.Arrow (first)
import Data.Binary (Binary, get, put)
import Data.Text.Binary ()
import Data.List (foldl')
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.DAWG.Static as DS
import qualified Data.DAWG.Dynamic as DM
data Rule = Rule {
cut :: !Int
, suffix :: !T.Text
} deriving (Show, Eq, Ord)
instance Binary Rule where
put Rule{..} = put cut >> put suffix
get = Rule <$> get <*> get
apply :: Rule -> T.Text -> T.Text
apply r x = T.take (T.length x cut r) x `T.append` suffix r
between :: T.Text -> T.Text -> Rule
between source dest =
let k = lcp source dest
in Rule (T.length source k) (T.drop k dest)
where
lcp a b = case T.commonPrefixes a b of
Just (c, _, _) -> T.length c
Nothing -> 0
data Key i = Key {
path :: T.Text
, uid :: i }
deriving (Show, Eq, Ord)
data Val a w b = Val {
info :: a
, forms :: M.Map w b }
deriving (Show, Eq, Ord)
instance (Ord w, Binary a, Binary w, Binary b) => Binary (Val a w b) where
put Val{..} = put info >> put forms
get = Val <$> get <*> get
data Lex i a b = Lex {
lexKey :: Key i
, lexVal :: Val a T.Text b }
deriving (Show, Eq, Ord)
type LexSet i a b = M.Map (Key i) (Val a T.Text b)
mkLexSet :: Ord i => [Lex i a b] -> LexSet i a b
mkLexSet = M.fromList . map ((,) <$> lexKey <*> lexVal)
unLexSet :: LexSet i a b -> [Lex i a b]
unLexSet = map (uncurry Lex) . M.toList
mapW :: Ord w' => (w -> w') -> Val a w b -> Val a w' b
mapW f v =
let g = M.fromList . map (first f) . M.toList
in v { forms = g (forms v) }
type Node i a b = M.Map i (Val a Rule b)
decode :: Ord i => T.Text -> Node i a b -> LexSet i a b
decode x n = M.fromList
[ (Key x i, mapW (flip apply x) val)
| (i, val) <- M.toList n ]
toListE :: Lex i a b -> [(T.Text, i, a, T.Text, b)]
toListE (Lex Key{..} Val{..}) =
[ (path, uid, info, form, y)
| (form, y) <- M.assocs forms ]
type DAWG i a b = DS.DAWG Char DS.Weight (Node i a b)
type DAWG'Init i a b = DM.DAWG Char (Node i a b)
insert
:: (Ord i, Ord a, Ord b)
=> (T.Text, i, a, T.Text, b)
-> DAWG'Init i a b
-> DAWG'Init i a b
insert (x, i, a, y, b) = DM.insertWith union
(T.unpack x)
(M.singleton i (Val a (M.singleton (between x y) b)))
where
union = M.unionWith $ both const M.union
both f g (Val x0 y0) (Val x1 y1) = Val (f x0 x1) (g y0 y1)
lookup :: Ord i => T.Text -> DAWG i a b -> LexSet i a b
lookup x dict = decode x $ case DS.lookup (T.unpack x) dict of
Just m -> m
Nothing -> M.empty
submap :: Ord i => T.Text -> DAWG i a b -> DAWG i a b
submap x dict = DS.submap (T.unpack x) dict
index :: T.Text -> DAWG i a b -> Maybe Int
index x = DS.index (T.unpack x)
byIndex :: Int -> DAWG i a b -> Maybe T.Text
byIndex ix = fmap T.pack . DS.byIndex ix
entries :: Ord i => DAWG i a b -> [Lex i a b]
entries = concatMap f . DS.assocs where
f (key, val) = unLexSet $ decode (T.pack key) val
fromList :: (Ord i, Ord a, Ord b) => [(T.Text, i, a, T.Text, b)] -> DAWG i a b
fromList = DS.weigh . DS.freeze . foldl' (flip insert) DM.empty
toList :: (Ord i, Ord a, Ord b) => DAWG i a b -> [(T.Text, i, a, T.Text, b)]
toList = concatMap toListE . entries
revDAWG :: (Ord i, Ord a, Ord b) => DAWG i a b -> DAWG i a b
revDAWG =
let swap (base, i, x, form, y) = (form, i, x, base, y)
in fromList . map swap . toList