module NLP.Nerf.Dict.Base
(
NeType
, Form
, isMultiWord
, Entry (..)
, Label
, DAWG
, Dict
, fromPairs
, fromEntries
, siftDict
, saveDict
, loadDict
, merge
, diff
) where
import Control.Applicative ((<$>), (<*>))
import Data.Binary (encodeFile, decodeFile)
import Data.Text.Binary ()
import qualified Data.Text as T
import qualified Data.Set as S
import qualified Data.DAWG.Static as D
import qualified Data.DAWG.Trans.Vector as D
type NeType = T.Text
type Form = T.Text
isMultiWord :: Form -> Bool
isMultiWord = (>1) . length . T.words
data Entry = Entry
{ neOrth :: !Form
, neType :: !NeType
} deriving (Show, Read, Eq, Ord)
type Label = T.Text
type DAWG = D.DAWG D.Trans Char ()
type Dict = DAWG (S.Set Label)
fromPairs :: [(Form, Label)] -> Dict
fromPairs xs = D.fromListWith S.union
[ ( T.unpack x
, S.singleton y)
| (x, y) <- xs ]
fromEntries :: [Entry] -> Dict
fromEntries = fromPairs . map ((,) <$> neOrth <*> neType)
siftDict :: (Form -> S.Set Label -> Bool) -> Dict -> Dict
siftDict f dict = D.fromList [(k, v) | (k, v) <- D.assocs dict, f (T.pack k) v]
saveDict :: FilePath -> Dict -> IO ()
saveDict = encodeFile
loadDict :: FilePath -> IO Dict
loadDict = decodeFile
merge :: [Dict] -> Dict
merge = unionsWith S.union
unionsWith :: Ord a => (a -> a -> a) -> [DAWG a] -> DAWG a
unionsWith f = foldl (unionWith f) D.empty
unionWith :: Ord a => (a -> a -> a) -> DAWG a -> DAWG a -> DAWG a
unionWith f d d' = D.fromListWith f (D.assocs d ++ D.assocs d')
diff :: [Dict] -> [Dict]
diff ds =
[ mapS (addPrefix i) `mapD` dict
| (i, dict) <- zip [0..] ds ]
mapD :: Ord b => (a -> b) -> DAWG a -> DAWG b
mapD f d = D.fromList [(x, f y) | (x, y) <- D.assocs d]
mapS :: Ord b => (a -> b) -> S.Set a -> S.Set b
mapS f s = S.fromList [f x | x <- S.toList s]
addPrefix :: Int -> T.Text -> T.Text
addPrefix = T.append . T.pack . show