{-# LANGUAGE RecordWildCards #-} -- | A directed acyclic word graph. module NLP.Adict.DAWG.Internal ( DAWG (..) , DAWGM , fromTrie , fromDAWG , size , nodeBy , Node (..) , entry , charOn , valueBy , edges , edgeOn ) where import Control.Applicative ((<$>)) import Data.Maybe (listToMaybe) import Data.Binary (Binary, get, put) import qualified Data.Vector as V import qualified NLP.Adict.Node as N import qualified NLP.Adict.Trie.Internal as Trie -- | A DAWGM is a 'DAWG' with 'Maybe' values in nodes. type DAWGM a b = DAWG a (Maybe b) -- | A directed acyclic word graph with character type @a@ and dictionary -- entry type @b@. Each node is represented by a unique integer number -- which is also an index of the node in the vector of DAWG nodes. data DAWG a b = DAWG { root :: Int -- ^ Root (index) of the DAWG , nodes :: V.Vector (Node a b) -- ^ Vector of DAWG nodes } -- | Find and eliminate all common subtries in the input trie -- and return the trie represented as a DAWG. fromTrie :: (Ord a, Ord b) => Trie.Trie a b -> DAWG a b fromTrie = deserialize . Trie.serialize -- | Transform the DAWG to implicit DAWG in a form of a trie. fromDAWG :: Ord a => DAWG a b -> Trie.Trie a b fromDAWG = Trie.deserialize . serialize -- | Size of the DAWG. size :: DAWG a b -> Int size = V.length . nodes {-# INLINE size #-} -- | Node by index. nodeBy :: DAWG a b -> Int -> Node a b nodeBy dag k = nodes dag V.! k {-# INLINE nodeBy #-} -- | A node in the DAWG. data Node a b = Node { -- | Value in the node. valueIn :: b, -- | Edges to subnodes (represented by DAWG node indices) -- annotated with characters. subNodes :: V.Vector (a, Int) } -- | Value in the DAWG node represented by the index. valueBy :: DAWG a b -> Int -> b valueBy dag k = valueIn (nodes dag V.! k) {-# INLINE valueBy #-} -- | Edges starting from the DAWG node represented by the index. edges :: DAWG a b -> Int -> [(a, Int)] edges dag k = V.toList . subNodes $ nodeBy dag k {-# INLINE edges #-} -- | Index of the node following the edge annotated with the -- given character. edgeOn :: Eq a => DAWG a b -> Int -> a -> Maybe Int edgeOn DAWG{..} k x = let r = nodes V.! k in snd <$> V.find ((x==).fst) (subNodes r) -- | Return the dictionary entry determined by following the -- path of node indices. entry :: DAWG a (Maybe b) -> [Int] -> Maybe ([a], b) entry dag xs = do x <- mapM (charOn dag) (zip (root dag:xs) xs) r <- maybeLast xs >>= valueBy dag return (x, r) where maybeLast [] = Nothing maybeLast ys = Just $ last ys -- | Determine the character on the edges between two nodes. charOn :: DAWG a b -> (Int, Int) -> Maybe a charOn dag (root, x) = listToMaybe [c | (c, y) <- edges dag root, x == y] -- | Serialize the DAWG into a list of nodes. serialize :: Ord a => DAWG a b -> [N.Node a b] serialize = map unNode . V.toList . nodes -- | Deserialize the DAWG from a list of nodes. Assumptiom: root node -- is last in the serialization list. deserialize :: Ord a => [N.Node a b] -> DAWG a b deserialize xs = let arr = V.fromList $ map mkNode xs in DAWG (V.length arr - 1) arr unNode :: Ord a => Node a b -> N.Node a b unNode Node{..} = N.mkNode valueIn (V.toList subNodes) {-# INLINE unNode #-} mkNode :: Ord a => N.Node a b -> Node a b mkNode n = Node (N.nodeValue n) (V.fromList $ N.nodeEdges n) {-# INLINE mkNode #-} instance (Ord a, Binary a, Binary b) => Binary (DAWG a b) where put = put . serialize get = deserialize <$> get