{-# LANGUAGE DeriveDataTypeable, BangPatterns #-} -- | The underlying structure for a Trie. -- -- This is an internal module, but is exposed for now because I'm not -- sure how to best generically target features such as structural -- diffs. And we might eventually benefit from zippers, etc. After -- experimenting and learning, I ask you to push the best generic -- code into the vcache-trie package (i.e. send a pull request). module Data.VCache.Trie.Type ( Trie(..) , Node(..) , Children , Child , unsafeTrieAddr -- , ZPath, ZSeek, ZFind(..), keyPath -- utility , sharedPrefixLen ) where import Control.Applicative import Data.Word import qualified Data.Array.IArray as A import Data.ByteString (ByteString) import qualified Data.ByteString.Internal as BSI import Data.Typeable import Data.Maybe import Foreign.ForeignPtr import Foreign.Ptr import Foreign.Storable import Database.VCache -- Thoughts: -- -- I think we shouldn't encode too much information into any one -- node, so I'm aiming for: -- -- encode at most one extended prefix per node -- encode at most one value per node -- encode at most 256 references to child nodes -- -- Children are serialized as a simple [(Word8, VRef Node)] list. -- The high branching factor favors sparse, flat tries, which is -- useful because VCache taxes deep lookups more than looking up -- larger values. -- -- | A node should either accept a value or branch into at least two -- children. data Node a = Node { trie_branch :: {-# UNPACK #-} !(Children a) -- arity 256; one byte from prefix , trie_prefix :: {-# UNPACK #-} !ByteString -- compact extended prefix , trie_accept :: !(Maybe a) -- value associated with prefix } deriving (Eq, Typeable) -- Invariant for nodes: either we accept or we have at least two children type Children a = A.Array Word8 (Child a) type Child a = Maybe (VRef (Node a)) -- | A trie data structure with bytestring keys, above VCache. -- -- A trie supports keys of arbitrary size, though very large keys may -- cause performance degradation. Values are directly serialized into -- nodes, so very large values should use indirection. -- data Trie a = Trie { trie_root :: !(Child a) , trie_space :: !VSpace } deriving (Eq, Typeable) instance (VCacheable a) => VCacheable (Node a) where get = Node <$> getChildren <*> get <*> get put (Node c p v) = putChildren c >> put p >> put v instance (VCacheable a) => VCacheable (Trie a) where get = Trie <$> get <*> getVSpace put (Trie r vc) = put r >> put vc instance Show (Trie a) where showsPrec _ t = showString "Trie#" . shows (unsafeTrieAddr t) -- | Obtain unique address for Trie value. As with VRef addresses, this -- should be stable while the Trie is reachable, but may change if the -- value is GC'd and later reconstructed at a new address. Exposed for -- memoization and similar purposes. unsafeTrieAddr :: Trie a -> Word64 unsafeTrieAddr = maybe 0 unsafeVRefAddr . trie_root {-# INLINE unsafeTrieAddr #-} mkChildren :: [(Word8, VRef (Node a))] -> Children a mkChildren = A.accumArray ins Nothing (minBound, maxBound) where ins _ c = Just c getChildren :: (VCacheable a) => VGet (Children a) getChildren = mkChildren <$> get -- get a list of pairs listChildren :: Children a -> [(Word8, VRef (Node a))] listChildren = mapMaybe toChild . A.assocs where toChild (_, Nothing) = Nothing toChild (ix, Just c) = Just (ix, c) putChildren :: (VCacheable a) => Children a -> VPut () putChildren = put . listChildren {- -- | ZPath a corresponds to a deep search to find a key. -- It returns a zipper-like data structure. type ZPath a = (ZSeek a, ZFind a) type ZSeek a = [(Word8, Node a)] -- a simple stack of nodes data ZFind a = ZTerm !ByteString -- ^ bytes not matched at end of trie | ZSplit !ByteString {-# UNPACK #-} !SharedPrefixLen !(Node a) -- ^ match between nodes | ZFind !(Node a) -- ^ exact match to existing node type SharedPrefixLen = Int -- | compute the path to a node as a data structure. keyPath :: ByteString -> Trie a -> ZPath a keyPath = kr where kr key = kc [] key . trie_root kc zs key Nothing = (zs, ZTerm key) kc zs key (Just c) = let tn = deref' c in let pre = trie_prefix tn in let s = sharedPrefixLen key pre in let p = B.length pre in let k = B.length key in if (s < p) then (zs, ZSplit key s tn) else assert (s == p) $ if (s == k) then (zs, ZFind tn) else assert (s < k) $ let key' = B.drop (s+1) key in let ixK = B.index key s in kc ((ixK,tn):zs) key' (trie_branch tn A.! ixK) -} -- | Return byte count for prefix common among two strings. sharedPrefixLen :: ByteString -> ByteString -> Int sharedPrefixLen (BSI.PS s1 off1 len1) (BSI.PS s2 off2 len2) = BSI.inlinePerformIO $ withForeignPtr s1 $ \ p1 -> withForeignPtr s2 $ \ p2 -> indexOfDiff (p1 `plusPtr` off1) (p2 `plusPtr` off2) (min len1 len2) indexOfDiff :: Ptr Word8 -> Ptr Word8 -> Int -> IO Int indexOfDiff !p1 !p2 !len = loop 0 where loop !idx = if (idx == len) then return len else peekByte (p1 `plusPtr` idx) >>= \ c1 -> peekByte (p2 `plusPtr` idx) >>= \ c2 -> if (c1 /= c2) then return idx else loop (idx + 1) -- an aide for type inference peekByte :: Ptr Word8 -> IO Word8 peekByte = peek