{-# LANGUAGE BangPatterns #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE InstanceSigs #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedLists #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE UndecidableInstances #-} module Data.RadixTree ( RadixTree (..) , RadixNode (..) , CompressedRadixTree -- * Construction , fromFoldable_ , fromFoldable , compressBy -- * Parsing with radix trees , RadixParsing (..) , parse_ , lookup_ , search ) where import Control.Applicative import Control.DeepSeq import Data.Data (Data, Typeable) import Data.Foldable (asum, foldr', toList) import Data.Map.Strict (Map) import qualified Data.Map.Strict as M import Data.Sequence (Seq) import qualified Data.Sequence as Seq import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Array as TI (Array) import qualified Data.Text.Internal as TI (Text (..), text) import Data.Vector (Vector) import qualified Data.Vector as V import Lens.Micro import Text.Parser.Char (CharParsing (anyChar, text)) import Text.Parser.Combinators (Parsing (try)) -------------------------------------------------------------------------------- -- Stuff to help construct RadixTrees -- -- I'm not clever enough to write a function to go directly from a 'Foldable' to -- a fully-optimised RadixTree. Instead, I generate a prefix-tree using a 'Map' -- directly ('Trie'), and then gradually compress that ('CompressedTrie') before -- packing the final result into an efficient structure using 'Text' nodes. -- -- TODO: -- - generate RadixTree directly, instead of going through 'Trie'/'CompressedTrie' -- - use compact regions? data PrefixNode a tree = Accept !Text a !tree | Skip !tree deriving (Show, Eq) newtype Trie a = Trie (PrefixNode a (Map Char (Trie a))) deriving (Show, Eq) newtype CompressedTrie a = CompressedTrie (PrefixNode a (Map (Seq Char) (CompressedTrie a))) deriving (Show, Eq) {-# INLINE node #-} node :: Lens (PrefixNode p a) (PrefixNode p b) a b node = lens (\x -> case x of { Accept _ _ t -> t; Skip t -> t }) (\x a -> case x of { Accept l p _ -> Accept l p a; Skip _ -> Skip a }) leaf :: Text -> Text -> a -> Trie a leaf ft t v = go (T.unpack t) where go (x:xs) = Trie (Skip (M.singleton x (go xs))) go [] = Trie (Accept ft v M.empty) insert :: Text -> Text -> a -> Trie a -> Trie a insert ft text' a (Trie n) = case T.uncons text' of Just (c, cs) -> Trie ((node %~ M.insertWith (\_ orig -> Data.RadixTree.insert ft cs a orig) c (leaf ft cs a)) n) Nothing -> Trie (n^.node.to (Accept ft a)) makeCompressable :: Trie a -> CompressedTrie a makeCompressable (Trie n) = CompressedTrie ( over node (M.map makeCompressable . M.mapKeysMonotonic Seq.singleton) n) compress :: Trie a -> CompressedTrie a compress = go . makeCompressable where go :: CompressedTrie a -> CompressedTrie a go (CompressedTrie n) = case n of Accept l p m -> CompressedTrie (Accept l p (M.map go m)) Skip m -> CompressedTrie (Skip (M.foldMapWithKey compress1 m)) compress1 :: Seq Char -> CompressedTrie a -> Map (Seq Char) (CompressedTrie a) compress1 k c@(CompressedTrie n) = case M.size sm of 0 -> M.singleton k c 1 | Skip _ <- n -> compress1 (k <> k') sm' where (k', sm') = M.findMax sm _ -> M.singleton k (go (n & node .~ sm & CompressedTrie)) where sm = n^.node -------------------------------------------------------------------------------- -- | A node in a radixtree. To advance from here a parser must parse the 'Text' -- (i.e., the prefix) value at this node. data RadixNode a = RadixNode {-# UNPACK #-} !Text !(RadixTree a) deriving (Eq, Show, Typeable, Data) -- | A radixtree. Construct with 'fromFoldable_, and use with 'parse'. data RadixTree a = -- | Can terminate a parser successfully, returning the 'Text' value given. RadixAccept {-# UNPACK #-} !Text -- ^ text to return at this point {-# UNPACK #-} !(Vector (RadixNode a)) -- ^ possible subtrees beyond this point a -- ^ value to return at this point | RadixSkip {-# UNPACK #-} !(Vector (RadixNode a)) -- ^ possible subtrees beyond this point deriving (Eq, Show, Typeable, Data) instance NFData a => NFData (RadixNode a) where {-# INLINE rnf #-} rnf (RadixNode l t) = rnf l `seq` rnf t instance NFData a => NFData (RadixTree a) where {-# INLINE rnf #-} rnf (RadixAccept t v p) = t `seq` rnf p `seq` rnf v rnf (RadixSkip v) = rnf v -- | Compress a totally-unoptimised 'Trie' into a nice and easily-parsable -- 'RadixTree' fromTrie :: Trie a -> RadixTree a fromTrie = go . compress where !z = V.empty radixNode :: Seq Char -> CompressedTrie a -> RadixNode a radixNode l t = RadixNode (T.pack (toList l)) (go t) mapToVector :: Map k a -> Vector (k, a) mapToVector m = case M.size m of 0 -> z sz -> V.fromListN sz (M.toList m) go :: CompressedTrie a -> RadixTree a go (CompressedTrie n) = case n of Accept l p m -> RadixAccept l (V.map (uncurry radixNode) (mapToVector m)) p Skip m -> RadixSkip . V.map (uncurry radixNode) . mapToVector $! m data TextSlice = TextSlice { tsOffset16 :: {-# UNPACK #-} !Int -- ^ offset (in units of Word16) , tsLength16 :: {-# UNPACK #-} !Int -- ^ length (in units of Word16) } -- | Probably dangerous magic -- -- When the second argument is found to be within the first, we re-use the -- 'Text' array of the first. This should allow the second argument to be -- garbage collected. This is to improve locality and memory use. magicallySaveSpaceSometimes :: Text -> Text -> Maybe TextSlice magicallySaveSpaceSometimes full s@(TI.Text _ _ slen) = case T.breakOn s full of (TI.Text{}, r@(TI.Text _ remoffs _)) | T.null r -> Nothing | otherwise -> Just TextSlice{tsOffset16 = remoffs, tsLength16 = slen} -- | A normal 'RadixTree' stores a new 'Text' at every node. In contrast, a -- 'CompressedRadixTree' takes a single corpus 'Text' which is indexed into by -- nodes. This can save a lot of memory (e.g., using the radix trees from the -- parsing benchmarks in this package, the 'CompressedRadixTree' version is -- 254032 bytes, whereas the ordinary 'RadixTree' is a rotund 709904 bytes) at -- no runtime cost. data CompressedRadixTree a = CompressedRadixTree {-# UNPACK #-} !TI.Array !(CompressedRadixTree1 a) data CompressedRadixTree1 a = CompressedRadixAccept {-# UNPACK #-} !TextSlice {-# UNPACK #-} !(Vector (CompressedRadixNode a)) a | CompressedRadixSkip {-# UNPACK #-} !(Vector (CompressedRadixNode a)) data CompressedRadixNode a = CompressedRadixNode {-# UNPACK #-} !TextSlice !(CompressedRadixTree1 a) instance NFData a => NFData (CompressedRadixNode a) where {-# INLINE rnf #-} rnf (CompressedRadixNode ts t) = ts `seq` rnf t instance NFData a => NFData (CompressedRadixTree a) where {-# INLINE rnf #-} rnf (CompressedRadixTree arr v) = arr `seq` rnf v instance NFData a => NFData (CompressedRadixTree1 a) where {-# INLINE rnf #-} rnf (CompressedRadixAccept ts v a) = ts `seq` rnf a `seq` rnf v rnf (CompressedRadixSkip v) = rnf v -- | Compress a 'RadixTree' given a corpus. All values in the tree must be -- findable within the corpus, though the corpus does not have to necessarily be -- the direct source of the tree compressBy :: Text -> RadixTree a -> Maybe (CompressedRadixTree a) compressBy full@(TI.Text arr _ _) rt = CompressedRadixTree arr <$> recompressT rt where magic = magicallySaveSpaceSometimes full recompressN :: RadixNode a -> Maybe (CompressedRadixNode a) recompressN (RadixNode t tree) = CompressedRadixNode <$> magic t <*> recompressT tree recompressT :: RadixTree a -> Maybe (CompressedRadixTree1 a) recompressT (RadixSkip v) = CompressedRadixSkip <$> V.mapM recompressN v recompressT (RadixAccept t v a) = CompressedRadixAccept <$> magic t <*> V.mapM recompressN v <*> pure a -- | *Slow*. Same as 'fromFoldable', but you do not need to supply pairs of text -- and values; they will default to '()'. fromFoldable_ :: Foldable f => f Text -> RadixTree () fromFoldable_ = fromTrie . foldr' (\t -> insert t t ()) (Trie (Skip M.empty)) -- | *Slow* fromFoldable :: Foldable f => f (Text, a) -> RadixTree a fromFoldable = fromTrie . foldr' (\(t, a) -> insert t t a) (Trie (Skip M.empty)) -------------------------------------------------------------------------------- -- Parsers from 'RadixTree's class RadixParsing radixtree where keys :: radixtree a -> [(Text, a)] parse :: CharParsing m => (Text -> a -> r) -> radixtree a -> m r lookup :: radixtree a -> Text -> Maybe (Text, a) {-# INLINE search #-} -- | Find all occurences of the terms in a 'RadixTree' from this point on. This -- will consume the entire remaining input. Can lazily produce results (but this -- depends on your parser). search :: (Monad m, CharParsing m, RadixParsing radixtree) => radixtree a -> m [Text] search r = go where go = (parse const r >>= \x -> (x:) <$> go) <|> (anyChar >> go) <|> return [] {-# INLINE parse_ #-} parse_ :: (RadixParsing r, CharParsing m) => r a -> m Text parse_ = Data.RadixTree.parse const {-# INLINE lookup_ #-} lookup_ :: RadixParsing r => r a -> Text -> Maybe Text lookup_ r t = fst <$> Data.RadixTree.lookup r t instance RadixParsing RadixTree where keys = go [] where go nil (RadixAccept l xs a) = (l,a) : V.foldr (\(RadixNode _ x) xs' -> go xs' x) nil xs go nil (RadixSkip xs) = V.foldr (\(RadixNode _ x) xs' -> go xs' x) nil xs {-# INLINE parse #-} -- | Parse from a 'RadixTree' parse :: CharParsing m => (Text -> a -> r) -> RadixTree a -> m r parse constr = go where go r = case r of RadixAccept l nodes a | T.null l -> empty | otherwise -> asum (V.map parseRadixNode nodes) <|> pure (constr l a) RadixSkip nodes -> asum (V.map parseRadixNode nodes) {-# INLINE parseRadixNode #-} parseRadixNode (RadixNode prefix tree) | T.null prefix = go tree | otherwise = try (text prefix *> go tree) lookup :: RadixTree a -> Text -> Maybe (Text, a) lookup rt0 t0 | T.null t0 = case rt0 of RadixAccept v _ a -> Just (v, a) RadixSkip _ -> Nothing | otherwise = case rt0 of RadixAccept _ ns _ -> lookupRadixNodes t0 ns RadixSkip ns -> lookupRadixNodes t0 ns where lookupRadixNodes t v = go 0 where !vlen = V.length v go !i | i < vlen = case V.unsafeIndex v i of RadixNode pfix rt -> case T.commonPrefixes pfix t of Just (_, remPfx, remSfx) | T.null remPfx -> Data.RadixTree.lookup rt remSfx | otherwise -> Nothing Nothing -> go (i + 1) | otherwise = Nothing instance RadixParsing CompressedRadixTree where keys (CompressedRadixTree arr crt) = go [] crt where fromSlice (TextSlice offs len) = TI.text arr offs len go nil (CompressedRadixAccept l xs a) = (fromSlice l, a) : V.foldr (\(CompressedRadixNode _ x) xs' -> go xs' x) nil xs go nil (CompressedRadixSkip xs) = V.foldr (\(CompressedRadixNode _ x) xs' -> go xs' x) nil xs {-# INLINE parse #-} -- | Parse from a 'RadixTree' parse :: CharParsing m => (Text -> a -> r) -> CompressedRadixTree a -> m r parse constr (CompressedRadixTree arr crt) = go crt where fromSlice (TextSlice offs len) = TI.text arr offs len go r = case r of CompressedRadixAccept ts nodes a -> case fromSlice ts of l | T.null l -> empty | otherwise -> asum (V.map parseRadixNode nodes) <|> pure (constr l a) CompressedRadixSkip nodes -> asum (V.map parseRadixNode nodes) {-# INLINE parseRadixNode #-} parseRadixNode (CompressedRadixNode ts tree) = case fromSlice ts of prefix | T.null prefix -> go tree | otherwise -> try (text prefix *> go tree) lookup :: CompressedRadixTree a -> Text -> Maybe (Text, a) lookup (CompressedRadixTree arr0 rt0) = lookup1 rt0 where fromSlice (TextSlice offs16 len16) = TI.text arr0 offs16 len16 lookup1 rt !t | T.null t = case rt of CompressedRadixAccept v _ a -> Just (fromSlice v, a) CompressedRadixSkip _ -> Nothing | otherwise = case rt of CompressedRadixAccept _ ns _ -> lookupCompressedRadixNodes t ns CompressedRadixSkip ns -> lookupCompressedRadixNodes t ns lookupCompressedRadixNodes !t v = go 0 where !vlen = V.length v go !i | i < vlen = case V.unsafeIndex v i of CompressedRadixNode pfix rt -> case T.commonPrefixes (fromSlice pfix) t of Just (_, remPfx, remSfx) | T.null remPfx -> lookup1 rt remSfx | otherwise -> Nothing Nothing -> go (i + 1) | otherwise = Nothing