{- Fastest when compiled as follows: ghc -O2 -optc-O3 -funbox-strict-fields -} ----------------------------------------------------------------------------- -- | -- Module : Data.SuffixTree -- Copyright : (c) Bryan O'Sullivan 2007 -- License : BSD-style -- Maintainer : bos@serpentine.com -- Stability : experimental -- Portability : portable -- -- A lazy, efficient suffix tree implementation. -- -- Since many function names (but not the type name) clash with -- "Prelude" names, this module is usually imported @qualified@, e.g. -- -- > import Data.SuffixTree (STree) -- > import qualified Data.SuffixTree as T -- -- The implementation is based on the first of those described in the -- following paper: -- -- * Robert Giegerich and Stefan Kurtz, \"/A comparison of -- imperative and purely functional suffix tree constructions/\", -- Science of Computer Programming 25(2-3):187-218, 1995, -- -- -- This implementation constructs the suffix tree lazily, so subtrees -- are not created until they are traversed. Two construction -- functions are provided, 'constructWith' for sequences composed of -- small alphabets, and 'construct' for larger alphabets. module Data.SuffixTree ( -- * Types Alphabet , Prefix , STree(..) -- * Construction , constructWith , construct -- * Querying , elem , find -- * Traversal , fold , fold' -- * Other useful functions , prefix , suffixes ) where import Prelude hiding (elem) import qualified Data.Map as M import Data.List (foldl') import Control.Arrow (second) import qualified Data.ByteString as SB import qualified Data.ByteString.Lazy as LB import Data.Maybe (listToMaybe, mapMaybe) -- | The length of a prefix list. This type is formulated to do cheap -- work eagerly (to avoid constructing a pile of deferred thunks), -- while deferring potentially expensive work. data Length a = Exactly {-# UNPACK #-} !Int | Sum {-# UNPACK #-} !Int [a] deriving (Show) -- | The list of symbols that 'constructWith' can possibly see in its -- input. type Alphabet a = [a] -- | The prefix string associated with an 'Edge'. newtype Prefix a = Prefix ([a], Length a) deriving (Show) instance (Eq a) => Eq (Prefix a) where a == b = prefix a == prefix b type EdgeFunction a = [[a]] -> (Length a, [[a]]) -- | A suffix tree. The implementation is exposed to ease the -- development of custom traversal functions. Note that @('Prefix' a, -- 'STree' a)@ pairs are not stored in any order. data STree a = Node [(Prefix a, STree a)] | Leaf deriving (Show) -- | Obtain the list stored in a 'Prefix'. prefix :: Prefix a -> [a] prefix (Prefix (ys, Exactly n)) = take n ys prefix (Prefix (ys, Sum n xs)) = tk n ys where tk 0 ys = zipWith (const id) xs ys tk n (y:ys) = y : tk (n-1) ys -- | /O(n)/. Fold the edges in a tree, from bottom to top. Suitable -- for lazy use. fold :: (Prefix a -> b -> b) -> b -> STree a -> b fold _ z Leaf = z fold f z (Node es) = foldr (\(e, t) v -> f e (fold f v t)) z es -- | /O(n)/. Fold the edges in a tree, from bottom to top. Suitable -- for strict use. fold' :: (a -> Prefix b -> a) -> a -> STree b -> a fold' _ z Leaf = z fold' f z (Node es) = foldl' (\v (e, t) -> f (fold' f v t) e) z es -- | Increment the length of a prefix. inc :: Length a -> Length a inc (Exactly n) = Exactly (n+1) inc (Sum n xs) = Sum (n+1) xs lazyTreeWith :: (Eq a) => EdgeFunction a -> Alphabet a -> [a] -> STree a lazyTreeWith edge alphabet = suf . suffixes where suf [[]] = Leaf suf ss = Node [(Prefix (a:sa, inc cpl), suf ssr) | a <- alphabet, n@(sa:_) <- [ss `clusterBy` a], (cpl,ssr) <- [edge n]] clusterBy ss a = [cs | c:cs <- ss, c == a] -- | Return all non-empty suffixes of the argument, longest first. -- Behaves as follows: -- -- >suffixes xs == init (tails xs) suffixes :: [a] -> [[a]] suffixes xs@(_:xs') = xs : suffixes xs' suffixes _ = [] lazyTree :: (Ord a) => EdgeFunction a -> [a] -> STree a lazyTree edge = suf . suffixes where suf [[]] = Leaf suf ss = Node [(Prefix (a:sa, inc cpl), suf ssr) | (a, n@(sa:_)) <- suffixMap ss, (cpl,ssr) <- [edge n]] suffixMap :: Ord a => [[a]] -> [(a, [[a]])] suffixMap = map (second reverse) . M.toList . foldl' step M.empty where step m (x:xs) = M.alter (f xs) x m step m _ = m f x Nothing = Just [x] f x (Just xs) = Just (x:xs) cst :: Eq a => EdgeFunction a cst [s] = (Sum 0 s, [[]]) cst awss@((a:w):ss) | null [c | c:_ <- ss, a /= c] = let cpl' = inc cpl in seq cpl' (cpl', rss) | otherwise = (Exactly 0, awss) where (cpl, rss) = cst (w:[u | _:u <- ss]) pst :: Eq a => EdgeFunction a pst = g . dropNested where g [s] = (Sum 0 s, [[]]) g ss = (Exactly 0, ss) dropNested ss@[_] = ss dropNested awss@((a:w):ss) | null [c | c:_ <- ss, a /= c] = [a:s | s <- rss] | otherwise = awss where rss = dropNested (w:[u | _:u <- ss]) {-# SPECIALISE constructWith :: [Char] -> [Char] -> STree Char #-} {-# SPECIALISE constructWith :: [[Char]] -> [[Char]] -> STree [Char] #-} {-# SPECIALISE constructWith :: [SB.ByteString] -> [SB.ByteString] -> STree SB.ByteString #-} {-# SPECIALISE constructWith :: [LB.ByteString] -> [LB.ByteString] -> STree LB.ByteString #-} {-# SPECIALISE constructWith :: (Eq a) => [[a]] -> [[a]] -> STree [a] #-} -- | /O(k n log n)/. Construct a suffix tree using the given -- alphabet. The performance of this function is linear in the size -- /k/ of the alphabet. That makes this function suitable for small -- alphabets, such as DNA nucleotides. For an alphabet containing -- more than a few symbols, 'construct' is usually several orders of -- magnitude faster. constructWith :: (Eq a) => Alphabet a -> [a] -> STree a constructWith = lazyTreeWith cst {-# SPECIALISE construct :: [Char] -> STree Char #-} {-# SPECIALISE construct :: [[Char]] -> STree [Char] #-} {-# SPECIALISE construct :: [SB.ByteString] -> STree SB.ByteString #-} {-# SPECIALISE construct :: [LB.ByteString] -> STree LB.ByteString #-} {-# SPECIALISE construct :: (Ord a) => [[a]] -> STree [a] #-} -- | /O(n log n)/. Construct a suffix tree. construct :: (Ord a) => [a] -> STree a construct = lazyTree cst suffix :: (Eq a) => [a] -> [a] -> Maybe [a] suffix (l:ls) (x:xs) | l == x = suffix ls xs | otherwise = Nothing suffix _ xs = Just xs {-# SPECIALISE elem :: [Char] -> STree Char -> Bool #-} {-# SPECIALISE elem :: [[Char]] -> STree [Char] -> Bool #-} {-# SPECIALISE elem :: [SB.ByteString] -> STree SB.ByteString -> Bool #-} {-# SPECIALISE elem :: [LB.ByteString] -> STree LB.ByteString -> Bool #-} {-# SPECIALISE elem :: (Eq a) => [[a]] -> STree [a] -> Bool #-} -- | /O(n)/. Indicate the suffix tree contains the given subsequence. -- Performance is linear in the length of the subsequence. elem :: (Eq a) => [a] -> STree a -> Bool elem [] _ = True elem _ Leaf = False elem xs (Node es) = any pfx es where pfx (e, t) = maybe False (`elem` t) (suffix (prefix e) xs) {-# SPECIALISE find :: [Char] -> STree Char -> Maybe (Prefix Char, STree Char) #-} {-# SPECIALISE find :: [[Char]] -> STree [Char] -> Maybe (Prefix [Char], STree [Char]) #-} {-# SPECIALISE find :: [SB.ByteString] -> STree SB.ByteString -> Maybe (Prefix SB.ByteString, STree SB.ByteString) #-} {-# SPECIALISE find :: [LB.ByteString] -> STree LB.ByteString -> Maybe (Prefix LB.ByteString, STree LB.ByteString) #-} {-# SPECIALISE find :: (Eq a) => [[a]] -> STree [a] -> Maybe (Prefix [a], STree [a]) #-} -- | /O(n)/. Return the portion of the suffix tree at which the given -- subsequence is located. If the subsequence is not found, return -- 'Nothing'. find :: (Eq a) => [a] -> STree a -> Maybe (Prefix a, STree a) find _ Leaf = Nothing find xs (Node es) = listToMaybe (mapMaybe pfx es) where pfx p@(e, t) = suffix (prefix e) xs >>= \suf -> case suf of [] -> return p s -> find s t