| Safe Haskell | Safe-Inferred |
|---|---|
| Language | Haskell2010 |
Data.Suffix.ByteString
Description
Implementation
See Data.Suffix for implementation details.
Synopsis
- buildSuffixArray :: Intn i => ByteString -> SuffixArray i
- newtype SuffixArray i = SuffixArray (PrimArray i)
- search :: Intn i => ByteString -> SuffixArray i -> ByteString -> (Int, Int)
- buildLCPArray :: Intn i => ByteString -> SuffixArray i -> LCPArray i
- newtype LCPArray i = LCPArray (PrimArray i)
- buildLRLCPArray :: Intn i => LCPArray i -> LRLCPArrays i
- data LRLCPArrays i = LRLCPArrays !(PrimArray i) !(PrimArray i)
- searchLRLCP :: Intn i => ByteString -> SuffixArray i -> LRLCPArrays i -> ByteString -> (Int, Int)
- foldSuffixTree :: (Intn i, Monad m) => (Int -> m a) -> (Int -> m b) -> (Int -> b -> a -> m b) -> (Int -> b -> m a) -> SuffixArray i -> LCPArray i -> m a
- class (Prim i, Integral i) => Intn i
Suffix array
Arguments
| :: Intn i | |
| => ByteString | Input |
| -> SuffixArray i | Output type |
\(O(n)\). Build a suffix array from a ByteString.
On 64-bit systems, a SuffixArray Int32 requires half the memory as a
SuffixArray Int.
newtype SuffixArray i Source #
Suffix array.
Constructors
| SuffixArray (PrimArray i) |
Instances
| (Show i, Prim i) => Show (SuffixArray i) Source # | |
Defined in Data.Suffix Methods showsPrec :: Int -> SuffixArray i -> ShowS # show :: SuffixArray i -> String # showList :: [SuffixArray i] -> ShowS # | |
| NFData (SuffixArray i) Source # | |
Defined in Data.Suffix Methods rnf :: SuffixArray i -> () # | |
| (Eq i, Prim i) => Eq (SuffixArray i) Source # | |
Defined in Data.Suffix Methods (==) :: SuffixArray i -> SuffixArray i -> Bool # (/=) :: SuffixArray i -> SuffixArray i -> Bool # | |
| (Ord i, Prim i) => Ord (SuffixArray i) Source # | |
Defined in Data.Suffix Methods compare :: SuffixArray i -> SuffixArray i -> Ordering # (<) :: SuffixArray i -> SuffixArray i -> Bool # (<=) :: SuffixArray i -> SuffixArray i -> Bool # (>) :: SuffixArray i -> SuffixArray i -> Bool # (>=) :: SuffixArray i -> SuffixArray i -> Bool # max :: SuffixArray i -> SuffixArray i -> SuffixArray i # min :: SuffixArray i -> SuffixArray i -> SuffixArray i # | |
Arguments
| :: Intn i | |
| => ByteString |
|
| -> SuffixArray i | Suffix array for the above |
| -> ByteString | Pattern |
| -> (Int, Int) | An |
\(O(m \log n)\). Search for a pattern in a ByteString using its suffix
array.
Note: For typical inputs, the worst case is unlikely and the running time is
close to \(O(m + \log n)\). To get guaranteed \(O(m + \log n)\) running time
consider using searchLRLCP instead.
Longest common prefix array
buildLCPArray :: Intn i => ByteString -> SuffixArray i -> LCPArray i Source #
\(O(n)\). Build a longest common prefix array from a ByteString and its
suffix array.
The LCP array has the same length as the sequence, \(n\). The \(0\)-th element of the LCP array is \(0\). The \(i\)-th element of the LCP array for \(0 < i < n\) is the longest common prefix of the \(i\)-th and \((i-1)\)-th suffix in the suffix array.
Longest common prefix array.
Instances
| (Show i, Prim i) => Show (LCPArray i) Source # | |
| NFData (LCPArray i) Source # | |
Defined in Data.Suffix | |
| (Eq i, Prim i) => Eq (LCPArray i) Source # | |
| (Ord i, Prim i) => Ord (LCPArray i) Source # | |
LLCP and RLCP arrays
buildLRLCPArray :: Intn i => LCPArray i -> LRLCPArrays i Source #
\(O(n)\). Build LLCP and RLCP arrays from an LCP array, for use in
searchLRLCP.
data LRLCPArrays i Source #
LLCP and RLCP arrays.
Constructors
| LRLCPArrays !(PrimArray i) !(PrimArray i) |
Instances
| (Show i, Prim i) => Show (LRLCPArrays i) Source # | |
Defined in Data.Suffix Methods showsPrec :: Int -> LRLCPArrays i -> ShowS # show :: LRLCPArrays i -> String # showList :: [LRLCPArrays i] -> ShowS # | |
| NFData (LRLCPArrays i) Source # | |
Defined in Data.Suffix Methods rnf :: LRLCPArrays i -> () # | |
| (Eq i, Prim i) => Eq (LRLCPArrays i) Source # | |
Defined in Data.Suffix Methods (==) :: LRLCPArrays i -> LRLCPArrays i -> Bool # (/=) :: LRLCPArrays i -> LRLCPArrays i -> Bool # | |
| (Ord i, Prim i) => Ord (LRLCPArrays i) Source # | |
Defined in Data.Suffix Methods compare :: LRLCPArrays i -> LRLCPArrays i -> Ordering # (<) :: LRLCPArrays i -> LRLCPArrays i -> Bool # (<=) :: LRLCPArrays i -> LRLCPArrays i -> Bool # (>) :: LRLCPArrays i -> LRLCPArrays i -> Bool # (>=) :: LRLCPArrays i -> LRLCPArrays i -> Bool # max :: LRLCPArrays i -> LRLCPArrays i -> LRLCPArrays i # min :: LRLCPArrays i -> LRLCPArrays i -> LRLCPArrays i # | |
Arguments
| :: Intn i | |
| => ByteString |
|
| -> SuffixArray i | Suffix array for the above |
| -> LRLCPArrays i | LLCP and RLCP arrays for the above |
| -> ByteString | Pattern |
| -> (Int, Int) | An |
\(O(m + \log n)\). Search for a pattern in a sequence using its suffix array and LLCP and RLCP arrays.
Suffix tree
Arguments
| :: (Intn i, Monad m) | |
| => (Int -> m a) | Leaf. The |
| -> (Int -> m b) | Internal node, initialize. The |
| -> (Int -> b -> a -> m b) | Internal node, combine with the value from a
child. The |
| -> (Int -> b -> m a) | Internal node, finalize. The |
| -> SuffixArray i | A suffix array derived from a sequence |
| -> LCPArray i | The LCP array for the same sequence and suffix array |
| -> m a |
\(O(n)\). Bottom-up strict monadic fold of the suffix tree formed by the given suffix array and LCP array.
The tree folded over can be considered to be equivalent to the following definition.
data SuffixTree
= Leaf
Int -- ^ The suffix index
| Branch
Int -- ^ The string depth of this node
[SuffixTree] -- ^ The children of this node
This tree has exactly \(n+1\) leaves and at most \(n\) internal nodes. The suffix with index \(n\), i.e. the empty suffix, is a leaf in this tree, though it is not present in the suffix array.
The calls to the leaf function and the internal node finalizer taken together form a post-order traversal of the tree. For an internal node, the values from its children are combined left-to-right.
The \(O(n)\) bound assumes that monadic bind and all given functions are \(O(1)\).
Intn
class (Prim i, Integral i) => Intn i Source #
Allows for a choice between Int and Int32.
Minimal complete definition
toInt, frInt
Examples
Build a suffix array and LCP array
import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as BC import Data.Suffix.ByteString banana :: ByteString banana = BC.pack "BANANA"
>>>let sa = buildSuffixArray banana :: SuffixArray Int>>>saSuffixArray [5,3,1,0,4,2]>>>let lcpa = buildLCPArray banana sa>>>lcpaLCPArray [0,1,3,0,0,2]
Tabulate a suffix array
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BC
import Data.Primitive.PrimArray (indexPrimArray)
import Text.Printf (printf)
import Data.Suffix.ByteString
tabulatePrint :: ByteString -> IO ()
tabulatePrint bs = putStrLn $ unlines $ header : map row [0 .. n-1]
where
header = "SA LCP Suffix"
n = BC.length bs
sa@(SuffixArray sa_) = buildSuffixArray bs :: SuffixArray Int
LCPArray lcpa_ = buildLCPArray bs sa
row i = printf "%2d %2d %s" sufIdx lcp suffix
where
suffixId = indexPrimArray sa_ i
lcp = indexPrimArray lcpa_ i
suffix = BC.unpack (BC.drop suffixId bs)
>>>tabulatePrint (BC.pack "mississippi")SA LCP Suffix 10 0 i 7 1 ippi 4 1 issippi 1 4 ississippi 0 0 mississippi 9 0 pi 8 1 ppi 6 0 sippi 3 2 sissippi 5 1 ssippi 2 3 ssissippi
Search
import Data.ByteString (ByteString) import qualified Data.ByteString.Char8 as BC import Data.Foldable (for_) import Data.Primitive.PrimArray (indexPrimArray) import Data.Suffix.ByteString searchAndPrint :: ByteString ->SuffixArrayInt -> ByteString -> IO () searchAndPrint bs sa@(SuffixArraysa_) pat = casesearchbs sa pat of (_, 0) -> putStrLn "not found" (off, len) -> for_ [off .. off+len-1] $ \i -> do let suffixId = indexPrimArray sa_ i putStrLn $ BC.unpack bs putStrLn $ replicate suffixId ' ' ++ replicate (BC.length pat) '^'
>>>let str = BC.pack "shikanokonokonokokoshitantan">>>let sa = buildSuffixArray str>>>search str sa (BC.pack "nokonoko")(14,2)>>>searchAndPrint str sa (BC.pack "nokonoko")shikanokonokonokokoshitantan ^^^^^^^^ shikanokonokonokokoshitantan ^^^^^^^^>>>search str sa (BC.pack "shika senbei")(24,0)>>>searchAndPrint str sa (BC.pack "shika senbei")not found
Visualize a suffix tree
import Control.Monad.Trans.Class
import Control.Monad.Trans.Writer.Lazy
import Control.Monad.Trans.State.Strict
import Data.ByteString (ByteString)
import qualified Data.ByteString.Char8 as BC
import Data.Maybe (fromJust)
import Data.Suffix.ByteString
-- Output a tree in Graphviz DOT format
suffixTreeDot :: ByteString -> String
suffixTreeDot bs =
execWriter $ flip evalStateT (0 :: Int) $ do
writeln "digraph suffixtree {"
_ <- foldSuffixTree l1 b1 b2 b3 sa lcpa
writeln "}"
where
n = BC.length bs
sa = buildSuffixArray bs :: SuffixArray Int
lcpa = buildLCPArray bs sa
fresh = state $ \i -> (i, i+1)
writeln = lift . tell . (++ "\n")
l1 suffixId = do
nodeId <- fresh
writeln $ concat [show nodeId, " [label=\"", show suffixId, "\", shape=square];"]
let depth = n - suffixId
pure (depth, suffixId, nodeId)
b1 _ = do
nodeId <- fresh
writeln $ concat [show nodeId, " [label=\"\", shape=circle];"]
pure (Nothing, nodeId)
b2 depth (_, nodeId) (chDepth, chSuffixId, chNodeId) = do
let label = BC.unpack $ BC.drop depth $ BC.take chDepth $ BC.drop chSuffixId bs
label' = if null label then "ϵ" else ' ':label
writeln $ concat [show nodeId, " -> ", show chNodeId, " [label=\"", label', "\"];"]
pure (Just chSuffixId, nodeId)
b3 depth (mSuffixId, nodeId) =
pure (depth, fromJust mSuffixId, nodeId)
>>>let bs = BC.pack "mississippi">>>writeFile "suffixtree.dot" (suffixTreeDot bs)>>>:! dot -Tsvg -o suffixtree.svg suffixtree.dot
Generates the image: