{-# OPTIONS_GHC -fbang-patterns #-} -- | -- Module : Data.ByteString.Seach.BoyerMoore -- Copyright : Daniel Fischer -- Chris Kuklewicz -- License : BSD3 -- Maintainer : Bryan O'Sullivan <bos@serpentine.com> -- Stability : experimental -- Portability : portable -- -- Fast overlapping Boyer-Moore search of both strict and lazy -- 'S.ByteString' values. -- -- Descriptions of the algorithm can be found at -- <http://www-igm.univ-mlv.fr/~lecroq/string/node14.html#SECTION00140> -- and -- <http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm> -- -- Original authors: Daniel Fischer (daniel.is.fischer at web.de) and -- Chris Kuklewicz (haskell at list.mightyreason.com). module Data.ByteString.Search.BoyerMoore ( -- * Overview -- $overview -- ** Parameter and return types -- $types -- ** Lazy ByteStrings -- $lazy -- ** Performance -- $performance -- ** Complexity -- $complexity -- ** Currying -- $currying -- ** Integer overflow -- $overflow -- * Functions matchLL , matchLS , matchSL , matchSS ) where import qualified Data.ByteString as S (ByteString,null,length,concat) import qualified Data.ByteString.Lazy as L (ByteString,toChunks) #if __GLASGOW_HASKELL__ >= 608 import qualified Data.ByteString.Unsafe as U (unsafeIndex) #else import qualified Data.ByteString.Base as U (unsafeIndex) #endif import Data.Array.Base (unsafeAt,unsafeRead,unsafeWrite) import Data.Array.ST (newArray,newArray_,runSTUArray) import Data.Array.IArray (array,accumArray) import Data.Array.Unboxed (UArray) import Data.Word (Word8) import Data.Int (Int64) -- $overview -- -- This module exports 4 search functions: 'matchLL', 'matchLS', -- 'matchSL', and 'matchSS'. -- -- If given an empty pattern, a search will always return an empty -- list. -- $types -- -- The first parameter is always the pattern string. The second -- parameter is always the target string to be searched. The returned -- list contains the offsets of all /overlapping/ patterns. -- -- A returned @Int@ or @Int64@ is an index into the target string -- which is aligned to the head of the pattern string. Strict targets -- return @Int@ indices and lazy targets return @Int64@ indices. All -- returned lists are computed and returned in a lazy fashion. -- $lazy -- -- 'matchLL' and 'matchLS' take lazy bytestrings as patterns. For -- performance, if the pattern is not a single strict chunk then all -- the the pattern chunks will copied into a concatenated strict -- bytestring. This limits the patterns to a length of (maxBound :: -- Int). -- -- 'matchLL' and 'matchSL' take lazy bytestrings as targets. -- These are written so that while they work they will not retain a -- reference to all the earlier parts of the the lazy bytestring. -- This means the garbage collector would be able to keep only a small -- amount of the target string and free the rest. -- $currying -- These functions can all be usefully curried. Given only a pattern -- the curried version will compute the supporting lookup tables only -- once, allowing for efficient re-use. Similarly, the curried -- 'matchLL' and 'matchLS' will compute the concatenated pattern only -- once. -- $complexity -- -- Preprocessing the pattern string is O(@patternLength@). The search -- performance is O(@targetLength@\/@patternLength@) in the best case, -- allowing it to go faster than a Knuth-Morris-Pratt algorithm. With -- a non-periodic pattern the worst case uses O(3\*@targetLength@) -- comparisons. The periodic pattern worst case is quadratic -- O(@targetLength@\*@patternLength@) complexity. Improvements -- (e.g. Turbo-Boyer-Moore) to catch and linearize worst case -- performance slow down the loop significantly. -- $performance -- -- Operating on a strict target string is faster than a lazy target -- string. It is unclear why the performance gap is as large as it is -- (patches welcome). To slightly ameliorate this, if the lazy string -- is a single chunk then a copy of the strict algorithm is used. -- $overflow -- -- The current code uses @Int@ to keep track of the locations in the -- target string. If the length of the pattern plus the length of any -- strict chunk of the target string is greater or equal to -- @'maxBound'::Int@ then this will overflow causing an error. We try -- to detect this and call 'error' before a segfault occurs. {-# INLINE matchLL #-} matchLL :: L.ByteString -- ^ lazy pattern -> L.ByteString -- ^ lazy target string -> [Int64] -- ^ offsets of matches matchLL pat = let search = matchSSsd (S.concat (L.toChunks pat)) in search . L.toChunks {-# INLINE matchLS #-} matchLS :: L.ByteString -- ^ lazy pattern -> S.ByteString -- ^ strict target string -> [Int] -- ^ offsets of matches matchLS pat = matchSSd (S.concat (L.toChunks pat)) {-# INLINE matchSL #-} matchSL :: S.ByteString -- ^ strict pattern -> L.ByteString -- ^ lazy target string -> [Int64] -- ^ offsets of matches matchSL pat = let search = matchSSsd pat in search . L.toChunks {-# INLINE matchSS #-} matchSS :: S.ByteString -- ^ strict pattern -> S.ByteString -- ^ strict target string -> [Int] -- ^ offsets of matches matchSS pat = matchSSd pat #ifndef __HADDOCK__ matchSSd :: S.ByteString -> S.ByteString -> [Int] matchSSd pat | S.null pat = const [] | otherwise = let !patLen = S.length pat !patEnd = pred patLen !maxStrLen = maxBound - patLen !occT = occurs pat -- used to compute bad-character shift !suffT = suffShifts pat -- used to compute good-suffix shift !skip = unsafeAt suffT 0 -- used after each matching position is found -- 0 < skip <= patLen {-# INLINE patAt #-} patAt :: Int -> Word8 patAt !i = U.unsafeIndex pat i searcher str | maxStrLen <= S.length str = error "Overflow error in BoyerMoore.matchSSd" | otherwise = let !strLen = S.length str !maxDiff = strLen-patLen {-# INLINE strAt #-} strAt :: Int -> Word8 strAt !i = U.unsafeIndex str i findMatch !diff !patI = case strAt (diff+patI) of c | c==patAt patI -> if patI == 0 then diff : let diff' = diff + skip in if maxDiff < diff' then [] else findMatch diff' patEnd else findMatch diff (pred patI) | otherwise -> let {-# INLINE badShift #-} badShift = patI - unsafeAt occT (fromIntegral c) -- (-patEnd) < badShift <= patLen {-# INLINE goodShift #-} goodShift = unsafeAt suffT patI -- 0 < goodShift <= patLen diff' = diff + max badShift goodShift in if maxDiff < diff' then [] else findMatch diff' patEnd in if maxDiff < 0 then [] else findMatch 0 patEnd in searcher #endif -- release is used to keep the zipper in matchSSs from remembering -- the leading part of the searched string. The deep parameter is the -- number of characters that the past needs to hold. This ensures -- lazy streaming consumption of the searched string. {-# INLINE release #-} release :: Int -> [S.ByteString] -> [S.ByteString] #ifndef __HADDOCK__ release !deep _ | deep <= 0 = [] release !deep (!x:xs) = let !rest = release (deep-S.length x) xs in x : rest release _ [] = error "BoyerMoore 'release' could not find enough past of length deep!" #endif matchSSsd :: S.ByteString -> [S.ByteString] -> [Int64] #ifndef __HADDOCK__ matchSSsd pat | S.null pat = const [] | otherwise = let !patLen = S.length pat !patEnd = pred patLen !occT = occurs pat -- used to compute bad-character shift !suffT = suffShifts pat -- used to compute good-suffix shift !skip = unsafeAt suffT 0 -- used after each matching position is found -- 0 < skip <= patLen {-# INLINE patAt #-} patAt :: Int -> Word8 patAt !i = U.unsafeIndex pat i searcher string = let -- seek is used to position the "zipper" of -- (past,str,future) to the correct S.ByteString to search -- with matcher. This is done by ensuring 0 <= strPos < -- strLen where (strPos == diffPos+patPos). Note that -- future is not a strict parameter. The character being -- compared will then be (strAt strPos) and (patAt -- patPos). Splitting this into specialized versions -- seems like going too, and is only useful if pat is -- close to (or larger than) the chunk size. seek :: Int64 -> [S.ByteString] -> S.ByteString -> [S.ByteString] -> Int -> Int -> [Int64] seek !prior !past !str future !diffPos !patPos | (diffPos+patPos) < 0 = {-# SCC "seek/past" #-} case past of [] -> error "seek back too far!" (h:t) -> let hLen = S.length h in seek (prior - fromIntegral hLen) t h (str:future) (diffPos + hLen) patPos | strLen <= (diffPos+patPos) = {-# SCC "seek/future" #-} case future of [] -> [] (h:t) -> let {-# INLINE prior' #-} prior' = prior + fromIntegral strLen !diffPos' = diffPos - strLen {-# INLINE past' #-} past' = release (-diffPos') (str:past) in if maxStrLen <= S.length h then error "Overflow in BoyerMoore.matchSSsd" else seek prior' past' h t diffPos' patPos | otherwise = {-# SCC "seek/str" #-} -- matcher is the tight loop that walks backwards from the end -- of the pattern checking for matching characters. The upper -- bound of strLen is checked only when strI is shifted -- upwards to strI'. The lower bound must be checked. let matcher !diff !patI = case strAt (diff+patI) of c | c==patAt patI -> if patI == 0 then prior + fromIntegral (diff+patI) : let !diff' = (diff+patI) + skip -- Assert : diff < diff' in if maxDiff < diff' then seek prior past str future diff' patEnd else if diff' < 0 then matcher diff' patEnd else matcherF diff' patEnd else if (diff+patI) == 0 -- diff < 0 means need to check underflow then seek prior past str future diff (pred patI) else matcher diff (pred patI) | otherwise -> let {-# INLINE badShift #-} badShift = patI - unsafeAt occT (fromIntegral c) -- (-patEnd) < badShift <= patLen {-# INLINE goodShift #-} goodShift = unsafeAt suffT patI -- 0 < goodShift <= patLen -- Assert : diff < diff' !diff' = diff + max badShift goodShift in if maxDiff < diff' then seek prior past str future diff' patEnd else if diff' < 0 then matcher diff' patEnd else matcherF diff' patEnd -- mathcherF only needs to check overflow since 0<=diff matcherF !diff !patI = case strAt (diff+patI) of c | c==patAt patI -> if patI == 0 then prior + fromIntegral (diff+patI) : let !diff' = (diff+patI) + skip -- Assert : diff < diff' in if maxDiff < diff' then seek prior past str future diff' patEnd else matcherF diff' patEnd else matcherF diff (pred patI) -- 0 <= diff means no need to check underflow | otherwise -> let {-# INLINE badShift #-} badShift = patI - unsafeAt occT (fromIntegral c) -- (-patEnd) < badShift <= patLen {-# INLINE goodShift #-} goodShift = unsafeAt suffT patI -- 0 < goodShift <= patLen -- Assert : diff < diff' !diff' = diff + max badShift goodShift in if maxDiff < diff' then seek prior past str future diff' patEnd else matcherF diff' patEnd in if diffPos < 0 then matcher diffPos patPos else matcherF diffPos patPos where !strLen = S.length str !maxDiff = strLen - patLen !maxStrLen = pred ((maxBound::Int) - patLen) {-# INLINE strAt #-} strAt :: Int -> Word8 strAt !i = U.unsafeIndex str i in case string of [] -> [] [str] -> -- Steal the quick findMatch from matchSSd for this case: let findMatch !diff !patI = case strAt (diff+patI) of c | c==patAt patI -> if patI == 0 then fromIntegral diff : let diff' = diff + skip in if maxDiff < diff' then [] else findMatch diff' patEnd else findMatch diff (pred patI) | otherwise -> let {-# INLINE badShift #-} badShift = patI - unsafeAt occT (fromIntegral c) -- (-patEnd) < badShift <= patLen {-# INLINE goodShift #-} goodShift = unsafeAt suffT patI -- 0 < goodShift <= patLen diff' = diff + max badShift goodShift in if maxDiff < diff' then [] else findMatch diff' patEnd !strLen = S.length str !maxDiff = strLen - patLen !maxStrLen = ((maxBound::Int) - patLen) {-# INLINE strAt #-} strAt :: Int -> Word8 strAt !i = U.unsafeIndex str i in if maxStrLen <= strLen then error "Overflow in BoyerMoore.matchSSsd" else findMatch 0 patEnd (str:future) -> if ((maxBound::Int) - patLen) <= S.length str then error "Overflow in BoyerMoore.matchSSsd" else seek 0 [] str future 0 patEnd in searcher #endif {- Format of bad character table generated by occurs: Index is good for Word8 / ASCII searching only. The last character (at the last index) in pat is ignored. Excluding that last element, the value is largest index of occurances of that Word8 in the pat. The default value for Word8's not in the pattern is (-1). Range of values: -1 <= value < length of pattern -} {-# INLINE occurs #-} occurs :: S.ByteString -> UArray Word8 Int #ifndef __HADDOCK__ occurs !pat | patEnd < 0 = emptyOccurs | otherwise = runSTUArray (do ar <- newArray (minBound,maxBound) (-1) let loop !i | i == patEnd = return ar | otherwise = do unsafeWrite ar (fromEnum $ pat `U.unsafeIndex` i) i loop (succ i) loop 0) where !patEnd = pred (S.length pat) #endif emptyOccurs :: UArray Word8 Int emptyOccurs = accumArray const (-1) (minBound,maxBound) [] {- Non ST variants of occurs occurs' :: S.ByteString -> UArray Word8 Int occurs' !pat = accumArray (flip const) (-1) (0,255) [ (pat `U.unsafeIndex` i, i) | i <- [0..pred (S.length pat)] ] occurs'' :: S.ByteString -> UArray Word8 Int occurs'' !pat = accumArray (flip const) (-1) (minBound,maxBound) $ zip (init $ S.unpack pat) [0..] -} {- suffLengths uses a ST array to allow for strict querying of previously filled in values durring the fill loops. Format for suffLengths array: Valid index range is the same as for the pat. The value at index k is used when there is a mismatch at index k in pat after checking that all indices j where j > k correctly match. For all indices consider the prefix of pat that ends with the character at that index. Now the value of suffLength is the number of character at the end of this prefix that are identical to the end of pat. By the above definition, the last index has the length of the pattern as its value, since the whole pattern is compared to itself and the overlap is always the whole pattern length. And the maximum value at index k is (k+1). This value itself is a non-negative integer less than the length of pat except for the last index, where the value is the length of pat. For most positions the value will be 0. Aside from the at the last index the value can be non-zero only at indices where the last character of the pat occurs earlier in pat. -} {-# INLINE suffLengths #-} suffLengths :: S.ByteString -> UArray Int Int #ifndef __HADDOCK__ suffLengths !pat | 0==patLen = array (0,-1) [] | otherwise = runSTUArray (do ar <- newArray_ (0,patEnd) unsafeWrite ar patEnd patLen let {-# INLINE matchSuffix #-} matchSuffix !idx !from = do let !d = patEnd - idx helper !i | i < 0 || (pat `U.unsafeIndex` i) /= (pat `U.unsafeIndex` (i+d)) = i | otherwise = helper (pred i) pre' = helper from unsafeWrite ar idx (idx-pre') idxLoop (pred idx) pre' start idxLoop !idx !pre !end | idx < 0 = return ar | pre < idx = do matching <- unsafeRead ar end -- try and reuse old result if pre + matching < idx -- check if old matching length is too long for current idx then do unsafeWrite ar idx matching idxLoop (pred idx) pre (pred end) else matchSuffix idx pre | otherwise = matchSuffix idx idx idxLoop start start start) -- the third argument, the initial value of "end", is never used and does not matter. where !patLen = S.length pat !patEnd = pred patLen !start = pred patEnd #endif {- Format for suffShifts: The valid index range is the same as for pat. The index k is used when there is a mismatch at pat index k and all indices j where j > k have matched. The value is the smallest number of characters one can advance the pattern such that there the shifted pattern agrees at the already checked positions j>k. Thus the value range is : 0 < value <= length of pattern -} {-# INLINE suffShifts #-} suffShifts :: S.ByteString -> UArray Int Int #ifndef __HADDOCK__ suffShifts !pat | patLen == 0 = array (0,-1) [] | otherwise = runSTUArray (do ar <- newArray (0,patEnd) patLen let preShift !idx !j -- idx counts down and j starts at 0 and is non-decreasing | idx < 0 = return () | suff `unsafeAt` idx == idx+1 = do let !shf = patEnd - idx fill_to_shf !i | i==shf = return () | otherwise = do unsafeWrite ar i shf fill_to_shf (succ i) fill_to_shf j preShift (pred idx) shf | otherwise = preShift (pred idx) j sufShift !idx | idx == patEnd = return ar | otherwise = do unsafeWrite ar (patEnd - (suff `unsafeAt` idx)) (patEnd - idx) sufShift (succ idx) preShift start 0 sufShift 0) where !patLen = S.length pat !patEnd = pred patLen !start = pred patEnd !suff = suffLengths pat #endif