Data/ByteString/Search/BoyerMoore.hs

{-# OPTIONS_GHC -fbang-patterns #-}
-- |
-- Module      : Data.ByteString.Seach.BoyerMoore
-- Copyright   : Daniel Fischer
--               Chris Kuklewicz
-- License     : BSD3
-- Maintainer  : Bryan O'Sullivan <bos@serpentine.com>
-- Stability   : experimental
-- Portability : portable
--
-- Fast overlapping Boyer-Moore search of both strict and lazy
-- 'S.ByteString' values.
--
-- Descriptions of the algorithm can be found at
-- <http://www-igm.univ-mlv.fr/~lecroq/string/node14.html#SECTION00140>
-- and
-- <http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm>
--
-- Original authors: Daniel Fischer (daniel.is.fischer at web.de) and
-- Chris Kuklewicz (haskell at list.mightyreason.com).

module Data.ByteString.Search.BoyerMoore
    (
      -- * Overview
      -- $overview

      -- ** Parameter and return types
      -- $types

      -- ** Lazy ByteStrings
      -- $lazy

      -- ** Performance
      -- $performance

      -- ** Complexity
      -- $complexity

      -- ** Currying
      -- $currying

      -- ** Integer overflow
      -- $overflow

      -- * Functions
      matchLL
    , matchLS
    , matchSL
    , matchSS
    ) where

import qualified Data.ByteString as S (ByteString,null,length,concat)
import qualified Data.ByteString.Lazy as L (ByteString,toChunks)
#if __GLASGOW_HASKELL__ >= 608
import qualified Data.ByteString.Unsafe as U (unsafeIndex)
#else
import qualified Data.ByteString.Base as U (unsafeIndex)
#endif

import Data.Array.Base (unsafeAt,unsafeRead,unsafeWrite)
import Data.Array.ST (newArray,newArray_,runSTUArray)
import Data.Array.IArray (array,accumArray)
import Data.Array.Unboxed (UArray)
import Data.Word (Word8)
import Data.Int (Int64)

-- $overview
--
-- This module exports 4 search functions: 'matchLL', 'matchLS',
-- 'matchSL', and 'matchSS'.
--
-- If given an empty pattern, a search will always return an empty
-- list.

-- $types
--
-- The first parameter is always the pattern string.  The second
-- parameter is always the target string to be searched.  The returned
-- list contains the offsets of all /overlapping/ patterns.
--
-- A returned @Int@ or @Int64@ is an index into the target string
-- which is aligned to the head of the pattern string.  Strict targets
-- return @Int@ indices and lazy targets return @Int64@ indices.  All
-- returned lists are computed and returned in a lazy fashion.

-- $lazy
--
-- 'matchLL' and 'matchLS' take lazy bytestrings as patterns.  For
-- performance, if the pattern is not a single strict chunk then all
-- the the pattern chunks will copied into a concatenated strict
-- bytestring.  This limits the patterns to a length of (maxBound ::
-- Int).
--
-- 'matchLL' and 'matchSL' take lazy bytestrings as targets.
-- These are written so that while they work they will not retain a
-- reference to all the earlier parts of the the lazy bytestring.
-- This means the garbage collector would be able to keep only a small
-- amount of the target string and free the rest.

-- $currying
-- These functions can all be usefully curried.  Given only a pattern
-- the curried version will compute the supporting lookup tables only
-- once, allowing for efficient re-use.  Similarly, the curried
-- 'matchLL' and 'matchLS' will compute the concatenated pattern only
-- once.

-- $complexity
--
-- Preprocessing the pattern string is O(@patternLength@).  The search
-- performance is O(@targetLength@\/@patternLength@) in the best case,
-- allowing it to go faster than a Knuth-Morris-Pratt algorithm.  With
-- a non-periodic pattern the worst case uses O(3\*@targetLength@)
-- comparisons.  The periodic pattern worst case is quadratic
-- O(@targetLength@\*@patternLength@) complexity.  Improvements
-- (e.g. Turbo-Boyer-Moore) to catch and linearize worst case
-- performance slow down the loop significantly.

-- $performance
--
-- Operating on a strict target string is faster than a lazy target
-- string.  It is unclear why the performance gap is as large as it is
-- (patches welcome).  To slightly ameliorate this, if the lazy string
-- is a single chunk then a copy of the strict algorithm is used.

-- $overflow
--
-- The current code uses @Int@ to keep track of the locations in the
-- target string.  If the length of the pattern plus the length of any
-- strict chunk of the target string is greater or equal to
-- @'maxBound'::Int@ then this will overflow causing an error.  We try
-- to detect this and call 'error' before a segfault occurs.

{-# INLINE matchLL #-}
matchLL :: L.ByteString         -- ^ lazy pattern
        -> L.ByteString         -- ^ lazy target string
        -> [Int64]              -- ^ offsets of matches
matchLL pat = let search = matchSSsd (S.concat (L.toChunks pat))
                in search . L.toChunks

{-# INLINE matchLS #-}
matchLS :: L.ByteString         -- ^ lazy pattern
        -> S.ByteString         -- ^ strict target string
        -> [Int]                -- ^ offsets of matches
matchLS pat = matchSSd (S.concat (L.toChunks pat))

{-# INLINE matchSL #-}
matchSL :: S.ByteString         -- ^ strict pattern
        -> L.ByteString         -- ^ lazy target string
        -> [Int64]              -- ^ offsets of matches
matchSL pat = let search = matchSSsd pat
                in search . L.toChunks

{-# INLINE matchSS #-}
matchSS :: S.ByteString         -- ^ strict pattern
        -> S.ByteString         -- ^ strict target string
        -> [Int]                -- ^ offsets of matches
matchSS pat = matchSSd pat

#ifndef __HADDOCK__
matchSSd :: S.ByteString -> S.ByteString -> [Int]
matchSSd pat | S.null pat = const []
               | otherwise = 
  let !patLen = S.length pat
      !patEnd = pred patLen
      !maxStrLen = maxBound - patLen
      !occT   = occurs pat       -- used to compute bad-character shift
      !suffT  = suffShifts pat   -- used to compute good-suffix shift
      !skip   = unsafeAt suffT 0 -- used after each matching position is found
      -- 0 < skip <= patLen

      {-# INLINE patAt #-}
      patAt :: Int -> Word8
      patAt !i = U.unsafeIndex pat i

      searcher str | maxStrLen <= S.length str = error "Overflow error in BoyerMoore.matchSSd"
                   | otherwise =
        let !strLen = S.length str
            !maxDiff = strLen-patLen
            {-# INLINE strAt #-}
            strAt :: Int -> Word8
            strAt !i = U.unsafeIndex str i

            findMatch !diff !patI =
              case strAt (diff+patI) of
                c | c==patAt patI -> if patI == 0
                                       then diff :
                                              let diff' = diff + skip
                                              in if maxDiff < diff'
                                                   then []
                                                   else findMatch diff' patEnd
                                       else findMatch diff (pred patI)
                  | otherwise -> let {-# INLINE badShift #-}
                                     badShift = patI - unsafeAt occT (fromIntegral c)
                                     -- (-patEnd) < badShift <= patLen
                                     {-# INLINE goodShift #-}
                                     goodShift = unsafeAt suffT patI
                                     -- 0 < goodShift <= patLen
                                     diff' = diff + max badShift goodShift
                                 in if maxDiff < diff'
                                      then []
                                      else findMatch diff' patEnd
        in if maxDiff < 0
             then []
             else findMatch 0 patEnd
  in searcher
#endif

-- release is used to keep the zipper in matchSSs from remembering
-- the leading part of the searched string.  The deep parameter is the
-- number of characters that the past needs to hold.  This ensures
-- lazy streaming consumption of the searched string.
{-# INLINE release #-}
release :: Int ->  [S.ByteString] -> [S.ByteString]
#ifndef __HADDOCK__
release !deep _ | deep <= 0 = []
release !deep (!x:xs) = let !rest = release (deep-S.length x) xs in x : rest
release _ [] = error "BoyerMoore 'release' could not find enough past of length deep!"
#endif

matchSSsd :: S.ByteString -> [S.ByteString] -> [Int64]
#ifndef __HADDOCK__
matchSSsd pat | S.null pat = const []
               | otherwise =
  let !patLen = S.length pat
      !patEnd = pred patLen
      !occT   = occurs pat       -- used to compute bad-character shift
      !suffT  = suffShifts pat   -- used to compute good-suffix shift
      !skip   = unsafeAt suffT 0 -- used after each matching position is found
      -- 0 < skip <= patLen

      {-# INLINE patAt #-}
      patAt :: Int -> Word8
      patAt !i = U.unsafeIndex pat i

      searcher string =
        let -- seek is used to position the "zipper" of
            -- (past,str,future) to the correct S.ByteString to search
            -- with matcher.  This is done by ensuring 0 <= strPos <
            -- strLen where (strPos == diffPos+patPos). Note that
            -- future is not a strict parameter.  The character being
            -- compared will then be (strAt strPos) and (patAt
            -- patPos).  Splitting this into specialized versions
            -- seems like going too, and is only useful if pat is
            -- close to (or larger than) the chunk size.
            seek :: Int64 -> [S.ByteString] -> S.ByteString -> [S.ByteString] -> Int -> Int -> [Int64]
            seek !prior !past !str future !diffPos !patPos | (diffPos+patPos) < 0 = {-# SCC "seek/past" #-}
              case past of
                [] -> error "seek back too far!"
                (h:t) -> let hLen = S.length h
                         in seek (prior - fromIntegral hLen) t h (str:future) (diffPos + hLen) patPos
                                                           | strLen <= (diffPos+patPos) = {-# SCC "seek/future" #-}
              case future of
                [] -> []
                (h:t) -> let {-# INLINE prior' #-}
                             prior' = prior + fromIntegral strLen
                             !diffPos' = diffPos - strLen
                             {-# INLINE past' #-}
                             past' = release (-diffPos') (str:past)
                         in if maxStrLen <= S.length h
                              then error "Overflow in BoyerMoore.matchSSsd"
                              else seek prior' past' h t diffPos' patPos
                                                          | otherwise = {-# SCC "seek/str" #-}
              -- matcher is the tight loop that walks backwards from the end
              -- of the pattern checking for matching characters.  The upper
              -- bound of strLen is checked only when strI is shifted
              -- upwards to strI'.  The lower bound must be checked.
              let matcher !diff !patI =
                    case strAt (diff+patI) of
                      c | c==patAt patI ->
                            if patI == 0
                              then prior + fromIntegral (diff+patI) :
                                     let !diff' = (diff+patI) + skip -- Assert : diff < diff'
                                     in if maxDiff < diff'
                                          then seek prior past str future diff' patEnd
                                          else if diff' < 0
                                                 then matcher diff' patEnd
                                                 else matcherF diff' patEnd
                              else if (diff+patI) == 0 -- diff < 0 means need to check underflow
                                     then seek prior past str future diff (pred patI) 
                                     else matcher diff (pred patI)
                        | otherwise ->
                            let {-# INLINE badShift #-}
                                badShift = patI - unsafeAt occT (fromIntegral c)
                                -- (-patEnd) < badShift <= patLen
                                {-# INLINE goodShift #-}
                                goodShift = unsafeAt suffT patI
                                -- 0 < goodShift <= patLen
                                -- Assert : diff < diff'
                                !diff' = diff + max badShift goodShift
                            in if maxDiff < diff'
                                 then seek prior past str future diff' patEnd
                                 else if diff' < 0
                                        then matcher diff' patEnd
                                        else matcherF diff' patEnd

              -- mathcherF only needs to check overflow since 0<=diff
                  matcherF !diff !patI =
                    case strAt (diff+patI) of
                      c | c==patAt patI ->
                            if patI == 0
                              then prior + fromIntegral (diff+patI) :
                                     let !diff' = (diff+patI) + skip -- Assert : diff < diff'
                                     in if maxDiff < diff'
                                          then seek prior past str future diff' patEnd
                                          else matcherF diff' patEnd
                              else matcherF diff (pred patI) -- 0 <= diff means no need to check underflow
                        | otherwise ->
                            let {-# INLINE badShift #-}
                                badShift = patI - unsafeAt occT (fromIntegral c)
                                -- (-patEnd) < badShift <= patLen
                                {-# INLINE goodShift #-}
                                goodShift = unsafeAt suffT patI
                                -- 0 < goodShift <= patLen
                                -- Assert : diff < diff'
                                !diff' = diff + max badShift goodShift
                            in if maxDiff < diff'
                                 then seek prior past str future diff' patEnd
                                 else matcherF diff' patEnd
              in if diffPos < 0
                   then matcher diffPos patPos
                   else matcherF diffPos patPos

             where !strLen = S.length str
                   !maxDiff = strLen - patLen
                   !maxStrLen = pred ((maxBound::Int) - patLen)
                   {-# INLINE strAt #-}
                   strAt :: Int -> Word8
                   strAt !i = U.unsafeIndex str i
        in case string of
             [] -> []
             [str] -> -- Steal the quick findMatch from matchSSd for this case:
               let findMatch !diff !patI =
                     case strAt (diff+patI) of
                       c | c==patAt patI -> if patI == 0
                                              then fromIntegral diff :
                                                     let diff' = diff + skip
                                                     in if maxDiff < diff'
                                                          then []
                                                          else findMatch diff' patEnd
                                              else findMatch diff (pred patI)
                         | otherwise -> let {-# INLINE badShift #-}
                                            badShift = patI - unsafeAt occT (fromIntegral c)
                                            -- (-patEnd) < badShift <= patLen
                                            {-# INLINE goodShift #-}
                                            goodShift = unsafeAt suffT patI
                                            -- 0 < goodShift <= patLen
                                            diff' = diff + max badShift goodShift
                                        in if maxDiff < diff'
                                             then []
                                             else findMatch diff' patEnd
                   !strLen = S.length str
                   !maxDiff = strLen - patLen
                   !maxStrLen = ((maxBound::Int) - patLen)
                   {-# INLINE strAt #-}
                   strAt :: Int -> Word8
                   strAt !i = U.unsafeIndex str i
               in if maxStrLen <= strLen
                    then error "Overflow in BoyerMoore.matchSSsd"
                    else findMatch 0 patEnd
             (str:future) -> if ((maxBound::Int) - patLen) <= S.length str
                               then error "Overflow in BoyerMoore.matchSSsd"
                               else seek 0 [] str future 0 patEnd
  in searcher
#endif

{- Format of bad character table generated by occurs:

Index is good for Word8 / ASCII searching only.
The last character (at the last index) in pat is ignored.
Excluding that last element, the value is largest index of occurances of that Word8 in the pat.
The default value for Word8's not in the pattern is (-1).

Range of values: -1 <= value < length of pattern

-}
{-# INLINE occurs #-}
occurs :: S.ByteString -> UArray Word8 Int
#ifndef __HADDOCK__
occurs !pat | patEnd < 0 = emptyOccurs
            | otherwise  = runSTUArray
    (do ar <- newArray (minBound,maxBound) (-1)
        let loop !i | i == patEnd = return ar
                    | otherwise   = do unsafeWrite ar (fromEnum $ pat `U.unsafeIndex` i) i
                                       loop (succ i)
        loop 0)
  where
    !patEnd = pred (S.length pat)
#endif

emptyOccurs :: UArray Word8 Int
emptyOccurs = accumArray const (-1) (minBound,maxBound) []

{- Non ST variants of occurs

occurs' :: S.ByteString -> UArray Word8 Int
occurs' !pat = accumArray (flip const) (-1) (0,255)
  [ (pat `U.unsafeIndex` i, i) | i <- [0..pred (S.length pat)] ]

occurs'' :: S.ByteString -> UArray Word8 Int
occurs'' !pat = accumArray (flip const) (-1) (minBound,maxBound) $ zip (init $ S.unpack pat) [0..]
-}

{-
suffLengths uses a ST array to allow for strict querying of previously
filled in values durring the fill loops.

Format for suffLengths array:

Valid index range is the same as for the pat.

The value at index k is used when there is a mismatch at index k in
pat after checking that all indices j where j > k correctly match.

For all indices consider the prefix of pat that ends with the
character at that index.  Now the value of suffLength is the number of
character at the end of this prefix that are identical to the end of
pat.

By the above definition, the last index has the length of the pattern
as its value, since the whole pattern is compared to itself and the
overlap is always the whole pattern length.  And the maximum value at
index k is (k+1).

This value itself is a non-negative integer less than the length of
pat except for the last index, where the value is the length of pat.

For most positions the value will be 0.  Aside from the at the last
index the value can be non-zero only at indices where the last
character of the pat occurs earlier in pat.
-}
{-# INLINE suffLengths #-}
suffLengths :: S.ByteString -> UArray Int Int
#ifndef __HADDOCK__
suffLengths !pat | 0==patLen = array (0,-1) []
                 | otherwise = runSTUArray
    (do ar <- newArray_ (0,patEnd)
        unsafeWrite ar patEnd patLen
        let {-# INLINE matchSuffix #-}
            matchSuffix !idx !from = do
                let !d = patEnd - idx
                    helper !i | i < 0 || (pat `U.unsafeIndex` i) /= (pat `U.unsafeIndex` (i+d)) = i
                              | otherwise = helper (pred i)
                    pre' = helper from
                unsafeWrite ar idx (idx-pre')
                idxLoop (pred idx) pre' start
            idxLoop !idx !pre !end
                | idx < 0   = return ar
                | pre < idx = do matching <- unsafeRead ar end  -- try and reuse old result
                                 if pre + matching < idx        -- check if old matching length is too long for current idx
                                   then do unsafeWrite ar idx matching
                                           idxLoop (pred idx) pre (pred end)
                                   else matchSuffix idx pre
                | otherwise = matchSuffix idx idx
        idxLoop start start start) -- the third argument, the initial value of "end", is never used and does not matter.
  where
    !patLen = S.length pat
    !patEnd = pred patLen
    !start  = pred patEnd
#endif

{- Format for suffShifts:

The valid index range is the same as for pat.

The index k is used when there is a mismatch at pat index k and all
indices j where j > k have matched.

The value is the smallest number of characters one can advance the
pattern such that there the shifted pattern agrees at the already
checked positions j>k.

Thus the value range is : 0 < value <= length of pattern

-}
{-# INLINE suffShifts #-}
suffShifts :: S.ByteString -> UArray Int Int
#ifndef __HADDOCK__
suffShifts !pat | patLen == 0 = array (0,-1) []
                | otherwise = runSTUArray
    (do ar <- newArray (0,patEnd) patLen
        let preShift !idx !j -- idx counts down and j starts at 0 and is non-decreasing
                | idx < 0   = return ()
                | suff `unsafeAt` idx == idx+1  =
              do let !shf = patEnd - idx
                     fill_to_shf !i | i==shf = return ()
                                    | otherwise = do unsafeWrite ar i shf
                                                     fill_to_shf (succ i)
                 fill_to_shf j
                 preShift (pred idx) shf
                | otherwise = preShift (pred idx) j
            sufShift !idx
                | idx == patEnd = return ar
                | otherwise = do unsafeWrite ar (patEnd - (suff `unsafeAt` idx)) (patEnd - idx)
                                 sufShift (succ idx)
        preShift start 0
        sufShift 0)
      where
        !patLen = S.length pat
        !patEnd = pred patLen
        !start = pred patEnd
        !suff = suffLengths pat
#endif