Haskell Code by HsColour

{-# OPTIONS_GHC -fbang-patterns #-}
-- |
-- Module      : Data.ByteString.Seach.KnuthMorrisPratt
-- Copyright   : Justin Bailey
--               Chris Kuklewicz
-- License     : BSD3
-- Maintainer  : Bryan O'Sullivan <bos@serpentine.com>
-- Stability   : experimental
-- Portability : portable
--  
-- Fast non-overlapping Knuth-Morris-Pratt search of both strict and
-- lazy 'S.ByteString' values.
--
-- A description of the algorithm can be found at
-- <http://en.wikipedia.org/wiki/Knuth-Morris-Pratt_algorithm>.

-- Original authors: Justin Bailey (jgbailey at gmail.com) and
-- Chris Kuklewicz (haskell at list.mightyreason.com).

module Data.ByteString.Search.KnuthMorrisPratt
    (
      -- * Overview
      -- $overview

      -- ** Parameter and return types
      -- $types

      -- ** Lazy ByteStrings
      -- $lazy

      -- * Functions
      matchLL
    , matchLS
    , matchSS
    , matchSL
    ) where

import qualified Data.Array.Base as Base (unsafeAt)
import qualified Data.Array.Unboxed as Unboxed (UArray)
import qualified Data.Array.IArray as IArray (array)
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString as S
#if __GLASGOW_HASKELL__ >= 608
import qualified Data.ByteString.Unsafe as U (unsafeIndex)
#else
import qualified Data.ByteString.Base as U (unsafeIndex)
#endif
import Data.Int (Int64)

-- $overview
--
-- This module exports 4 search functions: 'matchLL', 'matchLS',
-- 'matchSL', and 'matchSS'.
--
-- If given an empty pattern, a search will always return an empty
-- list.

-- $types
--
-- The first parameter is always the pattern string.  The second
-- parameter is always the target string to be searched.  The returned
-- list contains the offsets of all /non-overlapping/ patterns.
--
-- A returned @Int@ or @Int64@ is an index into the target string
-- which is aligned to the head of the pattern string.  Strict targets
-- return @Int@ indices and lazy targets return @Int64@ indices.  All
-- returned lists are computed and returned in a lazy fashion.

-- $lazy
--
-- 'matchLL' and 'matchLS' take lazy bytestrings as patterns.  For
-- performance, if the pattern is not a single strict chunk then all
-- the the pattern chunks will copied into a concatenated strict
-- bytestring.  This limits the patterns to a length of (maxBound ::
-- Int).
--
-- 'matchLL' and 'matchSL' take lazy bytestrings as targets.
-- These are written so that while they work they will not retain a
-- reference to all the earlier parts of the the lazy bytestring.
-- This means the garbage collector would be able to keep only a small
-- amount of the target string and free the rest.

{-# INLINE matchLL #-}
matchLL :: L.ByteString         -- ^ lazy pattern
        -> L.ByteString         -- ^ lazy target string
        -> [Int64]              -- ^ offsets of matches
matchLL pat = let search = matchSSs' (S.concat (L.toChunks pat)) in search . L.toChunks

{-# INLINE matchLS #-}
matchLS :: L.ByteString         -- ^ lazy pattern
        -> S.ByteString         -- ^ strict target string
        -> [Int64]              -- ^ offsets of matches
matchLS pat = let search = matchSSs' (S.concat (L.toChunks pat)) in search . (:[])

{-# INLINE matchSS #-}
matchSS :: S.ByteString         -- ^ strict pattern
        -> S.ByteString         -- ^ strict target string
        -> [Int64]              -- ^ offsets of matches
matchSS pat = let search = matchSSs' pat in search . (:[])

{-# INLINE matchSL #-}
matchSL :: S.ByteString         -- ^ strict pattern
        -> L.ByteString         -- ^ lazy target string
        -> [Int64]              -- ^ offsets of matches
matchSL pat = let search = matchSSs' pat in search . L.toChunks

matchSSs' :: S.ByteString -> [S.ByteString] -> [Int64]
#ifndef __HADDOCK__
matchSSs' pat | S.null pat = const []
                 | otherwise =
  let !patLen = S.length pat -- Evaluate S.length once; 
      !lookupTable = computeLookup pat -- lower bound of UArray must be 0 for Base.unsafeAt, but index 0 will never be looked up
      searcher :: Int64 -> Int -> [S.ByteString] -> [Int64]
      searcher _ _ [] = []
      searcher !prior !patStart (!str:strRest) =
        let !strLen = S.length str -- Evaluate S.length once; 
            findMatch :: Int -> Int -> [Int64]
            findMatch !strIndex !patIndex | patIndex == patLen = (prior + fromIntegral strIndex - fromIntegral patLen) : findMatch strIndex 0
                                          | strIndex == strLen = searcher (prior + fromIntegral strLen) patIndex strRest
                                          | otherwise =
              if (U.unsafeIndex str strIndex) == (U.unsafeIndex pat patIndex)
                then findMatch (succ strIndex) (succ patIndex)
                else if patIndex == 0
                       then findMatch (succ strIndex) 0
                       else findMatch strIndex (Base.unsafeAt lookupTable patIndex) -- here 1 <= patIndex <= patLen-1
        in
          findMatch 0 patStart
  in searcher 0 0
#endif

{-|

 Given our pattern, get all the prefixes of the pattern. For each of those
 prefixes, find the longest prefix from the original pattern that is also a
 suffix of the prefix segment being considered, and is not equal to it. The
 argument given to overlap is the length of the prefix matched so far, and the
 length of the longest prefix, which is a suffix and is not equal to it, is the
 value overlap returns.

 If a given prefix has no possible overlap, it is mapped to -1.

-}
overlap :: S.ByteString -> [(Int, Int)]
#ifndef __HADDOCK__
overlap pat =
 let patternLength = S.length pat
     -- Given an index into the pattern (representing a substring), find the longest prefix of
     -- the pattern which is a suffix of the substring given, without being
     -- equal to it.
     --
     -- patIdx represents the index of the last character in the prefix, not the
     -- character after it. Therefore, compare the pattern starting at the first
     -- character of the prefix, not the zeroth.
     longestSuffix !patIdx =
      let longestSuffix' !shiftPrefix !prefixIdx 
            | shiftPrefix == patIdx = 0 -- No match
            | shiftPrefix + prefixIdx == patIdx = prefixIdx -- Suffix found.
            -- Compare pattern to itself, but shifted, here.
            | U.unsafeIndex pat (shiftPrefix + prefixIdx) == U.unsafeIndex pat prefixIdx = longestSuffix' shiftPrefix (prefixIdx + 1)
            | otherwise = longestSuffix' (shiftPrefix + 1) 0
      in
        longestSuffix' 1 0
 in
  (0, 0) : [(matchLen, longestSuffix matchLen) | matchLen <- [1 .. patternLength - 1]]
  -- List.map (\prefix -> (fromIntegral $ S.length prefix, fromIntegral $ longestPreSuffix prefix)) prefixes
#endif

 
{-|
 Given a string representing a search pattern, this function
 returns a function which represents, for each prefix of that
 pattern, the maximally long prefix of the pattern which is a suffix
 of the indicated pattern segment.

 If there is no such prefix, 0 is returned.
 -}
computeLookup :: S.ByteString -> Unboxed.UArray Int Int
computeLookup pat =
 let patLen = fromIntegral $ S.length pat
     table :: Unboxed.UArray Int Int
     table = {-# SCC "computeLookup_table" #-} IArray.array (0, patLen - 1) (overlap pat)
 in table