{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE UnliftedFFITypes #-}

-- |
-- Module      : Data.Text.Internal.Search
-- Copyright   : (c) Bryan O'Sullivan 2009
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : GHC
--
-- Fast substring search for 'Text', based on work by Boyer, Moore,
-- Horspool, Sunday, and Lundh.
--
-- References:
--
-- * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm.
--   Communications of the ACM, 20, 10, 762-772 (1977)
--
-- * R. N. Horspool: Practical Fast Searching in Strings.  Software -
--   Practice and Experience 10, 501-506 (1980)
--
-- * D. M. Sunday: A Very Fast Substring Search Algorithm.
--   Communications of the ACM, 33, 8, 132-142 (1990)
--
-- * F. Lundh: The Fast Search Algorithm.
--   <http://effbot.org/zone/stringlib.htm> (2006)

module Data.Text.Internal.Search
    (
      indices
    ) where

import qualified Data.Text.Array as A
import Data.Word (Word64, Word8)
import Data.Text.Internal (Text(..))
import Data.Bits ((.|.), (.&.), unsafeShiftL)
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Foreign.C.Types
import GHC.Exts (ByteArray#)
import System.Posix.Types (CSsize(..))

data T = {-# UNPACK #-} !Word64 :* {-# UNPACK #-} !Int

-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
-- @needle@ within @haystack@.
--
-- In (unlikely) bad cases, this algorithm's complexity degrades
-- towards /O(n*m)/.
indices :: Text                -- ^ Substring to search for (@needle@)
        -> Text                -- ^ Text to search in (@haystack@)
        -> [Int]
indices :: Text -> Text -> [Int]
indices needle :: Text
needle@(Text Array
narr Int
noff Int
nlen)
  | Int
nlen Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
1 = Word8 -> Text -> [Int]
scanOne (Array -> Int -> Word8
A.unsafeIndex Array
narr Int
noff)
  | Int
nlen Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0 = [Int] -> Text -> [Int]
forall a b. a -> b -> a
const []
  | Bool
otherwise = Text -> Text -> [Int]
indices' Text
needle
{-# INLINE indices #-}

-- | nlen must be >= 2, otherwise nindex causes access violation
indices' :: Text -> Text -> [Int]
indices' :: Text -> Text -> [Int]
indices' (Text Array
narr Int
noff Int
nlen) (Text harr :: Array
harr@(A.ByteArray ByteArray#
harr#) Int
hoff Int
hlen) = Int -> [Int]
loop (Int
hoff Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
nlen)
  where
    nlast :: Int
nlast    = Int
nlen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1
    !z :: Word8
z       = Int -> Word8
nindex Int
nlast
    nindex :: Int -> Word8
nindex Int
k = Array -> Int -> Word8
A.unsafeIndex Array
narr (Int
noffInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
k)
    buildTable :: Int -> Word64 -> Int -> T
buildTable !Int
i !Word64
msk !Int
skp
        | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
nlast           = (Word64
msk Word64 -> Word64 -> Word64
forall a. Bits a => a -> a -> a
.|. Word8 -> Word64
swizzle Word8
z) Word64 -> Int -> T
:* Int
skp
        | Bool
otherwise            = Int -> Word64 -> Int -> T
buildTable (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1) (Word64
msk Word64 -> Word64 -> Word64
forall a. Bits a => a -> a -> a
.|. Word8 -> Word64
swizzle Word8
c) Int
skp'
        where !c :: Word8
c               = Int -> Word8
nindex Int
i
              skp' :: Int
skp' | Word8
c Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
z    = Int
nlen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
2
                   | Bool
otherwise = Int
skp
    !(Word64
mask :* Int
skip) = Int -> Word64 -> Int -> T
buildTable Int
0 Word64
0 (Int
nlenInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
2)

    swizzle :: Word8 -> Word64
    swizzle :: Word8 -> Word64
swizzle !Word8
k = Word64
1 Word64 -> Int -> Word64
forall a. Bits a => a -> Int -> a
`unsafeShiftL` (Word8 -> Int
word8ToInt Word8
k Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3f)

    loop :: Int -> [Int]
loop !Int
i
      | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
hlen Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
hoff
      = []
      | Array -> Int -> Word8
A.unsafeIndex Array
harr (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
z
      = if Array -> Int -> Array -> Int -> Int -> Bool
A.equal Array
narr Int
noff Array
harr (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
nlen) Int
nlen
        then Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
nlen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
hoff Int -> [Int] -> [Int]
forall a. a -> [a] -> [a]
: Int -> [Int]
loop (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
nlen)
        else                   Int -> [Int]
loop (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
skip Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
      | Int
i Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
hlen Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
hoff
      = []
      | Word64
mask Word64 -> Word64 -> Word64
forall a. Bits a => a -> a -> a
.&. Word8 -> Word64
swizzle (Array -> Int -> Word8
A.unsafeIndex Array
harr Int
i) Word64 -> Word64 -> Bool
forall a. Eq a => a -> a -> Bool
== Word64
0
      = Int -> [Int]
loop (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
nlen Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
      | Bool
otherwise
      = case IO CSsize -> CSsize
forall a. IO a -> a
unsafeDupablePerformIO (IO CSsize -> CSsize) -> IO CSsize -> CSsize
forall a b. (a -> b) -> a -> b
$ ByteArray# -> CSize -> CSize -> Word8 -> IO CSsize
memchr ByteArray#
harr# (Int -> CSize
intToCSize Int
i) (Int -> CSize
intToCSize (Int
hlen Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
hoff Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
i)) Word8
z of
        -1 -> []
        CSsize
x  -> Int -> [Int]
loop (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ CSsize -> Int
cSsizeToInt CSsize
x Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
{-# INLINE indices' #-}

scanOne :: Word8 -> Text -> [Int]
scanOne :: Word8 -> Text -> [Int]
scanOne Word8
c (Text Array
harr Int
hoff Int
hlen) = Int -> [Int]
loop Int
0
  where
    loop :: Int -> [Int]
loop !Int
i
      | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
hlen                        = []
      | Array -> Int -> Word8
A.unsafeIndex Array
harr (Int
hoffInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
i) Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== Word8
c = Int
i Int -> [Int] -> [Int]
forall a. a -> [a] -> [a]
: Int -> [Int]
loop (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
      | Bool
otherwise                        = Int -> [Int]
loop (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1)
{-# INLINE scanOne #-}

word8ToInt :: Word8 -> Int
word8ToInt :: Word8 -> Int
word8ToInt = Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral

intToCSize :: Int -> CSize
intToCSize :: Int -> CSize
intToCSize = Int -> CSize
forall a b. (Integral a, Num b) => a -> b
fromIntegral

cSsizeToInt :: CSsize -> Int
cSsizeToInt :: CSsize -> Int
cSsizeToInt = CSsize -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral

foreign import ccall unsafe "_hs_text_memchr" memchr
    :: ByteArray# -> CSize -> CSize -> Word8 -> IO CSsize