{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Data.Text.BoyerMooreCI.Automaton
( Automaton
, CaseSensitivity (..)
, CodeUnitIndex (..)
, Next (..)
, buildAutomaton
, patternLength
, patternText
, runText
, minimumSkipForCodePoint
) where
import Control.DeepSeq (NFData)
import Control.Monad.ST (runST)
import Data.Hashable (Hashable (..))
import Data.Text.Internal (Text (..))
import GHC.Generics (Generic)
#if defined(HAS_AESON)
import qualified Data.Aeson as AE
#endif
import Data.Text.CaseSensitivity (CaseSensitivity (..))
import Data.Text.Utf8 (BackwardsIter (..), CodePoint, CodeUnitIndex (..))
import Data.TypedByteArray (Prim, TypedByteArray)
import qualified Data.Char as Char
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as Text
import qualified Data.Text.Utf8 as Utf8
import qualified Data.TypedByteArray as TBA
data Next a
= Done !a
| Step !a
data Automaton = Automaton
{ Automaton -> TypedByteArray CodePoint
automatonPattern :: !(TypedByteArray CodePoint)
, Automaton -> Int
automatonPatternHash :: !Int
, Automaton -> SuffixTable
automatonSuffixTable :: !SuffixTable
, Automaton -> BadCharLookup
automatonBadCharLookup :: !BadCharLookup
, Automaton -> CodeUnitIndex
automatonMinPatternSkip :: !CodeUnitIndex
}
deriving stock ((forall x. Automaton -> Rep Automaton x)
-> (forall x. Rep Automaton x -> Automaton) -> Generic Automaton
forall x. Rep Automaton x -> Automaton
forall x. Automaton -> Rep Automaton x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cfrom :: forall x. Automaton -> Rep Automaton x
from :: forall x. Automaton -> Rep Automaton x
$cto :: forall x. Rep Automaton x -> Automaton
to :: forall x. Rep Automaton x -> Automaton
Generic, Int -> Automaton -> ShowS
[Automaton] -> ShowS
Automaton -> String
(Int -> Automaton -> ShowS)
-> (Automaton -> String)
-> ([Automaton] -> ShowS)
-> Show Automaton
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Automaton -> ShowS
showsPrec :: Int -> Automaton -> ShowS
$cshow :: Automaton -> String
show :: Automaton -> String
$cshowList :: [Automaton] -> ShowS
showList :: [Automaton] -> ShowS
Show)
deriving anyclass (Automaton -> ()
(Automaton -> ()) -> NFData Automaton
forall a. (a -> ()) -> NFData a
$crnf :: Automaton -> ()
rnf :: Automaton -> ()
NFData)
instance Hashable Automaton where
hashWithSalt :: Int -> Automaton -> Int
hashWithSalt Int
salt = Int -> Int -> Int
forall a. Hashable a => Int -> a -> Int
hashWithSalt Int
salt (Int -> Int) -> (Automaton -> Int) -> Automaton -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Automaton -> Int
automatonPatternHash
instance Eq Automaton where
Automaton
x == :: Automaton -> Automaton -> Bool
== Automaton
y = Automaton -> TypedByteArray CodePoint
automatonPattern Automaton
x TypedByteArray CodePoint -> TypedByteArray CodePoint -> Bool
forall a. Eq a => a -> a -> Bool
== Automaton -> TypedByteArray CodePoint
automatonPattern Automaton
y
#if defined(HAS_AESON)
instance AE.FromJSON Automaton where
parseJSON :: Value -> Parser Automaton
parseJSON Value
v = Text -> Automaton
buildAutomaton (Text -> Automaton) -> Parser Text -> Parser Automaton
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Value -> Parser Text
forall a. FromJSON a => Value -> Parser a
AE.parseJSON Value
v
instance AE.ToJSON Automaton where
toJSON :: Automaton -> Value
toJSON = Text -> Value
forall a. ToJSON a => a -> Value
AE.toJSON (Text -> Value) -> (Automaton -> Text) -> Automaton -> Value
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Automaton -> Text
patternText
#endif
buildAutomaton :: Text -> Automaton
buildAutomaton :: Text -> Automaton
buildAutomaton Text
pattern_ =
Automaton
{ automatonPattern :: TypedByteArray CodePoint
automatonPattern = TypedByteArray CodePoint
patternVec
, automatonPatternHash :: Int
automatonPatternHash = Text -> Int
forall a. Hashable a => a -> Int
hash Text
pattern_
, automatonSuffixTable :: SuffixTable
automatonSuffixTable = TypedByteArray CodePoint -> SuffixTable
buildSuffixTable TypedByteArray CodePoint
patternVec
, automatonBadCharLookup :: BadCharLookup
automatonBadCharLookup = TypedByteArray CodePoint -> BadCharLookup
buildBadCharLookup TypedByteArray CodePoint
patternVec
, automatonMinPatternSkip :: CodeUnitIndex
automatonMinPatternSkip = TypedByteArray CodePoint -> CodeUnitIndex
minimumSkipForVector TypedByteArray CodePoint
patternVec
}
where
patternVec :: TypedByteArray CodePoint
patternVec = String -> TypedByteArray CodePoint
forall a. Prim a => [a] -> TypedByteArray a
TBA.fromList (Text -> String
Text.unpack Text
pattern_)
runText :: forall a
. a
-> (a -> CodeUnitIndex -> CodeUnitIndex -> Next a)
-> Automaton
-> Text
-> a
{-# INLINE runText #-}
runText :: forall a.
a
-> (a -> CodeUnitIndex -> CodeUnitIndex -> Next a)
-> Automaton
-> Text
-> a
runText a
seed a -> CodeUnitIndex -> CodeUnitIndex -> Next a
f Automaton
automaton !Text
text
| TypedByteArray CodePoint -> Bool
forall a. TypedByteArray a -> Bool
TBA.null TypedByteArray CodePoint
pattern_ = a
seed
| Bool
otherwise = a -> CodeUnitIndex -> CodeUnitIndex -> a
alignPattern a
seed CodeUnitIndex
initialHaystackMin (CodeUnitIndex
initialHaystackMin CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
minPatternSkip CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1)
where
Automaton TypedByteArray CodePoint
pattern_ Int
_ SuffixTable
suffixTable BadCharLookup
badCharTable CodeUnitIndex
minPatternSkip = Automaton
automaton
haystackMax :: CodeUnitIndex
haystackMax = case Text
text of Text Array
_ Int
offset Int
len -> Int -> CodeUnitIndex
CodeUnitIndex (Int
offset Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)
initialHaystackMin :: CodeUnitIndex
initialHaystackMin = case Text
text of Text Array
_ Int
offset Int
_ -> Int -> CodeUnitIndex
CodeUnitIndex Int
offset
alignPattern
:: a
-> CodeUnitIndex
-> CodeUnitIndex
-> a
{-# INLINE alignPattern #-}
alignPattern :: a -> CodeUnitIndex -> CodeUnitIndex -> a
alignPattern !a
result !CodeUnitIndex
haystackMin !CodeUnitIndex
alignmentEnd
| CodeUnitIndex
alignmentEnd CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
> CodeUnitIndex
haystackMax = a
result
| Bool
otherwise =
let
!iter :: BackwardsIter
iter = Array -> CodeUnitIndex -> BackwardsIter
Utf8.unsafeIndexAnywhereInCodePoint' (case Text
text of Text Array
d Int
_ Int
_ -> Array
d) CodeUnitIndex
alignmentEnd
!patternIndex :: Int
patternIndex = TypedByteArray CodePoint -> Int
forall a. Prim a => TypedByteArray a -> Int
TBA.length TypedByteArray CodePoint
pattern_ Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1
!alignmentEnd' :: CodeUnitIndex
alignmentEnd' = BackwardsIter -> CodeUnitIndex
backwardsIterEndOfChar BackwardsIter
iter
in
a -> CodeUnitIndex -> CodeUnitIndex -> BackwardsIter -> Int -> a
matchLoop a
result CodeUnitIndex
haystackMin CodeUnitIndex
alignmentEnd' BackwardsIter
iter Int
patternIndex
matchLoop
:: a
-> CodeUnitIndex
-> CodeUnitIndex
-> BackwardsIter
-> Int
-> a
matchLoop :: a -> CodeUnitIndex -> CodeUnitIndex -> BackwardsIter -> Int -> a
matchLoop !a
result !CodeUnitIndex
haystackMin !CodeUnitIndex
alignmentEnd !BackwardsIter
iter !Int
patternIndex =
let
!haystackCodePointLower :: CodePoint
haystackCodePointLower = CodePoint -> CodePoint
Utf8.lowerCodePoint (BackwardsIter -> CodePoint
backwardsIterChar BackwardsIter
iter)
in
case CodePoint
haystackCodePointLower CodePoint -> CodePoint -> Bool
forall a. Eq a => a -> a -> Bool
== TypedByteArray CodePoint -> Int -> CodePoint
forall a. Prim a => TypedByteArray a -> Int -> a
TBA.unsafeIndex TypedByteArray CodePoint
pattern_ Int
patternIndex of
Bool
True | Int
patternIndex Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 ->
let !from :: CodeUnitIndex
from = BackwardsIter -> CodeUnitIndex
backwardsIterNext BackwardsIter
iter CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
1 CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
initialHaystackMin
!to :: CodeUnitIndex
to = CodeUnitIndex
alignmentEnd CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
initialHaystackMin
in
case a -> CodeUnitIndex -> CodeUnitIndex -> Next a
f a
result CodeUnitIndex
from CodeUnitIndex
to of
Done a
final -> a
final
Step a
intermediate ->
let haystackMin' :: CodeUnitIndex
haystackMin' = CodeUnitIndex
alignmentEnd CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
1
alignmentEnd' :: CodeUnitIndex
alignmentEnd' = CodeUnitIndex
alignmentEnd CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
minPatternSkip
in a -> CodeUnitIndex -> CodeUnitIndex -> a
alignPattern a
intermediate CodeUnitIndex
haystackMin' CodeUnitIndex
alignmentEnd'
Bool
True | BackwardsIter -> CodeUnitIndex
backwardsIterNext BackwardsIter
iter CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnitIndex
haystackMin ->
a -> CodeUnitIndex -> CodeUnitIndex -> a
alignPattern a
result CodeUnitIndex
haystackMin (CodeUnitIndex
alignmentEnd CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
1)
Bool
True ->
let
next :: CodeUnitIndex
next = BackwardsIter -> CodeUnitIndex
backwardsIterNext BackwardsIter
iter
!iter' :: BackwardsIter
iter' = Array -> CodeUnitIndex -> BackwardsIter
Utf8.unsafeIndexEndOfCodePoint' (case Text
text of Text Array
d Int
_ Int
_ -> Array
d) CodeUnitIndex
next
in
a -> CodeUnitIndex -> CodeUnitIndex -> BackwardsIter -> Int -> a
matchLoop a
result CodeUnitIndex
haystackMin CodeUnitIndex
alignmentEnd BackwardsIter
iter' (Int
patternIndex Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)
Bool
False ->
let
!fromBadChar :: CodeUnitIndex
fromBadChar =
BackwardsIter -> CodeUnitIndex
backwardsIterEndOfChar BackwardsIter
iter CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ BadCharLookup -> CodePoint -> CodeUnitIndex
badCharLookup BadCharLookup
badCharTable CodePoint
haystackCodePointLower
!fromSuffixLookup :: CodeUnitIndex
fromSuffixLookup =
CodeUnitIndex
alignmentEnd CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ SuffixTable -> Int -> CodeUnitIndex
suffixLookup SuffixTable
suffixTable Int
patternIndex
!alignmentEnd' :: CodeUnitIndex
alignmentEnd' = CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Ord a => a -> a -> a
max CodeUnitIndex
fromBadChar CodeUnitIndex
fromSuffixLookup
in
a -> CodeUnitIndex -> CodeUnitIndex -> a
alignPattern a
result CodeUnitIndex
haystackMin CodeUnitIndex
alignmentEnd'
patternLength :: Automaton -> CodeUnitIndex
patternLength :: Automaton -> CodeUnitIndex
patternLength = Text -> CodeUnitIndex
Utf8.lengthUtf8 (Text -> CodeUnitIndex)
-> (Automaton -> Text) -> Automaton -> CodeUnitIndex
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Automaton -> Text
patternText
patternText :: Automaton -> Text
patternText :: Automaton -> Text
patternText = String -> Text
Text.pack (String -> Text) -> (Automaton -> String) -> Automaton -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. TypedByteArray CodePoint -> String
forall a. Prim a => TypedByteArray a -> [a]
TBA.toList (TypedByteArray CodePoint -> String)
-> (Automaton -> TypedByteArray CodePoint) -> Automaton -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Automaton -> TypedByteArray CodePoint
automatonPattern
minimumSkipForCodePoint :: CodePoint -> CodeUnitIndex
minimumSkipForCodePoint :: CodePoint -> CodeUnitIndex
minimumSkipForCodePoint CodePoint
cp =
case CodePoint -> Int
Char.ord CodePoint
cp of
Int
c | Int
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0x80 -> CodeUnitIndex
1
Int
c | Int
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0x800 -> CodeUnitIndex
2
Int
0x2C65 -> CodeUnitIndex
2
Int
0x2C66 -> CodeUnitIndex
2
Int
c | Int
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0x10000 -> CodeUnitIndex
3
Int
_ -> CodeUnitIndex
4
minimumSkipForVector :: TypedByteArray CodePoint -> CodeUnitIndex
minimumSkipForVector :: TypedByteArray CodePoint -> CodeUnitIndex
minimumSkipForVector = (CodePoint -> CodeUnitIndex -> CodeUnitIndex)
-> CodeUnitIndex -> TypedByteArray CodePoint -> CodeUnitIndex
forall a b. Prim a => (a -> b -> b) -> b -> TypedByteArray a -> b
TBA.foldr (\CodePoint
cp CodeUnitIndex
s -> CodeUnitIndex
s CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodePoint -> CodeUnitIndex
minimumSkipForCodePoint CodePoint
cp) CodeUnitIndex
0
newtype SuffixTable = SuffixTable (TypedByteArray CodeUnitIndex)
deriving stock ((forall x. SuffixTable -> Rep SuffixTable x)
-> (forall x. Rep SuffixTable x -> SuffixTable)
-> Generic SuffixTable
forall x. Rep SuffixTable x -> SuffixTable
forall x. SuffixTable -> Rep SuffixTable x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cfrom :: forall x. SuffixTable -> Rep SuffixTable x
from :: forall x. SuffixTable -> Rep SuffixTable x
$cto :: forall x. Rep SuffixTable x -> SuffixTable
to :: forall x. Rep SuffixTable x -> SuffixTable
Generic)
deriving anyclass (SuffixTable -> ()
(SuffixTable -> ()) -> NFData SuffixTable
forall a. (a -> ()) -> NFData a
$crnf :: SuffixTable -> ()
rnf :: SuffixTable -> ()
NFData)
instance Show SuffixTable where
show :: SuffixTable -> String
show (SuffixTable TypedByteArray CodeUnitIndex
table) = String
"SuffixTable (TBA.toList " String -> ShowS
forall a. Semigroup a => a -> a -> a
<> [CodeUnitIndex] -> String
forall a. Show a => a -> String
show (TypedByteArray CodeUnitIndex -> [CodeUnitIndex]
forall a. Prim a => TypedByteArray a -> [a]
TBA.toList TypedByteArray CodeUnitIndex
table) String -> ShowS
forall a. Semigroup a => a -> a -> a
<> String
")"
suffixLookup :: SuffixTable -> Int -> CodeUnitIndex
{-# INLINE suffixLookup #-}
suffixLookup :: SuffixTable -> Int -> CodeUnitIndex
suffixLookup (SuffixTable TypedByteArray CodeUnitIndex
table) = TypedByteArray CodeUnitIndex -> Int -> CodeUnitIndex
forall a. Prim a => TypedByteArray a -> Int -> a
indexTable TypedByteArray CodeUnitIndex
table
buildSuffixTable :: TypedByteArray CodePoint -> SuffixTable
buildSuffixTable :: TypedByteArray CodePoint -> SuffixTable
buildSuffixTable TypedByteArray CodePoint
pattern_ = (forall s. ST s SuffixTable) -> SuffixTable
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s SuffixTable) -> SuffixTable)
-> (forall s. ST s SuffixTable) -> SuffixTable
forall a b. (a -> b) -> a -> b
$ do
let
patLen :: Int
patLen = TypedByteArray CodePoint -> Int
forall a. Prim a => TypedByteArray a -> Int
TBA.length TypedByteArray CodePoint
pattern_
wholePatternSkip :: CodeUnitIndex
wholePatternSkip = TypedByteArray CodePoint -> CodeUnitIndex
minimumSkipForVector TypedByteArray CodePoint
pattern_
MutableTypedByteArray CodeUnitIndex (PrimState (ST s))
table <- Int
-> ST s (MutableTypedByteArray CodeUnitIndex (PrimState (ST s)))
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
Int -> m (MutableTypedByteArray a (PrimState m))
TBA.newTypedByteArray Int
patLen
let
init1 :: CodeUnitIndex -> Int -> m ()
init1 CodeUnitIndex
lastSkipBytes Int
p
| Int
p Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0 = do
let
skipBytes :: CodeUnitIndex
skipBytes = case TypedByteArray CodePoint -> Int -> Maybe CodeUnitIndex
suffixIsPrefix TypedByteArray CodePoint
pattern_ (Int
p Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) of
Maybe CodeUnitIndex
Nothing -> CodeUnitIndex
lastSkipBytes
Just CodeUnitIndex
nonSkippableBytes -> CodeUnitIndex
wholePatternSkip CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
nonSkippableBytes
MutableTypedByteArray CodeUnitIndex (PrimState m)
-> Int -> CodeUnitIndex -> m ()
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
MutableTypedByteArray a (PrimState m) -> Int -> a -> m ()
TBA.writeTypedByteArray MutableTypedByteArray CodeUnitIndex (PrimState m)
MutableTypedByteArray CodeUnitIndex (PrimState (ST s))
table Int
p CodeUnitIndex
skipBytes
CodeUnitIndex -> Int -> m ()
init1 CodeUnitIndex
skipBytes (Int
p Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)
| Bool
otherwise = () -> m ()
forall a. a -> m a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
init2 :: Int -> CodeUnitIndex -> ST s ()
init2 Int
p CodeUnitIndex
skipBytes
| Int
p Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
patLen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1 = do
let skipBytes' :: CodeUnitIndex
skipBytes' = CodeUnitIndex
skipBytes CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodePoint -> CodeUnitIndex
minimumSkipForCodePoint (TypedByteArray CodePoint -> Int -> CodePoint
forall a. Prim a => TypedByteArray a -> Int -> a
TBA.unsafeIndex TypedByteArray CodePoint
pattern_ Int
p)
case TypedByteArray CodePoint -> Int -> Maybe Int
substringIsSuffix TypedByteArray CodePoint
pattern_ Int
p of
Maybe Int
Nothing -> () -> ST s ()
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
Just Int
suffixLen -> do
MutableTypedByteArray CodeUnitIndex (PrimState (ST s))
-> Int -> CodeUnitIndex -> ST s ()
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
MutableTypedByteArray a (PrimState m) -> Int -> a -> m ()
TBA.writeTypedByteArray MutableTypedByteArray CodeUnitIndex (PrimState (ST s))
table (Int
patLen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
suffixLen) CodeUnitIndex
skipBytes'
Int -> CodeUnitIndex -> ST s ()
init2 (Int
p Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) CodeUnitIndex
skipBytes'
| Bool
otherwise = () -> ST s ()
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
CodeUnitIndex -> Int -> ST s ()
forall {m :: * -> *}.
(PrimState m ~ PrimState (ST s), PrimMonad m) =>
CodeUnitIndex -> Int -> m ()
init1 (CodeUnitIndex
wholePatternSkipCodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
-CodeUnitIndex
1) (Int
patLen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)
Int -> CodeUnitIndex -> ST s ()
init2 Int
0 CodeUnitIndex
wholePatternSkip
MutableTypedByteArray CodeUnitIndex (PrimState (ST s))
-> Int -> CodeUnitIndex -> ST s ()
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
MutableTypedByteArray a (PrimState m) -> Int -> a -> m ()
TBA.writeTypedByteArray MutableTypedByteArray CodeUnitIndex (PrimState (ST s))
table (Int
patLen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1) (Int -> CodeUnitIndex
CodeUnitIndex Int
1)
TypedByteArray CodeUnitIndex -> SuffixTable
SuffixTable (TypedByteArray CodeUnitIndex -> SuffixTable)
-> ST s (TypedByteArray CodeUnitIndex) -> ST s SuffixTable
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MutableTypedByteArray CodeUnitIndex (PrimState (ST s))
-> ST s (TypedByteArray CodeUnitIndex)
forall (m :: * -> *) a.
PrimMonad m =>
MutableTypedByteArray a (PrimState m) -> m (TypedByteArray a)
TBA.unsafeFreezeTypedByteArray MutableTypedByteArray CodeUnitIndex (PrimState (ST s))
table
suffixIsPrefix :: TypedByteArray CodePoint -> Int -> Maybe CodeUnitIndex
suffixIsPrefix :: TypedByteArray CodePoint -> Int -> Maybe CodeUnitIndex
suffixIsPrefix TypedByteArray CodePoint
pattern_ Int
pos = Int -> CodeUnitIndex -> Maybe CodeUnitIndex
go Int
0 (Int -> CodeUnitIndex
CodeUnitIndex Int
0)
where
suffixLen :: Int
suffixLen = TypedByteArray CodePoint -> Int
forall a. Prim a => TypedByteArray a -> Int
TBA.length TypedByteArray CodePoint
pattern_ Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
pos
go :: Int -> CodeUnitIndex -> Maybe CodeUnitIndex
go !Int
i !CodeUnitIndex
skipBytes
| Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
suffixLen =
let prefixChar :: CodePoint
prefixChar = TypedByteArray CodePoint -> Int -> CodePoint
forall a. Prim a => TypedByteArray a -> Int -> a
TBA.unsafeIndex TypedByteArray CodePoint
pattern_ Int
i in
if CodePoint
prefixChar CodePoint -> CodePoint -> Bool
forall a. Eq a => a -> a -> Bool
== TypedByteArray CodePoint -> Int -> CodePoint
forall a. Prim a => TypedByteArray a -> Int -> a
TBA.unsafeIndex TypedByteArray CodePoint
pattern_ (Int
pos Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i)
then Int -> CodeUnitIndex -> Maybe CodeUnitIndex
go (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (CodeUnitIndex
skipBytes CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodePoint -> CodeUnitIndex
minimumSkipForCodePoint CodePoint
prefixChar)
else Maybe CodeUnitIndex
forall a. Maybe a
Nothing
| Bool
otherwise = CodeUnitIndex -> Maybe CodeUnitIndex
forall a. a -> Maybe a
Just CodeUnitIndex
skipBytes
substringIsSuffix :: TypedByteArray CodePoint -> Int -> Maybe Int
substringIsSuffix :: TypedByteArray CodePoint -> Int -> Maybe Int
substringIsSuffix TypedByteArray CodePoint
pattern_ Int
pos = Int -> Maybe Int
go Int
0
where
patLen :: Int
patLen = TypedByteArray CodePoint -> Int
forall a. Prim a => TypedByteArray a -> Int
TBA.length TypedByteArray CodePoint
pattern_
go :: Int -> Maybe Int
go Int
i | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
pos = Maybe Int
forall a. Maybe a
Nothing
| TypedByteArray CodePoint -> Int -> CodePoint
forall a. Prim a => TypedByteArray a -> Int -> a
TBA.unsafeIndex TypedByteArray CodePoint
pattern_ (Int
pos Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
i) CodePoint -> CodePoint -> Bool
forall a. Eq a => a -> a -> Bool
== TypedByteArray CodePoint -> Int -> CodePoint
forall a. Prim a => TypedByteArray a -> Int -> a
TBA.unsafeIndex TypedByteArray CodePoint
pattern_ (Int
patLen Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1 Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
i) =
Int -> Maybe Int
go (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1)
| Int
i Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0 = Maybe Int
forall a. Maybe a
Nothing
| Bool
otherwise = Int -> Maybe Int
forall a. a -> Maybe a
Just Int
i
data BadCharLookup = BadCharLookup
{ BadCharLookup -> TypedByteArray CodeUnitIndex
badCharLookupTable :: {-# UNPACK #-} !(TypedByteArray CodeUnitIndex)
, BadCharLookup -> HashMap CodePoint CodeUnitIndex
badCharLookupMap :: !(HashMap.HashMap CodePoint CodeUnitIndex)
, BadCharLookup -> CodeUnitIndex
badCharLookupDefault :: !CodeUnitIndex
}
deriving stock ((forall x. BadCharLookup -> Rep BadCharLookup x)
-> (forall x. Rep BadCharLookup x -> BadCharLookup)
-> Generic BadCharLookup
forall x. Rep BadCharLookup x -> BadCharLookup
forall x. BadCharLookup -> Rep BadCharLookup x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cfrom :: forall x. BadCharLookup -> Rep BadCharLookup x
from :: forall x. BadCharLookup -> Rep BadCharLookup x
$cto :: forall x. Rep BadCharLookup x -> BadCharLookup
to :: forall x. Rep BadCharLookup x -> BadCharLookup
Generic, Int -> BadCharLookup -> ShowS
[BadCharLookup] -> ShowS
BadCharLookup -> String
(Int -> BadCharLookup -> ShowS)
-> (BadCharLookup -> String)
-> ([BadCharLookup] -> ShowS)
-> Show BadCharLookup
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> BadCharLookup -> ShowS
showsPrec :: Int -> BadCharLookup -> ShowS
$cshow :: BadCharLookup -> String
show :: BadCharLookup -> String
$cshowList :: [BadCharLookup] -> ShowS
showList :: [BadCharLookup] -> ShowS
Show)
deriving anyclass (BadCharLookup -> ()
(BadCharLookup -> ()) -> NFData BadCharLookup
forall a. (a -> ()) -> NFData a
$crnf :: BadCharLookup -> ()
rnf :: BadCharLookup -> ()
NFData)
badCharTableSize :: Int
{-# INLINE badCharTableSize #-}
badCharTableSize :: Int
badCharTableSize = Int
256
badCharLookup :: BadCharLookup -> CodePoint -> CodeUnitIndex
{-# INLINE badCharLookup #-}
badCharLookup :: BadCharLookup -> CodePoint -> CodeUnitIndex
badCharLookup (BadCharLookup TypedByteArray CodeUnitIndex
bclTable HashMap CodePoint CodeUnitIndex
bclMap CodeUnitIndex
bclDefault) CodePoint
char
| Int
intChar Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
badCharTableSize = TypedByteArray CodeUnitIndex -> Int -> CodeUnitIndex
forall a. Prim a => TypedByteArray a -> Int -> a
indexTable TypedByteArray CodeUnitIndex
bclTable Int
intChar
| Bool
otherwise = CodeUnitIndex
-> CodePoint -> HashMap CodePoint CodeUnitIndex -> CodeUnitIndex
forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
HashMap.lookupDefault CodeUnitIndex
bclDefault CodePoint
char HashMap CodePoint CodeUnitIndex
bclMap
where
intChar :: Int
intChar = CodePoint -> Int
forall a. Enum a => a -> Int
fromEnum CodePoint
char
buildBadCharLookup :: TypedByteArray CodePoint -> BadCharLookup
buildBadCharLookup :: TypedByteArray CodePoint -> BadCharLookup
buildBadCharLookup TypedByteArray CodePoint
pattern_ = (forall s. ST s BadCharLookup) -> BadCharLookup
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s BadCharLookup) -> BadCharLookup)
-> (forall s. ST s BadCharLookup) -> BadCharLookup
forall a b. (a -> b) -> a -> b
$ do
let
defaultSkip :: CodeUnitIndex
defaultSkip = TypedByteArray CodePoint -> CodeUnitIndex
minimumSkipForVector TypedByteArray CodePoint
pattern_
MutableTypedByteArray CodeUnitIndex (PrimState (ST s))
table <- (Int
-> CodeUnitIndex
-> ST s (MutableTypedByteArray CodeUnitIndex (PrimState (ST s)))
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
Int -> a -> m (MutableTypedByteArray a (PrimState m))
TBA.replicate Int
badCharTableSize CodeUnitIndex
defaultSkip)
let
fillTable :: HashMap CodePoint CodeUnitIndex
-> CodeUnitIndex
-> String
-> ST s (HashMap CodePoint CodeUnitIndex)
fillTable !HashMap CodePoint CodeUnitIndex
badCharMap !CodeUnitIndex
skipBytes = \case
[] -> HashMap CodePoint CodeUnitIndex
-> ST s (HashMap CodePoint CodeUnitIndex)
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure HashMap CodePoint CodeUnitIndex
badCharMap
[CodePoint
_] -> HashMap CodePoint CodeUnitIndex
-> ST s (HashMap CodePoint CodeUnitIndex)
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure HashMap CodePoint CodeUnitIndex
badCharMap
(!CodePoint
patChar : !String
patChars) ->
let skipBytes' :: CodeUnitIndex
skipBytes' = CodeUnitIndex
skipBytes CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodePoint -> CodeUnitIndex
minimumSkipForCodePoint CodePoint
patChar in
if CodePoint -> Int
forall a. Enum a => a -> Int
fromEnum CodePoint
patChar Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
badCharTableSize
then do
MutableTypedByteArray CodeUnitIndex (PrimState (ST s))
-> Int -> CodeUnitIndex -> ST s ()
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
MutableTypedByteArray a (PrimState m) -> Int -> a -> m ()
TBA.writeTypedByteArray MutableTypedByteArray CodeUnitIndex (PrimState (ST s))
table (CodePoint -> Int
forall a. Enum a => a -> Int
fromEnum CodePoint
patChar) CodeUnitIndex
skipBytes'
HashMap CodePoint CodeUnitIndex
-> CodeUnitIndex
-> String
-> ST s (HashMap CodePoint CodeUnitIndex)
fillTable HashMap CodePoint CodeUnitIndex
badCharMap CodeUnitIndex
skipBytes' String
patChars
else
let badCharMap' :: HashMap CodePoint CodeUnitIndex
badCharMap' = CodePoint
-> CodeUnitIndex
-> HashMap CodePoint CodeUnitIndex
-> HashMap CodePoint CodeUnitIndex
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
HashMap.insert CodePoint
patChar CodeUnitIndex
skipBytes' HashMap CodePoint CodeUnitIndex
badCharMap
in HashMap CodePoint CodeUnitIndex
-> CodeUnitIndex
-> String
-> ST s (HashMap CodePoint CodeUnitIndex)
fillTable HashMap CodePoint CodeUnitIndex
badCharMap' CodeUnitIndex
skipBytes' String
patChars
HashMap CodePoint CodeUnitIndex
badCharMap <- HashMap CodePoint CodeUnitIndex
-> CodeUnitIndex
-> String
-> ST s (HashMap CodePoint CodeUnitIndex)
fillTable HashMap CodePoint CodeUnitIndex
forall k v. HashMap k v
HashMap.empty CodeUnitIndex
defaultSkip (TypedByteArray CodePoint -> String
forall a. Prim a => TypedByteArray a -> [a]
TBA.toList TypedByteArray CodePoint
pattern_)
TypedByteArray CodeUnitIndex
tableFrozen <- MutableTypedByteArray CodeUnitIndex (PrimState (ST s))
-> ST s (TypedByteArray CodeUnitIndex)
forall (m :: * -> *) a.
PrimMonad m =>
MutableTypedByteArray a (PrimState m) -> m (TypedByteArray a)
TBA.unsafeFreezeTypedByteArray MutableTypedByteArray CodeUnitIndex (PrimState (ST s))
table
BadCharLookup -> ST s BadCharLookup
forall a. a -> ST s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure BadCharLookup
{ badCharLookupTable :: TypedByteArray CodeUnitIndex
badCharLookupTable = TypedByteArray CodeUnitIndex
tableFrozen
, badCharLookupMap :: HashMap CodePoint CodeUnitIndex
badCharLookupMap = HashMap CodePoint CodeUnitIndex
badCharMap
, badCharLookupDefault :: CodeUnitIndex
badCharLookupDefault = CodeUnitIndex
defaultSkip
}
indexTable :: Prim a => TypedByteArray a -> Int -> a
{-# INLINE indexTable #-}
indexTable :: forall a. Prim a => TypedByteArray a -> Int -> a
indexTable = TypedByteArray a -> Int -> a
forall a. Prim a => TypedByteArray a -> Int -> a
TBA.unsafeIndex