{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Data.Text.BoyerMoore.Automaton
( Automaton
, CaseSensitivity (..)
, CodeUnitIndex (..)
, Next (..)
, buildAutomaton
, patternLength
, patternText
, runLower
, runText
) where
import Prelude hiding (length)
import Control.DeepSeq (NFData)
import Control.Monad (when)
import Control.Monad.ST (runST)
import Data.Hashable (Hashable (..), Hashed, hashed, unhashed)
import Data.Text.Internal (Text (..))
import Data.TypedByteArray (Prim, TypedByteArray)
import GHC.Generics (Generic)
#if defined(HAS_AESON)
import qualified Data.Aeson as AE
#endif
import qualified Data.HashMap.Strict as HashMap
import Data.Text.AhoCorasick.Automaton (Next (..))
import Data.Text.CaseSensitivity (CaseSensitivity (..))
import Data.Text.Utf16 (CodeUnit, CodeUnitIndex (..), lengthUtf16, lowerCodeUnit, unsafeIndexUtf16)
import qualified Data.TypedByteArray as TBA
data Automaton = Automaton
{ Automaton -> Hashed Text
automatonPattern :: Hashed Text
, Automaton -> SuffixTable
automatonSuffixTable :: SuffixTable
, Automaton -> BadCharTable
automatonBadCharTable :: BadCharTable
}
deriving stock ((forall x. Automaton -> Rep Automaton x)
-> (forall x. Rep Automaton x -> Automaton) -> Generic Automaton
forall x. Rep Automaton x -> Automaton
forall x. Automaton -> Rep Automaton x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep Automaton x -> Automaton
$cfrom :: forall x. Automaton -> Rep Automaton x
Generic, Int -> Automaton -> ShowS
[Automaton] -> ShowS
Automaton -> String
(Int -> Automaton -> ShowS)
-> (Automaton -> String)
-> ([Automaton] -> ShowS)
-> Show Automaton
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Automaton] -> ShowS
$cshowList :: [Automaton] -> ShowS
show :: Automaton -> String
$cshow :: Automaton -> String
showsPrec :: Int -> Automaton -> ShowS
$cshowsPrec :: Int -> Automaton -> ShowS
Show)
deriving anyclass (Automaton -> ()
(Automaton -> ()) -> NFData Automaton
forall a. (a -> ()) -> NFData a
rnf :: Automaton -> ()
$crnf :: Automaton -> ()
NFData)
instance Hashable Automaton where
hashWithSalt :: Int -> Automaton -> Int
hashWithSalt Int
salt (Automaton Hashed Text
pattern SuffixTable
_ BadCharTable
_) = Int -> Hashed Text -> Int
forall a. Hashable a => Int -> a -> Int
hashWithSalt Int
salt Hashed Text
pattern
instance Eq Automaton where
(Automaton Hashed Text
pat1 SuffixTable
_ BadCharTable
_) == :: Automaton -> Automaton -> Bool
== (Automaton Hashed Text
pat2 SuffixTable
_ BadCharTable
_) = Hashed Text
pat1 Hashed Text -> Hashed Text -> Bool
forall a. Eq a => a -> a -> Bool
== Hashed Text
pat2
#if defined(HAS_AESON)
instance AE.FromJSON Automaton where
parseJSON :: Value -> Parser Automaton
parseJSON Value
v = Text -> Automaton
buildAutomaton (Text -> Automaton) -> Parser Text -> Parser Automaton
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Value -> Parser Text
forall a. FromJSON a => Value -> Parser a
AE.parseJSON Value
v
instance AE.ToJSON Automaton where
toJSON :: Automaton -> Value
toJSON = Text -> Value
forall a. ToJSON a => a -> Value
AE.toJSON (Text -> Value) -> (Automaton -> Text) -> Automaton -> Value
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Hashed Text -> Text
forall a. Hashed a -> a
unhashed (Hashed Text -> Text)
-> (Automaton -> Hashed Text) -> Automaton -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Automaton -> Hashed Text
automatonPattern
#endif
buildAutomaton :: Text -> Automaton
buildAutomaton :: Text -> Automaton
buildAutomaton Text
pattern = Hashed Text -> SuffixTable -> BadCharTable -> Automaton
Automaton (Text -> Hashed Text
forall a. Hashable a => a -> Hashed a
hashed Text
pattern) (Text -> SuffixTable
buildSuffixTable Text
pattern) (Text -> BadCharTable
buildBadCharTable Text
pattern)
runWithCase
:: forall a
. CaseSensitivity
-> a
-> (a -> CodeUnitIndex -> Next a)
-> Automaton
-> Text
-> a
{-# INLINE runWithCase #-}
runWithCase :: CaseSensitivity
-> a -> (a -> CodeUnitIndex -> Next a) -> Automaton -> Text -> a
runWithCase CaseSensitivity
caseSensitivity a
seed a -> CodeUnitIndex -> Next a
f Automaton
automaton Text
text
| CodeUnitIndex
patLen CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Eq a => a -> a -> Bool
== CodeUnitIndex
0 = a
seed
| Bool
otherwise = a -> CodeUnitIndex -> a
go a
seed (CodeUnitIndex
patLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1)
where
Automaton Hashed Text
patternHashed SuffixTable
suffixTable BadCharTable
badCharTable = Automaton
automaton
pattern :: Text
pattern = Hashed Text -> Text
forall a. Hashed a -> a
unhashed Hashed Text
patternHashed
patLen :: CodeUnitIndex
patLen = Text -> CodeUnitIndex
lengthUtf16 Text
pattern
stringLen :: CodeUnitIndex
stringLen = Text -> CodeUnitIndex
lengthUtf16 Text
text
inputCasedAt :: CodeUnitIndex -> CodeUnit
inputCasedAt = case CaseSensitivity
caseSensitivity of
CaseSensitivity
CaseSensitive -> Text -> CodeUnitIndex -> CodeUnit
indexCodePoint Text
text
CaseSensitivity
IgnoreCase -> CodeUnit -> CodeUnit
lowerCodeUnit (CodeUnit -> CodeUnit)
-> (CodeUnitIndex -> CodeUnit) -> CodeUnitIndex -> CodeUnit
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> CodeUnitIndex -> CodeUnit
indexCodePoint Text
text
go :: a -> CodeUnitIndex -> a
go a
result CodeUnitIndex
haystackIndex
| CodeUnitIndex
haystackIndex CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnitIndex
stringLen = a -> CodeUnitIndex -> CodeUnitIndex -> a
matchLoop a
result CodeUnitIndex
haystackIndex (CodeUnitIndex
patLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1)
| Bool
otherwise = a
result
{-# INLINE go #-}
matchLoop :: a -> CodeUnitIndex -> CodeUnitIndex -> a
matchLoop a
result CodeUnitIndex
haystackIndex CodeUnitIndex
needleIndex
| CodeUnitIndex
needleIndex CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
>= CodeUnitIndex
0 Bool -> Bool -> Bool
&& CodeUnitIndex -> CodeUnit
inputCasedAt CodeUnitIndex
haystackIndex CodeUnit -> CodeUnit -> Bool
forall a. Eq a => a -> a -> Bool
== Text -> CodeUnitIndex -> CodeUnit
indexCodePoint Text
pattern CodeUnitIndex
needleIndex =
a -> CodeUnitIndex -> CodeUnitIndex -> a
matchLoop a
result (CodeUnitIndex
haystackIndex CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1) (CodeUnitIndex
needleIndex CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1)
| CodeUnitIndex
needleIndex CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnitIndex
0 =
case a -> CodeUnitIndex -> Next a
f a
result (CodeUnitIndex
haystackIndex CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
1) of
Done a
final -> a
final
Step a
intermediate -> a -> CodeUnitIndex -> a
go a
intermediate (CodeUnitIndex
haystackIndex CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
2 CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
* CodeUnitIndex
patLen)
| Bool
otherwise =
let
badCharSkip :: CodeUnitIndex
badCharSkip = BadCharTable -> CodeUnit -> CodeUnitIndex
badCharLookup BadCharTable
badCharTable (CodeUnitIndex -> CodeUnit
inputCasedAt CodeUnitIndex
haystackIndex)
suffixSkip :: CodeUnitIndex
suffixSkip = SuffixTable -> CodeUnitIndex -> CodeUnitIndex
suffixLookup SuffixTable
suffixTable CodeUnitIndex
needleIndex
skip :: CodeUnitIndex
skip = CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Ord a => a -> a -> a
max CodeUnitIndex
badCharSkip CodeUnitIndex
suffixSkip
in
a -> CodeUnitIndex -> a
go a
result (CodeUnitIndex
haystackIndex CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
skip)
{-# INLINE runText #-}
runText :: forall a. a -> (a -> CodeUnitIndex -> Next a) -> Automaton -> Text -> a
runText :: a -> (a -> CodeUnitIndex -> Next a) -> Automaton -> Text -> a
runText = CaseSensitivity
-> a -> (a -> CodeUnitIndex -> Next a) -> Automaton -> Text -> a
forall a.
CaseSensitivity
-> a -> (a -> CodeUnitIndex -> Next a) -> Automaton -> Text -> a
runWithCase CaseSensitivity
CaseSensitive
{-# INLINE runLower #-}
runLower :: forall a. a -> (a -> CodeUnitIndex -> Next a) -> Automaton -> Text -> a
runLower :: a -> (a -> CodeUnitIndex -> Next a) -> Automaton -> Text -> a
runLower = CaseSensitivity
-> a -> (a -> CodeUnitIndex -> Next a) -> Automaton -> Text -> a
forall a.
CaseSensitivity
-> a -> (a -> CodeUnitIndex -> Next a) -> Automaton -> Text -> a
runWithCase CaseSensitivity
IgnoreCase
patternLength :: Automaton -> CodeUnitIndex
patternLength :: Automaton -> CodeUnitIndex
patternLength = Text -> CodeUnitIndex
lengthUtf16 (Text -> CodeUnitIndex)
-> (Automaton -> Text) -> Automaton -> CodeUnitIndex
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Automaton -> Text
patternText
patternText :: Automaton -> Text
patternText :: Automaton -> Text
patternText (Automaton Hashed Text
pattern SuffixTable
_ BadCharTable
_) = Hashed Text -> Text
forall a. Hashed a -> a
unhashed Hashed Text
pattern
newtype SuffixTable = SuffixTable (TypedByteArray Int)
deriving stock ((forall x. SuffixTable -> Rep SuffixTable x)
-> (forall x. Rep SuffixTable x -> SuffixTable)
-> Generic SuffixTable
forall x. Rep SuffixTable x -> SuffixTable
forall x. SuffixTable -> Rep SuffixTable x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep SuffixTable x -> SuffixTable
$cfrom :: forall x. SuffixTable -> Rep SuffixTable x
Generic, Int -> SuffixTable -> ShowS
[SuffixTable] -> ShowS
SuffixTable -> String
(Int -> SuffixTable -> ShowS)
-> (SuffixTable -> String)
-> ([SuffixTable] -> ShowS)
-> Show SuffixTable
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [SuffixTable] -> ShowS
$cshowList :: [SuffixTable] -> ShowS
show :: SuffixTable -> String
$cshow :: SuffixTable -> String
showsPrec :: Int -> SuffixTable -> ShowS
$cshowsPrec :: Int -> SuffixTable -> ShowS
Show)
deriving anyclass (SuffixTable -> ()
(SuffixTable -> ()) -> NFData SuffixTable
forall a. (a -> ()) -> NFData a
rnf :: SuffixTable -> ()
$crnf :: SuffixTable -> ()
NFData)
suffixLookup :: SuffixTable -> CodeUnitIndex -> CodeUnitIndex
{-# INLINE suffixLookup #-}
suffixLookup :: SuffixTable -> CodeUnitIndex -> CodeUnitIndex
suffixLookup (SuffixTable TypedByteArray Int
table) = Int -> CodeUnitIndex
CodeUnitIndex (Int -> CodeUnitIndex)
-> (CodeUnitIndex -> Int) -> CodeUnitIndex -> CodeUnitIndex
forall b c a. (b -> c) -> (a -> b) -> a -> c
. TypedByteArray Int -> Int -> Int
forall a. Prim a => TypedByteArray a -> Int -> a
indexTable TypedByteArray Int
table (Int -> Int) -> (CodeUnitIndex -> Int) -> CodeUnitIndex -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CodeUnitIndex -> Int
codeUnitIndex
buildSuffixTable :: Text -> SuffixTable
buildSuffixTable :: Text -> SuffixTable
buildSuffixTable Text
pattern = (forall s. ST s SuffixTable) -> SuffixTable
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s SuffixTable) -> SuffixTable)
-> (forall s. ST s SuffixTable) -> SuffixTable
forall a b. (a -> b) -> a -> b
$ do
let patLen :: CodeUnitIndex
patLen = Text -> CodeUnitIndex
lengthUtf16 Text
pattern
MutableTypedByteArray Int s
table <- Int -> ST s (MutableTypedByteArray Int (PrimState (ST s)))
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
Int -> m (MutableTypedByteArray a (PrimState m))
TBA.newTypedByteArray (Int -> ST s (MutableTypedByteArray Int (PrimState (ST s))))
-> Int -> ST s (MutableTypedByteArray Int (PrimState (ST s)))
forall a b. (a -> b) -> a -> b
$ CodeUnitIndex -> Int
codeUnitIndex CodeUnitIndex
patLen
let
init1 :: CodeUnitIndex -> CodeUnitIndex -> ST s ()
init1 CodeUnitIndex
lastPrefixIndex CodeUnitIndex
p
| CodeUnitIndex
p CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
>= CodeUnitIndex
0 = do
let
prefixIndex :: CodeUnitIndex
prefixIndex
| Text -> CodeUnitIndex -> Bool
isPrefix Text
pattern (CodeUnitIndex
p CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
1) = CodeUnitIndex
p CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
1
| Bool
otherwise = CodeUnitIndex
lastPrefixIndex
MutableTypedByteArray Int (PrimState (ST s))
-> Int -> Int -> ST s ()
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
MutableTypedByteArray a (PrimState m) -> Int -> a -> m ()
TBA.writeTypedByteArray MutableTypedByteArray Int s
MutableTypedByteArray Int (PrimState (ST s))
table (CodeUnitIndex -> Int
codeUnitIndex CodeUnitIndex
p) (CodeUnitIndex -> Int
codeUnitIndex (CodeUnitIndex -> Int) -> CodeUnitIndex -> Int
forall a b. (a -> b) -> a -> b
$ CodeUnitIndex
prefixIndex CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
patLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1 CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
p)
CodeUnitIndex -> CodeUnitIndex -> ST s ()
init1 CodeUnitIndex
prefixIndex (CodeUnitIndex
p CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1)
| Bool
otherwise = () -> ST s ()
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
init2 :: CodeUnitIndex -> ST s ()
init2 CodeUnitIndex
p
| CodeUnitIndex
p CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnitIndex
patLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1 = do
let
suffixLen :: CodeUnitIndex
suffixLen = Text -> CodeUnitIndex -> CodeUnitIndex
suffixLength Text
pattern CodeUnitIndex
p
Bool -> ST s () -> ST s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (Text -> CodeUnitIndex -> CodeUnit
indexCodePoint Text
pattern (CodeUnitIndex
p CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
suffixLen) CodeUnit -> CodeUnit -> Bool
forall a. Eq a => a -> a -> Bool
/= Text -> CodeUnitIndex -> CodeUnit
indexCodePoint Text
pattern (CodeUnitIndex
patLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1 CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
suffixLen)) (ST s () -> ST s ()) -> ST s () -> ST s ()
forall a b. (a -> b) -> a -> b
$
MutableTypedByteArray Int (PrimState (ST s))
-> Int -> Int -> ST s ()
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
MutableTypedByteArray a (PrimState m) -> Int -> a -> m ()
TBA.writeTypedByteArray MutableTypedByteArray Int s
MutableTypedByteArray Int (PrimState (ST s))
table (CodeUnitIndex -> Int
codeUnitIndex (CodeUnitIndex -> Int) -> CodeUnitIndex -> Int
forall a b. (a -> b) -> a -> b
$ CodeUnitIndex
patLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1 CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
suffixLen) (CodeUnitIndex -> Int
codeUnitIndex (CodeUnitIndex -> Int) -> CodeUnitIndex -> Int
forall a b. (a -> b) -> a -> b
$ CodeUnitIndex
patLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1 CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
p CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
suffixLen)
CodeUnitIndex -> ST s ()
init2 (CodeUnitIndex
p CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
1)
| Bool
otherwise = () -> ST s ()
forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
CodeUnitIndex -> CodeUnitIndex -> ST s ()
init1 (CodeUnitIndex
patLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1) (CodeUnitIndex
patLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1)
CodeUnitIndex -> ST s ()
init2 CodeUnitIndex
0
TypedByteArray Int -> SuffixTable
SuffixTable (TypedByteArray Int -> SuffixTable)
-> ST s (TypedByteArray Int) -> ST s SuffixTable
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> MutableTypedByteArray Int (PrimState (ST s))
-> ST s (TypedByteArray Int)
forall (m :: * -> *) a.
PrimMonad m =>
MutableTypedByteArray a (PrimState m) -> m (TypedByteArray a)
TBA.unsafeFreezeTypedByteArray MutableTypedByteArray Int s
MutableTypedByteArray Int (PrimState (ST s))
table
data BadCharTable = BadCharTable
{ BadCharTable -> TypedByteArray Int
badCharTableAscii :: {-# UNPACK #-} !(TypedByteArray Int)
, BadCharTable -> HashMap CodeUnit CodeUnitIndex
badCharTableNonAscii :: !(HashMap.HashMap CodeUnit CodeUnitIndex)
, BadCharTable -> CodeUnitIndex
badCharTablePatternLen :: CodeUnitIndex
}
deriving stock ((forall x. BadCharTable -> Rep BadCharTable x)
-> (forall x. Rep BadCharTable x -> BadCharTable)
-> Generic BadCharTable
forall x. Rep BadCharTable x -> BadCharTable
forall x. BadCharTable -> Rep BadCharTable x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep BadCharTable x -> BadCharTable
$cfrom :: forall x. BadCharTable -> Rep BadCharTable x
Generic, Int -> BadCharTable -> ShowS
[BadCharTable] -> ShowS
BadCharTable -> String
(Int -> BadCharTable -> ShowS)
-> (BadCharTable -> String)
-> ([BadCharTable] -> ShowS)
-> Show BadCharTable
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [BadCharTable] -> ShowS
$cshowList :: [BadCharTable] -> ShowS
show :: BadCharTable -> String
$cshow :: BadCharTable -> String
showsPrec :: Int -> BadCharTable -> ShowS
$cshowsPrec :: Int -> BadCharTable -> ShowS
Show)
deriving anyclass (BadCharTable -> ()
(BadCharTable -> ()) -> NFData BadCharTable
forall a. (a -> ()) -> NFData a
rnf :: BadCharTable -> ()
$crnf :: BadCharTable -> ()
NFData)
asciiCount :: Int
{-# INLINE asciiCount #-}
asciiCount :: Int
asciiCount = Int
128
badCharLookup :: BadCharTable -> CodeUnit -> CodeUnitIndex
{-# INLINE badCharLookup #-}
badCharLookup :: BadCharTable -> CodeUnit -> CodeUnitIndex
badCharLookup (BadCharTable TypedByteArray Int
asciiTable HashMap CodeUnit CodeUnitIndex
nonAsciis CodeUnitIndex
patLen) CodeUnit
char
| Int
intChar Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
asciiCount = Int -> CodeUnitIndex
CodeUnitIndex (Int -> CodeUnitIndex) -> Int -> CodeUnitIndex
forall a b. (a -> b) -> a -> b
$ TypedByteArray Int -> Int -> Int
forall a. Prim a => TypedByteArray a -> Int -> a
indexTable TypedByteArray Int
asciiTable Int
intChar
| Bool
otherwise = CodeUnitIndex
-> CodeUnit -> HashMap CodeUnit CodeUnitIndex -> CodeUnitIndex
forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
HashMap.lookupDefault CodeUnitIndex
patLen CodeUnit
char HashMap CodeUnit CodeUnitIndex
nonAsciis
where
intChar :: Int
intChar = CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
char
isPrefix :: Text -> CodeUnitIndex -> Bool
isPrefix :: Text -> CodeUnitIndex -> Bool
isPrefix Text
pattern CodeUnitIndex
pos = CodeUnitIndex -> Bool
go CodeUnitIndex
0
where
suffixLen :: CodeUnitIndex
suffixLen = Text -> CodeUnitIndex
lengthUtf16 Text
pattern CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
pos
go :: CodeUnitIndex -> Bool
go CodeUnitIndex
i
| CodeUnitIndex
i CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnitIndex
suffixLen =
if Text -> CodeUnitIndex -> CodeUnit
indexCodePoint Text
pattern CodeUnitIndex
i CodeUnit -> CodeUnit -> Bool
forall a. Eq a => a -> a -> Bool
== Text -> CodeUnitIndex -> CodeUnit
indexCodePoint Text
pattern (CodeUnitIndex
pos CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
i)
then CodeUnitIndex -> Bool
go (CodeUnitIndex
i CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
1)
else Bool
False
| Bool
otherwise = Bool
True
suffixLength :: Text -> CodeUnitIndex -> CodeUnitIndex
suffixLength :: Text -> CodeUnitIndex -> CodeUnitIndex
suffixLength Text
pattern CodeUnitIndex
pos = CodeUnitIndex -> CodeUnitIndex
go CodeUnitIndex
0
where
patLen :: CodeUnitIndex
patLen = Text -> CodeUnitIndex
lengthUtf16 Text
pattern
go :: CodeUnitIndex -> CodeUnitIndex
go CodeUnitIndex
i
| Text -> CodeUnitIndex -> CodeUnit
indexCodePoint Text
pattern (CodeUnitIndex
pos CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
i) CodeUnit -> CodeUnit -> Bool
forall a. Eq a => a -> a -> Bool
== Text -> CodeUnitIndex -> CodeUnit
indexCodePoint Text
pattern (CodeUnitIndex
patLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1 CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
i) Bool -> Bool -> Bool
&& CodeUnitIndex
i CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnitIndex
pos = CodeUnitIndex -> CodeUnitIndex
go (CodeUnitIndex
i CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
1)
| Bool
otherwise = CodeUnitIndex
i
buildBadCharTable :: Text -> BadCharTable
buildBadCharTable :: Text -> BadCharTable
buildBadCharTable Text
pattern = (forall s. ST s BadCharTable) -> BadCharTable
forall a. (forall s. ST s a) -> a
runST ((forall s. ST s BadCharTable) -> BadCharTable)
-> (forall s. ST s BadCharTable) -> BadCharTable
forall a b. (a -> b) -> a -> b
$ do
let patLen :: CodeUnitIndex
patLen = Text -> CodeUnitIndex
lengthUtf16 Text
pattern
MutableTypedByteArray Int s
asciiTable <- Int -> Int -> ST s (MutableTypedByteArray Int (PrimState (ST s)))
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
Int -> a -> m (MutableTypedByteArray a (PrimState m))
TBA.replicate Int
asciiCount (Int -> ST s (MutableTypedByteArray Int (PrimState (ST s))))
-> Int -> ST s (MutableTypedByteArray Int (PrimState (ST s)))
forall a b. (a -> b) -> a -> b
$ CodeUnitIndex -> Int
codeUnitIndex CodeUnitIndex
patLen
let
fillTable :: CodeUnitIndex
-> HashMap CodeUnit CodeUnitIndex
-> ST s (HashMap CodeUnit CodeUnitIndex)
fillTable !CodeUnitIndex
i !HashMap CodeUnit CodeUnitIndex
nonAsciis
| CodeUnitIndex
i CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnitIndex
patLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1 = do
let patChar :: CodeUnit
patChar = Text -> CodeUnitIndex -> CodeUnit
indexCodePoint Text
pattern CodeUnitIndex
i
if CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
patChar Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
asciiCount
then do
MutableTypedByteArray Int (PrimState (ST s))
-> Int -> Int -> ST s ()
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
MutableTypedByteArray a (PrimState m) -> Int -> a -> m ()
TBA.writeTypedByteArray MutableTypedByteArray Int s
MutableTypedByteArray Int (PrimState (ST s))
asciiTable (CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
patChar) (CodeUnitIndex -> Int
codeUnitIndex (CodeUnitIndex -> Int) -> CodeUnitIndex -> Int
forall a b. (a -> b) -> a -> b
$ CodeUnitIndex
patLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1 CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
i)
CodeUnitIndex
-> HashMap CodeUnit CodeUnitIndex
-> ST s (HashMap CodeUnit CodeUnitIndex)
fillTable (CodeUnitIndex
i CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
1) HashMap CodeUnit CodeUnitIndex
nonAsciis
else
CodeUnitIndex
-> HashMap CodeUnit CodeUnitIndex
-> ST s (HashMap CodeUnit CodeUnitIndex)
fillTable (CodeUnitIndex
i CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
1) (CodeUnit
-> CodeUnitIndex
-> HashMap CodeUnit CodeUnitIndex
-> HashMap CodeUnit CodeUnitIndex
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
HashMap.insert CodeUnit
patChar (CodeUnitIndex
patLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
1 CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
i) HashMap CodeUnit CodeUnitIndex
nonAsciis)
| Bool
otherwise = HashMap CodeUnit CodeUnitIndex
-> ST s (HashMap CodeUnit CodeUnitIndex)
forall (f :: * -> *) a. Applicative f => a -> f a
pure HashMap CodeUnit CodeUnitIndex
nonAsciis
HashMap CodeUnit CodeUnitIndex
nonAsciis <- CodeUnitIndex
-> HashMap CodeUnit CodeUnitIndex
-> ST s (HashMap CodeUnit CodeUnitIndex)
fillTable CodeUnitIndex
0 HashMap CodeUnit CodeUnitIndex
forall k v. HashMap k v
HashMap.empty
TypedByteArray Int
asciiTableFrozen <- MutableTypedByteArray Int (PrimState (ST s))
-> ST s (TypedByteArray Int)
forall (m :: * -> *) a.
PrimMonad m =>
MutableTypedByteArray a (PrimState m) -> m (TypedByteArray a)
TBA.unsafeFreezeTypedByteArray MutableTypedByteArray Int s
MutableTypedByteArray Int (PrimState (ST s))
asciiTable
BadCharTable -> ST s BadCharTable
forall (f :: * -> *) a. Applicative f => a -> f a
pure BadCharTable :: TypedByteArray Int
-> HashMap CodeUnit CodeUnitIndex -> CodeUnitIndex -> BadCharTable
BadCharTable
{ badCharTableAscii :: TypedByteArray Int
badCharTableAscii = TypedByteArray Int
asciiTableFrozen
, badCharTableNonAscii :: HashMap CodeUnit CodeUnitIndex
badCharTableNonAscii = HashMap CodeUnit CodeUnitIndex
nonAsciis
, badCharTablePatternLen :: CodeUnitIndex
badCharTablePatternLen = CodeUnitIndex
patLen
}
indexTable :: Prim a => TypedByteArray a -> Int -> a
{-# INLINE indexTable #-}
indexTable :: TypedByteArray a -> Int -> a
indexTable = TypedByteArray a -> Int -> a
forall a. Prim a => TypedByteArray a -> Int -> a
TBA.unsafeIndex
indexCodePoint :: Text -> CodeUnitIndex -> CodeUnit
{-# INLINE indexCodePoint #-}
indexCodePoint :: Text -> CodeUnitIndex -> CodeUnit
indexCodePoint Text
text CodeUnitIndex
index
| CodeUnitIndex
index CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnitIndex
0 Bool -> Bool -> Bool
|| CodeUnitIndex
index CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
>= Text -> CodeUnitIndex
lengthUtf16 Text
text = String -> CodeUnit
forall a. HasCallStack => String -> a
error (String -> CodeUnit) -> String -> CodeUnit
forall a b. (a -> b) -> a -> b
$ String
"Index out of bounds " String -> ShowS
forall a. [a] -> [a] -> [a]
++ CodeUnitIndex -> String
forall a. Show a => a -> String
show CodeUnitIndex
index
| Bool
otherwise = Text -> CodeUnitIndex -> CodeUnit
unsafeIndexUtf16 Text
text CodeUnitIndex
index