-- Alfred-Margaret: Fast Aho-Corasick string searching
-- Copyright 2019 Channable
--
-- Licensed under the 3-clause BSD license, see the LICENSE file in the
-- repository root.

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DerivingStrategies #-}

-- | Implements sequential string replacements based on the Aho-Corasick algorithm.
module Data.Text.AhoCorasick.Replacer
    ( -- * State machine
      Needle
    , Payload (..)
    , Replacement
    , Replacer (..)
    , replacerCaseSensitivity
    , build
    , compose
    , mapReplacement
    , run
    , runWithLimit
    , setCaseSensitivity
    ) where

import Control.DeepSeq (NFData)
import Data.Hashable (Hashable)
import Data.List (sort)
import Data.Maybe (fromJust)
import GHC.Generics (Generic)

#if defined(HAS_AESON)
import qualified Data.Aeson as AE
#endif

import Data.Text.AhoCorasick.Searcher (Searcher)
import Data.Text.CaseSensitivity (CaseSensitivity (..))
import Data.Text.Utf8 (CodeUnitIndex (..), Text)

import qualified Data.Text as Text
import qualified Data.Text.AhoCorasick.Automaton as Aho
import qualified Data.Text.AhoCorasick.Searcher as Searcher
import qualified Data.Text.Utf8 as Utf8

-- | Descriptive type alias for strings to search for.
type Needle = Text

-- | Descriptive type alias for replacements.
type Replacement = Text

-- | Priority of a needle. Higher integers indicate higher priorities.
-- Replacement order is such that all matches of priority p are replaced before
-- replacing any matches of priority q where p > q.
type Priority = Int

data Payload = Payload
  { Payload -> Priority
needlePriority    :: {-# UNPACK #-} !Priority
  , Payload -> CodeUnitIndex
needleLengthBytes       :: {-# UNPACK #-} !CodeUnitIndex
    -- ^ Number of bytes is used for case sensitive matching
  , Payload -> Priority
needleLengthCodePoints  :: {-# UNPACK #-} !Int
    -- ^ For case insensitive matches, the byte length does not necessarily match the needle byte
    -- length. Due to our simple case folding the number of codepoints _does_ match, so we put that
    -- in the payload. It's less efficient because we have to scan backwards through the text to
    -- obtain the length of a match.

  , Payload -> Replacement
needleReplacement :: !Replacement
  }
#if defined(HAS_AESON)
  deriving (Payload -> Payload -> Bool
(Payload -> Payload -> Bool)
-> (Payload -> Payload -> Bool) -> Eq Payload
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Payload -> Payload -> Bool
== :: Payload -> Payload -> Bool
$c/= :: Payload -> Payload -> Bool
/= :: Payload -> Payload -> Bool
Eq, (forall x. Payload -> Rep Payload x)
-> (forall x. Rep Payload x -> Payload) -> Generic Payload
forall x. Rep Payload x -> Payload
forall x. Payload -> Rep Payload x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cfrom :: forall x. Payload -> Rep Payload x
from :: forall x. Payload -> Rep Payload x
$cto :: forall x. Rep Payload x -> Payload
to :: forall x. Rep Payload x -> Payload
Generic, Eq Payload
Eq Payload =>
(Priority -> Payload -> Priority)
-> (Payload -> Priority) -> Hashable Payload
Priority -> Payload -> Priority
Payload -> Priority
forall a.
Eq a =>
(Priority -> a -> Priority) -> (a -> Priority) -> Hashable a
$chashWithSalt :: Priority -> Payload -> Priority
hashWithSalt :: Priority -> Payload -> Priority
$chash :: Payload -> Priority
hash :: Payload -> Priority
Hashable, Payload -> ()
(Payload -> ()) -> NFData Payload
forall a. (a -> ()) -> NFData a
$crnf :: Payload -> ()
rnf :: Payload -> ()
NFData, Priority -> Payload -> ShowS
[Payload] -> ShowS
Payload -> String
(Priority -> Payload -> ShowS)
-> (Payload -> String) -> ([Payload] -> ShowS) -> Show Payload
forall a.
(Priority -> a -> ShowS)
-> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Priority -> Payload -> ShowS
showsPrec :: Priority -> Payload -> ShowS
$cshow :: Payload -> String
show :: Payload -> String
$cshowList :: [Payload] -> ShowS
showList :: [Payload] -> ShowS
Show, Maybe Payload
Value -> Parser [Payload]
Value -> Parser Payload
(Value -> Parser Payload)
-> (Value -> Parser [Payload]) -> Maybe Payload -> FromJSON Payload
forall a.
(Value -> Parser a)
-> (Value -> Parser [a]) -> Maybe a -> FromJSON a
$cparseJSON :: Value -> Parser Payload
parseJSON :: Value -> Parser Payload
$cparseJSONList :: Value -> Parser [Payload]
parseJSONList :: Value -> Parser [Payload]
$comittedField :: Maybe Payload
omittedField :: Maybe Payload
AE.FromJSON, [Payload] -> Value
[Payload] -> Encoding
Payload -> Bool
Payload -> Value
Payload -> Encoding
(Payload -> Value)
-> (Payload -> Encoding)
-> ([Payload] -> Value)
-> ([Payload] -> Encoding)
-> (Payload -> Bool)
-> ToJSON Payload
forall a.
(a -> Value)
-> (a -> Encoding)
-> ([a] -> Value)
-> ([a] -> Encoding)
-> (a -> Bool)
-> ToJSON a
$ctoJSON :: Payload -> Value
toJSON :: Payload -> Value
$ctoEncoding :: Payload -> Encoding
toEncoding :: Payload -> Encoding
$ctoJSONList :: [Payload] -> Value
toJSONList :: [Payload] -> Value
$ctoEncodingList :: [Payload] -> Encoding
toEncodingList :: [Payload] -> Encoding
$comitField :: Payload -> Bool
omitField :: Payload -> Bool
AE.ToJSON)
#else
  deriving (Eq, Generic, Hashable, NFData, Show)
#endif

-- | A state machine used for efficient replacements with many different needles.
data Replacer = Replacer
  { Replacer -> Searcher Payload
replacerSearcher :: Searcher Payload
  }
  deriving stock (Priority -> Replacer -> ShowS
[Replacer] -> ShowS
Replacer -> String
(Priority -> Replacer -> ShowS)
-> (Replacer -> String) -> ([Replacer] -> ShowS) -> Show Replacer
forall a.
(Priority -> a -> ShowS)
-> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Priority -> Replacer -> ShowS
showsPrec :: Priority -> Replacer -> ShowS
$cshow :: Replacer -> String
show :: Replacer -> String
$cshowList :: [Replacer] -> ShowS
showList :: [Replacer] -> ShowS
Show, Replacer -> Replacer -> Bool
(Replacer -> Replacer -> Bool)
-> (Replacer -> Replacer -> Bool) -> Eq Replacer
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Replacer -> Replacer -> Bool
== :: Replacer -> Replacer -> Bool
$c/= :: Replacer -> Replacer -> Bool
/= :: Replacer -> Replacer -> Bool
Eq, (forall x. Replacer -> Rep Replacer x)
-> (forall x. Rep Replacer x -> Replacer) -> Generic Replacer
forall x. Rep Replacer x -> Replacer
forall x. Replacer -> Rep Replacer x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cfrom :: forall x. Replacer -> Rep Replacer x
from :: forall x. Replacer -> Rep Replacer x
$cto :: forall x. Rep Replacer x -> Replacer
to :: forall x. Rep Replacer x -> Replacer
Generic)
#if defined(HAS_AESON)
  deriving (Eq Replacer
Eq Replacer =>
(Priority -> Replacer -> Priority)
-> (Replacer -> Priority) -> Hashable Replacer
Priority -> Replacer -> Priority
Replacer -> Priority
forall a.
Eq a =>
(Priority -> a -> Priority) -> (a -> Priority) -> Hashable a
$chashWithSalt :: Priority -> Replacer -> Priority
hashWithSalt :: Priority -> Replacer -> Priority
$chash :: Replacer -> Priority
hash :: Replacer -> Priority
Hashable, Replacer -> ()
(Replacer -> ()) -> NFData Replacer
forall a. (a -> ()) -> NFData a
$crnf :: Replacer -> ()
rnf :: Replacer -> ()
NFData, Maybe Replacer
Value -> Parser [Replacer]
Value -> Parser Replacer
(Value -> Parser Replacer)
-> (Value -> Parser [Replacer])
-> Maybe Replacer
-> FromJSON Replacer
forall a.
(Value -> Parser a)
-> (Value -> Parser [a]) -> Maybe a -> FromJSON a
$cparseJSON :: Value -> Parser Replacer
parseJSON :: Value -> Parser Replacer
$cparseJSONList :: Value -> Parser [Replacer]
parseJSONList :: Value -> Parser [Replacer]
$comittedField :: Maybe Replacer
omittedField :: Maybe Replacer
AE.FromJSON, [Replacer] -> Value
[Replacer] -> Encoding
Replacer -> Bool
Replacer -> Value
Replacer -> Encoding
(Replacer -> Value)
-> (Replacer -> Encoding)
-> ([Replacer] -> Value)
-> ([Replacer] -> Encoding)
-> (Replacer -> Bool)
-> ToJSON Replacer
forall a.
(a -> Value)
-> (a -> Encoding)
-> ([a] -> Value)
-> ([a] -> Encoding)
-> (a -> Bool)
-> ToJSON a
$ctoJSON :: Replacer -> Value
toJSON :: Replacer -> Value
$ctoEncoding :: Replacer -> Encoding
toEncoding :: Replacer -> Encoding
$ctoJSONList :: [Replacer] -> Value
toJSONList :: [Replacer] -> Value
$ctoEncodingList :: [Replacer] -> Encoding
toEncodingList :: [Replacer] -> Encoding
$comitField :: Replacer -> Bool
omitField :: Replacer -> Bool
AE.ToJSON)
#else
  deriving (Hashable, NFData)
#endif

-- | Build an Aho-Corasick automaton that can be used for performing fast
-- sequential replaces.
--
-- Case-insensitive matching performs per-letter language-agnostic lower-casing.
-- Therefore, it will work in most cases, but not in languages where lower-casing
-- depends on the context of the character in question.
--
-- We need to revisit this algorithm when we want to implement full Unicode
-- support.
build :: CaseSensitivity -> [(Needle, Replacement)] -> Replacer
build :: CaseSensitivity -> [(Replacement, Replacement)] -> Replacer
build CaseSensitivity
caseSensitivity [(Replacement, Replacement)]
replaces = Searcher Payload -> Replacer
Replacer Searcher Payload
searcher
  where
    searcher :: Searcher Payload
searcher = CaseSensitivity -> [(Replacement, Payload)] -> Searcher Payload
forall v.
Hashable v =>
CaseSensitivity -> [(Replacement, v)] -> Searcher v
Searcher.buildWithValues CaseSensitivity
caseSensitivity ([(Replacement, Payload)] -> Searcher Payload)
-> [(Replacement, Payload)] -> Searcher Payload
forall a b. (a -> b) -> a -> b
$ (Priority -> (Replacement, Replacement) -> (Replacement, Payload))
-> [Priority]
-> [(Replacement, Replacement)]
-> [(Replacement, Payload)]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith Priority -> (Replacement, Replacement) -> (Replacement, Payload)
mapNeedle [Priority
0..] [(Replacement, Replacement)]
replaces
    mapNeedle :: Priority -> (Replacement, Replacement) -> (Replacement, Payload)
mapNeedle Priority
i (Replacement
needle, Replacement
replacement) =
      -- Note that we negate i: earlier needles have a higher priority. We
      -- could avoid it and define larger integers to be lower priority, but
      -- that made the terminology in this module very confusing.
      let needle' :: Replacement
needle' = case Searcher Payload -> CaseSensitivity
forall v. Searcher v -> CaseSensitivity
Searcher.caseSensitivity Searcher Payload
searcher of
                      CaseSensitivity
CaseSensitive -> Replacement
needle
                      CaseSensitivity
IgnoreCase -> Replacement -> Replacement
Utf8.lowerUtf8 Replacement
needle
          -- Payload includes byte and code point lengths, so can still be used if we change case
          -- sensitivity later.
          payload :: Payload
payload = Payload
            { needlePriority :: Priority
needlePriority = (-Priority
i)
            , needleLengthBytes :: CodeUnitIndex
needleLengthBytes = Replacement -> CodeUnitIndex
Utf8.lengthUtf8 Replacement
needle
            , needleLengthCodePoints :: Priority
needleLengthCodePoints = Replacement -> Priority
Text.length Replacement
needle
            , needleReplacement :: Replacement
needleReplacement = Replacement
replacement
            }
      in (Replacement
needle', Payload
payload)

-- | Return the composition `replacer2` after `replacer1`, if they have the same
-- case sensitivity. If the case sensitivity differs, Nothing is returned.
compose :: Replacer -> Replacer -> Maybe Replacer
compose :: Replacer -> Replacer -> Maybe Replacer
compose (Replacer Searcher Payload
searcher1) (Replacer Searcher Payload
searcher2)
  | Searcher Payload -> CaseSensitivity
forall v. Searcher v -> CaseSensitivity
Searcher.caseSensitivity Searcher Payload
searcher1 CaseSensitivity -> CaseSensitivity -> Bool
forall a. Eq a => a -> a -> Bool
/= Searcher Payload -> CaseSensitivity
forall v. Searcher v -> CaseSensitivity
Searcher.caseSensitivity Searcher Payload
searcher2 = Maybe Replacer
forall a. Maybe a
Nothing
  | Bool
otherwise =
      let
        -- Replace the priorities of the second machine, so they all come after
        -- the first.
        renumber :: Priority -> (a, Payload) -> (a, Payload)
renumber Priority
i (a
needle, Payload Priority
_ CodeUnitIndex
lenb Priority
lenc Replacement
replacement) = (a
needle, Priority -> CodeUnitIndex -> Priority -> Replacement -> Payload
Payload (-Priority
i) CodeUnitIndex
lenb Priority
lenc Replacement
replacement)
        needles1 :: [(Replacement, Payload)]
needles1 = Searcher Payload -> [(Replacement, Payload)]
forall v. Searcher v -> [(Replacement, v)]
Searcher.needles Searcher Payload
searcher1
        needles2 :: [(Replacement, Payload)]
needles2 = Searcher Payload -> [(Replacement, Payload)]
forall v. Searcher v -> [(Replacement, v)]
Searcher.needles Searcher Payload
searcher2
        cs :: CaseSensitivity
cs = Searcher Payload -> CaseSensitivity
forall v. Searcher v -> CaseSensitivity
Searcher.caseSensitivity Searcher Payload
searcher1
        searcher :: Searcher Payload
searcher = CaseSensitivity -> [(Replacement, Payload)] -> Searcher Payload
forall v.
Hashable v =>
CaseSensitivity -> [(Replacement, v)] -> Searcher v
Searcher.buildWithValues CaseSensitivity
cs ([(Replacement, Payload)] -> Searcher Payload)
-> [(Replacement, Payload)] -> Searcher Payload
forall a b. (a -> b) -> a -> b
$ (Priority -> (Replacement, Payload) -> (Replacement, Payload))
-> [Priority]
-> [(Replacement, Payload)]
-> [(Replacement, Payload)]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith Priority -> (Replacement, Payload) -> (Replacement, Payload)
forall {a}. Priority -> (a, Payload) -> (a, Payload)
renumber [Priority
0..] ([(Replacement, Payload)]
needles1 [(Replacement, Payload)]
-> [(Replacement, Payload)] -> [(Replacement, Payload)]
forall a. [a] -> [a] -> [a]
++ [(Replacement, Payload)]
needles2)
      in
        Replacer -> Maybe Replacer
forall a. a -> Maybe a
Just (Replacer -> Maybe Replacer) -> Replacer -> Maybe Replacer
forall a b. (a -> b) -> a -> b
$ Searcher Payload -> Replacer
Replacer Searcher Payload
searcher

-- | Modify the replacement of a replacer. It doesn't modify the needles.
mapReplacement :: (Replacement -> Replacement) -> Replacer -> Replacer
mapReplacement :: (Replacement -> Replacement) -> Replacer -> Replacer
mapReplacement Replacement -> Replacement
f Replacer
replacer = Replacer
replacer{
  replacerSearcher = Searcher.mapSearcher
    (\Payload
p -> Payload
p {needleReplacement = f (needleReplacement p)})
    (replacerSearcher replacer)
}


replacerCaseSensitivity :: Replacer -> CaseSensitivity
replacerCaseSensitivity :: Replacer -> CaseSensitivity
replacerCaseSensitivity (Replacer Searcher Payload
searcher) = Searcher Payload -> CaseSensitivity
forall v. Searcher v -> CaseSensitivity
Searcher.caseSensitivity Searcher Payload
searcher


-- | Updates the case sensitivity of the replacer. Does not change the
-- capitilization of the needles. The caller should be certain that if IgnoreCase
-- is passed, the needles are already lower case.
setCaseSensitivity :: CaseSensitivity -> Replacer -> Replacer
setCaseSensitivity :: CaseSensitivity -> Replacer -> Replacer
setCaseSensitivity CaseSensitivity
case_ (Replacer Searcher Payload
searcher) =
  Searcher Payload -> Replacer
Replacer (CaseSensitivity -> Searcher Payload -> Searcher Payload
forall v. CaseSensitivity -> Searcher v -> Searcher v
Searcher.setCaseSensitivity CaseSensitivity
case_ Searcher Payload
searcher)


-- A match collected while running replacements. It is isomorphic to the Match
-- reported by the automaton, but the data is arranged in a more useful way:
-- as the start index and length of the match, and the replacement.
data Match = Match !CodeUnitIndex !CodeUnitIndex !Text deriving (Match -> Match -> Bool
(Match -> Match -> Bool) -> (Match -> Match -> Bool) -> Eq Match
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Match -> Match -> Bool
== :: Match -> Match -> Bool
$c/= :: Match -> Match -> Bool
/= :: Match -> Match -> Bool
Eq, Eq Match
Eq Match =>
(Match -> Match -> Ordering)
-> (Match -> Match -> Bool)
-> (Match -> Match -> Bool)
-> (Match -> Match -> Bool)
-> (Match -> Match -> Bool)
-> (Match -> Match -> Match)
-> (Match -> Match -> Match)
-> Ord Match
Match -> Match -> Bool
Match -> Match -> Ordering
Match -> Match -> Match
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: Match -> Match -> Ordering
compare :: Match -> Match -> Ordering
$c< :: Match -> Match -> Bool
< :: Match -> Match -> Bool
$c<= :: Match -> Match -> Bool
<= :: Match -> Match -> Bool
$c> :: Match -> Match -> Bool
> :: Match -> Match -> Bool
$c>= :: Match -> Match -> Bool
>= :: Match -> Match -> Bool
$cmax :: Match -> Match -> Match
max :: Match -> Match -> Match
$cmin :: Match -> Match -> Match
min :: Match -> Match -> Match
Ord, Priority -> Match -> ShowS
[Match] -> ShowS
Match -> String
(Priority -> Match -> ShowS)
-> (Match -> String) -> ([Match] -> ShowS) -> Show Match
forall a.
(Priority -> a -> ShowS)
-> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Priority -> Match -> ShowS
showsPrec :: Priority -> Match -> ShowS
$cshow :: Match -> String
show :: Match -> String
$cshowList :: [Match] -> ShowS
showList :: [Match] -> ShowS
Show)

-- | Apply replacements of all matches. Assumes that the matches are ordered by
-- match position, and that no matches overlap.
replace :: [Match] -> Text -> Text
replace :: [Match] -> Replacement -> Replacement
replace [Match]
matches Replacement
haystack = [Replacement] -> Replacement
Utf8.concat ([Replacement] -> Replacement) -> [Replacement] -> Replacement
forall a b. (a -> b) -> a -> b
$ CodeUnitIndex -> [Match] -> Replacement -> [Replacement]
go CodeUnitIndex
0 [Match]
matches Replacement
haystack
  where
    -- At every match, cut the string into three pieces, removing the match.
    -- Because a Text is a buffer pointer and (offset, length), cutting does not
    -- involve string copies. Only at the very end we piece together the strings
    -- again, so Text can allocate a buffer of the right length and memcpy the
    -- parts into the new target string.
    -- If `k` is a code unit index into the original text, then `k - offset`
    -- is an index into `remainder`. In other words, `offset` is the index into
    -- the original text where `remainder` starts.
    go :: CodeUnitIndex -> [Match] -> Text -> [Text]
    go :: CodeUnitIndex -> [Match] -> Replacement -> [Replacement]
go !CodeUnitIndex
_offset [] Replacement
remainder = [Replacement
remainder]
    go !CodeUnitIndex
offset ((Match CodeUnitIndex
pos CodeUnitIndex
len Replacement
replacement) : [Match]
ms) Replacement
remainder =
      let
        (Replacement
prefix, Replacement
suffix) = CodeUnitIndex
-> CodeUnitIndex -> Replacement -> (Replacement, Replacement)
Utf8.unsafeCutUtf8 (CodeUnitIndex
pos CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
offset) CodeUnitIndex
len Replacement
remainder
      in
        Replacement
prefix Replacement -> [Replacement] -> [Replacement]
forall a. a -> [a] -> [a]
: Replacement
replacement Replacement -> [Replacement] -> [Replacement]
forall a. a -> [a] -> [a]
: CodeUnitIndex -> [Match] -> Replacement -> [Replacement]
go (CodeUnitIndex
pos CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
len) [Match]
ms Replacement
suffix

-- | Compute the length of the string resulting from applying the replacements.
replacementLength :: [Match] -> Text -> CodeUnitIndex
replacementLength :: [Match] -> Replacement -> CodeUnitIndex
replacementLength [Match]
matches Replacement
initial  = [Match] -> CodeUnitIndex -> CodeUnitIndex
go [Match]
matches (Replacement -> CodeUnitIndex
Utf8.lengthUtf8 Replacement
initial)
  where
    go :: [Match] -> CodeUnitIndex -> CodeUnitIndex
go [] !CodeUnitIndex
acc = CodeUnitIndex
acc
    go (Match CodeUnitIndex
_ CodeUnitIndex
matchLen Replacement
repl : [Match]
rest) !CodeUnitIndex
acc = [Match] -> CodeUnitIndex -> CodeUnitIndex
go [Match]
rest (CodeUnitIndex
acc CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
matchLen CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ Replacement -> CodeUnitIndex
Utf8.lengthUtf8 Replacement
repl)

-- | Given a list of matches sorted on start position, remove matches that start
-- within an earlier match.
removeOverlap :: [Match] -> [Match]
removeOverlap :: [Match] -> [Match]
removeOverlap [Match]
matches = case [Match]
matches of
  [] -> []
  [Match
m] -> [Match
m]
  (m0 :: Match
m0@(Match CodeUnitIndex
pos0 CodeUnitIndex
len0 Replacement
_) : m1 :: Match
m1@(Match CodeUnitIndex
pos1 CodeUnitIndex
_ Replacement
_) : [Match]
ms) ->
    if CodeUnitIndex
pos1 CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
>= CodeUnitIndex
pos0 CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
len0
      then Match
m0 Match -> [Match] -> [Match]
forall a. a -> [a] -> [a]
: [Match] -> [Match]
removeOverlap (Match
m1Match -> [Match] -> [Match]
forall a. a -> [a] -> [a]
:[Match]
ms)
      else [Match] -> [Match]
removeOverlap (Match
m0Match -> [Match] -> [Match]
forall a. a -> [a] -> [a]
:[Match]
ms)

run :: Replacer -> Text -> Text
run :: Replacer -> Replacement -> Replacement
run Replacer
replacer = Maybe Replacement -> Replacement
forall a. HasCallStack => Maybe a -> a
fromJust (Maybe Replacement -> Replacement)
-> (Replacement -> Maybe Replacement) -> Replacement -> Replacement
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Replacer -> CodeUnitIndex -> Replacement -> Maybe Replacement
runWithLimit Replacer
replacer CodeUnitIndex
forall a. Bounded a => a
maxBound

{-# NOINLINE runWithLimit #-}
runWithLimit :: Replacer -> CodeUnitIndex -> Text -> Maybe Text
runWithLimit :: Replacer -> CodeUnitIndex -> Replacement -> Maybe Replacement
runWithLimit (Replacer Searcher Payload
searcher) CodeUnitIndex
maxLength = Priority -> Replacement -> Maybe Replacement
go Priority
initialThreshold
  where
    !automaton :: AcMachine Payload
automaton = Searcher Payload -> AcMachine Payload
forall v. Searcher v -> AcMachine v
Searcher.automaton Searcher Payload
searcher

    -- Priorities are 0 or lower, so an initial threshold of 1 keeps all
    -- matches.
    !initialThreshold :: Priority
initialThreshold = Priority
1

    -- Needle priorities go from 0 for the highest priority to (-numNeedles + 1)
    -- for the lowest priority. That means that if we find a match with
    -- minPriority, we don't need to do another pass afterwards, because there
    -- are no remaining needles.
    !minPriority :: Priority
minPriority = Priority
1 Priority -> Priority -> Priority
forall a. Num a => a -> a -> a
- Searcher Payload -> Priority
forall v. Searcher v -> Priority
Searcher.numNeedles Searcher Payload
searcher

    go :: Priority -> Text -> Maybe Text
    go :: Priority -> Replacement -> Maybe Replacement
go !Priority
threshold Replacement
haystack =
      let
        seed :: (Priority, [a])
seed = (Priority
forall a. Bounded a => a
minBound :: Priority, [])
        matchesWithPriority :: (Priority, [Match])
matchesWithPriority = case Searcher Payload -> CaseSensitivity
forall v. Searcher v -> CaseSensitivity
Searcher.caseSensitivity Searcher Payload
searcher of
          CaseSensitivity
CaseSensitive -> (Priority, [Match])
-> ((Priority, [Match])
    -> Match Payload -> Next (Priority, [Match]))
-> AcMachine Payload
-> Replacement
-> (Priority, [Match])
forall a v.
a -> (a -> Match v -> Next a) -> AcMachine v -> Replacement -> a
Aho.runText (Priority, [Match])
forall {a}. (Priority, [a])
seed (Priority
-> Replacement
-> (Priority, [Match])
-> Match Payload
-> Next (Priority, [Match])
prependMatch Priority
threshold Replacement
haystack) AcMachine Payload
automaton Replacement
haystack
          CaseSensitivity
IgnoreCase -> (Priority, [Match])
-> ((Priority, [Match])
    -> Match Payload -> Next (Priority, [Match]))
-> AcMachine Payload
-> Replacement
-> (Priority, [Match])
forall a v.
a -> (a -> Match v -> Next a) -> AcMachine v -> Replacement -> a
Aho.runLower (Priority, [Match])
forall {a}. (Priority, [a])
seed (Priority
-> Replacement
-> (Priority, [Match])
-> Match Payload
-> Next (Priority, [Match])
prependMatch Priority
threshold Replacement
haystack) AcMachine Payload
automaton Replacement
haystack
      in
        case (Priority, [Match])
matchesWithPriority of
          -- No match at the given threshold, there is nothing left to do.
          -- Return the input string unmodified.
          (Priority
_, []) -> Replacement -> Maybe Replacement
forall a. a -> Maybe a
Just Replacement
haystack
          -- We found matches at priority p. Remove overlapping matches, then
          -- apply all replacements. Next, we need to go again, this time
          -- considering only needles with a lower priority than p. As an
          -- optimization (which matters mainly for the single needle case),
          -- if we find a match at the lowest priority, we don't need another
          -- pass. Note that if in `rawMatches` we find only matches of priority
          -- p > minPriority, then we do still need another pass, because the
          -- replacements could create new matches.
          (Priority
p, [Match]
matches)
            | [Match] -> Replacement -> CodeUnitIndex
replacementLength [Match]
matches Replacement
haystack CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
> CodeUnitIndex
maxLength -> Maybe Replacement
forall a. Maybe a
Nothing
            | Priority
p Priority -> Priority -> Bool
forall a. Eq a => a -> a -> Bool
== Priority
minPriority -> Replacement -> Maybe Replacement
forall a. a -> Maybe a
Just (Replacement -> Maybe Replacement)
-> Replacement -> Maybe Replacement
forall a b. (a -> b) -> a -> b
$ [Match] -> Replacement -> Replacement
replace ([Match] -> [Match]
removeOverlap ([Match] -> [Match]) -> [Match] -> [Match]
forall a b. (a -> b) -> a -> b
$ [Match] -> [Match]
forall a. Ord a => [a] -> [a]
sort [Match]
matches) Replacement
haystack
            | Bool
otherwise -> Priority -> Replacement -> Maybe Replacement
go Priority
p (Replacement -> Maybe Replacement)
-> Replacement -> Maybe Replacement
forall a b. (a -> b) -> a -> b
$ [Match] -> Replacement -> Replacement
replace ([Match] -> [Match]
removeOverlap ([Match] -> [Match]) -> [Match] -> [Match]
forall a b. (a -> b) -> a -> b
$ [Match] -> [Match]
forall a. Ord a => [a] -> [a]
sort [Match]
matches) Replacement
haystack

    -- When we iterate through all matches, keep track only of the matches with
    -- the highest priority: those are the ones that we will replace first. If we
    -- find multiple matches with that priority, remember all of them. If we find a
    -- match with lower priority, ignore it, because we already have a more
    -- important match. Also, if the priority is `threshold` or higher, ignore the
    -- match, so we can exclude matches if we already did a round of replacements
    -- for that priority. This way we don't have to build a new automaton after
    -- every round of replacements.
    prependMatch
      :: Priority -> Text -> (Priority, [Match]) -> Aho.Match Payload -> Aho.Next (Priority, [Match])
    {-# INLINE prependMatch #-}
    prependMatch :: Priority
-> Replacement
-> (Priority, [Match])
-> Match Payload
-> Next (Priority, [Match])
prependMatch !Priority
threshold Replacement
haystack (!Priority
pBest, ![Match]
matches) (Aho.Match CodeUnitIndex
pos (Payload Priority
pMatch CodeUnitIndex
lenb Priority
lenc Replacement
replacement))
      | Priority
pMatch Priority -> Priority -> Bool
forall a. Ord a => a -> a -> Bool
< Priority
threshold Bool -> Bool -> Bool
&& Priority
pMatch Priority -> Priority -> Bool
forall a. Ord a => a -> a -> Bool
>  Priority
pBest =
          (Priority, [Match]) -> Next (Priority, [Match])
forall a. a -> Next a
Aho.Step (Priority
pMatch, [Replacement
-> CodeUnitIndex
-> CodeUnitIndex
-> Priority
-> Replacement
-> Match
makeMatch Replacement
haystack CodeUnitIndex
pos CodeUnitIndex
lenb Priority
lenc Replacement
replacement])
      | Priority
pMatch Priority -> Priority -> Bool
forall a. Ord a => a -> a -> Bool
< Priority
threshold Bool -> Bool -> Bool
&& Priority
pMatch Priority -> Priority -> Bool
forall a. Eq a => a -> a -> Bool
== Priority
pBest =
          (Priority, [Match]) -> Next (Priority, [Match])
forall a. a -> Next a
Aho.Step (Priority
pMatch, Replacement
-> CodeUnitIndex
-> CodeUnitIndex
-> Priority
-> Replacement
-> Match
makeMatch Replacement
haystack CodeUnitIndex
pos CodeUnitIndex
lenb Priority
lenc Replacement
replacement Match -> [Match] -> [Match]
forall a. a -> [a] -> [a]
: [Match]
matches)
      | Bool
otherwise = (Priority, [Match]) -> Next (Priority, [Match])
forall a. a -> Next a
Aho.Step (Priority
pBest, [Match]
matches)

    -- Pos is the code unit index past the last code unit of the match, we have
    -- to find the first code unit.
    makeMatch :: Text -> CodeUnitIndex -> CodeUnitIndex -> Int -> Replacement -> Match
    {-# INLINE makeMatch #-}
    makeMatch :: Replacement
-> CodeUnitIndex
-> CodeUnitIndex
-> Priority
-> Replacement
-> Match
makeMatch = case Searcher Payload -> CaseSensitivity
forall v. Searcher v -> CaseSensitivity
Searcher.caseSensitivity Searcher Payload
searcher of
      -- Case sensitive: length is interpreted as number of bytes
      CaseSensitivity
CaseSensitive -> \Replacement
_ CodeUnitIndex
pos CodeUnitIndex
lenb Priority
_ Replacement
replacement ->
        CodeUnitIndex -> CodeUnitIndex -> Replacement -> Match
Match (CodeUnitIndex
pos CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
lenb) CodeUnitIndex
lenb Replacement
replacement
      -- Case insensitive: length is interpreted as number of characters
      CaseSensitivity
IgnoreCase -> \Replacement
haystack CodeUnitIndex
pos CodeUnitIndex
_ Priority
lenc Replacement
replacement ->
        -- We start at the last byte of the match, and look backwards.
        let start :: CodeUnitIndex
start = Replacement -> CodeUnitIndex -> Priority -> CodeUnitIndex
Utf8.skipCodePointsBackwards Replacement
haystack (CodeUnitIndex
posCodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
-CodeUnitIndex
1) (Priority
lencPriority -> Priority -> Priority
forall a. Num a => a -> a -> a
-Priority
1) in
        CodeUnitIndex -> CodeUnitIndex -> Replacement -> Match
Match CodeUnitIndex
start (CodeUnitIndex
pos CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
start) Replacement
replacement