-- Alfred-Margaret: Fast Aho-Corasick string searching
-- Copyright 2019 Channable
--
-- Licensed under the 3-clause BSD license, see the LICENSE file in the
-- repository root.

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}

module Data.Text.BoyerMoore.Searcher
    ( Searcher
    , automata
    , build
    , buildNeedleIdSearcher
    , buildWithValues
    , containsAll
    , containsAny
    , needles
    , numNeedles
    ) where


import Control.DeepSeq (NFData)
import Data.Bifunctor (first)
import Data.Hashable (Hashable (hashWithSalt), Hashed, hashed, unhashed)
import GHC.Generics (Generic)

import Data.Text.Utf8 (Text)
import Data.Text.BoyerMoore.Automaton (Automaton)

import qualified Data.Text.BoyerMoore.Automaton as BoyerMoore


-- | A set of needles with associated values, and Boyer-Moore automata to
-- efficiently find those needles.
--
-- INVARIANT: searcherAutomaton = BoyerMoore.buildAutomaton . searcherNeedles
-- To enforce this invariant, the fields are not exposed from this module.
-- There is a separate constructor function.
--
-- The purpose of this wrapper is to have a type that is Hashable and Eq, so we
-- can derive those for the types that embed the searcher, whithout
-- requiring the automaton itself to be Hashable or Eq, which would be both
-- wasteful and tedious. Because the automaton is fully determined by the
-- needles and associated values, it is sufficient to implement Eq and Hashable
-- in terms of the needles only.
--
-- We also use Hashed to cache the hash of the needles.
data Searcher v = Searcher
  { Searcher v -> Hashed [(Text, v)]
searcherNeedles :: Hashed [(Text, v)]
  , Searcher v -> Int
searcherNumNeedles :: Int
  , Searcher v -> [(Automaton, v)]
searcherAutomata :: [(Automaton, v)]
  } deriving ((forall x. Searcher v -> Rep (Searcher v) x)
-> (forall x. Rep (Searcher v) x -> Searcher v)
-> Generic (Searcher v)
forall x. Rep (Searcher v) x -> Searcher v
forall x. Searcher v -> Rep (Searcher v) x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall v x. Rep (Searcher v) x -> Searcher v
forall v x. Searcher v -> Rep (Searcher v) x
$cto :: forall v x. Rep (Searcher v) x -> Searcher v
$cfrom :: forall v x. Searcher v -> Rep (Searcher v) x
Generic)

instance Show (Searcher v) where
  show :: Searcher v -> String
show Searcher v
_ = String
"Searcher _ _ _"

instance Hashable v => Hashable (Searcher v) where
  hashWithSalt :: Int -> Searcher v -> Int
hashWithSalt Int
salt Searcher v
searcher = Int -> Hashed [(Text, v)] -> Int
forall a. Hashable a => Int -> a -> Int
hashWithSalt Int
salt (Hashed [(Text, v)] -> Int) -> Hashed [(Text, v)] -> Int
forall a b. (a -> b) -> a -> b
$ Searcher v -> Hashed [(Text, v)]
forall v. Searcher v -> Hashed [(Text, v)]
searcherNeedles Searcher v
searcher
  {-# INLINE hashWithSalt #-}

instance Eq v => Eq (Searcher v) where
  Searcher Hashed [(Text, v)]
xs Int
nx [(Automaton, v)]
_ == :: Searcher v -> Searcher v -> Bool
== Searcher Hashed [(Text, v)]
ys Int
ny [(Automaton, v)]
_ = (Hashed [(Text, v)]
xs, Int
nx) (Hashed [(Text, v)], Int) -> (Hashed [(Text, v)], Int) -> Bool
forall a. Eq a => a -> a -> Bool
== (Hashed [(Text, v)]
ys, Int
ny)
  {-# INLINE (==) #-}

instance NFData v => NFData (Searcher v)

-- | Builds the Searcher for a list of needles without values.
-- This is useful for just checking whether the haystack contains the needles.
build :: [Text] -> Searcher ()
{-# INLINABLE build #-}
build :: [Text] -> Searcher ()
build = [(Text, ())] -> Searcher ()
forall v. Hashable v => [(Text, v)] -> Searcher v
buildWithValues ([(Text, ())] -> Searcher ())
-> ([Text] -> [(Text, ())]) -> [Text] -> Searcher ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([Text] -> [()] -> [(Text, ())]) -> [()] -> [Text] -> [(Text, ())]
forall a b c. (a -> b -> c) -> b -> a -> c
flip [Text] -> [()] -> [(Text, ())]
forall a b. [a] -> [b] -> [(a, b)]
zip (() -> [()]
forall a. a -> [a]
repeat ())

-- | Builds the Searcher for a list of needles.
buildWithValues :: Hashable v => [(Text, v)] -> Searcher v
{-# INLINABLE buildWithValues #-}
buildWithValues :: [(Text, v)] -> Searcher v
buildWithValues [(Text, v)]
ns =
  Hashed [(Text, v)] -> Int -> [(Automaton, v)] -> Searcher v
forall v.
Hashed [(Text, v)] -> Int -> [(Automaton, v)] -> Searcher v
Searcher ([(Text, v)] -> Hashed [(Text, v)]
forall a. Hashable a => a -> Hashed a
hashed [(Text, v)]
ns) ([(Text, v)] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [(Text, v)]
ns) ([(Automaton, v)] -> Searcher v) -> [(Automaton, v)] -> Searcher v
forall a b. (a -> b) -> a -> b
$ ((Text, v) -> (Automaton, v)) -> [(Text, v)] -> [(Automaton, v)]
forall a b. (a -> b) -> [a] -> [b]
map ((Text -> Automaton) -> (Text, v) -> (Automaton, v)
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
first Text -> Automaton
BoyerMoore.buildAutomaton) [(Text, v)]
ns

needles :: Searcher v -> [(Text, v)]
needles :: Searcher v -> [(Text, v)]
needles = Hashed [(Text, v)] -> [(Text, v)]
forall a. Hashed a -> a
unhashed (Hashed [(Text, v)] -> [(Text, v)])
-> (Searcher v -> Hashed [(Text, v)]) -> Searcher v -> [(Text, v)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Searcher v -> Hashed [(Text, v)]
forall v. Searcher v -> Hashed [(Text, v)]
searcherNeedles

automata :: Searcher v -> [(Automaton, v)]
automata :: Searcher v -> [(Automaton, v)]
automata = Searcher v -> [(Automaton, v)]
forall v. Searcher v -> [(Automaton, v)]
searcherAutomata

numNeedles :: Searcher v -> Int
numNeedles :: Searcher v -> Int
numNeedles = Searcher v -> Int
forall v. Searcher v -> Int
searcherNumNeedles

-- | Return whether the haystack contains any of the needles.
-- This function is marked noinline as an inlining boundary. BoyerMoore.runText is
-- marked inline, so this function will be optimized to report only whether
-- there is a match, and not construct a list of matches. We don't want this
-- function be inline, to make sure that the conditions of the caller don't
-- affect how this function is optimized. There is little to gain from
-- additional inlining. The pragma is not an optimization in itself, rather it
-- is a defence against fragile optimizer decisions.
{-# NOINLINE containsAny #-}
containsAny :: Searcher () -> Text -> Bool
containsAny :: Searcher () -> Text -> Bool
containsAny !Searcher ()
searcher !Text
text =
  let
    -- On the first match, return True immediately.
    f :: p -> p -> Next Bool
f p
_acc p
_match = Bool -> Next Bool
forall a. a -> Next a
BoyerMoore.Done Bool
True
  in
    ((Automaton, ()) -> Bool) -> [(Automaton, ())] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (\(Automaton
automaton, ()) -> Bool
-> (Bool -> CodeUnitIndex -> Next Bool)
-> Automaton
-> Text
-> Bool
forall a.
a -> (a -> CodeUnitIndex -> Next a) -> Automaton -> Text -> a
BoyerMoore.runText Bool
False Bool -> CodeUnitIndex -> Next Bool
forall p p. p -> p -> Next Bool
f Automaton
automaton Text
text) (Searcher () -> [(Automaton, ())]
forall v. Searcher v -> [(Automaton, v)]
automata Searcher ()
searcher)
-- | Build a 'Searcher' that returns the needle's index in the needle list when it matches.

buildNeedleIdSearcher :: [Text] -> Searcher Int
buildNeedleIdSearcher :: [Text] -> Searcher Int
buildNeedleIdSearcher ![Text]
ns =
  [(Text, Int)] -> Searcher Int
forall v. Hashable v => [(Text, v)] -> Searcher v
buildWithValues ([(Text, Int)] -> Searcher Int) -> [(Text, Int)] -> Searcher Int
forall a b. (a -> b) -> a -> b
$ [Text] -> [Int] -> [(Text, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Text]
ns [Int
0..]

-- | Like 'containsAny', but checks whether all needles match instead.
-- Use 'buildNeedleIdSearcher' to get an appropriate 'Searcher'.
{-# NOINLINE containsAll #-}
containsAll :: Searcher Int -> Text -> Bool
containsAll :: Searcher Int -> Text -> Bool
containsAll !Searcher Int
searcher !Text
text =
  let
    -- On the first match, return True immediately.
    f :: p -> p -> Next Bool
f p
_acc p
_match = Bool -> Next Bool
forall a. a -> Next a
BoyerMoore.Done Bool
True
  in
    ((Automaton, Int) -> Bool) -> [(Automaton, Int)] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (\(Automaton
automaton, Int
_) -> Bool
-> (Bool -> CodeUnitIndex -> Next Bool)
-> Automaton
-> Text
-> Bool
forall a.
a -> (a -> CodeUnitIndex -> Next a) -> Automaton -> Text -> a
BoyerMoore.runText Bool
False Bool -> CodeUnitIndex -> Next Bool
forall p p. p -> p -> Next Bool
f Automaton
automaton Text
text) (Searcher Int -> [(Automaton, Int)]
forall v. Searcher v -> [(Automaton, v)]
automata Searcher Int
searcher)