-- Alfred-Margaret: Fast Aho-Corasick string searching
-- Copyright 2019 Channable
--
-- Licensed under the 3-clause BSD license, see the LICENSE file in the
-- repository root.

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}

module Data.Text.BoyerMoore.Searcher
    ( Searcher
    , automata
    , build
    , buildNeedleIdSearcher
    , buildWithValues
    , containsAll
    , containsAny
    , needles
    , numNeedles
    ) where


import Control.DeepSeq (NFData)
import Data.Bifunctor (first)
import Data.Hashable (Hashable (hashWithSalt), Hashed, hashed, unhashed)
import GHC.Generics (Generic)

import Data.Text.Utf8 (Text)
import Data.Text.BoyerMoore.Automaton (Automaton)

import qualified Data.Text.BoyerMoore.Automaton as BoyerMoore


-- | A set of needles with associated values, and Boyer-Moore automata to
-- efficiently find those needles.
--
-- INVARIANT: searcherAutomaton = BoyerMoore.buildAutomaton . searcherNeedles
-- To enforce this invariant, the fields are not exposed from this module.
-- There is a separate constructor function.
--
-- The purpose of this wrapper is to have a type that is Hashable and Eq, so we
-- can derive those for the types that embed the searcher, whithout
-- requiring the automaton itself to be Hashable or Eq, which would be both
-- wasteful and tedious. Because the automaton is fully determined by the
-- needles and associated values, it is sufficient to implement Eq and Hashable
-- in terms of the needles only.
--
-- We also use Hashed to cache the hash of the needles.
data Searcher v = Searcher
  { forall v. Searcher v -> Hashed [(Text, v)]
searcherNeedles :: Hashed [(Text, v)]
  , forall v. Searcher v -> Int
searcherNumNeedles :: Int
  , forall v. Searcher v -> [(Automaton, v)]
searcherAutomata :: [(Automaton, v)]
  } deriving (forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall v x. Rep (Searcher v) x -> Searcher v
forall v x. Searcher v -> Rep (Searcher v) x
$cto :: forall v x. Rep (Searcher v) x -> Searcher v
$cfrom :: forall v x. Searcher v -> Rep (Searcher v) x
Generic)

instance Show (Searcher v) where
  show :: Searcher v -> String
show Searcher v
_ = String
"Searcher _ _ _"

instance Hashable v => Hashable (Searcher v) where
  hashWithSalt :: Int -> Searcher v -> Int
hashWithSalt Int
salt Searcher v
searcher = forall a. Hashable a => Int -> a -> Int
hashWithSalt Int
salt forall a b. (a -> b) -> a -> b
$ forall v. Searcher v -> Hashed [(Text, v)]
searcherNeedles Searcher v
searcher
  {-# INLINE hashWithSalt #-}

instance Eq v => Eq (Searcher v) where
  Searcher Hashed [(Text, v)]
xs Int
nx [(Automaton, v)]
_ == :: Searcher v -> Searcher v -> Bool
== Searcher Hashed [(Text, v)]
ys Int
ny [(Automaton, v)]
_ = Int
nx forall a. Eq a => a -> a -> Bool
== Int
ny Bool -> Bool -> Bool
&& Hashed [(Text, v)]
xs forall a. Eq a => a -> a -> Bool
== Hashed [(Text, v)]
ys
  {-# INLINE (==) #-}

instance NFData v => NFData (Searcher v)

-- | Builds the Searcher for a list of needles without values.
-- This is useful for just checking whether the haystack contains the needles.
build :: [Text] -> Searcher ()
{-# INLINABLE build #-}
build :: [Text] -> Searcher ()
build = forall v. Hashable v => [(Text, v)] -> Searcher v
buildWithValues forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b c. (a -> b -> c) -> b -> a -> c
flip forall a b. [a] -> [b] -> [(a, b)]
zip (forall a. a -> [a]
repeat ())

-- | Builds the Searcher for a list of needles.
buildWithValues :: Hashable v => [(Text, v)] -> Searcher v
{-# INLINABLE buildWithValues #-}
buildWithValues :: forall v. Hashable v => [(Text, v)] -> Searcher v
buildWithValues [(Text, v)]
ns =
  forall v.
Hashed [(Text, v)] -> Int -> [(Automaton, v)] -> Searcher v
Searcher (forall a. Hashable a => a -> Hashed a
hashed [(Text, v)]
ns) (forall (t :: * -> *) a. Foldable t => t a -> Int
length [(Text, v)]
ns) forall a b. (a -> b) -> a -> b
$ forall a b. (a -> b) -> [a] -> [b]
map (forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
first Text -> Automaton
BoyerMoore.buildAutomaton) [(Text, v)]
ns

needles :: Searcher v -> [(Text, v)]
needles :: forall v. Searcher v -> [(Text, v)]
needles = forall a. Hashed a -> a
unhashed forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall v. Searcher v -> Hashed [(Text, v)]
searcherNeedles

automata :: Searcher v -> [(Automaton, v)]
automata :: forall v. Searcher v -> [(Automaton, v)]
automata = forall v. Searcher v -> [(Automaton, v)]
searcherAutomata

numNeedles :: Searcher v -> Int
numNeedles :: forall v. Searcher v -> Int
numNeedles = forall v. Searcher v -> Int
searcherNumNeedles

-- | Return whether the haystack contains any of the needles.
-- This function is marked noinline as an inlining boundary. BoyerMoore.runText is
-- marked inline, so this function will be optimized to report only whether
-- there is a match, and not construct a list of matches. We don't want this
-- function be inline, to make sure that the conditions of the caller don't
-- affect how this function is optimized. There is little to gain from
-- additional inlining. The pragma is not an optimization in itself, rather it
-- is a defence against fragile optimizer decisions.
{-# NOINLINE containsAny #-}
containsAny :: Searcher () -> Text -> Bool
containsAny :: Searcher () -> Text -> Bool
containsAny !Searcher ()
searcher !Text
text =
  let
    -- On the first match, return True immediately.
    f :: p -> p -> Next Bool
f p
_acc p
_match = forall a. a -> Next a
BoyerMoore.Done Bool
True
  in
    forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (\(Automaton
automaton, ()) -> forall a.
a -> (a -> CodeUnitIndex -> Next a) -> Automaton -> Text -> a
BoyerMoore.runText Bool
False forall {p} {p}. p -> p -> Next Bool
f Automaton
automaton Text
text) (forall v. Searcher v -> [(Automaton, v)]
automata Searcher ()
searcher)
-- | Build a 'Searcher' that returns the needle's index in the needle list when it matches.

buildNeedleIdSearcher :: [Text] -> Searcher Int
buildNeedleIdSearcher :: [Text] -> Searcher Int
buildNeedleIdSearcher ![Text]
ns =
  forall v. Hashable v => [(Text, v)] -> Searcher v
buildWithValues forall a b. (a -> b) -> a -> b
$ forall a b. [a] -> [b] -> [(a, b)]
zip [Text]
ns [Int
0..]

-- | Like 'containsAny', but checks whether all needles match instead.
-- Use 'buildNeedleIdSearcher' to get an appropriate 'Searcher'.
{-# NOINLINE containsAll #-}
containsAll :: Searcher Int -> Text -> Bool
containsAll :: Searcher Int -> Text -> Bool
containsAll !Searcher Int
searcher !Text
text =
  let
    -- On the first match, return True immediately.
    f :: p -> p -> Next Bool
f p
_acc p
_match = forall a. a -> Next a
BoyerMoore.Done Bool
True
  in
    forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (\(Automaton
automaton, Int
_) -> forall a.
a -> (a -> CodeUnitIndex -> Next a) -> Automaton -> Text -> a
BoyerMoore.runText Bool
False forall {p} {p}. p -> p -> Next Bool
f Automaton
automaton Text
text) (forall v. Searcher v -> [(Automaton, v)]
automata Searcher Int
searcher)