-- Alfred-Margaret: Fast Aho-Corasick string searching
-- Copyright 2019 Channable
--
-- Licensed under the 3-clause BSD license, see the LICENSE file in the
-- repository root.

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}

module Data.Text.AhoCorasick.Searcher
  ( Searcher
  , build
  , buildWithValues
  , needles
  , numNeedles
  , automaton
  , caseSensitivity
  , containsAny
  , setSearcherCaseSensitivity
  )
  where

import Control.DeepSeq (NFData)
import Data.Hashable (Hashable (hashWithSalt), Hashed, hashed, unhashed)
import Data.Semigroup (Semigroup, (<>))
import Data.Text (Text)
import GHC.Generics (Generic)

#if defined(HAS_AESON)
import Data.Aeson ((.:), (.=))
import qualified Data.Aeson as AE
#endif

import Data.Text.AhoCorasick.Automaton (CaseSensitivity (..))

import qualified Data.Text.AhoCorasick.Automaton as Aho
import qualified Data.Text.Utf16 as Utf16

-- | A set of needles with associated values, and an Aho-Corasick automaton to
-- efficiently find those needles.
--
-- INVARIANT: searcherAutomaton = Aho.build . searcherNeedles
-- To enforce this invariant, the fields are not exposed from this module.
-- There is a separate constructor function.
--
-- The purpose of this wrapper is to have a type that is Hashable and Eq, so we
-- can derive those for types that embed the searcher, whithout requiring the
-- automaton itself to be Hashable or Eq, which would be both wasteful and
-- tedious. Because the automaton is fully determined by the needles and
-- associated values, it is sufficient to implement Eq and Hashable in terms of
-- the needles only.
--
-- We also use Hashed to cache the hash of the needles.
data Searcher v = Searcher
  { searcherCaseSensitive :: CaseSensitivity
  , searcherNeedles :: Hashed [(Text, v)]
  , searcherNumNeedles :: Int
  , searcherAutomaton :: Aho.AcMachine v
  } deriving (Generic)

#if defined(HAS_AESON)
instance AE.ToJSON v => AE.ToJSON (Searcher v) where
  toJSON s = AE.object
    [ "needles" .= needles s
    , "caseSensitivity" .= caseSensitivity s
    ]

instance (Hashable v, AE.FromJSON v) => AE.FromJSON (Searcher v) where
  parseJSON = AE.withObject "Searcher" $ \o -> buildWithValues <$> o .: "caseSensitivity" <*> o .: "needles"
#endif

instance Show (Searcher v) where
  show _ = "Searcher _ _ _"

instance Hashable v => Hashable (Searcher v) where
  hashWithSalt salt searcher = hashWithSalt salt $ searcherNeedles searcher
  {-# INLINE hashWithSalt #-}

instance Eq v => Eq (Searcher v) where
  -- Since we store the length of the needle list anyway,
  -- we can use it to early out if there is a length mismatch.
  Searcher cx xs nx _ == Searcher cy ys ny _ = (nx, xs, cx) == (ny, ys, cy)
  {-# INLINE (==) #-}

instance NFData v => NFData (Searcher v)

-- NOTE: Although we could implement Semigroup for every v by just concatenating
-- needle lists, we don't, because this might lead to unexpected results. For
-- example, if v is (Int, a) where the Int is a priority, combining two
-- searchers might want to discard priorities, concatenate the needle lists, and
-- reassign priorities, rather than concatenating the needle lists as-is and
-- possibly having duplicate priorities in the resulting searcher.
instance Semigroup (Searcher ()) where
  x <> y
    | caseSensitivity x == caseSensitivity y
      = buildWithValues (searcherCaseSensitive x) (needles x <> needles y)
    | otherwise = error "Combining searchers of different case sensitivity"
  {-# INLINE (<>) #-}

-- | Builds the Searcher for a list of needles
-- The caller is responsible that the needles are lower case in case the IgnoreCase
-- is used for case sensitivity
build :: CaseSensitivity -> [Text] -> Searcher ()
build case_ = buildWithValues case_ . fmap (\x -> (x, ()))

-- | The caller is responsible that the needles are lower case in case the IgnoreCase
-- is used for case sensitivity
buildWithValues :: Hashable v => CaseSensitivity -> [(Text, v)] -> Searcher v
{-# INLINABLE buildWithValues #-}
buildWithValues case_ ns =
  let
    unpack (text, value) = (Utf16.unpackUtf16 text, value)
  in
    Searcher case_ (hashed ns) (length ns) $ Aho.build $ fmap unpack ns

needles :: Searcher v -> [(Text, v)]
needles = unhashed . searcherNeedles

numNeedles :: Searcher v -> Int
numNeedles = searcherNumNeedles

automaton :: Searcher v -> Aho.AcMachine v
automaton = searcherAutomaton

caseSensitivity :: Searcher v -> CaseSensitivity
caseSensitivity = searcherCaseSensitive

-- | Updates the case sensitivity of the searcher. Does not change the
-- capitilization of the needles. The caller should be certain that if IgnoreCase
-- is passed, the needles are already lower case.
setSearcherCaseSensitivity :: CaseSensitivity -> Searcher v -> Searcher v
setSearcherCaseSensitivity case_ searcher = searcher{
    searcherCaseSensitive = case_
  }

-- | Return whether the haystack contains any of the needles.
-- Case sensitivity depends on the properties of the searcher
-- This function is marked noinline as an inlining boundary. Aho.runText is
-- marked inline, so this function will be optimized to report only whether
-- there is a match, and not construct a list of matches. We don't want this
-- function be inline, to make sure that the conditions of the caller don't
-- affect how this function is optimized. There is little to gain from
-- additional inlining. The pragma is not an optimization in itself, rather it
-- is a defence against fragile optimizer decisions.
{-# NOINLINE containsAny #-}
containsAny :: Searcher () -> Text -> Bool
containsAny !searcher !text =
  let
    -- On the first match, return True immediately.
    f _acc _match = Aho.Done True
  in case caseSensitivity searcher of
    CaseSensitive  -> Aho.runText False f (automaton searcher) text
    IgnoreCase      -> Aho.runLower False f (automaton searcher) text