{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE OverloadedStrings #-}

module Data.Text.BoyerMooreCISpec
  ( spec
  ) where


import Control.Monad (forM_)
import Test.Hspec (Spec, describe, it, shouldBe, parallel)
import Test.Hspec.QuickCheck (modifyMaxSuccess, prop)
import Data.Text (Text)
import Test.QuickCheck (Arbitrary (arbitrary, shrink), forAllShrink)
import Test.QuickCheck.Instances ()

import qualified Data.Char as Char
import qualified Data.Text as Text
import qualified Data.Text.Utf8 as Utf8
import qualified Test.QuickCheck as QuickCheck

import Data.Text.CaseSensitivity (CaseSensitivity (..))
import Data.Text.TestInstances (arbitraryAlphabet, arbitraryFragment, arbitraryNeedleHaystack,
                                arbitraryNeedlesHaystack)

import qualified Data.Text.AhoCorasick.Replacer as AhoReplacer
import qualified Data.Text.BoyerMooreCI.Automaton as BoyerMooreCI
import qualified Data.Text.BoyerMooreCI.Searcher as Searcher
import qualified Data.Text.BoyerMooreCI.Replacer as Replacer


spec :: Spec
spec = parallel $ modifyMaxSuccess (const 200) $ do

  describe "automaton" $ do

    it "works for some basic examples" $ do
      matchPositions "a" "abca" `shouldBe` [(0,0), (3,3)]
      matchPositions "a" "ABCA" `shouldBe` [(0,0), (3,3)]
      matchPositions "abc" "abca" `shouldBe` [(0,2)]
      matchPositions "abc" "ABCA" `shouldBe` [(0,2)]
      matchPositions "bc" "abca" `shouldBe` [(1,2)]
      matchPositions "bc" "ABCA" `shouldBe` [(1,2)]

    it "does not yield overlapping matches" $ do
      matchPositions "aba" "abababa" `shouldBe` [(0,2), (4,6)]
      matchPositions "aba" "ABaBaBA" `shouldBe` [(0,2), (4,6)]

    it "does not work with uppercase needles" $ do
      matchPositions "A" "aaaa" `shouldBe` []
      matchPositions "A" "AAAA" `shouldBe` []

    it "works with cyrillic characters" $ do
      -- Cyrillic characters are all two bytes.
      -- The match positions are byte indices (not char indices).
      matchPositions "п" "ипсум" `shouldBe` [(2,3)]
      matchPositions "п" "ИПСУМ" `shouldBe` [(2,3)]

      matchPositions "лорем" "Лорем" `shouldBe` [(0,9)]
      matchTexts "лорем" "Лорем" `shouldBe` ["Лорем"]

      matchPositions "лорем" "ЛОРЕМ" `shouldBe` [(0,9)]
      matchTexts "лорем" "ЛОРЕМ" `shouldBe` ["ЛОРЕМ"]

      -- This is an interesting case for badCharLookup, because the mismatch
      -- happens at "с" which is the first character in the needle.
      matchTexts "сит" "итсит" `shouldBe` ["сит"]
      matchTexts "сит" "ИТСИТ" `shouldBe` ["СИТ"]

    it "works with mixed byte lengths" $ do
      -- Space is 1 byte
      matchTexts "сит" "Лор сит амет" `shouldBe` ["сит"]
      matchTexts "сит" "Лорем ипсум долор сит амет" `shouldBe` ["сит"]
      matchTexts "сит" "ЛОРЕМ ИПСУМ ДОЛОР СИТ АМЕТ" `shouldBe` ["СИТ"]

      matchTexts "💩b" "ЛОРЕМab𝄞💩𝄞ДОЛab💩baåÅÅ𝄞𝄞ßẞ" `shouldBe` ["💩b"]
      matchTexts "𝄞" "ЛОРЕМab𝄞💩𝄞ДОЛab💩baåÅÅ𝄞𝄞ßẞ" `shouldBe` ["𝄞","𝄞","𝄞","𝄞"]
      matchTexts "a" "ЛОРЕМab𝄞💩𝄞ДОЛab💩baåÅÅ𝄞𝄞ßẞ" `shouldBe` ["a","a","a"]

    it "works with ⱥ and ⱦ" $ do
      -- The letters ⱥ and ⱦ are 3 UTF8 bytes, but have unlowerings Ⱥ and Ⱦ of 2 bytes
      matchPositions "ⱥⱦⱥⱦⱥⱦ" "ⱥⱦⱥⱦⱥⱦ" `shouldBe` [(0, 17)]
      matchTexts "ⱥⱦⱥⱦⱥⱦ" "ⱥⱦⱥⱦⱥⱦ" `shouldBe` ["ⱥⱦⱥⱦⱥⱦ"]
      matchPositions "ⱥⱦⱥⱦⱥⱦ" "ȺȾȺȾȺȾ" `shouldBe` [(0, 11)]
      matchTexts "ⱥⱦⱥⱦⱥⱦ" "ȺȾȺȾȺȾ" `shouldBe` ["ȺȾȺȾȺȾ"]

      matchPositions "ⱥⱦⱥⱦⱥⱦ" "ȺⱦⱥȾⱥȾ" `shouldBe` [(0, 14)]
      matchTexts "ⱥⱦⱥⱦⱥⱦ" "ȺⱦⱥȾⱥȾ" `shouldBe` ["ȺⱦⱥȾⱥȾ"]

    describe "with a needle equal to the haystack" $ do

      it "reports a single match for a repeated character" $
        forM_ [1..128] $ \n ->
          let needle = Text.replicate n "a" in
          matchPositions needle needle `shouldBe` [(0, Utf8.lengthUtf8 needle-1)]

      prop "reports a single match for any arbitrary text fragment" $
        QuickCheck.forAll (arbitraryAlphabet >>= arbitraryFragment) $ \text ->
          let needle = Utf8.lowerUtf8 text in
          matchPositions needle text `shouldBe` [(0, Utf8.lengthUtf8 text-1)]

    describe "with sliced text (using nonzero internal offset)" $ do

      it "still reports offset relative to the text start" $
        -- The match position should be relative to the start of the text "a".
        -- Even if this text is represented as a slice of "bbba" internally.
        matchPositions "a" (Text.dropWhile (== 'b') "bbba") `shouldBe` [(0, 0)]

    it "matches ß and ẞ" $ do
      matchTexts "groß" "Großfräsmaschinenöffnungstür" `shouldBe` ["Groß"]
      matchTexts "groß" "GROẞFRÄSMASCHINENÖFFNUNGSTÜR" `shouldBe` ["GROẞ"]
      matchTexts "öffnung" "Großfräsmaschinenöffnungstür" `shouldBe` ["öffnung"]
      matchTexts "öffnung" "GROẞFRÄSMASCHINENÖFFNUNGSTÜR" `shouldBe` ["ÖFFNUNG"]


  describe "minimumSkipForCodePoint" $
    it "should match the reference implementation" $ do
      forM_ [minBound..maxBound] $ \c ->
        BoyerMooreCI.minimumSkipForCodePoint c `shouldBe` refMinimumSkipForCodePoint c


  describe "Searcher" $ do
    describe "containsAny" $ do

      -- For the edge case where a needle is the empty string,
      -- 'Text.isInfixOf' and 'Searcher.containsAny' are different:
      --
      -- @
      -- Text.isInfixOf "" "abc" == True /= False == Searcher.containsAny (Searcher.build [""]) "abc"
      -- @
      --
      -- However, at this point we probably shouldn't break this property.
      prop "is equivalent to disjunction of Text.isInfixOf calls*" $ do
        QuickCheck.forAllShrink arbitraryNeedlesHaystack shrink $ \(needles, haystack) -> do
          let
            lneedles = map Utf8.lowerUtf8 needles  -- needles must be lowercase
            searcher = Searcher.build lneedles
            test needle =
              not (Text.null needle) && needle `Text.isInfixOf` (Utf8.lowerUtf8 haystack)
          Searcher.containsAny searcher haystack `shouldBe` any test lneedles

    describe "containsAll" $ do
      prop "is equivalent to conjunction of Text.isInfixOf calls*" $ do
        QuickCheck.forAllShrink arbitraryNeedlesHaystack shrink $ \(needles, haystack) -> do
          let
            lneedles = map Utf8.lowerUtf8 needles  -- needles must be lowercase
            searcher = Searcher.buildNeedleIdSearcher lneedles
            test needle =
              not (Text.null needle) && needle `Text.isInfixOf` (Utf8.lowerUtf8 haystack)
          Searcher.containsAll searcher haystack `shouldBe` all test lneedles

  describe "Replacer" $ do
    describe "replaceSingleLimited" $ do

      prop "is equivalent to Aho-Corasick replacer with a single needle" $
        forAllShrink arbitraryNeedleHaystack shrink $ \(needle, haystack) ->
        forAllShrink arbitrary shrink $ \replacement ->
        let
          lneedle = Utf8.lowerUtf8 needle
          expected =
            AhoReplacer.run (AhoReplacer.build IgnoreCase [(lneedle, replacement)]) haystack
          auto = BoyerMooreCI.buildAutomaton lneedle
          actual = Replacer.replaceSingleLimited auto replacement haystack maxBound
        in
          actual `shouldBe` Just expected

-- Reference implementation for BoyerMooreCI.minimumSkipForCodePoint
refMinimumSkipForCodePoint :: Utf8.CodePoint -> Utf8.CodeUnitIndex
refMinimumSkipForCodePoint cp =
  let codePointLength = length . Utf8.unicode2utf8 . Char.ord in
  case Utf8.unlowerCodePoint cp of
    [] ->
      -- Input is upper case, so this is undefined behaviour but we match what the real
      -- implementation does:
      Utf8.CodeUnitIndex $ codePointLength cp
    ucs -> Utf8.CodeUnitIndex $ minimum $ map codePointLength ucs


-- | Return indices of the first and last byte of every match
matchPositions :: Text -> Text -> [(Utf8.CodeUnitIndex, Utf8.CodeUnitIndex)]
matchPositions needle =
  let
    !automaton = BoyerMooreCI.buildAutomaton needle
    prependMatch matches matchStart matchEnd =
      BoyerMooreCI.Step ((matchStart, matchEnd) : matches)
  in
    \haystack -> reverse $ BoyerMooreCI.runText [] prependMatch automaton haystack

positionText :: Text -> (Utf8.CodeUnitIndex, Utf8.CodeUnitIndex) -> Text
positionText haystack (firstByte, lastByte) =
  let len = lastByte - firstByte + 1  -- length is 1 if firstByte==lastByte
  in Utf8.unsafeSliceUtf8 firstByte len haystack

matchTexts :: Text -> Text -> [Text]
matchTexts needle haystack =
  map (positionText haystack) $ matchPositions needle haystack