{-# LANGUAGE CPP #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BinaryLiterals #-}
module Regex.KDE.Match
 ( matchRegex
 ) where

import qualified Data.ByteString as B
import Data.ByteString (ByteString)
import qualified Data.ByteString.UTF8 as U
import qualified Data.Set as Set
import Data.Set (Set)
import Regex.KDE.Regex
import qualified Data.IntMap.Strict as M
#if !MIN_VERSION_base(4,11,0)
import Data.Semigroup ((<>))
#endif

-- Note that all matches are from the beginning of the string.
-- The ^ anchor is implicit at the beginning of the regex.

data Match =
   Match { Match -> ByteString
matchBytes    :: !ByteString
         , Match -> Int
matchOffset   :: !Int
         , Match -> IntMap (Int, Int)
matchCaptures :: !(M.IntMap (Int, Int))
                                  -- starting offset, length in bytes
         } deriving (Int -> Match -> ShowS
[Match] -> ShowS
Match -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Match] -> ShowS
$cshowList :: [Match] -> ShowS
show :: Match -> String
$cshow :: Match -> String
showsPrec :: Int -> Match -> ShowS
$cshowsPrec :: Int -> Match -> ShowS
Show, Match -> Match -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Match -> Match -> Bool
$c/= :: Match -> Match -> Bool
== :: Match -> Match -> Bool
$c== :: Match -> Match -> Bool
Eq)

-- preferred matches are <=
instance Ord Match where
  Match
m1 <= :: Match -> Match -> Bool
<= Match
m2
    | Match -> Int
matchOffset Match
m1 forall a. Ord a => a -> a -> Bool
> Match -> Int
matchOffset Match
m2 = Bool
True
    | Match -> Int
matchOffset Match
m1 forall a. Ord a => a -> a -> Bool
< Match -> Int
matchOffset Match
m2 = Bool
False
    | Bool
otherwise = Match -> IntMap (Int, Int)
matchCaptures Match
m1 forall a. Ord a => a -> a -> Bool
>= Match -> IntMap (Int, Int)
matchCaptures Match
m2

mapMatching :: (Match -> Match) -> Set Match -> Set Match
mapMatching :: (Match -> Match) -> Set Match -> Set Match
mapMatching Match -> Match
f = forall a. (a -> Bool) -> Set a -> Set a
Set.filter ((forall a. Ord a => a -> a -> Bool
>= Int
0) forall b c a. (b -> c) -> (a -> b) -> a -> c
. Match -> Int
matchOffset) forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map Match -> Match
f

-- we take the n best matches to avoid pathological slowdown
sizeLimit :: Int
sizeLimit :: Int
sizeLimit = Int
2000

-- prune matches if it gets out of hand
prune :: Set Match -> Set Match
prune :: Set Match -> Set Match
prune Set Match
ms = if forall a. Set a -> Int
Set.size Set Match
ms forall a. Ord a => a -> a -> Bool
> Int
sizeLimit
              then forall a. Int -> Set a -> Set a
Set.take Int
sizeLimit Set Match
ms
              else Set Match
ms

-- first argument is a map of capturing groups, needed for Subroutine.
exec :: M.IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec :: IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
_ Direction
_ Regex
MatchNull = forall a. a -> a
id
exec IntMap Regex
cgs Direction
dir (Lazy Regex
re) = -- note: the action is below under Concat
  IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir (Regex -> Regex -> Regex
MatchConcat (Regex -> Regex
Lazy Regex
re) Regex
MatchNull)
exec IntMap Regex
cgs Direction
dir (Possessive Regex
re) =
  forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr
    (\Match
elt Set Match
s -> case forall a. Set a -> Maybe a
Set.lookupMin (IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
re (forall a. a -> Set a
Set.singleton Match
elt)) of
                 Maybe Match
Nothing -> Set Match
s
                 Just Match
m  -> forall a. Ord a => a -> Set a -> Set a
Set.insert Match
m Set Match
s)
    forall a. Monoid a => a
mempty
exec IntMap Regex
cgs Direction
dir (MatchDynamic Int
n) = -- if this hasn't been replaced, match literal
  IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir ((Char -> Bool) -> Regex
MatchChar (forall a. Eq a => a -> a -> Bool
== Char
'%') forall a. Semigroup a => a -> a -> a
<>
            forall a. Monoid a => [a] -> a
mconcat (forall a b. (a -> b) -> [a] -> [b]
map (\Char
c -> (Char -> Bool) -> Regex
MatchChar (forall a. Eq a => a -> a -> Bool
== Char
c)) (forall a. Show a => a -> String
show Int
n)))
exec IntMap Regex
_ Direction
_ Regex
AssertEnd = forall a. (a -> Bool) -> Set a -> Set a
Set.filter (\Match
m -> Match -> Int
matchOffset Match
m forall a. Eq a => a -> a -> Bool
== ByteString -> Int
B.length (Match -> ByteString
matchBytes Match
m))
exec IntMap Regex
_ Direction
_ Regex
AssertBeginning = forall a. (a -> Bool) -> Set a -> Set a
Set.filter (\Match
m -> Match -> Int
matchOffset Match
m forall a. Eq a => a -> a -> Bool
== Int
0)
exec IntMap Regex
cgs Direction
_ (AssertPositive Direction
dir Regex
regex) =
  forall (f :: * -> *) a. (Foldable f, Ord a) => f (Set a) -> Set a
Set.unions forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map
    (\Match
m -> forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map (\Match
m' -> -- we keep captures but not matches
                            Match
m'{ matchBytes :: ByteString
matchBytes = Match -> ByteString
matchBytes Match
m,
                               matchOffset :: Int
matchOffset = Match -> Int
matchOffset Match
m })
           forall a b. (a -> b) -> a -> b
$ IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
regex (forall a. a -> Set a
Set.singleton Match
m))
exec IntMap Regex
cgs Direction
_ (AssertNegative Direction
dir Regex
regex) =
  forall a. (a -> Bool) -> Set a -> Set a
Set.filter (\Match
m -> forall (t :: * -> *) a. Foldable t => t a -> Bool
null (IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
regex (forall a. a -> Set a
Set.singleton Match
m)))
exec IntMap Regex
_ Direction
_ Regex
AssertWordBoundary = forall a. (a -> Bool) -> Set a -> Set a
Set.filter Match -> Bool
atWordBoundary
exec IntMap Regex
_ Direction
Forward Regex
MatchAnyChar = (Match -> Match) -> Set Match -> Set Match
mapMatching forall a b. (a -> b) -> a -> b
$ \Match
m ->
  case ByteString -> Maybe (Char, Int)
U.decode (Int -> ByteString -> ByteString
B.drop (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m)) of
    Maybe (Char, Int)
Nothing -> Match
m{ matchOffset :: Int
matchOffset = - Int
1}
    Just (Char
_,Int
n) -> Match
m{ matchOffset :: Int
matchOffset = Match -> Int
matchOffset Match
m forall a. Num a => a -> a -> a
+ Int
n }
exec IntMap Regex
_ Direction
Backward Regex
MatchAnyChar = (Match -> Match) -> Set Match -> Set Match
mapMatching forall a b. (a -> b) -> a -> b
$ \Match
m ->
  case ByteString -> Int -> Maybe Int
lastCharOffset (Match -> ByteString
matchBytes Match
m) (Match -> Int
matchOffset Match
m) of
    Maybe Int
Nothing  -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
    Just Int
off -> Match
m{ matchOffset :: Int
matchOffset = Int
off }
exec IntMap Regex
_ Direction
Forward (MatchChar Char -> Bool
f) = (Match -> Match) -> Set Match -> Set Match
mapMatching forall a b. (a -> b) -> a -> b
$ \Match
m ->
  case ByteString -> Maybe (Char, Int)
U.decode (Int -> ByteString -> ByteString
B.drop (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m)) of
    Just (Char
c,Int
n) | Char -> Bool
f Char
c -> Match
m{ matchOffset :: Int
matchOffset = Match -> Int
matchOffset Match
m forall a. Num a => a -> a -> a
+ Int
n }
    Maybe (Char, Int)
_ -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
exec IntMap Regex
_ Direction
Backward (MatchChar Char -> Bool
f) = (Match -> Match) -> Set Match -> Set Match
mapMatching forall a b. (a -> b) -> a -> b
$ \Match
m ->
  case ByteString -> Int -> Maybe Int
lastCharOffset (Match -> ByteString
matchBytes Match
m) (Match -> Int
matchOffset Match
m) of
    Maybe Int
Nothing  -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
    Just Int
off ->
      case ByteString -> Maybe (Char, Int)
U.decode (Int -> ByteString -> ByteString
B.drop Int
off (Match -> ByteString
matchBytes Match
m)) of
        Just (Char
c,Int
_) | Char -> Bool
f Char
c -> Match
m{ matchOffset :: Int
matchOffset = Int
off }
        Maybe (Char, Int)
_                -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
exec IntMap Regex
cgs Direction
dir (MatchConcat (MatchConcat Regex
r1 Regex
r2) Regex
r3) =
  IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir (Regex -> Regex -> Regex
MatchConcat Regex
r1 (Regex -> Regex -> Regex
MatchConcat Regex
r2 Regex
r3))
exec IntMap Regex
cgs Direction
Forward (MatchConcat (Lazy Regex
r1) Regex
r2) =
  forall a b. (a -> b -> a) -> a -> Set b -> a
Set.foldl forall a. Ord a => Set a -> Set a -> Set a
Set.union forall a. Monoid a => a
mempty forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map
    (\Match
m ->
      let ms1 :: Set Match
ms1 = IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
Forward Regex
r1 (forall a. a -> Set a
Set.singleton Match
m)
       in if forall a. Set a -> Bool
Set.null Set Match
ms1
             then Set Match
ms1
             else Set Match -> Set Match
go Set Match
ms1)
 where
  go :: Set Match -> Set Match
go Set Match
ms = case forall a. Set a -> Maybe a
Set.lookupMax Set Match
ms of   -- find shortest match
            Maybe Match
Nothing -> forall a. Set a
Set.empty
            Just Match
m' ->
              let s' :: Set Match
s' = IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
Forward Regex
r2 (forall a. a -> Set a
Set.singleton Match
m')
               in if forall a. Set a -> Bool
Set.null Set Match
s'
                     then Set Match -> Set Match
go (forall a. Ord a => a -> Set a -> Set a
Set.delete Match
m' Set Match
ms)
                     else Set Match
s'
exec IntMap Regex
cgs Direction
Forward (MatchConcat Regex
r1 Regex
r2) = -- TODO longest match first
  \Set Match
ms ->
    let ms1 :: Set Match
ms1 = IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
Forward Regex
r1 Set Match
ms
     in if forall a. Set a -> Bool
Set.null Set Match
ms1
           then Set Match
ms1
           else IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
Forward Regex
r2 (Set Match -> Set Match
prune Set Match
ms1)
exec IntMap Regex
cgs Direction
Backward (MatchConcat Regex
r1 Regex
r2) =
  IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
Backward Regex
r1 forall b c a. (b -> c) -> (a -> b) -> a -> c
. IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
Backward Regex
r2
exec IntMap Regex
cgs Direction
dir (MatchAlt Regex
r1 Regex
r2) = \Set Match
ms -> IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
r1 Set Match
ms forall a. Semigroup a => a -> a -> a
<> IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
r2 Set Match
ms
exec IntMap Regex
cgs Direction
dir (MatchSome Regex
re) = Set Match -> Set Match
go
 where
  go :: Set Match -> Set Match
go Set Match
ms = case IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
re Set Match
ms of
            Set Match
ms' | forall a. Set a -> Bool
Set.null Set Match
ms' -> forall a. Set a
Set.empty
                | Set Match
ms' forall a. Eq a => a -> a -> Bool
== Set Match
ms    -> Set Match
ms
                | Bool
otherwise    -> let ms'' :: Set Match
ms'' = Set Match -> Set Match
prune Set Match
ms'
                                   in Set Match
ms'' forall a. Semigroup a => a -> a -> a
<> Set Match -> Set Match
go Set Match
ms''
exec IntMap Regex
cgs Direction
dir (MatchCapture Int
i Regex
re) =
  forall a b. (a -> b -> b) -> b -> Set a -> b
Set.foldr forall a. Ord a => Set a -> Set a -> Set a
Set.union forall a. Set a
Set.empty forall b c a. (b -> c) -> (a -> b) -> a -> c
.
   forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map (\Match
m ->
     forall b a. Ord b => (a -> b) -> Set a -> Set b
Set.map (Match -> Match -> Match
captureDifference Match
m) (IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
re (forall a. a -> Set a
Set.singleton Match
m)))
 where
    captureDifference :: Match -> Match -> Match
captureDifference Match
m Match
m' =
      let len :: Int
len = Match -> Int
matchOffset Match
m' forall a. Num a => a -> a -> a
- Match -> Int
matchOffset Match
m
      in  Match
m'{ matchCaptures :: IntMap (Int, Int)
matchCaptures = forall a. Int -> a -> IntMap a -> IntMap a
M.insert Int
i (Match -> Int
matchOffset Match
m, Int
len)
                                  (Match -> IntMap (Int, Int)
matchCaptures Match
m') }
exec IntMap Regex
_ Direction
dir (MatchCaptured Int
n) = (Match -> Match) -> Set Match -> Set Match
mapMatching Match -> Match
matchCaptured
 where
   matchCaptured :: Match -> Match
matchCaptured Match
m =
     case forall a. Int -> IntMap a -> Maybe a
M.lookup Int
n (Match -> IntMap (Int, Int)
matchCaptures Match
m) of
       Just (Int
offset, Int
len) ->
              let capture :: ByteString
capture = Int -> ByteString -> ByteString
B.take Int
len forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
B.drop Int
offset forall a b. (a -> b) -> a -> b
$ Match -> ByteString
matchBytes Match
m
              in  case Direction
dir of
                     Direction
Forward | ByteString -> ByteString -> Bool
B.isPrefixOf ByteString
capture
                                 (Int -> ByteString -> ByteString
B.drop (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m))
                        -> Match
m{ matchOffset :: Int
matchOffset = Match -> Int
matchOffset Match
m forall a. Num a => a -> a -> a
+ ByteString -> Int
B.length ByteString
capture }
                     Direction
Backward | ByteString -> ByteString -> Bool
B.isSuffixOf ByteString
capture
                                 (Int -> ByteString -> ByteString
B.take (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m))
                        -> Match
m{ matchOffset :: Int
matchOffset = Match -> Int
matchOffset Match
m forall a. Num a => a -> a -> a
- ByteString -> Int
B.length ByteString
capture }
                     Direction
_  -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
       Maybe (Int, Int)
Nothing -> Match
m{ matchOffset :: Int
matchOffset = -Int
1 }
exec IntMap Regex
cgs Direction
dir (Subroutine Int
i) =
  case forall a. Int -> IntMap a -> Maybe a
M.lookup Int
i IntMap Regex
cgs of
    Maybe Regex
Nothing -> forall a. a -> a
id  -- ignore references to nonexistent groups
    Just Regex
re' -> IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
cgs Direction
dir Regex
re'

atWordBoundary :: Match -> Bool
atWordBoundary :: Match -> Bool
atWordBoundary Match
m =
  case Match -> Int
matchOffset Match
m of
    Int
0 -> Bool
True
    Int
n | Int
n forall a. Eq a => a -> a -> Bool
== ByteString -> Int
B.length (Match -> ByteString
matchBytes Match
m) -> Bool
True
      | Bool
otherwise ->
           case ByteString -> Int -> Maybe Int
lastCharOffset (Match -> ByteString
matchBytes Match
m) (Match -> Int
matchOffset Match
m) of
             Maybe Int
Nothing  -> Bool
True
             Just Int
off ->
               case ByteString -> String
U.toString (Int -> ByteString -> ByteString
B.drop (Int
off forall a. Num a => a -> a -> a
- Int
1) (Match -> ByteString
matchBytes Match
m)) of
                 (Char
prev:Char
cur:Char
next:String
_) ->
                   (Char -> Bool
isWordChar Char
cur forall a. Eq a => a -> a -> Bool
/= Char -> Bool
isWordChar Char
next) Bool -> Bool -> Bool
||
                   (Char -> Bool
isWordChar Char
cur forall a. Eq a => a -> a -> Bool
/= Char -> Bool
isWordChar Char
prev)
                 String
_ -> Bool
True

lastCharOffset :: ByteString -> Int -> Maybe Int
lastCharOffset :: ByteString -> Int -> Maybe Int
lastCharOffset ByteString
_ Int
0 = forall a. Maybe a
Nothing
lastCharOffset ByteString
_ Int
1 = forall a. Maybe a
Nothing
lastCharOffset ByteString
bs Int
n =
  case HasCallStack => ByteString -> Int -> Word8
B.index ByteString
bs (Int
n forall a. Num a => a -> a -> a
- Int
2) of
    Word8
w | Word8
w forall a. Ord a => a -> a -> Bool
<  Word8
0b10000000 -> forall a. a -> Maybe a
Just (Int
n forall a. Num a => a -> a -> a
- Int
1)
      | Word8
w forall a. Ord a => a -> a -> Bool
>= Word8
0b11000000 -> forall a. a -> Maybe a
Just (Int
n forall a. Num a => a -> a -> a
- Int
1)
      | Bool
otherwise -> ByteString -> Int -> Maybe Int
lastCharOffset ByteString
bs (Int
n forall a. Num a => a -> a -> a
- Int
1)

-- | Match a Regex against a (presumed UTF-8 encoded) ByteString,
-- returning the matched text and a map of (offset, size)
-- pairs for captures.  Note that all matches are from the
-- beginning of the string (a @^@ anchor is implicit).  Note
-- also that to avoid pathological performance in certain cases,
-- the matcher is limited to considering 2000 possible matches
-- at a time; when that threshold is reached, it discards
-- smaller matches.  Hence certain regexes may incorrectly fail to
-- match: e.g. @a*a{3000}$@ on a string of 3000 @a@s.
matchRegex :: Regex
           -> ByteString
           -> Maybe (ByteString, M.IntMap (Int, Int))
matchRegex :: Regex -> ByteString -> Maybe (ByteString, IntMap (Int, Int))
matchRegex Regex
re ByteString
bs =
  let capturingGroups :: IntMap Regex
capturingGroups = Regex -> IntMap Regex
extractCapturingGroups Regex
re
  in  Match -> (ByteString, IntMap (Int, Int))
toResult forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall a. Set a -> Maybe a
Set.lookupMin
               (IntMap Regex -> Direction -> Regex -> Set Match -> Set Match
exec IntMap Regex
capturingGroups Direction
Forward Regex
re
                  (forall a. a -> Set a
Set.singleton (ByteString -> Int -> IntMap (Int, Int) -> Match
Match ByteString
bs Int
0 forall a. IntMap a
M.empty)))
 where
   toResult :: Match -> (ByteString, IntMap (Int, Int))
toResult Match
m = (Int -> ByteString -> ByteString
B.take (Match -> Int
matchOffset Match
m) (Match -> ByteString
matchBytes Match
m), (Match -> IntMap (Int, Int)
matchCaptures Match
m))

extractCapturingGroups :: Regex -> M.IntMap Regex
extractCapturingGroups :: Regex -> IntMap Regex
extractCapturingGroups Regex
regex = forall a. Int -> a -> IntMap a
M.singleton Int
0 Regex
regex forall a. Semigroup a => a -> a -> a
<>
  case Regex
regex of
    MatchSome Regex
re -> Regex -> IntMap Regex
extractCapturingGroups Regex
re
    MatchAlt Regex
re1 Regex
re2 ->
      Regex -> IntMap Regex
extractCapturingGroups Regex
re1 forall a. Semigroup a => a -> a -> a
<> Regex -> IntMap Regex
extractCapturingGroups Regex
re2
    MatchConcat Regex
re1 Regex
re2 ->
      Regex -> IntMap Regex
extractCapturingGroups Regex
re1 forall a. Semigroup a => a -> a -> a
<> Regex -> IntMap Regex
extractCapturingGroups Regex
re2
    MatchCapture Int
i Regex
re -> forall a. Int -> a -> IntMap a
M.singleton Int
i Regex
re
    AssertPositive Direction
_ Regex
re -> Regex -> IntMap Regex
extractCapturingGroups Regex
re
    AssertNegative Direction
_ Regex
re -> Regex -> IntMap Regex
extractCapturingGroups Regex
re
    Possessive Regex
re -> Regex -> IntMap Regex
extractCapturingGroups Regex
re
    Lazy Regex
re -> Regex -> IntMap Regex
extractCapturingGroups Regex
re
    Regex
_ -> forall a. Monoid a => a
mempty