{-# LANGUAGE CPP #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE BinaryLiterals #-} module Regex.KDE.Match ( matchRegex ) where import qualified Data.ByteString as B import Data.ByteString (ByteString) import qualified Data.ByteString.UTF8 as U import qualified Data.Set as Set import Data.Set (Set) import Regex.KDE.Regex import qualified Data.IntMap.Strict as M #if !MIN_VERSION_base(4,11,0) import Data.Semigroup ((<>)) #endif -- Note that all matches are from the beginning of the string. -- The ^ anchor is implicit at the beginning of the regex. data Match = Match { matchBytes :: !ByteString , matchOffset :: !Int , matchCaptures :: !(M.IntMap (Int, Int)) -- starting offset, length in bytes } deriving (Show, Eq) -- preferred matches are <= instance Ord Match where m1 <= m2 | matchOffset m1 > matchOffset m2 = True | matchOffset m1 < matchOffset m2 = False | otherwise = matchCaptures m1 >= matchCaptures m2 mapMatching :: (Match -> Match) -> Set Match -> Set Match mapMatching f = Set.filter ((>= 0) . matchOffset) . Set.map f -- we take the n best matches to avoid pathological slowdown sizeLimit :: Int sizeLimit = 2000 -- prune matches if it gets out of hand prune :: Set Match -> Set Match prune ms = if Set.size ms > sizeLimit then Set.take sizeLimit ms else ms -- first argument is the "top-level" regex, needed for Recurse. exec :: Regex -> Direction -> Regex -> Set Match -> Set Match exec _ _ MatchNull = id exec top dir (Lazy re) = -- note: the action is below under Concat exec top dir (MatchConcat (Lazy re) MatchNull) exec top dir (Possessive re) = foldr (\elt s -> case Set.lookupMin (exec top dir re (Set.singleton elt)) of Nothing -> s Just m -> Set.insert m s) mempty exec top dir (MatchDynamic n) = -- if this hasn't been replaced, match literal exec top dir (MatchChar (== '%') <> mconcat (map (\c -> MatchChar (== c)) (show n))) exec _ _ AssertEnd = Set.filter (\m -> matchOffset m == B.length (matchBytes m)) exec _ _ AssertBeginning = Set.filter (\m -> matchOffset m == 0) exec top _ (AssertPositive dir regex) = Set.filter (\m -> not (null (exec top dir regex (Set.singleton m)))) exec top _ (AssertNegative dir regex) = Set.filter (\m -> null (exec top dir regex (Set.singleton m))) exec _ _ AssertWordBoundary = Set.filter atWordBoundary exec _ Forward MatchAnyChar = mapMatching $ \m -> case U.decode (B.drop (matchOffset m) (matchBytes m)) of Nothing -> m{ matchOffset = - 1} Just (_,n) -> m{ matchOffset = matchOffset m + n } exec _ Backward MatchAnyChar = mapMatching $ \m -> case lastCharOffset (matchBytes m) (matchOffset m) of Nothing -> m{ matchOffset = -1 } Just off -> m{ matchOffset = off } exec _ Forward (MatchChar f) = mapMatching $ \m -> case U.decode (B.drop (matchOffset m) (matchBytes m)) of Just (c,n) | f c -> m{ matchOffset = matchOffset m + n } _ -> m{ matchOffset = -1 } exec _ Backward (MatchChar f) = mapMatching $ \m -> case lastCharOffset (matchBytes m) (matchOffset m) of Nothing -> m{ matchOffset = -1 } Just off -> case U.decode (B.drop off (matchBytes m)) of Just (c,_) | f c -> m{ matchOffset = off } _ -> m{ matchOffset = -1 } exec top dir (MatchConcat (MatchConcat r1 r2) r3) = exec top dir (MatchConcat r1 (MatchConcat r2 r3)) exec top Forward (MatchConcat (Lazy r1) r2) = Set.foldl Set.union mempty . Set.map (\m -> let ms1 = exec top Forward r1 (Set.singleton m) in if Set.null ms1 then ms1 else go ms1) where go ms = case Set.lookupMax ms of -- find shortest match Nothing -> Set.empty Just m' -> let s' = exec top Forward r2 (Set.singleton m') in if Set.null s' then go (Set.delete m' ms) else s' exec top Forward (MatchConcat r1 r2) = -- TODO longest match first \ms -> let ms1 = exec top Forward r1 ms in if Set.null ms1 then ms1 else exec top Forward r2 (prune ms1) exec top Backward (MatchConcat r1 r2) = exec top Backward r1 . exec top Backward r2 exec top dir (MatchAlt r1 r2) = \ms -> exec top dir r1 ms <> exec top dir r2 ms exec top dir (MatchSome re) = go where go ms = case exec top dir re ms of ms' | Set.null ms' -> Set.empty | ms' == ms -> ms | otherwise -> let ms'' = prune ms' in ms'' <> go ms'' exec top dir (MatchCapture i re) = Set.foldr Set.union Set.empty . Set.map (\m -> Set.map (captureDifference m) (exec top dir re (Set.singleton m))) where captureDifference m m' = let len = matchOffset m' - matchOffset m in m'{ matchCaptures = M.insert i (matchOffset m, len) (matchCaptures m') } exec _ dir (MatchCaptured n) = mapMatching matchCaptured where matchCaptured m = case M.lookup n (matchCaptures m) of Just (offset, len) -> let capture = B.take len $ B.drop offset $ matchBytes m in case dir of Forward | B.isPrefixOf capture (B.drop (matchOffset m) (matchBytes m)) -> m{ matchOffset = matchOffset m + B.length capture } Backward | B.isSuffixOf capture (B.take (matchOffset m) (matchBytes m)) -> m{ matchOffset = matchOffset m - B.length capture } _ -> m{ matchOffset = -1 } Nothing -> m{ matchOffset = -1 } exec top dir Recurse = \ms -> if Set.null ms then ms else exec top dir top ms atWordBoundary :: Match -> Bool atWordBoundary m = case matchOffset m of 0 -> True n | n == B.length (matchBytes m) -> True | otherwise -> case lastCharOffset (matchBytes m) (matchOffset m) of Nothing -> True Just off -> case U.toString (B.drop (off - 1) (matchBytes m)) of (prev:cur:next:_) -> (isWordChar cur /= isWordChar next) || (isWordChar cur /= isWordChar prev) _ -> True lastCharOffset :: ByteString -> Int -> Maybe Int lastCharOffset _ 0 = Nothing lastCharOffset _ 1 = Nothing lastCharOffset bs n = case B.index bs (n - 2) of w | w < 0b10000000 -> Just (n - 1) | w >= 0b11000000 -> Just (n - 1) | otherwise -> lastCharOffset bs (n - 1) -- | Match a Regex against a (presumed UTF-8 encoded) ByteString, -- returning the matched text and a map of (offset, size) -- pairs for captures. Note that all matches are from the -- beginning of the string (a @^@ anchor is implicit). Note -- also that to avoid pathological performance in certain cases, -- the matcher is limited to considering 2000 possible matches -- at a time; when that threshold is reached, it discards -- smaller matches. Hence certain regexes may incorrectly fail to -- match: e.g. @a*a{3000}$@ on a string of 3000 @a@s. matchRegex :: Regex -> ByteString -> Maybe (ByteString, M.IntMap (Int, Int)) matchRegex re bs = toResult <$> Set.lookupMin (exec re Forward re (Set.singleton (Match bs 0 M.empty))) where toResult m = (B.take (matchOffset m) (matchBytes m), (matchCaptures m))