-- -- Copyright (c) 2013-2023 Nicola Bonelli -- -- This program is free software; you can redistribute it and/or modify -- it under the terms of the GNU General Public License as published by -- the Free Software Foundation; either version 2 of the License, or -- (at your option) any later version. -- -- This program is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program; if not, write to the Free Software -- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -- module CGrep.Parser.Atom (Atom(..), Atoms, mkAtomFromToken, combineAtoms, filterTokensWithAtoms, wildCardMap, wildCardMatch, wildCardsMatch) where import qualified Data.Map as M import CGrep.Common ( trim, trim8 ) import CGrep.Distance ( (~==) ) import CGrep.Parser.Char ( isDigit ) import Data.List ( isSuffixOf, findIndices, isInfixOf, isPrefixOf, subsequences ) import Options ( Options(edit_dist, word_match, prefix_match, suffix_match) ) import Util ( spanGroup, rmQuote8 ) import qualified CGrep.Parser.Token as T import qualified Data.ByteString.Char8 as C import qualified CGrep.Parser.Chunk as T data Atom = Any | Keyword | Number | Oct | Hex | String | Literal | Identifier C.ByteString | Raw T.Token deriving stock (Eq, Ord, Show) type Atoms = [Atom] wildCardMap :: M.Map C.ByteString Atom wildCardMap = M.fromList [ ("ANY", Any ), ("KEY", Keyword ), ("OCT", Oct ), ("HEX", Hex ), ("NUM", Number ), ("STR", String ), ("LIT", String ) ] mkAtomFromToken :: T.Token -> Atom mkAtomFromToken t | T.isTokenIdentifier t = case () of _ | Just wc <- M.lookup str wildCardMap -> wc | isAtomIdentifier str -> Identifier str | otherwise -> Raw $ T.mkTokenIdentifier (rmAtomEscape str) (T.tOffset t) where str = T.tToken t | otherwise = Raw t combineAtoms :: [Atoms] -> [Atoms] combineAtoms (m1:r@(m2:m3:ms)) | [Raw b] <- m2, T.tToken b == "OR" = combineAtoms $ (m1<>m3):ms | otherwise = m1 : combineAtoms r combineAtoms [m1,m2] = [m1,m2] combineAtoms [m1] = [m1] combineAtoms [] = [] {-# INLINE filterTokensWithAtoms #-} filterTokensWithAtoms :: Options -> [Atoms] -> [T.Token] -> [T.Token] filterTokensWithAtoms opt ws ts = go opt (spanOptionalCards ws) ts where go :: Options -> [[Atoms]] -> [T.Token] -> [T.Token] go _ [] _ = [] go opt (g:gs) ts = {-# SCC "atom_find_total" #-} concatMap (take grpLen . (`drop` ts)) ({-# SCC "atom_find_indices" #-} findIndices (wildCardsCompare opt g) grp) <> {-# SCC atom_find_req #-} go opt gs ts where grp = {-# SCC "atomSpanGroup" #-} spanGroup grpLen ts grpLen = length g spanOptionalCards :: [Atoms] -> [[Atoms]] spanOptionalCards wc = map (`filterCardIndices` wc') idx where wc' = zip [0..] wc idx = subsequences $ findIndices (\case [Identifier (C.uncons -> Just ('$', _))] -> True _ -> False) wc filterCardIndices :: [Int] -> [(Int, Atoms)] -> [Atoms] filterCardIndices ns ps = map snd $ filter (\(n, _) -> n `notElem` ns) ps {-# INLINE filterCardIndices #-} wildCardsCompare :: Options -> [Atoms] -> [T.Token] -> Bool wildCardsCompare opt l r = wildCardsCompareAll ts && wildCardsCheckOccurrences ts where ts = wildCardsGroupCompare opt l r {-# INLINE wildCardsCompare #-} isAtomIdentifier :: C.ByteString -> Bool isAtomIdentifier s = if | Just (x, C.uncons -> Just (y, xs)) <- C.uncons s -> wprefix x && isDigit y | Just (x, "") <- C.uncons s -> wprefix x | otherwise -> error "isAtomIdentifier" where wprefix x = x == '$' || x == '_' rmAtomEscape :: C.ByteString -> C.ByteString rmAtomEscape (C.uncons -> Just ('$',xs)) = xs rmAtomEscape (C.uncons -> Just ('_',xs)) = xs rmAtomEscape xs = xs {-# INLINE rmAtomEscape #-} wildCardsCompareAll :: [(Bool, (Atoms, [C.ByteString]))] -> Bool wildCardsCompareAll = all fst {-# INLINE wildCardsCompareAll #-} {-# SCC wildCardsCompareAll #-} -- Note: pattern $ and _ match any token, whereas $1 $2 (_1 _2 etc.) match tokens -- that must compare equal in the respective occurrences wildCardsCheckOccurrences :: [(Bool, (Atoms, [C.ByteString]))] -> Bool wildCardsCheckOccurrences ts = M.foldr (\xs r -> r && all (== head xs) xs) True m where m = M.mapWithKey (\k xs -> case k of [Identifier "_0"] -> xs [Identifier "_1"] -> xs [Identifier "_2"] -> xs [Identifier "_3"] -> xs [Identifier "_4"] -> xs [Identifier "_5"] -> xs [Identifier "_6"] -> xs [Identifier "_7"] -> xs [Identifier "_8"] -> xs [Identifier "_9"] -> xs [Identifier "$0"] -> xs [Identifier "$1"] -> xs [Identifier "$2"] -> xs [Identifier "$3"] -> xs [Identifier "$4"] -> xs [Identifier "$5"] -> xs [Identifier "$6"] -> xs [Identifier "$7"] -> xs [Identifier "$8"] -> xs [Identifier "$9"] -> xs _ -> [] ) $ M.fromListWith (<>) (map snd ts) {-# INLINE wildCardsCheckOccurrences #-} {-# SCC wildCardsCheckOccurrences #-} wildCardsGroupCompare :: Options -> [Atoms] -> [T.Token] -> [(Bool, (Atoms, [C.ByteString]))] wildCardsGroupCompare opt ls rs | length rs >= length ls = zipWith (tokensZip opt) ls rs | otherwise = [ (False, ([Any], [])) ] {-# INLINE wildCardsGroupCompare #-} {-# SCC wildCardsGroupCompare #-} tokensZip :: Options -> Atoms -> T.Token -> (Bool, (Atoms, [C.ByteString])) tokensZip opt l r | wildCardsMatch opt l r = (True, (l, [T.tToken r])) | otherwise = (False, ([Any],[] )) {-# INLINE tokensZip #-} {-# SCC tokensZip #-} wildCardsMatch :: Options -> Atoms -> T.Token -> Bool wildCardsMatch opt m t = any (\w -> wildCardMatch opt w t) m {-# INLINE wildCardsMatch #-} {-# SCC wildCardsMatch #-} {-# SCC wildCardMatch #-} wildCardMatch :: Options -> Atom -> T.Token -> Bool wildCardMatch opt (Raw l) r | T.isTokenIdentifier l && T.isTokenIdentifier r = {-# SCC wildcard_raw_0 #-} if | word_match opt -> T.tToken l == T.tToken r | prefix_match opt -> T.tToken l `C.isPrefixOf` T.tToken r | suffix_match opt -> T.tToken l `C.isSuffixOf` T.tToken r | edit_dist opt -> (C.unpack . T.tToken) l ~== C.unpack (T.tToken r) | otherwise -> T.tToken l `C.isInfixOf` T.tToken r | T.isTokenString l && T.isTokenString r = {-# SCC wildcard_raw_1 #-} if | word_match opt -> ls == rs | prefix_match opt -> ls `C.isPrefixOf` rs | suffix_match opt -> ls `C.isSuffixOf` rs | edit_dist opt -> C.unpack ls ~== C.unpack rs | otherwise -> ls `C.isInfixOf` rs | otherwise = {-# SCC wildcard_raw_2 #-} l `T.eqToken` r where ls = rmQuote8 $ trim8 (T.tToken l) rs = rmQuote8 $ trim8 (T.tToken r) wildCardMatch _ Any _ = {-# SCC wildcard_any #-} True wildCardMatch _ (Identifier _) t = {-# SCC wildcard_identifier #-} T.isTokenIdentifier t wildCardMatch _ Keyword t = {-# SCC wildcard_keyword #-} T.isTokenKeyword t wildCardMatch _ String t = {-# SCC wildcard_string #-} T.isTokenString t wildCardMatch _ Literal t = {-# SCC wildcard_lit #-} T.isTokenString t wildCardMatch _ Number t = {-# SCC wildcard_number #-} T.isTokenNumber t wildCardMatch _ Oct t = {-# SCC wildcard_octal #-} T.isTokenNumber t && case C.uncons (T.tToken t) of Just ('0', C.uncons -> Just (d, _)) -> isDigit d; _ -> False wildCardMatch _ Hex t = {-# SCC wildcard_hex #-} T.isTokenNumber t && case C.uncons (T.tToken t) of Just ('0', C.uncons -> Just ('x',_)) -> True; _ -> False