-- | This "Text.Regex.TDFA.Pattern" module provides the 'Pattern' data -- type and its subtypes. This 'Pattern' type is used to represent -- the parsed form of a Regular Expression. module Text.Regex.TDFA.Pattern (Pattern(..) ,PatternSet(..) ,PatternSetCharacterClass(..) ,PatternSetCollatingElement(..) ,PatternSetEquivalenceClass(..) ,GroupIndex ,DoPa(..) ,showPattern -- ** Internal use ,starTrans -- ** Internal use, Operations to support debugging under ghci ,starTrans',simplify',dfsPattern ) where {- By Chris Kuklewicz, 2007. BSD License, see the LICENSE file. -} import Data.List(intersperse,partition) import qualified Data.Set as Set(toAscList,toList) import Data.Set(Set) -- XXX EnumSet import Text.Regex.TDFA.Common(DoPa(..),GroupIndex,common_error) err :: String -> a err = common_error "Text.Regex.TDFA.Pattern" -- | Pattern is the type returned by the regular expression parser. -- This is consumed by the CorePattern module and the tender leaves -- are nibbled by the TNFA module. data Pattern = PEmpty | PGroup (Maybe GroupIndex) Pattern -- Nothing to indicate non-matching PGroup (Nothing never used!) | POr [Pattern] -- flattened by starTrans | PConcat [Pattern] -- flattened by starTrans | PQuest Pattern -- eliminated by starTrans | PPlus Pattern -- eliminated by starTrans | PStar Bool Pattern -- True means mayFirstBeNull is True | PBound Int (Maybe Int) Pattern -- eliminated by starTrans -- The rest of these need an index of where in the regex string it is from | PCarat {getDoPa::DoPa} | PDollar {getDoPa::DoPa} -- The following test and accept a single character | PDot {getDoPa::DoPa} -- Any character (newline?) at all | PAny {getDoPa::DoPa,getPatternSet::PatternSet} -- Square bracketed things | PAnyNot {getDoPa::DoPa,getPatternSet::PatternSet} -- Inverted square bracketed things | PEscape {getDoPa::DoPa,getPatternChar::Char} -- Backslashed Character | PChar {getDoPa::DoPa,getPatternChar::Char} -- Specific Character -- The following are semantic tags created in starTrans, not the parser | PNonCapture Pattern -- introduced by starTrans | PNonEmpty Pattern -- introduced by starTrans deriving (Eq,Show) -- | I have not been checking, but this should have the property that -- parsing the resulting string should result in an identical Pattern. -- This is not true if starTrans has created PNonCapture and PNonEmpty -- values or a (PStar False). The contents of a "[ ]" grouping are -- always shown in a sorted canonical order. showPattern :: Pattern -> String showPattern pIn = case pIn of PEmpty -> "()" PGroup _ p -> paren (showPattern p) POr ps -> concat $ intersperse "|" (map showPattern ps) PConcat ps -> concatMap showPattern ps PQuest p -> (showPattern p)++"?" PPlus p -> (showPattern p)++"+" -- If PStar has mayFirstBeNull False then reparsing will forget this flag PStar _ p -> (showPattern p)++"*" PBound i (Just j) p | i==j -> showPattern p ++ ('{':show i)++"}" PBound i mj p -> showPattern p ++ ('{':show i) ++ maybe ",}" (\j -> ',':show j++"}") mj -- PCarat _ -> "^" PDollar _ -> "$" PDot _ -> "." PAny _ ps -> ('[':show ps)++"]" PAnyNot _ ps -> ('[':'^':show ps)++"]" PEscape _ c -> '\\':c:[] PChar _ c -> [c] -- The following were not directly from the parser, and will not be parsed in properly PNonCapture p -> showPattern p PNonEmpty p -> showPattern p where {- groupRange x n (y:ys) = if (fromEnum y)-(fromEnum x) == n then groupRange x (succ n) ys else (if n <=3 then take n [x..] else x:'-':(toEnum (pred n+fromEnum x)):[]) ++ groupRange y 1 ys groupRange x n [] = if n <=3 then take n [x..] else x:'-':(toEnum (pred n+fromEnum x)):[] -} paren s = ('(':s)++")" data PatternSet = PatternSet (Maybe (Set Char)) (Maybe (Set PatternSetCharacterClass)) (Maybe (Set PatternSetCollatingElement)) (Maybe (Set PatternSetEquivalenceClass)) deriving (Eq) instance Show PatternSet where showsPrec i (PatternSet s scc sce sec) = let (special,normal) = maybe ("","") ((partition (`elem` "]-")) . Set.toAscList) s charSpec = (if ']' `elem` special then (']':) else id) (byRange normal) scc' = maybe "" ((concatMap show) . Set.toList) scc sce' = maybe "" ((concatMap show) . Set.toList) sce sec' = maybe "" ((concatMap show) . Set.toList) sec in shows charSpec . showsPrec i scc' . showsPrec i sce' . showsPrec i sec' . if '-' `elem` special then showChar '-' else id where byRange xAll@(x:xs) | length xAll <=3 = xAll | otherwise = groupRange x 1 xs byRange _ = undefined groupRange x n (y:ys) = if (fromEnum y)-(fromEnum x) == n then groupRange x (succ n) ys else (if n <=3 then take n [x..] else x:'-':(toEnum (pred n+fromEnum x)):[]) ++ groupRange y 1 ys groupRange x n [] = if n <=3 then take n [x..] else x:'-':(toEnum (pred n+fromEnum x)):[] newtype PatternSetCharacterClass = PatternSetCharacterClass {unSCC::String} deriving (Eq,Ord) newtype PatternSetCollatingElement = PatternSetCollatingElement {unSCE::String} deriving (Eq,Ord) newtype PatternSetEquivalenceClass = PatternSetEquivalenceClass {unSEC::String} deriving (Eq,Ord) instance Show PatternSetCharacterClass where showsPrec _ p = showChar '[' . showChar ':' . shows (unSCC p) . showChar ':' . showChar ']' instance Show PatternSetCollatingElement where showsPrec _ p = showChar '[' . showChar '.' . shows (unSCE p) . showChar '.' . showChar ']' instance Show PatternSetEquivalenceClass where showsPrec _ p = showChar '[' . showChar '=' . shows (unSEC p) . showChar '=' . showChar ']' -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- == -- | Do the transformation and simplification in a single traversal. -- This removes the PPlus, PQuest, and PBound values, changing to POr -- and PEmpty and PStar True/False. For some PBound values it adds -- PNonEmpty and PNonCapture semantic marker. It also simplifies to -- flatten out nested POr and PConcat instances and eliminate some -- uneeded PEmpty values. starTrans :: Pattern -> Pattern starTrans = dfsPattern (simplify' . starTrans') -- | Apply a Pattern transfomation function depth first dfsPattern :: (Pattern -> Pattern) -- ^ The transformation function -> Pattern -- ^ The Pattern to transform -> Pattern -- ^ The transformed Pattern dfsPattern f = dfs where unary c = f . c . dfs dfs pattern = case pattern of POr ps -> f (POr (map dfs ps)) PConcat ps -> f (PConcat (map dfs ps)) PGroup i p -> unary (PGroup i) p PQuest p -> unary PQuest p PPlus p -> unary PPlus p PStar i p -> unary (PStar i) p PBound i mi p -> unary (PBound i mi) p _ -> f pattern {- Replace by PNonCapture unCapture = dfsPattern unCapture' where unCapture' (PGroup (Just _) p) = PGroup Nothing p unCapture' x = x -} reGroup p@(PConcat xs) | 2 <= length xs = PGroup Nothing p reGroup p@(POr xs) | 2 <= length xs = PGroup Nothing p reGroup p = p starTrans' :: Pattern -> Pattern starTrans' pIn = case pIn of -- We know that "p" has been simplified in each of these cases: PQuest p -> POr [p,PEmpty] {- The PStar should not capture 0 characters on its first iteration, so set its mayFirstBeNull flag to False -} PPlus p | canOnlyMatchNull p -> p | otherwise -> asGroup $ PConcat [reGroup p,PStar False p] {- "An ERE matching a single character repeated by an '*' , '?' , or an interval expression shall not match a null expression unless this is the only match for the repetition or it is necessary to satisfy the exact or minimum number of occurrences for the interval expression." -} {- p? is p|PEmpty which prefers even a 0-character match for p p{0,1} is p? is POr [p,PEmpty] p{0,2} is (pp?)? NOT p?p? p{0,3} is (p(pp?)?)? p{1,2} is like pp{0,1} is like pp? but see below p{2,5} is ppp{0,3} is pp(p(pp?)?)? But this is not always right. Because if the second use of p in p?p? matches 0 characters then the perhaps non 0 character match of the first p is overwritten. We need a new operation "p!" that means "p?" unless "p" match 0 characters, in which case skip p as if it failed in "p?". Thus when p cannot accept 0 characters p! and p? are equivalent. And when p can only match 0 characters p! is PEmpty. So for simplicity, only use ! when p can match 0 characters but not only 0 characters. Call this (PNonEmpty p) in the Pattern type. p! is PNonEmpty p is POr [PEmpty,p] IS THIS TRUE? Use QuickCheck? Note that if p cannot match 0 characters then p! is p? and vice versa The p{0,1} is still always p? and POr [p,PEmpty] Now p{0,2} means p?p! or (pp!)? and p{0,3} means (p(pp!)!)? or p?p!p! Equivalently p?p! and p?p!p! And p{2,2} is p'p and p{3,3} is p'p'p and p{4} is p'p'p'p The p{1,2} is pp! and p{1,3} is pp!p! or p(pp!)! And p{2,4} means p'pp!p! and p{3,6} is p'p'pp!p!p! or p'p'p(p(pp!)!)! But this second form still has a problem: the (pp!)! can have the first p match 0 and the second p match non-zero. This showed up for (.|$){1,3} since ($.!)! should not be a valid path but altered the qt_win commands. Thus only p'p'pp!p!p! has the right semantics. For completeness: if p can only match only 0 characters then the cases are p{0,0} is (), p{0,_} = p?, p{_,_} is p if p can match 0 or non-zero characters then cases are p{0,0} is (), p{0,1} is (p)?, p{0,2} is (pp!)?, p{0,3} is (pp!p!)? p{1,1} is p, p{1,2} is pp!, p{1,3} is pp!p!, p{1,4} is pp!p!p! p{2,2} is p'p, p{2,3} is p'pp!, p{2,4} is p'pp!p! or p'p(pp!)! p{2,5} is p'pp!p!p! or p'p(p(pp!)!)! p{3,3} is p'p'p, p{3,4} is p'p'pp!, p{3,5} is p'p'pp!p!, p{3,6} is p'p'pp!p!p! if p can only match 1 or more characters then cases are p{0,0} is () p{0,1} is p?, p{0,2} is (pp?)?, p{0,3} is (p(pp?)?)?, p{0,4} is (pp{0,3})? p{1,1} is p, p{1,j} is pp{0,pred j} p{2,2} is p'p, p{2,3} is p'pp?, p{2,4} is p'p(pp?)?, p{2,5} = p'p{1,4} = p'(pp{0,3}) p{3,3} is p'p'p, p{3,4} is p'p'pp?, p{3,5} is p'p'p(pp?)?, p{3,6} is And by this logic, the PStar False is really p*! So p{0,} is p* and p{1,} is pp*! and p{2,} is p'pp*! and p{3,} is p'p'pp*! The (nonEmpty' p) below is the only way PNonEmpty is introduced into the Pattern. It is always preceded by p inside a PConcat list. The p involved never simplifies to PEmpty. Thus it is impossible to have PNonEmpty directly nested, i.e. (PNonEmpty (PNonEmpty _)) never occurs even after simplifications. The (nonCapture' p) below is the only way PNonCapture is introduced into the Pattern. It is always followed by p inside a PConcat list. -} -- Easy cases PBound i _ _ | i<0 -> PEmpty -- impossibly malformed PBound i (Just j) _ | i>j -> PEmpty -- impossibly malformed PBound _ (Just 0) _ -> PEmpty -- Medium cases PBound 0 Nothing p | canOnlyMatchNull p -> quest p | otherwise -> PStar True p PBound 0 (Just 1) p -> quest p -- Hard cases PBound i Nothing p | canOnlyMatchNull p -> p | otherwise -> asGroup . PConcat $ apply (nc'p:) (pred i) [reGroup p,PStar False p] where nc'p = nonCapture' p PBound 0 (Just j) p | canOnlyMatchNull p -> quest p -- The first operation is quest NOT nonEmpty. This can be tested with -- "a\nb" "((^)?|b){0,3}" and "a\nb" "((^)|b){0,3}" | otherwise -> quest . (concat' p) $ apply (nonEmpty' . (concat' p)) (j-2) (nonEmpty' p) {- 0.99.6 remove | cannotMatchNull p -> apply (quest' . (concat' p)) (pred j) (quest' p) | otherwise -> POr [ simplify' (PConcat (p : replicate (pred j) (nonEmpty' p))) , PEmpty ] -} {- 0.99.6 add, 0.99.7 remove PBound i (Just j) p | canOnlyMatchNull p -> p | i == j -> PConcat $ apply (p':) (pred i) [p] | otherwise -> PConcat $ apply (p':) (pred i) [p,apply (nonEmpty' . (concat' p)) (j-i-1) (nonEmpty' p) ] where p' = nonCapture' p -} {- 0.99.7 add -} PBound i (Just j) p | canOnlyMatchNull p -> p | i == j -> asGroup . PConcat $ apply (nc'p:) (pred i) [reGroup p] | otherwise -> asGroup . PConcat $ apply (nc'p:) (pred i) [reGroup p,apply (nonEmpty' . (concat' p)) (j-i-1) (ne'p) ] where nc'p = nonCapture' p ne'p = nonEmpty' p {- 0.99.6 | cannotMatchNull p -> PConcat $ apply (p':) (pred i) $ (p:) $ [apply (quest' . (concat' p)) (pred (j-i)) (quest' p)] | otherwise -> PConcat $ (replicate (pred i) p') ++ p : (replicate (j-i) (nonEmpty' p)) -} PStar mayFirstBeNull p | canOnlyMatchNull p -> if mayFirstBeNull then quest p else PEmpty | otherwise -> pass -- Left intact PEmpty -> pass PGroup {} -> pass POr {} -> pass PConcat {} -> pass PCarat {} -> pass PDollar {} -> pass PDot {} -> pass PAny {} -> pass PAnyNot {} -> pass PEscape {} -> pass PChar {} -> pass PNonCapture {} -> pass PNonEmpty {} -> pass -- TODO : remove PNonEmpty from program where quest = (\ p -> POr [p,PEmpty]) -- require p to have been simplified -- quest' = (\ p -> simplify' $ POr [p,PEmpty]) -- require p to have been simplified concat' a b = simplify' $ PConcat [reGroup a,reGroup b] -- require a and b to have been simplified nonEmpty' = (\ p -> simplify' $ POr [PEmpty,p]) -- 2009-01-19 : this was PNonEmpty nonCapture' = PNonCapture apply f n x = foldr ($) x (replicate n f) -- function f applied n times to x : f^n(x) asGroup p = PGroup Nothing (simplify' p) pass = pIn -- | Function to transform a pattern into an equivalent, but less -- redundant form. Nested 'POr' and 'PConcat' are flattened. PEmpty -- is propagated. simplify' :: Pattern -> Pattern simplify' x@(POr _) = let ps' = case span notPEmpty (flatten x) of (notEmpty,[]) -> notEmpty (notEmpty,_:rest) -> notEmpty ++ (PEmpty:filter notPEmpty rest) -- keep 1st PEmpty only in case ps' of [] -> PEmpty [p] -> p _ -> POr ps' simplify' x@(PConcat _) = let ps' = filter notPEmpty (flatten x) in case ps' of [] -> PEmpty [p] -> p _ -> PConcat ps' -- PConcat ps' simplify' (PStar _ PEmpty) = PEmpty simplify' (PNonCapture PEmpty) = PEmpty -- 2009, perhaps useful --simplify' (PNonEmpty PEmpty) = err "simplify' (PNonEmpty PEmpty) = should be Impossible!" -- 2009 simplify' other = other -- | Function to flatten nested POr or nested PConcat applicataions. flatten :: Pattern -> [Pattern] flatten (POr ps) = (concatMap (\x -> case x of POr ps' -> ps' p -> [p]) ps) flatten (PConcat ps) = (concatMap (\x -> case x of PConcat ps' -> ps' p -> [p]) ps) flatten _ = err "flatten can only be applied to POr or PConcat" notPEmpty :: Pattern -> Bool notPEmpty PEmpty = False notPEmpty _ = True -- | Determines if pIn will fail or accept [] and never accept any -- characters. Treat PCarat and PDollar as True. canOnlyMatchNull :: Pattern -> Bool canOnlyMatchNull pIn = case pIn of PEmpty -> True PGroup _ p -> canOnlyMatchNull p POr ps -> all canOnlyMatchNull ps PConcat ps -> all canOnlyMatchNull ps PQuest p -> canOnlyMatchNull p PPlus p -> canOnlyMatchNull p PStar _ p -> canOnlyMatchNull p PBound _ (Just 0) _ -> True PBound _ _ p -> canOnlyMatchNull p PCarat _ -> True PDollar _ -> True PNonCapture p -> canOnlyMatchNull p -- PNonEmpty p -> canOnlyMatchNull p -- like PQuest _ ->False {- -- | If 'cannotMatchNull' returns 'True' then it is known that the -- 'Pattern' will never accept an empty string. If 'cannotMatchNull' -- returns 'False' then it is possible but not definite that the -- 'Pattern' could accept an empty string. cannotMatchNull :: Pattern -> Bool cannotMatchNull pIn = case pIn of PEmpty -> False PGroup _ p -> cannotMatchNull p POr [] -> False POr ps -> all cannotMatchNull ps PConcat [] -> False PConcat ps -> any cannotMatchNull ps PQuest _ -> False PPlus p -> cannotMatchNull p PStar {} -> False PBound 0 _ _ -> False PBound _ _ p -> cannotMatchNull p PCarat _ -> False PDollar _ -> False PNonCapture p -> cannotMatchNull p -- PNonEmpty _ -> False -- like PQuest _ -> True -}