{-# OPTIONS_GHC -fglasgow-exts -fno-warn-unused-binds #-} module Text.Regex.DFA.Transitions(noLoop,topNoLoop,starTrans,simplify ,canMatchNull,cannotMatchNull,stateAn) where import Text.Regex.DFA.Pattern import Control.Monad.Writer import Control.Monad.State import Data.Maybe import Data.List -- -- -- Transformations on Pattern -- die fn = error ("Text.Regex.DFA.Transitions "++fn++" failed"++show pIn) -- | Change a pattern so it will not loop in the DFA construction. -- Previously have used starTrans on the Pattern. The problems comes -- from PStar applied to a pattern which may match 0 characters. This -- *will* return a safe pattern, which can match null if and only if -- the input pattern could. noLoop :: Pattern -> Pattern noLoop = simplify . topNoLoop . simplify . starTrans -- | topNoLoop descends, looking for PStar, for which it calls -- breakLoop. This *will* return a safe pattern, which can match null -- "if" and "only if" the input pattern could. topNoLoop :: Pattern -> Pattern topNoLoop pIn = case pIn of POr ps -> POr (map topNoLoop ps) -- depend on "iff" PConcat ps -> PConcat (map topNoLoop ps) -- depend on "iff" PStar p -> if cannotMatchNull p then PStar (topNoLoop p) -- depend on "only if" to be safe else case breakLoop p of Nothing -> topNoLoop p -- depend on "if", remove PStar Just q -> PStar q -- depend on breakLoop to be safe _ -> pIn -- these cannot hide PStar, and so are safe -- | This is called for patterns that can match null, and which must -- be changed so they cannot match null. This may fail and return -- Nothing, or may succeed and return (Just Pattern) which *will* be -- both safe and unable to match null. breakLoop :: Pattern -> Maybe Pattern breakLoop pIn = case pIn of -- We know at least one of ps canMatchNull. It must be fixed or -- removed. POr ps -> let act p = if cannotMatchNull p then Just (topNoLoop p) -- depend on "only if" else breakLoop p -- try to fix this branch ps' = mapMaybe act $ ps -- destroy unfixable branches -- todo move unfixable branches to parallel position. in case ps' of [] -> Nothing [p] -> Just p _ -> Just (POr ps') -- We know every one of ps canMatchNull. This will likely -- replicate the PConcat into many slightly different branches of -- a POr. This is because we only need to fix one of the items in -- order to make a good branch. PConcat [] -> Nothing -- safe to be destroyed PConcat ps -> let act [] = [[]] act (q:qs) = let mr = breakLoop q rs = map topNoLoop qs q' = topNoLoop q qs' = act qs in case mr of Nothing -> map (q':) qs' Just r' -> (map (q':) qs') ++ [r':rs] -- The first item can match null, but later ones cannot -- if *any* nullable piece can be fixed, then this procedure works in case tail . map PConcat . act $ ps of [] -> Nothing -- all failed. probably not ok if had been simplify'd -- todo: return topNoLoop pIn to be moved to parallel position [pC] -> Just pC -- one piece was fixed. pCs -> Just (POr pCs) PStar p -> if cannotMatchNull p then let p' = topNoLoop p in Just (PConcat [p',PStar p']) -- depend "only if" else case breakLoop p of Nothing -> Nothing -- FAILED Just q -> Just (PConcat [q,PStar q]) PEmpty -> Nothing PCarat {} -> Nothing PDollar {} -> Nothing _ -> Just pIn -- These cannot match null and so are safe, and cannot hide PStar -- | starTrans replaces PQuest,PPlus,PBound,PGroup with combinations of -- PEmpty,POr,PConcat,PStar. Malformed limits on the PBound will be -- replaced with PEmpty rather than calling error. This will also -- simplify the resulting Pattern as it works. starTrans :: Pattern -> Pattern starTrans = dfsPattern starTrans' starTrans' :: Pattern -> Pattern starTrans' pIn = case pIn of -- Eliminated PGroup _ p -> p PQuest p -> quest' p PPlus p -> PConcat [p,PStar p] PBound i _ _ | i<0 -> PEmpty PBound i (Just j) _ | i>j -> PEmpty PBound i Nothing p -> PConcat $ apply (p:) i [PStar p] PBound 0 (Just 0) _ -> PEmpty PBound 0 (Just 1) p -> quest' p PBound 0 (Just j) p -> apply (quest' . (concat' p)) (pred j) (quest' p) PBound i (Just j) p | i == j -> PConcat (replicate i p) | otherwise -> PConcat $ apply (p:) i [starTrans' $ PBound 0 (Just (j-i)) p] -- Left intact PEmpty -> pIn PStar {} -> pIn POr {} -> pIn PConcat {} -> pIn PCarat {} -> pIn PDollar {} -> pIn PDot {} -> pIn PAny {} -> pIn PAnyNot {} -> pIn PEscape {} -> pIn PChar {} -> pIn where quest' = (\p -> POr [p,PEmpty]) concat' a b = PConcat [a,b] apply f n x = foldr ($) x (replicate n f) simplify :: Pattern -> Pattern simplify = dfsPattern simplify' -- | Apply a Pattern transfomation function depth first dfsPattern :: (Pattern -> Pattern) -- ^ The transformation function -> Pattern -- ^ The Pattern to transform -> Pattern -- ^ The transformed Pattern dfsPattern f = dfs where unary c = f . c . dfs dfs pattern = case pattern of POr ps -> f (POr (map dfs ps)) PConcat ps -> f (PConcat (map dfs ps)) PGroup i p -> unary (PGroup i) p PQuest p -> unary PQuest p PPlus p -> unary PPlus p PStar p -> unary PStar p PBound i mi p -> unary (PBound i mi) p _ -> f pattern -- | Function to transform a pattern into an equivalent, but less -- redundant form. Nested 'POr' and 'PConcat' are flattened. simplify' :: Pattern -> Pattern simplify' x@(POr _) = let ps' = case span notPEmpty (flatten x) of (notEmpty,[]) -> notEmpty (notEmpty,_:rest) -> notEmpty ++ (PEmpty:filter notPEmpty rest) -- keep 1st PEmpty only in case ps' of [] -> PEmpty [p] -> p _ -> POr ps' simplify' x@(PConcat _) = let ps' = filter notPEmpty (flatten x) in case ps' of [] -> PEmpty [p] -> p _ -> PConcat ps' simplify' (PBound _ (Just 0) _) = PEmpty simplify' (PStar PEmpty) = PEmpty simplify' other = other -- | Function to flatten nested POr or nested PConcat applicataions. -- Other patterns are returned unchanged flatten :: Pattern -> [Pattern] flatten (POr ps) = (concatMap (\x -> case x of POr ps' -> ps' p -> [p]) ps) flatten (PConcat ps) = (concatMap (\x -> case x of PConcat ps' -> ps' p -> [p]) ps) flatten _ = error "flatten can only be applied to POr or PConcat" notPEmpty :: Pattern -> Bool notPEmpty PEmpty = False notPEmpty _ = True -- -- Analyze Pattern {- -- | This provides an unordered list of the PatternIndex values that -- have back references in the pattern. This does not mean the -- pattern will have these captured substrings, just that the pattern -- referes to these indices. backReferences :: Pattern -> [PatternIndex] backReferences = foldPattern f [] where f (PBack x) xs = (x:xs) f _ xs = xs -} foldPattern :: (Pattern -> a -> a) -> a -> Pattern -> a foldPattern f = foldP where foldP a pIn = let unary p = f pIn (f p a) in case pIn of POr ps -> f pIn (foldr f a ps) PConcat ps -> f pIn (foldr f a ps) PGroup _ p -> unary p PQuest p -> unary p PPlus p -> unary p PStar p -> unary p PBound _ _ p -> unary p _ -> f pIn a -- | Determines if pIn will always accept [] and never accept any characters -- Treat PCarat and PDollar as False, since they do not always accept [] alwaysOnlyMatchNull :: Pattern -> Bool alwaysOnlyMatchNull pIn = case pIn of PEmpty -> True PGroup _ p -> alwaysOnlyMatchNull p POr [] -> True POr ps -> all alwaysOnlyMatchNull ps PConcat [] -> True PConcat ps -> all alwaysOnlyMatchNull ps PQuest p -> alwaysOnlyMatchNull p PPlus p -> alwaysOnlyMatchNull p PStar p -> alwaysOnlyMatchNull p PBound _ (Just 0) _ -> True PBound _ _ p -> alwaysOnlyMatchNull p PCarat _ -> False PDollar _ -> False _ ->False canMatchNull,cannotMatchNull :: Pattern -> Bool canMatchNull = not . cannotMatchNull -- | If 'cannotMatchNull' returns 'True' then it is known that the -- 'Pattern' will never accept an empty string. If 'cannotMatchNull' -- returns 'False' then it is possible but not definite that the -- 'Pattern' could accept an empty string. cannotMatchNull pIn = case pIn of PEmpty -> False PGroup _ p -> cannotMatchNull p POr [] -> False POr ps -> all cannotMatchNull ps PConcat [] -> False PConcat ps -> any cannotMatchNull ps PQuest _ -> False PPlus p -> cannotMatchNull p PStar _ -> False PBound 0 _ _ -> False PBound _ _ p -> cannotMatchNull p PCarat _ -> False PDollar _ -> False _ -> True -- | Determines if pIn is always anchored at the front with PCarat hasFrontCarat,hasBackDollar::Pattern -> Bool hasFrontCarat pIn = case pIn of PCarat _ -> True POr [] -> False POr ps -> all hasFrontCarat ps PConcat [] -> False PConcat ps -> case dropWhile alwaysOnlyMatchNull ps of [] -> False (p:_) -> hasFrontCarat p _ -> False -- | Determines if pIn is always anchored at the back with PDollar hasBackDollar pIn = case pIn of PDollar _ -> True POr [] -> False POr ps -> all hasBackDollar ps PConcat [] -> False PConcat ps -> case dropWhile alwaysOnlyMatchNull (reverse ps) of [] -> False (p:_) -> hasBackDollar p _ -> False -- | I did this overly cleverly. It descends the Pattern depth first -- and computes what is nullable along with creating id numbers for -- the DFA states. This works, but amusingly required mdo. The main -- output is funnel out via the tell to the WriterT and is the NFA -- state before, the DoPa index of the transition character, stateAn :: Pattern -> [(Int,DoPa,Int)] stateAn pIn = evalState (execWriterT (descend1 0 pIn)) 0 uniq :: WriterT [(Int,DoPa,Int)] (State Int) Int uniq = do s <- get let s' = succ s put $! s' return s' descend1 :: Int -> Pattern -> WriterT [(Int,DoPa,Int)] (State Int) (Bool,Int) descend1 a pIn = case pIn of PEmpty -> return (True,a) PGroup _ p -> descend1 a p POr ps -> mdo foo <- mapM (descend2 a b) ps let n = any fst foo b <- if n then return a else uniq return (n,b) PConcat [] -> return (True,a) PConcat (p:ps) -> mdo foo <- descend2 a b p let n = fst foo b <- if n then return a else uniq rest <- descend1 b (PConcat ps) return (n && fst rest,snd rest) PStar p -> do descend2 a a p return (True,a) PCarat _ -> return (True,a) PDollar _ -> return (True,a) PDot d -> one d PAny d _ -> one d PAnyNot d _ -> one d PEscape d _ -> one d PChar d _ -> one d _ -> undefined where one d = do b <- uniq tell [(a,d,b)] return (False,b) descend2 :: Int -> Int -> Pattern -> WriterT [(Int,DoPa,Int)] (State Int) (Bool,Int) descend2 a b pIn = case pIn of PEmpty -> return (True,b) PGroup _ p -> descend2 a b p POr ps -> mdo foo <- mapM (descend2 a b) ps let n = any fst foo return (n,b) PConcat [] -> return (True,b) PConcat (p:ps) -> mdo foo <- descend2 a c1 p let n1 = fst foo c1 <- if n1 then return a else uniq rest <- descend2 c1 b (PConcat ps) let n2 = fst rest return (n1 && n2,snd rest) {- PConcat (p:ps) -> mdo foo <- descend2 a c p let n1 = fst foo rest <- descend2 c b (PConcat ps) let n2 = fst rest c <- if n1 then return a else if n2 then return b else uniq return (n1 && n2,snd rest) -} PStar p -> do descend2 a a p return (True,b) PCarat _ -> return (True,b) PDollar _ -> return (True,b) PDot d -> one d PAny d _ -> one d PAnyNot d _ -> one d PEscape d _ -> one d PChar d _ -> one d _ -> undefined where one d = do tell [(a,d,b)] return (False,b)