> {-# LANGUAGE BangPatterns #-} > -- | This module defines the data type of internal regular expression pattern, > -- | as well as the partial derivative operations for regular expression patterns. > module Text.Regex.Deriv.IntPattern > ( Pat(..) > , strip > , Binder > , toBinder > , listifyBinder > , Key(..) > ) > where > import Data.List > import qualified Data.IntMap as IM > import Text.Regex.Deriv.Common (Range(..), range, minRange, maxRange, Letter, PosEpsilon(..), IsEpsilon(..), IsPhi(..), GFlag(..), IsGreedy(..), Simplifiable(..) ) > import Text.Regex.Deriv.RE > import Text.Regex.Deriv.Dictionary (Key(..), primeL, primeR) > import Text.Regex.Deriv.Pretty > -- | regular expression patterns > data Pat = PVar Int [Range] Pat -- ^ variable pattern > | PE [RE] -- ^ pattern without binder > | PPair Pat Pat -- ^ pair pattern > | PChoice [Pat] GFlag -- ^ choice pattern > | PStar Pat GFlag -- ^ star pattern > | PPlus Pat Pat -- ^ plus pattern, it is used internally to indicate that it is unrolled from a PStar > | PEmpty Pat -- ^ empty pattern, it is used intermally to indicate that mkEmpty function has been applied. > {-| The Eq instance for Pat data type > NOTE: We ignore the 'consumed word' when comparing patterns > (ie we only compare the pattern structure). > Essential for later comparisons among patterns. -} > instance Eq Pat where > (==) (PVar x1 _ p1) (PVar x2 _ p2) = (x1 == x2) && (p1 == p2) > (==) (PPair p1 p2) (PPair p1' p2') = (p1 == p1') && (p2 == p2') > (==) (PChoice ps1 g1) (PChoice ps2 g2) = (g1 == g2) && (ps1 == ps2) -- more efficient, because choices are constructed in left-nested > (==) (PPlus p1 p2) (PPlus p1' p2') = (p1 == p1') && (p2 == p2') > (==) (PStar p1 g1) (PStar p2 g2) = (g1 == g2) && (p1 == p2) > (==) (PE rs1) (PE rs2) = rs1 == rs2 > (==) _ _ = False > > instance Pretty a => Pretty [a] where > pretty [] = "{}" > pretty a@(x:xs) = "{" ++ prettyAll ++ "}" > where prettyAll = foldl' (\a i -> a++","++(pretty i)) (pretty x) xs > instance Pretty Pat where > pretty (PVar x1 _ p1) = "(" ++ show x1 ++ ":" ++ pretty p1 ++ ")" > pretty (PPair p1 p2) = "<" ++ pretty p1 ++ "," ++ pretty p2 ++ ">" > pretty (PChoice ps g) = "(" ++ pretty ps ++ ")" ++ (show g) > pretty (PE rs) = "|" ++ show rs ++ "|" > pretty (PPlus p1 p2 ) = "(" ++ pretty p1 ++ "," ++ pretty p2 ++ ")" > pretty (PStar p g) = (pretty p) ++ "*" ++ (show g) > pretty (PEmpty p) = "[" ++ pretty p ++ "]" > instance Show Pat where > show pat = pretty pat > instance Key Pat where > hash (PVar x1 _ p1) = let y1 = head (hash x1) > y2 = head (hash p1) > in y1 `seq` y2 `seq` [ 1 + y1 * primeL + y2 * primeR ] > hash (PPair p1 p2) = let x1 = head (hash p1) > x2 = head (hash p2) > in x1 `seq` x2 `seq` [ 2 + x1 * primeL + x2 * primeR ] > hash (PChoice (p1:p2:_) Greedy) = let x1 = head (hash p1) > x2 = head (hash p2) > in x1 `seq` x2 `seq` [ 4 + x1 * primeL + x2 * primeR ] > hash (PChoice (p1:p2:_) NotGreedy) = let x1 = head (hash p1) > x2 = head (hash p2) > in x1 `seq` x2 `seq` [ 5 + x1 * primeL + x2 * primeR ] > hash (PChoice (p1:_) _) = let x1 = head (hash p1) > > in x1 `seq` [ 5 + x1 * primeL ] > hash (PChoice [] _) = [5] > hash (PPlus p1 p2) = let x1 = head (hash p1) > x2 = head (hash p2) > in x1 `seq` x2 `seq` [ 6 + x1 * primeL + x2 * primeR ] > hash (PStar p Greedy) = let x = head (hash p) > in x `seq` [ 7 + x * primeL ] > hash (PStar p NotGreedy) = let x = head (hash p) > in x `seq` [ 8 + x * primeL ] > hash (PE r) = let x = head (hash r) > in x `seq` [ 9 + x * primeL ] > hash (PEmpty p) = let x = head (hash p) > in x `seq` [ 3 + x * primeL ] > hash p = error ("hash is applied to an unacceptable pattern " ++ (show p)) > -- | function 'strip' strips away the bindings from a pattern > strip :: Pat -> RE > strip (PVar _ w p) = strip p > strip (PE rs) = resToRE rs > strip (PStar p g) = Star (strip p) g > strip (PPair p1 p2) = Seq (strip p1) (strip p2) > strip (PPlus p1 p2) = Seq (strip p1) (strip p2) > strip (PChoice ps g) = Choice (map strip ps) g > strip (PEmpty p) = strip p > -- | function 'mkEmpPat' makes an empty pattern > mkEmpPat :: Pat -> Pat > mkEmpPat (PVar x w p) = PVar x w (mkEmpPat p) > mkEmpPat (PE rs) > | any posEpsilon rs = PE [Empty] > | otherwise = PE [Phi] > mkEmpPat (PStar p g) = PE [Empty] -- problematic?! we are losing binding (x,()) from ( x : a*) ~> PE <> > mkEmpPat (PPlus p1 p2) = mkEmpPat p1 -- since p2 must be pstar we drop it. If we mkEmpPat p2, we need to deal with pdPat (PPlus (x :<>) (PE <>)) l > mkEmpPat (PPair p1 p2) = PPair (mkEmpPat p1) (mkEmpPat p2) > mkEmpPat (PChoice ps g) = PChoice (map mkEmpPat ps) g > -- | function 'getBindingsFrom' transfer bindings from p2 to p1 > getBindingsFrom :: Pat -- ^ the source of the > -> Pat -> Pat > getBindingsFrom p1 p2 = let b = toBinder p2 > in assign p1 b > where assign :: Pat -> Binder -> Pat > assign (PVar x w p) b = > case IM.lookup x b of > Nothing -> let p' = assign p b in PVar x w p' > Just rs -> let > p' = assign p b > in PVar x (w ++ rs) p' > assign (PE r) _ = PE r > assign (PPlus p1 p2) b = PPlus (assign p1 b) p2 -- we don't need to care about p2 since it is a p* > assign (PPair p1 p2) b = PPair (assign p1 b) (assign p2 b) > assign (PChoice ps g) b = PChoice (map (\p -> assign p b) ps) g > -- | Function 'isGreedy' checks whether a pattern is greedy > instance IsGreedy Pat where > isGreedy (PVar _ _ p) = isGreedy p > isGreedy (PE rs) = any isGreedy rs > isGreedy (PPair p1 p2) = isGreedy p1 || isGreedy p2 > isGreedy (PChoice ps Greedy) = True > isGreedy (PChoice ps NotGreedy) = False -- isGreedy p1 || isGreedy p2 > isGreedy (PEmpty p) = False > isGreedy (PStar p Greedy) = True > isGreedy (PStar p NotGreedy) = False > isGreedy (PPlus p p') = isGreedy p || isGreedy p' > -- | The 'Binder' type denotes a set of (pattern var * range) pairs > -- type Binder = [(Int, [Range])] > type Binder = IM.IntMap [Range] > -- | check whether a pattern has binder > hasBinder :: Pat -> Bool > hasBinder (PVar _ _ _) = True > hasBinder (PPair p1 p2) = (hasBinder p1) || (hasBinder p2) > hasBinder (PPlus p1 p2) = hasBinder p1 > hasBinder (PStar p1 g) = hasBinder p1 > hasBinder (PE rs) = False > hasBinder (PChoice ps g) = any hasBinder ps > hasBinder (PEmpty p) = hasBinder p > -- | Function 'toBinder' turns a pattern into a binder > toBinder :: Pat -> Binder > toBinder p = IM.fromList (toBinderList p) > toBinderList :: Pat -> [(Int, [Range])] > toBinderList (PVar i rs p) = [(i, rs)] ++ (toBinderList p) > toBinderList (PPair p1 p2) = (toBinderList p1) ++ (toBinderList p2) > toBinderList (PPlus p1 p2) = (toBinderList p1) > toBinderList (PStar p1 g) = (toBinderList p1) > toBinderList (PE rs) = [] > toBinderList (PChoice ps g) = concatMap toBinderList ps > toBinderList (PEmpty p) = toBinderList p > listifyBinder :: Binder -> [(Int, [Range])] > listifyBinder b = sortBy (\ x y -> compare (fst x) (fst y)) (IM.toList b) > > {-| Function 'updateBinderByIndex' updates a binder given an index to a pattern var > ASSUMPTION: the var index in the pattern is linear. e.g. no ( 0 :: R1, (1 :: R2, 2 :a: R3)) > -} > updateBinderByIndex :: Int > -> Int > -> Binder > -> Binder > updateBinderByIndex i !pos binder = -- binder {- > IM.update (\ r -> case r of -- we always initialize to [], we don't need to handle the key miss case > { (rs_@((Range b e):rs)) -> > let !e' = e + 1 > in case e' of > _ | pos == e' -> Just ((range b e'):rs) > | pos > e' -> Just ((range pos pos):rs_) > | otherwise -> error "impossible, the current letter position is smaller than the last recorded letter" > ; [] -> Just [(range pos pos)] > } ) i binder -- -} > {- > updateBinderByIndex i pos binder = > case IM.lookup i binder of > { Nothing -> IM.insert i [(pos, pos)] binder > ; Just ranges -> > case ranges of > { [] -> IM.update (\_ -> Just [(pos,pos)]) i binder > ; ((b,e):rs) > | pos == e + 1 -> IM.update (\_ -> Just ((b,e+1):rs)) i binder > | pos > e + 1 -> IM.update (\_ -> Just ((pos,pos):(b,e):rs)) i binder > | otherwise -> error "impossible, the current letter position is smaller than the last recorded letter" > } > } > -} > {- > {-# INLINE updateBinderByIndex #-} > updateBinderByIndex :: Int -- ^ the indext of the pattern variable > -> Int -- ^ the letter position > -> Binder -> Binder > updateBinderByIndex i lpos binder = > updateBinderByIndexSub i lpos binder > > {-# INLINE updateBinderByIndexSub #-} > updateBinderByIndexSub :: Int -> Int -> Binder -> Binder > updateBinderByIndexSub idx pos [] = [] > updateBinderByIndexSub idx pos (x@(idx',(b,e):rs):xs) > -- | pos `seq` idx `seq` idx' `seq` xs `seq` False = undefined > | idx == idx' = if pos == (e + 1) > then (idx', (b, e+ 1):rs):xs > else if pos > (e + 1) > then (idx', (pos,pos):(b, e):rs):xs > else error "impossible, the current letter position is smaller than the last recorded letter" > | otherwise = -- idx `seq` pos `seq` xs `seq` > x:(updateBinderByIndexSub idx pos xs) > updateBinderByIndexSub idx pos (x@(idx',[]):xs) > -- | pos `seq` idx `seq` idx' `seq` xs `seq` False = undefined > | idx == idx' = ((idx', [(pos, pos)]):xs) > | otherwise = -- idx `seq` pos `seq` xs `seq` > x:(updateBinderByIndexSub idx pos xs) > -} > {- > {-| Function 'pdPat0' is the 'abstracted' form of the 'pdPat' function > It computes a set of pairs. Each pair consists a 'shape' of the partial derivative, and > an update function which defines the change of the pattern bindings from the 'source' pattern to > the resulting partial derivative. This is used in the compilation of the regular expression pattern -} > pdPat0 :: Pat -- ^ the source pattern > -> Letter -- ^ the letter to be "consumed" > -> [(Pat, Int -> Binder -> Binder)] > pdPat0 (PVar x w p) (l,idx) > | hasBinder p = > let pfs = pdPat0 p (l,idx) > in g `seq` pfs `seq` [ (PVar x [] pd, (\i -> (g i) . (f i) )) | (pd,f) <- pfs ] > | otherwise = -- p is not nested > let pds = partDeriv (strip p) l > in g `seq` pds `seq` if null pds then [] > else -- not PCRE [ (PVar x [] (PE (resToRE pds)), g) ] > [ (PVar x [] (PE pd), g) | pd <- pds ] > where g = updateBinderByIndex x > {- > | IM.null (toBinder p) = -- p is not nested > let pds = partDeriv (strip p) l > in g `seq` pds `seq` if null pds then [] > else [ (PVar x [] (PE (resToRE pds)), g) ] > | otherwise = > let pfs = pdPat0 p (l,idx) > in g `seq` pfs `seq` [ (PVar x [] pd, (\i -> (g i) . (f i) )) | (pd,f) <- pfs ] > where g = updateBinderByIndex x > -} > pdPat0 (PE r) (l,idx) = > let pds = partDeriv r l > in pds `seq` if null pds then [] > else [ (PE (resToRE pds), ( \_ -> id ) ) ] > pdPat0 (PStar p g) l = let pfs = pdPat0 p l > in pfs `seq` [ (PPair p' (PStar p g), f) | (p', f) <- pfs ] > pdPat0 (PPair p1 p2) l = > if (posEpsilon (strip p1)) > then if isGreedy p1 > then nub2 ([ (PPair p1' p2, f) | (p1' , f) <- pdPat0 p1 l ] ++ (pdPat0 p2 l)) > else nub2 ((pdPat0 p2 l) ++ [ (PPair p1' p2, f) | (p1' , f) <- pdPat0 p1 l ]) > else [ (PPair p1' p2, f) | (p1',f) <- pdPat0 p1 l ] > pdPat0 (PChoice ps g) l = > nub2 (concatMap (\p -> pdPat0 p l) ps) -- nub doesn't seem to be essential > nub2 :: Eq a => [(a,b)] -> [(a,b)] > nub2 = nubBy (\(p1,f1) (p2, f2) -> p1 == p2) > {-| Function 'pdPat0Sim' applies simplification to the results of 'pdPat0' -} > pdPat0Sim :: Pat -- ^ the source pattern > -> Letter -- ^ the letter to be "consumed" > -> [(Pat, Int -> Binder -> Binder)] > pdPat0Sim p l = > let pfs = pdPat0 p l > pfs' = pfs `seq` map (\(p,f) -> (simplify p, f)) pfs > in nub2 pfs' > -} > -- | mainly interested in simplifying epsilon, p --> p > -- could be made more optimal, e.g. (epsilon, epsilon) --> epsilon > instance Simplifiable Pat where > -- simplify :: Pat -> Pat > simplify (PVar i rs p) = PVar i rs (simplify p) > simplify (PPair p1 p2) = > let p1' = simplify p1 > p2' = simplify p2 > in if isEpsilon p1' > then p2' > else if isEpsilon p2' > then p1' > else PPair p1' p2' > simplify (PChoice ps g) = > let ps' = filter (not . isPhi) (map simplify ps) > in PChoice ps' g > simplify (PStar p g) = PStar (simplify p) g > simplify (PPlus p1 p2) = PPlus (simplify p1) (simplify p2) > simplify (PE r) = PE (map simplify r) > instance IsEpsilon Pat where > isEpsilon (PVar _ _ p) = isEpsilon p > isEpsilon (PE rs) = all isEpsilon rs > isEpsilon (PPair p1 p2) = (isEpsilon p1) && (isEpsilon p2) > isEpsilon (PChoice ps _) = all isEpsilon ps > isEpsilon (PStar p _) = isEpsilon p > isEpsilon (PPlus p1 p2) = isEpsilon p1 && isEpsilon p2 > isEpsilon (PEmpty _) = True > instance IsPhi Pat where > isPhi (PVar _ _ p) = isPhi p > isPhi (PE rs) = all isPhi rs > isPhi (PPair p1 p2) = (isPhi p1) || (isPhi p2) > isPhi (PChoice ps _) = all isPhi ps > isPhi (PStar p _) = False > isPhi (PPlus p1 p2) = isPhi p1 || isPhi p2 > isPhi (PEmpty _) = False