>
A bytestring implementation of reg exp pattern matching using partial derivative / derivative
The POSIX matching policy is implemented by following the 'structure' of the reg-exp.
The pattern is follow annotated.
We do not break part the sub-pattern of the original reg, they are always grouped under the same var pattern.
>
> module Text.Regex.Deriv.ByteString.Posix
> ( Regex
> , CompOption(..)
> , ExecOption(..)
> , defaultCompOpt
> , defaultExecOpt
> , compile
> , execute
> , regexec
> ) where
> import System.IO.Unsafe
> import Data.List
> import Data.Char (ord)
> import GHC.Int
> import GHC.Arr
> import qualified Data.IntMap as IM
> import qualified Data.ByteString.Char8 as S
> import qualified Data.Map as M
> import Control.Monad
> import Text.Regex.Base(RegexOptions(..),RegexLike(..),MatchArray)
> import Text.Regex.Deriv.RE
> import Text.Regex.Deriv.Common (IsPhi(..), IsEpsilon(..))
> import Text.Regex.Deriv.Pretty (Pretty(..))
> import Text.Regex.Deriv.Common (Range(..), Letter, PosEpsilon(..), my_hash, my_lookup, GFlag(..), IsGreedy(..), preBinder, subBinder, mainBinder)
> import Text.Regex.Deriv.IntPattern (Pat(..), toBinder, Binder(..), strip, listifyBinder, Key(..))
> import Text.Regex.Deriv.Parse
> import qualified Text.Regex.Deriv.Dictionary as D (Dictionary(..), Key(..), insertNotOverwrite, lookupAll, empty, isIn, nub, member, lookup, insert)
> logger io = unsafePerformIO io
> type SRange = (Int,[Range])
> type CarryForward = IM.IntMap [Range]
> emptyCF = IM.empty
> combineCF :: CarryForward -> CarryForward -> CarryForward
> combineCF cf1 cf2 = IM.foldWithKey (\k r cf -> cf `seq` k `seq` r `seq` updateIfExist k r cf) cf2 cf1
>
> updateIfExist :: IM.Key -> [Range] -> CarryForward -> CarryForward
> updateIfExist k !r !cf =
> case IM.lookup k cf of
> { Just !r' -> IM.update (\_ -> Just (combineRange r r')) k cf
>
> ; Nothing -> IM.insert k r cf }
>
>
>
>
> combineRangeAcc :: [Range] -> [Range] -> [Range] -> [Range]
> combineRangeAcc acc [] rs2 = let rs2' = reverse rs2 in rs2' `seq` rs2' ++ acc
> combineRangeAcc acc rs1 [] = let rs1' = reverse rs1 in rs1' `seq` rs1' ++ acc
> combineRangeAcc acc ((r1@(Range b1 e1)):rs1) ((r2@(Range b2 e2)):rs2)
> | b1 == b2 && e1 >= e2 = let acc' = (Range b1 e1):acc
> in acc' `seq` combineRangeAcc acc' rs1 rs2
> | b1 == b2 && e2 >= e1 = let acc' = (Range b2 e2):acc
> in acc' `seq` combineRangeAcc acc' rs1 rs2
> | b1 == e2+1 = let acc' = (Range b2 e1):acc
> in acc' `seq` combineRangeAcc acc' rs1 rs2
> | b2 == e1+1 = let acc' = (Range b1 e2):acc
> in acc' `seq` (combineRangeAcc acc' rs1 rs2)
> | b1 > e2+1 = let acc' = (Range b2 e2):acc
> in acc' `seq` (combineRangeAcc acc' (r1:rs1) rs2)
> | b2 > e1+1 = let acc' = (Range b1 e1):acc
> in acc' `seq` (combineRangeAcc acc' rs1 (r2:rs2))
> | otherwise = error $ "unhandle combineRangeAcc:" ++ show (r1:rs1) ++ " vs " ++ show (r2:rs2)
> }
> combineRange :: [Range] -> [Range] -> [Range]
> combineRange [] rs2 = rs2
> combineRange rs1 [] = rs1
> combineRange ((r1@(Range b1 e1)):rs1) ((r2@(Range b2 e2)):rs2)
> | b1 == b2 && e1 >= e2 =
>
>
> [r1]
> | b1 == b2 && e2 >= e1 =
>
>
> [r2]
> | b1 == e2+1 =
>
>
> [r1]
> | b2 == e1+1 =
>
>
> [r2]
> | b1 > e2+1 =
>
>
> [r1]
> | b2 > e1+1 =
>
>
> [r2]
> | b1 >= b2 && e1 <= e2 = [r2]
> | b2 >= b1 && e2 <= e1 = [r1]
> | otherwise = error $ "unhandle combineRange:" ++ show (r1:rs1) ++ " vs " ++ show (r2:rs2)
>
> combineCFs :: [CarryForward] -> CarryForward
> combineCFs cfs = foldl' (\cf1 cf2 -> cf1 `combineCF` cf2) emptyCF cfs
> insertCF :: SRange -> CarryForward -> CarryForward
> insertCF (x,r) cf = IM.insert x r cf
> data SBinder = SChoice [SBinder] CarryForward
> | SPair SBinder SBinder CarryForward
> | SVar SRange SBinder CarryForward
> | SStar CarryForward
> | SRE CarryForward
> deriving Show
> toSBinder :: Pat -> SBinder
> toSBinder (PVar x w p) = SVar (x,[]) (toSBinder p) emptyCF
> toSBinder (PE rs) = SRE emptyCF
> toSBinder (PStar p g) = SStar emptyCF
> toSBinder (PPair p1 p2) = SPair (toSBinder p1) (toSBinder p2) emptyCF
> toSBinder (PChoice ps g) = SChoice (map toSBinder ps) emptyCF
The invariance:
The shapes of the input/output Pat and SBinder should be identical.
> dPat0 :: Pat -> Char -> [(Pat, Int -> SBinder -> SBinder)]
> dPat0 y@(PVar x w p) l =
> do { (!p',!f) <- dPat0 p l
> ; let f' !i !sb = case sb of
> { SVar (!v,!r) !sb' !cf -> let sb'' = f i sb'
> r' = updateRange i r
> r'' = r' `seq` r'
> in sb'' `seq` r'' `seq` SVar (v, r') sb'' cf
>
> }
> ; (!p'',!f'') <- simpFix (PVar x w p')
> ; if (p'' == (PVar x w p'))
> then return (PVar x w p', f')
> else return (p'', (\i sb ->
> let sb' = sb `seq` i `seq` f' i sb
> in sb' `seq` (f'' i sb')))
> }
> dPat0 (PE rs) l =
> let pds' = (concatMap (\r -> partDeriv r l) rs)
> pds = pds' `seq` nub pds'
> in pds `seq`
> if null pds then mzero
> else return (PE pds, (\_ !sb -> sb) )
> dPat0 (PStar p g) l =
> do { (!p', !f) <- dPat0 p l
> ; let emp = toSBinder p
> ; emp `seq`
> return (PPair p' (PStar p g), (\i sb -> i `seq` sb `seq`
> case sb of { SStar !cf -> let sb' = f i emp
> in sb' `seq`
> SPair sb' sb cf} ) )
> }
> dPat0 (PPair !p1 !p2) l
> | (posEpsilon (strip p1)) =
> let pf1 = dPat0 p1 l
> pf2 = dPat0 p2 l
> in case (pf1, pf2) of
> { ([], []) -> mzero
> ; ([], [(!p2',!f2')]) ->
> let rm = extract p1
> f !i !sb = case sb of
> { SPair !sb1 !sb2 !cf ->
> let sb1' = rm sb1
> cf' = sb1' `seq` combineCF sb1' cf
> sb2' = f2' i sb2
> in cf' `seq` sb2' `seq` carryForward cf' sb2' }
> in do { (!p2'',!f2'') <- simpFix p2'
> ; if p2'' == p2'
> then return (p2', f)
> else return (p2'', \i sb ->
> let sb' = i `seq` sb `seq` f i sb
> in sb' `seq` (f2'' i sb'))
> }
> ; ([(!p1',!f1')], []) ->
> let f !i !sb = case sb of
> { SPair !sb1 !sb2 !cf ->
> let sb1' = f1' i sb1
> in sb1' `seq`
> SPair sb1' sb2 cf }
> in do { (!p1'',!f1'') <- simpFix (PPair p1' p2)
> ; if (p1'' == (PPair p1' p2))
> then return (PPair p1' p2, f)
> else return (p1'', \i sb ->
> let sb' = i `seq` sb `seq` f i sb
> in sb' `seq` (f1'' i sb'))
> }
> ; _ | isGreedy p1 -> do
> { (!p1',!f1) <- dPat0 p1 l
> ; (!p2',!f2) <- dPat0 p2 l
> ; let rm = extract p1
> f !i !sb = case sb of
> { SPair !sb1 !sb2 !cf ->
> let sb1' = rm sb1
> sb1'' = f1 i sb1
> cf' = sb1' `seq` sb1' `combineCF` cf
> sb2' = f2 i sb2
> sb2'' = sb2' `seq` cf' `seq` carryForward cf' sb2'
> in sb1'' `seq` cf `seq` sb2 `seq` sb2'' `seq` SChoice [ SPair sb1'' sb2 cf, sb2'' ] emptyCF }
> ; (!p',!f') <- simpFix (PChoice [PPair p1' p2, p2'] Greedy)
> ; if (p' == (PChoice [PPair p1' p2, p2'] Greedy))
> then return (PChoice [PPair p1' p2, p2'] Greedy, f)
> else return (p', \i sb ->
> let sb' = i `seq` sb `seq` f i sb
> in sb' `seq` (f' i sb'))
> }
> | otherwise -> do
> { (!p1',!f1) <- dPat0 p1 l
> ; (!p2',!f2) <- dPat0 p2 l
> ; let rm = extract p1
> f !i !sb = case sb of
> { SPair !sb1 !sb2 !cf ->
> let sb1' = rm sb1
> sb2' = f2 i sb2
> cf1' = sb1' `seq` cf `seq`
> sb1' `combineCF` cf
> sb1'' = f1 i sb1
> sb2'' = cf1' `seq` sb2' `seq` carryForward cf1' sb2'
> in sb2'' `seq` sb1'' `seq` sb2 `seq`
> SChoice [sb2'', SPair sb1'' sb2 cf ] emptyCF }
> ; (!p',!f') <- simpFix (PChoice [p2' , PPair p1' p2] Greedy)
> ; if (p' == (PChoice [p2' , PPair p1' p2] Greedy))
> then return (PChoice [p2' , PPair p1' p2] Greedy, f)
> else return (p', \i sb -> let sb' = i `seq` sb `seq` f i sb
> in sb' `seq` (f' i sb'))
> }
> }
> | otherwise =
> do { (!p1',!f1) <- dPat0 p1 l
> ; let f !i !sb = case sb of { SPair !sb1 !sb2 !cf ->
> let sb1' = f1 i sb1
> in sb1' `seq` sb2 `seq` SPair sb1' sb2 cf }
> ; (!p',!f') <- simpFix (PPair p1' p2)
> ; if (p' == (PPair p1' p2))
> then return (PPair p1' p2, f)
> else return (p', \i sb ->
> let sb' = i `seq` sb `seq` f i sb
> in sb' `seq` (f' i sb'))
> }
> dPat0 (PChoice [] g) l = mzero
> dPat0 y@(PChoice [!p] g) l = do
> { (!p',!f') <- dPat0 p l
> ; let f !i !sb =
> case sb of { SChoice [!sb'] !cf -> let sb'' = (f' i sb') in sb'' `seq` carryForward cf sb''
> ; senv -> error $ "invariance is broken: " ++ pretty y ++ " vs " ++ show senv
> }
> ; (!p'',!f'') <- simpFix p'
> ; if (p'' == p')
> then return (p', f)
> else return (p'', \i sb ->
> let sb' = i `seq` sb `seq` f i sb
> in sb' `seq` (f'' i sb'))
> }
> dPat0 (PChoice !ps g) l =
> let pfs = map (\p -> p `seq` dPat0 p l) ps
> nubPF :: [[(Pat, Int -> SBinder -> SBinder)]] -> [(Pat, Int -> SBinder -> SBinder)]
> nubPF pfs = nub2Choice pfs M.empty
> in do
> { (!p,!f) <- pfs `seq` nubPF pfs
> ; (!p',!f') <- simpFix p
> ; if (p' == p)
> then return (p, f)
> else return (p', \i sb ->
> let sb' = i `seq` sb `seq` f i sb
> in sb' `seq` (f' i sb'))
> }
Turns a list of pattern x coercion pairs into a pchoice and a func, duplicate patterns are removed.
The first arg is a list of list of pair, because of the list monad generated by dPat0, each non-empty sub list is a singleton list.
The resulting func accept a SChoice pattern (cf to the input list of pattern).
-----------------------------------
{}, d |-nub PChoice {}, \i -> id
pfs .... todo
--------------------------------------------
{[]}\cup pfs , d |-nub PChoice {}, \i -> id
> nub2Choice :: [[(Pat, Int -> SBinder -> SBinder)]] -> M.Map Pat (Int -> SBinder -> SBinder) -> [(Pat, Int -> SBinder -> SBinder)]
> nub2Choice [] pDict = return (PChoice [] Greedy, (\i !sb -> sb ))
> nub2Choice ([]:pfs) pDict = do
> { (PChoice !ps !g, !f'') <- nub2Choice pfs pDict
> ; let f' !i !sb = case sb of
> { SChoice (s:ss) !cf -> ss `seq`
> f'' i $! (SChoice ss cf)
> ; _ -> error "nub2Choice coercion is applied to a non SChoice"
> }
> ; return (PChoice ps g, f')
> }
> nub2Choice ([(!p,!f)]:pfs) !pDict
> | isPhi (strip p) || p `M.member` pDict = do
> { (PChoice !ps !g, !f'') <- nub2Choice pfs pDict
> ; let f' !i !sb = case sb of
> { SChoice (s:ss) !cf -> ss `seq`
> f'' i $! (SChoice ss cf)
> ; _ -> error "nub2Choice coercion is applied to a non SChoice"
> }
> ; return (PChoice ps g, f')
> }
> | otherwise =
> case p of
> { PChoice !ps' !g' -> do
> { let fs' :: [Int -> SBinder -> SBinder]
> fs' = repeat (\i !sb -> sb )
> pfs' = zip ps' fs'
> pfs'' = pfs' `seq` map (\x -> [x]) pfs'
> pfs''' = pfs'' `seq` pfs'' ++ pfs
>
> ; (!p', !f'') <- pfs''' `seq` nub2Choice pfs''' pDict
> ; let f' !i !sb = case sb of
> { (SChoice (s:ss) !cf) -> s `seq` ss `seq`
> case (f i s) of
> { SChoice !ss'' !cf' -> let ss''' = map (\x -> x `seq` carryForward cf' x) ss''
> ss'''' = ss''' `seq` (ss''' ++ ss)
> in ss'''' `seq` f'' i $ (SChoice ss'''' cf)
> ; _ -> error "nub2Choice coercion is applied to a non SChoice" }
> ; _ -> error "nub2Choice coercion is applied to a non SChoice" }
> ; return (p', f')
> }
> ; _ ->
> do
> { let pDict' = M.insert p f pDict
> ; (PChoice !ps !g, !f'') <- pfs `seq` pDict' `seq` nub2Choice pfs pDict'
> ; let f' !i !sb = case sb of
> { SChoice (s:ss) !cf -> s `seq` ss `seq`
> let (SChoice !ss' !cf') = f'' i $ (SChoice ss cf)
> s' = f i s
> ss'' = s' `seq` ss' `seq` (s':ss')
> in ss'' `seq` cf' `seq` SChoice ss'' cf'
> ; _ -> error "nub2Choice coercion is applied to a non SChoice"
> }
> ; let ps' = (p:ps)
> ; ps' `seq` return (PChoice ps' g, f')
> }
> }
simplification
>
> simpFix :: Pat -> [(Pat, Int -> SBinder -> SBinder)]
> simpFix p = simp p
> simpFix' p f =
> case simp p of
> { [] -> []
> ; [(!p',!f')] ->
> if p' == p
> then [(p,f)]
> else simpFix' p' (\i sb -> let sb' = i `seq` sb `seq` f i sb
> in sb' `seq` (f' i sb))
> }
invariance: input / outoput of Int -> SBinder -> SBinder agree with simp's Pat input/ output
> simp :: Pat -> [(Pat, Int -> SBinder -> SBinder)]
> simp (PVar !x w !p) = do
> { (!p',!f') <- simp p
> ; case p' of
> { _ | p == p' -> return (PVar x w p,\_ !sb -> sb)
> | isPhi (strip p') -> mzero
> | otherwise -> let f i sb = i `seq` sb `seq` case sb of
> { SVar !vr !sb' !cf -> let sb'' = f' i sb'
> in sb'' `seq` SVar vr sb'' cf
> }
> in return (PVar x w p', f)
> }
> }
> simp y@(PPair !p1 !p2) = do
> { (!p1',!f1') <- simp p1
> ; (!p2',!f2') <- simp p2
> ; case (p1',p2') of
> { _ | isPhi p1' || isPhi p2' -> mzero
> | isEpsilon p1' ->
> let !rm = extract p1
> f !i !sb = case sb of
> { SPair !sb1 !sb2 !cf -> let cf' = rm sb1
> cf'' = cf' `seq` cf' `combineCF` cf
> sb2' = sb2 `seq` f2' i sb2
> in cf'' `seq` sb2' `seq` carryForward cf'' sb2' }
> in return (p2',f)
> | isEpsilon p2' ->
> let !rm = extract p2
> f !i !sb = case sb of
> { SPair !sb1 !sb2 !cf -> let cf' = rm sb2
> cf'' = cf' `seq` (cf' `combineCF` cf)
> sb1' = f1' i sb1
> in cf'' `seq` sb1' `seq` carryForward cf'' sb1' }
> in return (p1',f)
> | otherwise ->
> let f !i !sb = case sb of
> { SPair !sb1 !sb2 !cf -> let sb1' = (f1' i sb1)
> sb2' = (f2' i sb2)
> in sb1' `seq` sb2' `seq` SPair sb1' sb2' cf
> ; senv -> error $ "invariance broken: " ++ pretty y ++ " vs " ++ show senv }
> in return (PPair p1' p2', f)
> }
> }
> simp (PChoice [] g) = mzero
> simp (PChoice [!p] !g) = do
> { (!p',!f') <- simp p
> ; if isPhi p'
> then mzero
> else
> let f !i !sb =
> case sb of { SChoice [!sb'] !cf -> let sb'' = f' i sb'
> in sb'' `seq` carryForward cf sb'' }
> in return (p',f)
> }
> simp (PChoice !ps !g) =
> let pfs = map simp ps
> nubPF :: [[(Pat, Int -> SBinder -> SBinder)]] -> [(Pat, Int -> SBinder -> SBinder)]
> nubPF pfs = nub2Choice pfs M.empty
> in pfs `seq` nubPF pfs
> simp p = return (p,\_ !sb -> sb)
> carryForward :: CarryForward -> SBinder -> SBinder
> carryForward sr (SVar (v, r) sb' cf) = let cf' = combineCF cf sr
> in cf' `seq` SVar (v, r) sb' cf'
> carryForward sr (SRE cf) = let cf' = combineCF cf sr in cf' `seq` SRE cf'
> carryForward sr (SStar cf) = let cf' = combineCF cf sr in cf' `seq` SStar cf'
> carryForward sr (SPair sb1 sb2 cf) = let cf' = combineCF cf sr in cf' `seq` SPair sb1 sb2 cf'
> carryForward sr (SChoice sbs cf) = let cf' = combineCF cf sr in cf' `seq` SChoice sbs cf'
> carryForward sr sb2 = error ("trying to carry forward into a non-annotated pattern binder " ++ (show sb2))
> instance Ord Pat where
> compare (PVar x1 _ p1) (PVar x2 _ p2)
> | x1 == x2 = compare p1 p2
> | otherwise = compare x1 x2
> compare (PE r1) (PE r2) = compare r1 r2
> compare (PStar p1 _) (PStar p2 _) = compare p1 p2
> compare (PPair p1 p2) (PPair p3 p4) = let r = compare p1 p3 in case r of
> { EQ -> compare p2 p4
> ; _ -> r }
> compare (PChoice ps1 _) (PChoice ps2 _) =
> compare ps1 ps2
> compare p1 p2 = compare (assignInt p1) (assignInt p2)
> where assignInt (PVar _ _ _) = 0
> assignInt (PE _) = 1
> assignInt (PStar _ _) = 2
> assignInt (PPair _ _) = 3
> assignInt (PChoice _ _) = 4
extract a carry forward from the sbinder
> extract :: Pat -> SBinder -> CarryForward
> extract (PVar !x w !p) (SVar (_,!b) !sb !cf)
> | posEpsilon (strip p) = let cf' = extract p sb
> cf'' = cf' `seq` insertCF (x,b) cf'
> in cf'' `seq` (cf'' `combineCF` cf)
> | otherwise = IM.empty
> extract (PE rs) (SRE !cf) = cf
> extract (PStar p g) (SStar !cf) = cf
> extract (PPair !p1 !p2) (SPair !sb1 !sb2 !cf) = let cf1 = (extract p1 sb1)
> cf2 = (extract p2 sb2)
> cf' = cf1 `seq` cf2 `seq` (combineCF cf1 cf2)
> in cf' `seq` (cf' `combineCF` cf)
> extract (PChoice !ps !g) (SChoice !sbs !cf) = let psbs = zip ps sbs
> cf' = psbs `seq` (combineCFs $! map (\(!p,!sb) -> extract p sb) psbs)
> in cf' `seq` (cf' `combineCF` cf)
> extract p sb = error $ "Error: trying to extract" ++ (show sb) ++ " from " ++ (show p)
> updateRange :: Int -> [Range] -> [Range]
> updateRange !pos (rs_@((Range !b !e):rs)) =
> let e' = e + 1
> in e' `seq` case e' of
> _ | pos == e' -> let r = Range b e' in r `seq` rs `seq` (r:rs)
> | pos > e' ->
>
>
> let r = Range pos pos in r `seq` [r]
> | otherwise -> error "impossible, the current letter position is smaller than the last recorded letter"
> updateRange !pos [] = let r = Range pos pos in r `seq` [r]
> matchInner :: [(Pat, SBinder)] -> [(Char,Int)] -> [(Pat, SBinder)]
> matchInner pb [] = pb
> matchInner pb (l:ls) =
> do { (p,sb) <- pb
> ; (p',f) <- dPat0 p (fst l)
> ; matchInner [(p', f (snd l) sb)] ls
> }
> type Env = [SRange]
> match :: Pat -> [Char] -> [Env]
> match p w = do { (p',sb) <- matchInner [(p, toSBinder p)] (zip w [1..])
> ; sbinderToEnv p' sb }
> posixMatch :: Pat -> [Char] -> Maybe Env
> posixMatch p w = case match p w of
> { (e:_) -> Just e ; _ -> Nothing }
> match2 :: (Pat,FollowBy,IM.IntMap()) -> [Char] -> [MatchArray]
> match2 (p,fb,posixBinder) w =
> map (\env -> sbinderToMatchArray (length w) fb posixBinder (IM.fromList env)) (match p w)
get all envs from the sbinder
> sbinderToEnv :: Pat -> SBinder -> [Env]
> sbinderToEnv p sb =
> let cfs = sbinderToEnv' p sb
> in map IM.toList cfs
> sbinderToEnv' :: Pat -> SBinder -> [CarryForward]
> sbinderToEnv' _ (SChoice [] _) = []
> sbinderToEnv' (PChoice (p:ps) g) (SChoice (sb:sbs) cf)
> | posEpsilon (strip p) =
> do { cf' <- sbinderToEnv' p sb
> ; cf `seq` cf' `seq` return (combineCF cf cf') }
> | otherwise = sbinderToEnv' (PChoice ps g) (SChoice sbs cf)
> sbinderToEnv' (PPair p1 p2) (SPair sb1 sb2 cf) =
> do { cf1 <- sbinderToEnv' p1 sb1
> ; cf2 <- sbinderToEnv' p2 sb2
> ; cf1 `seq` cf2 `seq` cf `seq` return (combineCFs [cf1,cf2,cf]) }
> sbinderToEnv' (PVar x _ p) (SVar sr sb cf)
> | posEpsilon (strip p) = do { cf' <- sbinderToEnv' p sb
> ; let cf'' = cf' `seq` sr `seq` insertCF sr cf'
> ; cf `seq` cf'' `seq` return (cf `combineCF` cf'') }
> | otherwise = []
> sbinderToEnv' (PStar _ _) (SStar cf) = return cf
> sbinderToEnv' (PE _) (SRE cf) = return cf
> sbinderToEnv' p sb = error $ (pretty p) ++ " and " ++ (show sb)
>
> sbinderToEnv' :: Pat -> SBinder -> [Env]
> sbinderToEnv' _ (SChoice [] _) = []
> sbinderToEnv' (PChoice (p:ps) g) (SChoice (sb:sbs) cf)
> | posEpsilon (strip p) =
> do { env <- sbinderToEnv' p sb
> ; let env' = IM.toList cf
> ; env `seq` env' `seq` return (env ++ env') }
> | otherwise = sbinderToEnv' (PChoice ps g) (SChoice sbs cf)
> sbinderToEnv' (PPair p1 p2) (SPair sb1 sb2 cf) =
> do { e1 <- sbinderToEnv' p1 sb1
> ; e2 <- sbinderToEnv' p2 sb2
> ; let e3 = (IM.toList cf)
> ; e1 `seq` e2 `seq` e3 `seq` return (e1 ++ e2 ++ e3 ) }
> sbinderToEnv' (PVar x _ p) (SVar sr sb cf)
> | posEpsilon (strip p) = do { env <- sbinderToEnv' p sb
> ; let env' = env `seq` sr `seq` sr:env
> env'' = (IM.toList cf)
> ; env' `seq` env'' `seq` return (env' ++ env'') }
> | otherwise = []
> sbinderToEnv' (PStar _ _) (SStar cf) = let env = IM.toList cf in env `seq` [env]
> sbinderToEnv' (PE _) (SRE cf) = let env = IM.toList cf in env `seq` [env]
> sbinderToEnv' p sb = error $ (pretty p) ++ " and " ++ (show sb)
> sortEnvByVar :: Env -> Env
> sortEnvByVar env = let im = sortEnvByVar' env IM.empty
> in map (\(i,rs) -> (i, nub (sort rs) )) (IM.toList im)
> sortEnvByVar' :: Env -> IM.IntMap [Range] -> IM.IntMap [Range]
> sortEnvByVar' [] im = im
> sortEnvByVar' ((i,rs):srgs) im =
> case IM.lookup i im of
> { Just _ -> let im' = IM.update (\rs' -> Just $ rs ++ rs') i im
> in sortEnvByVar' srgs im'
> ; Nothing -> sortEnvByVar' srgs (IM.insert i rs im) }
> }
> type DfaTable = IM.IntMap (Int, Int -> SBinder -> SBinder, SBinder -> [Env])
> compilePat :: Pat -> (DfaTable,SBinder, SBinder -> [Env], [Int])
> compilePat p = let (t, sb, toEnv, finals) = buildDfaTable p
> in (t, sb, toEnv, finals)
> buildDfaTable :: Pat -> (DfaTable, SBinder, SBinder -> [Env], [Int])
> buildDfaTable p =
> let sig = sigmaRE (strip p)
>
>
> init_dict = M.insert p 0 M.empty
> (delta, mapping) = builder sig [] init_dict 0 [p]
>
>
> delta' = delta
> table = IM.fromList (map (\ (s,c,d,f,sb2env) -> (my_hash s c, (d,f,sb2env))) delta')
>
> finals = []
> in (table, toSBinder p, sbinderToEnv p, finals)
testing
> testp =
> let (Right (pp,posixBnd)) = parsePatPosix "^(((A|AB)(BAA|A))(AC|C))$"
> in pp
> testp2 =
> let (Right (pp,posixBnd)) = parsePatPosix "^(((A|AB)(BAA|A))(AC|C))$"
> fb = followBy pp
> in (pp,fb,posixBnd)
let sig = sigmaRE (strip testp)
let init_dict = M.insert testp (0::Int) M.empty
let (delta, mapping) = builder sig [] init_dict (0::Int) [testp]
let (allStates, delta, mapping) = builder sig [] [] init_dict (0::Int) [testp]
mapM_ (\p -> putStrLn (show p)) (sort allStates)
>
>
> builder :: [Char]
> -> [ (Int,Char,Int,Int -> SBinder -> SBinder, SBinder -> [Env] ) ]
> -> M.Map Pat Int
> -> Int
> -> [Pat]
> -> ([ (Int,Char,Int,Int -> SBinder -> SBinder, SBinder -> [Env] ) ], M.Map Pat Int)
> builder sig acc_delta dict max_id curr_pats
> | null curr_pats = (acc_delta, dict)
> | otherwise =
> let
>
> new_delta = [ p `seq` p' `seq` l `seq` f' `seq` g `seq` (p,l,p',f',g) | p <- curr_pats,
> l <- sig, (p',f') <- dPat0 p l, let g = sbinderToEnv p' ]
> new_pats = D.nub [ p' | (p,l,p',f',g) <- new_delta, not (p' `M.member` dict) ]
> (dict',max_id') = foldl' (\(d,id) p -> (M.insert p (id+1) d, id + 1)) (dict,max_id) new_pats
> acc_delta_next = acc_delta ++ (map (\(p,l,p',f,g) -> (getId dict' p, l, getId dict' p', f, g)) new_delta)
> in builder sig acc_delta_next dict' max_id' new_pats
> where getId :: M.Map Pat Int -> Pat -> Int
> getId m p = case M.lookup p m of
> { Just i -> i
> ; Nothing -> error "getId failed: this should not happen" }
> type Word = S.ByteString
> execDfa :: Int -> DfaTable -> Word -> [(Int, SBinder, SBinder -> [Env])] -> [(Int,SBinder, SBinder -> [Env])]
> execDfa cnt dfaTable w' [] = []
> execDfa cnt dfaTable w' currDfaStateSBinders =
> case S.uncons w' of
> Nothing -> currDfaStateSBinders
> Just (l,w) ->
> let ((i,sb,_):_) = currDfaStateSBinders
> k = my_hash i l
> in case IM.lookup k dfaTable of
> { Nothing -> []
> ; Just (j, f, sb2env) ->
> let sb' = sb `seq` f cnt sb
> nextDfaStateSBinders = j `seq` sb' `seq` sb2env `seq`
> [(j, sb',sb2env)]
> cnt' = cnt + 1
> in nextDfaStateSBinders `seq` cnt' `seq` w `seq`
> execDfa cnt' dfaTable w nextDfaStateSBinders
> }
x0 :: (x1 :: ( x2 :: (ab|a), x3 :: (baa|a)), x4 :: (ac|c))
> execPatMatch :: (DfaTable, SBinder, SBinder -> [Env], [Int], FollowBy, IM.IntMap ()) -> Word -> Maybe Env
> execPatMatch (dt, init_sbinder, init_sb2env, finals, _, posixBinder) w =
> let r = dt `seq` execDfa 0 dt w [(0, init_sbinder, init_sb2env)]
> in case r of
> { [] -> Nothing
> ; ((i,sb,sb2env):_) -> case (sb2env sb) of
> { [] -> Nothing
> ; (e:_) -> let e' = filter (\(x,_) -> x `IM.notMember` posixBinder) e
> in Just e'
> } }
> p4 = PVar 0 [] (PPair (PVar 1 [] ((PPair p_x p_y))) p_z)
> where p_x = PVar 2 [] (PE [(Choice [(L 'A'),(Seq (L 'A') (L 'B'))] Greedy)])
> p_y = PVar 3 [] (PE [(Choice [(Seq (L 'B') (Seq (L 'A') (L 'A'))), (L 'A')] Greedy)])
> p_z = PVar 4 [] (PE [(Choice [(Seq (L 'A') (L 'C')), (L 'C')] Greedy)])
x0 :: ( x1 :: ( x2 :: (x3:: a | x4 :: ab) | x5 :: b)* )
> p3 = PVar 0 [] (PStar ( PVar 1 [] ( PChoice [(PVar 2 [] (PChoice [p3,p4] Greedy)), p5] Greedy)) Greedy)
> where p3 = PVar 3 [] (PE [(L 'A')])
> p4 = PVar 4 [] (PE [(Seq (L 'A') (L 'B'))])
> p5 = PVar 5 [] (PE [(L 'B')])
>
>
>
> type Regex = (DfaTable, SBinder, SBinder -> [Env], [Int], FollowBy, IM.IntMap ())
-- todo: use the CompOption and ExecOption
> compile :: CompOption
> -> ExecOption
> -> S.ByteString
> -> Either String Regex
> compile compOpt execOpt bs =
> case parsePatPosix (S.unpack bs) of
> Left err -> Left ("parseRegex for Text.Regex.Deriv.ByteString failed:"++show err)
> Right (pat,posixBnd) ->
> Right (patToRegex pat posixBnd compOpt execOpt)
> patToRegex p posixBnd compOpt execOpt =
> let (t, sb, toEnv, finals) = compilePat p
> fb = followBy p
> in t `seq` sb `seq` toEnv `seq` finals `seq` fb `seq` posixBnd `seq` (t, sb, toEnv, finals, fb, posixBnd)
> execute :: Regex
> -> S.ByteString
> -> Either String (Maybe Env)
> execute r bs = Right (execPatMatch r bs)
> regexec :: Regex
> -> S.ByteString
> -> Either String (Maybe (S.ByteString, S.ByteString, S.ByteString, [S.ByteString]))
> regexec r bs =
> case execPatMatch r bs of
> Nothing -> Right Nothing
> Just env ->
> let pre = case lookup preBinder env of { Just e -> rg_collect_many bs e ; Nothing -> S.empty }
> post = case lookup subBinder env of { Just e -> rg_collect_many bs e ; Nothing -> S.empty }
> full_len = S.length bs
> pre_len = S.length pre
> post_len = S.length post
> main_len = full_len pre_len post_len
> main_and_post = S.drop pre_len bs
> main = main_and_post `seq` main_len `seq` S.take main_len main_and_post
> matched = map ((rg_collect_many bs) . snd) (filter (\(v,w) -> v > mainBinder && v < subBinder ) env)
> in
> Right (Just (pre,main,post,matched))
> rg_collect :: S.ByteString -> Range -> S.ByteString
> rg_collect w (Range i j) = S.take (j' i' + 1) (S.drop i' w)
> where i' = fromIntegral i
> j' = fromIntegral j
> rg_collect_many w rs = foldl' S.append S.empty $ map (rg_collect w) rs
>
>
> data CompOption = CompOption {
> caseSensitive :: Bool
> , multiline :: Bool
>
> , rightAssoc :: Bool
> , newSyntax :: Bool
> , lastStarGreedy :: Bool
>
>
> } deriving (Read,Show)
> data ExecOption = ExecOption {
> captureGroups :: Bool
> } deriving (Read,Show)
> instance RegexOptions Regex CompOption ExecOption where
> blankCompOpt = CompOption { caseSensitive = True
> , multiline = False
> , rightAssoc = True
> , newSyntax = False
> , lastStarGreedy = False
> }
> blankExecOpt = ExecOption { captureGroups = True }
> defaultCompOpt = CompOption { caseSensitive = True
> , multiline = True
> , rightAssoc = True
> , newSyntax = True
> , lastStarGreedy = False
> }
> defaultExecOpt = ExecOption { captureGroups = True }
> setExecOpts e r = undefined
> getExecOpts r = undefined
> instance RegexLike Regex S.ByteString where
>
> matchAll = execPatMatchArray
>
> matchOnce = posixPatMatchArray
>
>
>
>
>
> instance RegexLike Regex String where
>
> matchAll r s = execPatMatchArray r (S.pack s)
>
> matchOnce r s = posixPatMatchArray r (S.pack s)
>
>
>
>
> execPatMatchArray :: (DfaTable, SBinder, SBinder -> [Env], [Int], FollowBy, IM.IntMap ()) -> Word -> [MatchArray]
> execPatMatchArray (dt, init_sbinder, init_sb2env, finals, fb, posixBinder) w =
> let r = execDfa 0 dt w [(0, init_sbinder, init_sb2env)]
> in case r of
> { [] -> []
> ; ((i,sb,sb2env):_) -> map (\ env -> sbinderToMatchArray (S.length w) fb posixBinder (IM.fromList env)) (sb2env sb)
> }
> updateEmptyBinder b fb =
> let
> up b (x,y) = case IM.lookup x b of
> { Just (_:_) ->
> b
> ; Just [] ->
> case IM.lookup y b of
> { Just r@(_:_) -> let i = snd (last r)
> in IM.update (\_ -> Just [(i,i)]) x b
> ; _ -> b }
> ; Nothing -> b }
> in foldl' up b fb
> sbinderToMatchArray l fb posixBnd b =
> let
> subPatB = filter (\(x,_) -> x > mainBinder && x < subBinder && x `IM.notMember` posixBnd ) (listifyBinder b)
> mbPrefixB = IM.lookup preBinder b
> mbSubfixB = IM.lookup subBinder b
> mainB = case (mbPrefixB, mbSubfixB) of
> (Just [(Range _ x)], Just [(Range y _)]) -> (x, y x)
> (Just [(Range _ x)], _) -> (x, l x)
> (_, Just [(Range y _)]) -> (0, y)
> (_, _) -> (0, l)
> _ -> error (show (mbPrefixB, mbSubfixB) )
> rs = map snd subPatB
> rs' = map lastNonEmpty rs
> io = logger (print $ "\n" ++ show rs ++ " || " ++ show rs' ++ "\n")
> in
> listToArray (mainB:rs')
> where fromRange (Range b e) = (b, eb+1)
>
>
> lastNonEmpty [] = (1,0)
> lastNonEmpty rs = fromRange (last rs)
> listToArray l = listArray (0,length l1) l
> posixPatMatchArray :: (DfaTable, SBinder, SBinder -> [Env], [Int], FollowBy, IM.IntMap ()) -> Word -> Maybe MatchArray
> posixPatMatchArray compiled w =
> first (execPatMatchArray compiled w)
> where
> first (env:_) = return env
> first _ = Nothing
>
>
> type FollowBy = [(Int,Int)]
> followBy :: Pat -> FollowBy
> followBy p = map (\p -> (snd p, fst p)) (fst $ buildFollowBy p ([],[]))
>
> buildFollowBy :: Pat -> ([(Int,Int)], [Int]) -> ([(Int,Int)], [Int])
> buildFollowBy (PVar x w p) (acc, lefts) = let (acc', lefts') = buildFollowBy p (acc,lefts)
> in ([ (l,x) | l <- lefts] ++ acc', [x])
> buildFollowBy (PE r) x = x
> buildFollowBy (PStar p g) (acc, lefts) = buildFollowBy p (acc,lefts)
> buildFollowBy (PPair p1 p2) (acc, lefts) = let (acc',lefts') = buildFollowBy p1 (acc,lefts)
> in buildFollowBy p2 (acc',lefts')
> buildFollowBy (PChoice ps _) (acc, lefts) =
> foldl' (\(acc', lefts') p -> let (acc'', lefts'') = buildFollowBy p (acc',lefts)
> in (acc'', lefts' ++ lefts'')) (acc, []) ps