module Text.Regex.XMLSchema.String.Regex
( Regex
, GenRegex
, mkZero
, mkUnit
, mkSym
, mkSym1
, mkSymRng
, mkWord
, mkDot
, mkStar
, mkAll
, mkAlt
, mkElse
, mkSeq
, mkSeqs
, mkRep
, mkRng
, mkOpt
, mkDiff
, mkIsect
, mkExor
, mkInterleave
, mkCompl
, mkBr
, isZero
, errRegex
, nullable
, nullable'
, delta1
, delta
, firstChars
, matchWithRegex
, matchWithRegex'
, splitWithRegex
, splitWithRegex'
, splitWithRegexCS
, splitWithRegexCS'
)
where
import Data.List
import Text.Regex.XMLSchema.String.CharSet
data GenRegex l = Zero String
| Unit
| Sym CharSet
| Dot
| Star (GenRegex l)
| Alt (GenRegex l) (GenRegex l)
| Else (GenRegex l) (GenRegex l)
| Seq (GenRegex l) (GenRegex l)
| Rep Int (GenRegex l)
| Rng Int Int (GenRegex l)
| Diff (GenRegex l) (GenRegex l)
| Isec (GenRegex l) (GenRegex l)
| Exor (GenRegex l) (GenRegex l)
| Intl (GenRegex l) (GenRegex l)
| Br (Label l) (GenRegex l) String
| Cbr (GenRegex l) [(Label l, String)]
deriving (Eq, Ord )
type Regex = GenRegex String
type Label l = Maybe l
type Nullable l = (Bool, [(Label l, String)])
mkZero :: String -> GenRegex l
mkZero = Zero
mkUnit :: GenRegex l
mkUnit = Unit
mkSym :: CharSet -> GenRegex l
mkSym s
| nullCS s = mkZero $ "empty char range"
| fullCS s = mkDot
| otherwise = Sym s
mkSym1 :: Char -> GenRegex l
mkSym1 = mkSym . singleCS
mkSymRng :: Char -> Char -> GenRegex l
mkSymRng c1 c2 = mkSym $ rangeCS c1 c2
mkWord :: [Char] -> GenRegex l
mkWord = mkSeqs . map mkSym1
mkDot :: GenRegex l
mkDot = Dot
mkAll :: Eq l => GenRegex l
mkAll = mkStar mkDot
mkStar :: Eq l => GenRegex l -> GenRegex l
mkStar (Zero _) = mkUnit
mkStar e@Unit = e
mkStar e@(Star _e1) = e
mkStar (Rep 1 e1) = mkStar e1
mkStar (Rep i e1)
| i == 1
||
nullable e1 = mkStar e1
mkStar e@(Rng _ _ e1)
| nullable e = mkStar e1
mkStar e@(Alt _ _) = Star (rmStar e)
mkStar e = Star e
rmStar :: Eq l => GenRegex l -> GenRegex l
rmStar (Alt e1 e2) = mkAlt (rmStar e1) (rmStar e2)
rmStar (Star e1) = rmStar e1
rmStar (Rep 1 e1) = rmStar e1
rmStar e1 = e1
mkAlt :: Eq l => GenRegex l -> GenRegex l -> GenRegex l
mkAlt e1 (Zero _) = e1
mkAlt (Zero _) e2 = e2
mkAlt (Sym p1) (Sym p2) = mkSym $ p1 `unionCS` p2
mkAlt e1 e2@(Sym _) = mkAlt e2 e1
mkAlt e1@(Sym _) (Alt e2@(Sym _) e3) = mkAlt (mkAlt e1 e2) e3
mkAlt (Sym _) e2@Dot = e2
mkAlt e1@(Star Dot) _e2 = e1
mkAlt _e1 e2@(Star Dot) = e2
mkAlt (Alt e1 e2) e3 = mkAlt e1 (mkAlt e2 e3)
mkAlt e1 e2
| e1 == e2 = e1
| otherwise = Alt e1 e2
mkElse :: Eq l => GenRegex l -> GenRegex l -> GenRegex l
mkElse e1 (Zero _) = e1
mkElse (Zero _) e2 = e2
mkElse (Sym p1) (Sym p2) = mkSym $ p1 `unionCS` p2
mkElse e1@(Sym _) (Else e2@(Sym _) e3) = mkElse (mkElse e1 e2) e3
mkElse (Sym _) e2@Dot = e2
mkElse e1@(Star Dot) _e2 = e1
mkElse _e1 e2@(Star Dot) = e2
mkElse (Else e1 e2) e3 = mkElse e1 (mkElse e2 e3)
mkElse e1 e2
| e1 == e2 = e1
| otherwise = Else e1 e2
mkSeq :: GenRegex l -> GenRegex l -> GenRegex l
mkSeq e1@(Zero _) _e2 = e1
mkSeq _e1 e2@(Zero _) = e2
mkSeq Unit e2 = e2
mkSeq (Cbr e1 ss1) e2 = mkCbr (mkSeq e1 e2) ss1
mkSeq e1 Unit = e1
mkSeq (Seq e1 e2) e3 = mkSeq e1 (mkSeq e2 e3)
mkSeq e1 e2 = Seq e1 e2
mkSeqs :: [GenRegex l] -> GenRegex l
mkSeqs = foldr mkSeq mkUnit
mkRep :: Eq l => Int -> GenRegex l -> GenRegex l
mkRep 0 e = mkStar e
mkRep _ e@(Zero _) = e
mkRep _ e
| nullable e = mkStar e
mkRep i (Rep j e) = mkRep (i * j) e
mkRep i e = Rep i e
mkRng :: Int -> Int -> GenRegex l -> GenRegex l
mkRng 0 0 _e = mkUnit
mkRng 1 1 e = e
mkRng lb ub _e
| lb > ub = Zero $
"illegal range " ++
show lb ++ ".." ++ show ub
mkRng _l _u e@(Zero _) = e
mkRng _l _u e@Unit = e
mkRng lb ub e = Rng lb ub e
mkOpt :: GenRegex l -> GenRegex l
mkOpt = mkRng 0 1
mkDiff :: Eq l => GenRegex l -> GenRegex l -> GenRegex l
mkDiff e1@(Zero _) _e2 = e1
mkDiff e1 (Zero _) = e1
mkDiff _e1 (Star Dot) = mkZero "empty set in difference expr"
mkDiff Dot (Sym p) = mkSym $ compCS p
mkDiff (Sym _) Dot = mkZero "empty set in difference expr"
mkDiff (Sym p1) (Sym p2) = mkSym $ p1 `diffCS` p2
mkDiff e1 e2
| e1 == e2 = mkZero "empty set in difference expr"
| otherwise = Diff e1 e2
mkCompl :: Eq l => GenRegex l -> GenRegex l
mkCompl (Zero _) = mkAll
mkCompl (Star Dot) = mkZero "empty set in compl expr"
mkCompl e = mkDiff (mkStar mkDot) e
mkIsect :: Eq l => GenRegex l -> GenRegex l -> GenRegex l
mkIsect e1@(Zero _) _e2 = e1
mkIsect _e1 e2@(Zero _) = e2
mkIsect e1@(Unit) e2
| nullable e2 = e1
| otherwise = mkZero "intersection empty"
mkIsect e1 e2@(Unit) = mkIsect e2 e1
mkIsect (Sym p1) (Sym p2) = mkSym $ p1 `intersectCS` p2
mkIsect e1@(Sym _) Dot = e1
mkIsect Dot e2@(Sym _) = e2
mkIsect (Star Dot) e2 = e2
mkIsect e1 (Star Dot) = e1
mkIsect e1 e2
| e1 == e2 = e1
| otherwise = Isec e1 e2
mkExor :: Eq l => GenRegex l -> GenRegex l -> GenRegex l
mkExor (Zero _) e2 = e2
mkExor e1 (Zero _) = e1
mkExor (Star Dot) _e2 = mkZero "empty set in exor expr"
mkExor _e1 (Star Dot) = mkZero "empty set in exor expr"
mkExor (Sym p1) (Sym p2) = mkSym $ p1 `exorCS` p2
mkExor (Sym p1) Dot = mkSym $ compCS p1
mkExor Dot (Sym p2) = mkSym $ compCS p2
mkExor e1 e2
| e1 == e2 = mkZero "empty set in exor expr"
| otherwise = Exor e1 e2
mkInterleave :: GenRegex l -> GenRegex l -> GenRegex l
mkInterleave e1@(Zero _) _ = e1
mkInterleave _ e2@(Zero _) = e2
mkInterleave (Unit) e2 = e2
mkInterleave e1 (Unit) = e1
mkInterleave e1 e2 = Intl e1 e2
mkBr0 :: Label l -> GenRegex l -> String -> GenRegex l
mkBr0 _ e@(Zero _) _ = e
mkBr0 l Unit s = mkCbr mkUnit [(l,reverse s)]
mkBr0 l e s = Br l e s
mkBr :: l -> GenRegex l -> GenRegex l
mkBr l e = mkBr0 (Just l) e ""
mkBr' :: GenRegex l -> GenRegex l
mkBr' e = mkBr0 Nothing e ""
mkCbr :: GenRegex l -> [(Label l, String)] -> GenRegex l
mkCbr e@(Zero _) _ = e
mkCbr (Cbr e ss1) ss = mkCbr e (ss ++ ss1)
mkCbr e ss = Cbr e ss
instance Show l => Show (GenRegex l) where
show (Zero e) = "{" ++ e ++ "}"
show Unit = "()"
show (Sym p)
| p == compCS (stringCS "\n\r")
= "."
| null (tail cs) &&
rng1 (head cs)
= escRng . head $ cs
| otherwise = "[" ++ concat cs' ++ "]"
where
rng1 (x,y) = x == y
cs = p
cs' = map escRng p
escRng (x, y)
| x == y = esc x
| succ x == y
= esc x ++ esc y
| otherwise
= esc x ++ "-" ++ esc y
esc x
| x `elem` "\\-[]{}()*+?.^"
= '\\':x:""
| x >= ' ' && x <= '~'
= x:""
| otherwise
= "&#" ++ show (fromEnum x) ++ ";"
show Dot = "\\a"
show (Star Dot) = "\\A"
show (Star e) = "(" ++ show e ++ "*)"
show (Alt e1 e2) = "(" ++ show e1 ++ "|" ++ show e2 ++ ")"
show (Else e1 e2) = "(" ++ show e1 ++ "{|}" ++ show e2 ++ ")"
show (Seq e1 e2) = "(" ++ show e1 ++ show e2 ++ ")"
show (Rep 1 e) = "(" ++ show e ++ "+)"
show (Rep i e) = "(" ++ show e ++ "{" ++ show i ++ ",})"
show (Rng 0 1 e) = "(" ++ show e ++ "?)"
show (Rng i j e) = "(" ++ show e ++ "{" ++ show i ++ "," ++ show j ++ "})"
show (Diff e1 e2) = "(" ++ show e1 ++ "{\\}" ++ show e2 ++ ")"
show (Isec e1 e2) = "(" ++ show e1 ++ "{&}" ++ show e2 ++ ")"
show (Exor e1 e2) = "(" ++ show e1 ++ "{^}" ++ show e2 ++ ")"
show (Intl e1 e2) = "(" ++ show e1 ++ "{:}" ++ show e2 ++ ")"
show (Br l e s) = "({" ++ showL l ++ (if null s
then ""
else "=" ++ reverse s
) ++ "}" ++ show e ++ ")"
show (Cbr e ss) = "([" ++ intercalate "," (map (\(l,s) -> showL l ++ "=" ++ s) ss) ++ "]"
++ show e ++
")"
showL :: Show l => Label l -> String
showL = rmq . maybe "" show
where
rmq ('\"':xs) = init xs
rmq xs = xs
isZero :: GenRegex l -> Bool
isZero (Zero _) = True
isZero _ = False
errRegex :: GenRegex l -> String
errRegex (Zero e) = e
errRegex _ = ""
nullable :: GenRegex l -> Bool
nullable = fst . nullable'
nullable' :: GenRegex l -> Nullable l
nullable' (Zero _) = (False, [])
nullable' Unit = (True, [])
nullable' Dot = (False, [])
nullable' (Sym _x) = (False, [])
nullable' (Star _e) = (True, [])
nullable' (Rep _i e) = nullable' e
nullable' (Rng i _ e) = (i == 0, []) `unionN` nullable' e
nullable' (Seq e1 e2) = nullable' e1 `isectN` nullable' e2
nullable' (Alt e1 e2) = nullable' e1 `unionN` nullable' e2
nullable' (Else e1 e2) = nullable' e1 `orElseN` nullable' e2
nullable' (Isec e1 e2) = nullable' e1 `isectN` nullable' e2
nullable' (Diff e1 e2) = nullable' e1 `diffN` nullable' e2
nullable' (Exor e1 e2) = nullable' e1 `exorN` nullable' e2
nullable' (Intl e1 e2) = nullable' e1 `isectN` nullable' e2
nullable' (Br l e s) = (True, [(l, reverse s)]) `isectN` nullable' e
nullable' (Cbr e ss) = (True, ss) `isectN` nullable' e
isectN :: Nullable l -> Nullable l -> Nullable l
isectN (True, ws1) (True, ws2) = (True, ws1 ++ ws2)
isectN _ _ = (False, [])
unionN :: Nullable l -> Nullable l -> Nullable l
unionN (False, _) (False, _) = (False, [])
unionN (_, ws1) (_, ws2) = (True, ws1 ++ ws2)
orElseN :: Nullable l -> Nullable l -> Nullable l
orElseN e1@(True, _ws1) _ = e1
orElseN _ e2 = e2
diffN :: Nullable l -> Nullable l -> Nullable l
diffN n1 (False, _) = n1
diffN _ _ = (False, [])
exorN :: Nullable l -> Nullable l -> Nullable l
exorN n1@(True, _) (False, _) = n1
exorN (False, _) n2@(True, _) = n2
exorN _ _ = (False, [])
firstChars :: GenRegex l -> CharSet
firstChars (Zero _) = emptyCS
firstChars Unit = emptyCS
firstChars (Sym p) = p
firstChars Dot = allCS
firstChars (Star e1) = firstChars e1
firstChars (Alt e1 e2) = firstChars e1 `unionCS` firstChars e2
firstChars (Else e1 e2) = firstChars e1 `unionCS` firstChars e2
firstChars (Seq e1 e2)
| nullable e1 = firstChars e1 `unionCS` firstChars e2
| otherwise = firstChars e1
firstChars (Rep _i e) = firstChars e
firstChars (Rng _i _j e) = firstChars e
firstChars (Diff e1 _e2) = firstChars e1
firstChars (Isec e1 e2) = firstChars e1 `intersectCS` firstChars e2
firstChars (Exor e1 e2) = firstChars e1 `unionCS` firstChars e2
firstChars (Intl e1 e2) = firstChars e1 `unionCS` firstChars e2
firstChars (Br _l e _s) = firstChars e
firstChars (Cbr e _ss) = firstChars e
delta1 :: Eq l => GenRegex l -> Char -> GenRegex l
delta1 e@(Zero _) _ = e
delta1 Unit c = mkZero $
"unexpected char " ++ show c
delta1 (Sym p) c
| c `elemCS` p = mkUnit
| otherwise = mkZero $
"unexpected char " ++ show c
delta1 Dot _ = mkUnit
delta1 e@(Star Dot) _ = e
delta1 e@(Star e1) c = mkSeq (delta1 e1 c) e
delta1 (Alt e1 e2) c = mkAlt (delta1 e1 c) (delta1 e2 c)
delta1 (Else e1 e2) c = mkElse (delta1 e1 c) (delta1 e2 c)
delta1 (Seq e1@(Br l e1' s) e2) c
| n = mkAlt (mkSeq (delta1 e1 c) e2)
(mkCbr (delta1 e2 c) ((l, reverse s) : ws))
where
(n, ws) = nullable' e1'
delta1 (Seq e1 e2) c
| nullable e1 = mkAlt (mkSeq (delta1 e1 c) e2)
(delta1 e2 c)
| otherwise = mkSeq (delta1 e1 c) e2
delta1 (Rep i e) c = mkSeq (delta1 e c) (mkRep (i1) e)
delta1 (Rng i j e) c = mkSeq (delta1 e c) (mkRng ((i1) `max` 0) (j1) e)
delta1 (Diff e1 e2) c = mkDiff (delta1 e1 c) (delta1 e2 c)
delta1 (Isec e1 e2) c = mkIsect (delta1 e1 c) (delta1 e2 c)
delta1 (Exor e1 e2) c = mkExor (delta1 e1 c) (delta1 e2 c)
delta1 (Intl e1 e2) c = mkAlt (mkInterleave (delta1 e1 c) e2 )
(mkInterleave e1 (delta1 e2 c))
delta1 (Br l e s) c = mkBr0 l (delta1 e c) (c:s)
delta1 (Cbr e ss) c = mkCbr (delta1 e c) ss
delta :: Eq l => GenRegex l -> String -> GenRegex l
delta = foldl' delta1
matchWithRegex :: Eq l => GenRegex l -> String -> Bool
matchWithRegex e = nullable . delta e
matchWithRegex' :: Eq l => GenRegex l -> String -> Maybe [(Label l,String)]
matchWithRegex' e = (\ (r, l) -> if r then Just l else Nothing) . nullable' . delta e
splitWithRegex :: Eq l => GenRegex l -> String -> Maybe ([(Label l,String)], String)
splitWithRegex re inp = do
(re', rest) <- splitWithRegex' (mkBr' re) inp
return ( snd . nullable' $ re', rest)
splitWithRegexCS :: Eq l => GenRegex l -> CharSet -> String -> Maybe ([(Label l,String)], String)
splitWithRegexCS re cs inp = do
(re', rest) <- splitWithRegexCS' (mkBr' re) cs inp
return ( snd . nullable' $ re', rest)
splitWithRegex' :: Eq l => GenRegex l -> String -> Maybe (GenRegex l, String)
splitWithRegex' re = splitWithRegex''
( if nullable re
then Just (re, "")
else Nothing
) re
splitWithRegex'' :: Eq l => Maybe (GenRegex l, String) -> GenRegex l -> String -> Maybe (GenRegex l, String)
splitWithRegex'' lastRes _re "" = lastRes
splitWithRegex'' lastRes re (c : inp')
| isZero re = lastRes
| otherwise = splitWithRegex'' nextRes re' $ inp'
where
re' = delta1 re c
nextRes
| nullable re' = Just (re', inp')
| otherwise = lastRes
splitWithRegexCS' :: Eq l => GenRegex l -> CharSet -> String -> Maybe (GenRegex l, String)
splitWithRegexCS' re cs inp@(c : _)
| c `elemCS` cs = splitWithRegex' re inp
splitWithRegexCS' re _cs inp
| nullable re = Just (re, inp)
| otherwise = Nothing