module Text.XML.HXT.RelaxNG.XmlSchema.Regex
( Regex
, chars
, charRngs
, mkZero
, mkUnit
, mkSym
, mkSym1
, mkSymRng
, mkDot
, mkStar
, mkAlt
, mkSeq
, mkRep
, mkRng
, mkOpt
, mkDif
, mkCompl
, nullable
, delta
, match
)
where
import Data.Maybe
data Regex = Zero String
| Unit
| Sym (Char -> Bool)
| Dot
| Star Regex
| Alt Regex Regex
| Seq Regex Regex
| Rep Int Regex
| Rng Int Int Regex
| Dif Regex Regex
chars :: (Char -> Bool) -> [Char]
chars p = filter p $ [minBound .. maxBound]
charRngs :: [Char] -> [(Char, Char)]
charRngs [] = []
charRngs (x:xs) = charRng x xs
where
charRng y [] = (x,y) : []
charRng y xs'@(x1:xs1)
| x1 == succ y = charRng x1 xs1
| otherwise = (x,y) : charRngs xs'
mkZero :: String -> Regex
mkZero = Zero
mkUnit :: Regex
mkUnit = Unit
mkSym :: (Char -> Bool) -> Regex
mkSym = Sym
mkSym1 :: Char -> Regex
mkSym1 c = mkSym (==c)
mkSymRng :: Char -> Char -> Regex
mkSymRng c1 c2
| c1 == minBound &&
c2 == maxBound = mkDot
| c1 <= c2 = mkSym $ ( \ x -> x >= c1 && x<= c2 )
| otherwise = mkZero $ "empty char range"
mkDot :: Regex
mkDot = Dot
mkStar :: Regex -> Regex
mkStar (Zero _) = mkUnit
mkStar e@Unit = e
mkStar e@(Star _e1) = e
mkStar (Rep 1 e1) = mkStar e1
mkStar e@(Alt _ _) = Star (rmStar e)
mkStar e = Star e
rmStar :: Regex -> Regex
rmStar (Alt e1 e2) = mkAlt (rmStar e1) (rmStar e2)
rmStar (Star e1) = rmStar e1
rmStar (Rep 1 e1) = rmStar e1
rmStar e1 = e1
mkAlt :: Regex -> Regex -> Regex
mkAlt e1 (Zero _) = e1
mkAlt (Zero _) e2 = e2
mkAlt e1@(Star Dot) _e2 = e1
mkAlt _e1 e2@(Star Dot) = e2
mkAlt (Sym p1) (Sym p2) = mkSym $ \ x -> p1 x || p2 x
mkAlt e1 e2@(Sym _) = mkAlt e2 e1
mkAlt e1@(Sym _) (Alt e2@(Sym _) e3) = mkAlt (mkAlt e1 e2) e3
mkAlt (Alt e1 e2) e3 = mkAlt e1 (mkAlt e2 e3)
mkAlt e1 e2 = Alt e1 e2
mkSeq :: Regex -> Regex -> Regex
mkSeq e1@(Zero _) _e2 = e1
mkSeq _e1 e2@(Zero _) = e2
mkSeq Unit e2 = e2
mkSeq e1 Unit = e1
mkSeq (Seq e1 e2) e3 = mkSeq e1 (mkSeq e2 e3)
mkSeq e1 e2 = Seq e1 e2
mkRep :: Int -> Regex -> Regex
mkRep 0 e = mkStar e
mkRep _ e@(Zero _) = e
mkRep _ e@Unit = e
mkRep i e = Rep i e
mkRng :: Int -> Int -> Regex -> Regex
mkRng 0 0 _e = mkUnit
mkRng 1 1 e = e
mkRng lb ub _e
| lb > ub = Zero $
"illegal range " ++
show lb ++ ".." ++ show ub
mkRng _l _u e@(Zero _) = e
mkRng _l _u e@Unit = e
mkRng lb ub e = Rng lb ub e
mkOpt :: Regex -> Regex
mkOpt = mkRng 0 1
mkDif :: Regex -> Regex -> Regex
mkDif e1@(Zero _) _e2 = e1
mkDif e1 (Zero _) = e1
mkDif _e1 (Star Dot) = mkZero "empty set in difference expr"
mkDif Dot (Sym p) = mkSym (not . p)
mkDif (Sym _) Dot = mkZero "empty set of chars in difference expr"
mkDif (Sym p1) (Sym p2)
| null . chars $ (\ x -> p1 x && not (p2 x))
= mkZero "empty set of chars in difference expr"
mkDif e1 e2 = Dif e1 e2
mkCompl :: Regex -> Regex
mkCompl = mkDif mkDot
instance Show Regex where
show (Zero s) = "{err:" ++ s ++ "}"
show Unit = "()"
show (Sym p)
| null (tail cs) &&
rng1 (head cs)
= escRng . head $ cs
| otherwise = "[" ++ concat cs' ++ "]"
where
rng1 (x,y) = x == y
cs = charRngs . chars $ p
cs' = map escRng cs
escRng (x, y)
| x == y = esc x
| succ x == y
= esc x ++ esc y
| otherwise
= esc x ++ "-" ++ esc y
esc x
| x `elem` "\\-[]{}()*+?.^"
= '\\':x:""
| x >= ' ' && x <= '~'
= x:""
| otherwise
= "&#" ++ show (fromEnum x) ++ ";"
show Dot = "."
show (Star e) = "(" ++ show e ++ ")*"
show (Alt e1 e2) = "(" ++ show e1 ++ "|" ++ show e2 ++ ")"
show (Seq e1 e2) = show e1 ++ show e2
show (Rep 1 e) = "(" ++ show e ++ ")+"
show (Rep i e) = "(" ++ show e ++ "){" ++ show i ++ ",}"
show (Rng 0 1 e) = "(" ++ show e ++ ")?"
show (Rng i j e) = "(" ++ show e ++ "){" ++ show i ++ "," ++ show j ++ "}"
show (Dif e1 e2) = "(" ++ show e1 ++ "-" ++ show e2 ++ ")"
nullable :: Regex -> Bool
nullable (Zero _) = False
nullable Unit = True
nullable (Sym _p) = False
nullable Dot = False
nullable (Star _) = True
nullable (Alt e1 e2) = nullable e1 ||
nullable e2
nullable (Seq e1 e2) = nullable e1 &&
nullable e2
nullable (Rep _i e) = nullable e
nullable (Rng i _ e) = i == 0 ||
nullable e
nullable (Dif e1 e2) = nullable e1 &&
not (nullable e2)
delta :: Regex -> Char -> Regex
delta e@(Zero _) _ = e
delta Unit c = mkZero $
"unexpected char " ++ show c
delta (Sym p) c
| p c = mkUnit
| otherwise = mkZero $
"unexpected char " ++ show c ++ ", expected: " ++ oneof ++ chars'
where
(cs, ds) = splitAt 40 (chars p)
oneof
| null (tail cs) = ""
| otherwise = "one of "
chars'
| null (tail cs) = "'" ++ cs ++ "'"
| null ds = "[" ++ cs ++ "]"
| otherwise = "[" ++ cs ++ "...]"
delta Dot _ = mkUnit
delta e@(Star e1) c = mkSeq (delta e1 c) e
delta (Alt e1 e2) c = mkAlt (delta e1 c) (delta e2 c)
delta (Seq e1 e2) c
| nullable e1 = mkAlt (mkSeq (delta e1 c) e2) (delta e2 c)
| otherwise = mkSeq (delta e1 c) e2
delta (Rep i e) c = mkSeq (delta e c) (mkRep (i1) e)
delta (Rng i j e) c = mkSeq (delta e c) (mkRng ((i1) `max` 0) (j1) e)
delta (Dif e1 e2) c = mkDif (delta e1 c) (delta e2 c)
delta' :: Regex -> String -> Regex
delta' = foldl delta
match :: Regex -> String -> Maybe String
match e
= res . delta' e
where
res (Zero err) = Just err
res re
| nullable re = Nothing
| otherwise = Just $ "input does not match " ++ show e