module Text.Regex.XMLSchema.String.RegexParser
( parseRegex )
where
import Data.Maybe
import Text.ParserCombinators.Parsec
import Text.Regex.XMLSchema.String.Unicode.Blocks
import Text.Regex.XMLSchema.String.Unicode.CharProps
import Text.Regex.XMLSchema.String.XML.CharProps
import Text.Regex.XMLSchema.String.Regex
import Text.Regex.XMLSchema.String.CharSet
parseRegex :: String -> Regex
parseRegex
= either (mkZero . ("syntax error: " ++) . show) id
.
parse ( do
r <- regExp
eof
return r
) ""
regExp :: Parser Regex
regExp = branchList
branchList :: Parser Regex
branchList
= do
r1 <- orElseList
rs <- many branchList1
return (foldr1 mkAlt $ r1:rs)
where
branchList1
= do
char '|'
orElseList
orElseList :: Parser Regex
orElseList
= do
r1 <- interleaveList
rs <- many orElseList1
return (foldr1 mkElse $ r1:rs)
where
orElseList1
= do
try (string "{|}")
interleaveList
interleaveList :: Parser Regex
interleaveList
= do
r1 <- exorList
rs <- many interleaveList1
return (foldr1 mkInterleave $ r1:rs)
where
interleaveList1
= do
try (string "{:}")
exorList
exorList :: Parser Regex
exorList
= do
r1 <- diffList
rs <- many exorList1
return (foldr1 mkExor $ r1:rs)
where
exorList1
= do
try (string "{^}")
diffList
diffList :: Parser Regex
diffList
= do
r1 <- intersectList
rs <- many diffList1
return (foldl1 mkDiff $ r1:rs)
where
diffList1
= do
try (string "{\\}")
intersectList
intersectList :: Parser Regex
intersectList
= do
r1 <- seqList
rs <- many intersectList1
return (foldr1 mkIsect $ r1:rs)
where
intersectList1
= do
try (string "{&}")
seqList
seqList :: Parser Regex
seqList
= do
rs <- many piece
return $ mkSeqs rs
piece :: Parser Regex
piece
= do
r <- atom
quantifier r
quantifier :: Regex -> Parser Regex
quantifier r
= ( do
_ <- char '?'
return $ mkOpt r )
<|>
( do
_ <- char '*'
return $ mkStar r )
<|>
( do
_ <- char '+'
return $ mkRep 1 r )
<|>
try ( do
char '{'
res <- quantity r
char '}'
return res
)
<|>
( return r )
quantity :: Regex -> Parser Regex
quantity r
= do
lb <- many1 digit
quantityRest r (read lb)
quantityRest :: Regex -> Int -> Parser Regex
quantityRest r lb
= ( do
_ <- char ','
ub <- many digit
return ( if null ub
then mkRep lb r
else mkRng lb (read ub) r
)
)
<|>
( return $ mkRng lb lb r)
atom :: Parser Regex
atom
= char1
<|>
charClass
<|>
between (char '(') (char ')') regExp'
regExp' :: Parser Regex
regExp'
= do
lab <- option id (between (char '{') (char '}') label')
r <- regExp
return $ lab r
where
label'
= do
l <- many1 (satisfy (flip elemCS isXmlNameChar))
return $ mkBr l
char1 :: Parser Regex
char1
= do
c <- satisfy (`notElem` ".\\?*+{}()|[]")
return $ mkSym1 c
charClass :: Parser Regex
charClass
= charClassEsc
<|>
charClassExpr
<|>
wildCardEsc
charClassEsc :: Parser Regex
charClassEsc
= do
_ <- char '\\'
( singleCharEsc
<|>
multiCharEsc
<|>
catEsc
<|>
complEsc )
singleCharEsc :: Parser Regex
singleCharEsc
= do
c <- singleCharEsc'
return $ mkSym1 c
singleCharEsc' :: Parser Char
singleCharEsc'
= do
c <- satisfy (`elem` "nrt\\|.?*+(){}-[]^")
return $ maybe c id . lookup c . zip "ntr" $ "\n\r\t"
multiCharEsc :: Parser Regex
multiCharEsc
= ( do
c <- satisfy (`elem` es)
return $ mkSym . fromJust . lookup c $ pm )
<|>
( do
char 'a'
return mkDot )
<|>
( do
char 'A'
return mkAll )
where
es = map fst pm
pm = [ ('s', isXmlSpaceChar )
, ('S', compCS isXmlSpaceChar )
, ('i', isXmlNameStartChar )
, ('I', compCS isXmlNameStartChar )
, ('c', isXmlNameChar )
, ('C', compCS isXmlNameChar )
, ('d', isDigit )
, ('D', compCS isDigit )
, ('w', compCS isNotWord )
, ('W', isNotWord )
]
isDigit = rangeCS '0' '9'
isNotWord = isUnicodeP
`unionCS`
isUnicodeZ
`unionCS`
isUnicodeC
catEsc :: Parser Regex
catEsc
= do
char 'p'
s <- between (char '{') (char '}') charProp
return $ mkSym s
charProp :: Parser CharSet
charProp
= isCategory
<|>
isBlock
isBlock :: Parser CharSet
isBlock
= do
_ <- string "Is"
name <- many1 (satisfy legalChar)
case lookup name codeBlocks of
Just b -> return $ uncurry rangeCS b
Nothing -> fail $ "unknown Unicode code block " ++ show name
where
legalChar c = 'A' <= c && c <= 'Z' ||
'a' <= c && c <= 'z' ||
'0' <= c && c <= '9' ||
'-' == c
isCategory :: Parser CharSet
isCategory
= do
pr <- isCategory'
return $ fromJust (lookup pr categories)
categories :: [(String, CharSet)]
categories
= [ ("C", isUnicodeC )
, ("Cc", isUnicodeCc)
, ("Cf", isUnicodeCf)
, ("Co", isUnicodeCo)
, ("Cs", isUnicodeCs)
, ("L", isUnicodeL )
, ("Ll", isUnicodeLl)
, ("Lm", isUnicodeLm)
, ("Lo", isUnicodeLo)
, ("Lt", isUnicodeLt)
, ("Lu", isUnicodeLu)
, ("M", isUnicodeM )
, ("Mc", isUnicodeMc)
, ("Me", isUnicodeMe)
, ("Mn", isUnicodeMn)
, ("N", isUnicodeN )
, ("Nd", isUnicodeNd)
, ("Nl", isUnicodeNl)
, ("No", isUnicodeNo)
, ("P", isUnicodeP )
, ("Pc", isUnicodePc)
, ("Pd", isUnicodePd)
, ("Pe", isUnicodePe)
, ("Pf", isUnicodePf)
, ("Pi", isUnicodePi)
, ("Po", isUnicodePo)
, ("Ps", isUnicodePs)
, ("S", isUnicodeS )
, ("Sc", isUnicodeSc)
, ("Sk", isUnicodeSk)
, ("Sm", isUnicodeSm)
, ("So", isUnicodeSo)
, ("Z", isUnicodeZ )
, ("Zl", isUnicodeZl)
, ("Zp", isUnicodeZp)
, ("Zs", isUnicodeZs)
]
isCategory' :: Parser String
isCategory'
= ( foldr1 (<|>) . map (uncurry prop) $
[ ('L', "ultmo")
, ('M', "nce")
, ('N', "dlo")
, ('P', "cdseifo")
, ('Z', "slp")
, ('S', "mcko")
, ('C', "cfon")
]
) <?> "illegal Unicode character property"
where
prop c1 cs2
= do
_ <- char c1
s2 <- option ""
( do
c2 <- satisfy (`elem` cs2)
return [c2] )
return $ c1:s2
complEsc :: Parser Regex
complEsc
= do
_ <- char 'P'
s <- between (char '{') (char '}') charProp
return $ mkSym $ compCS s
charClassExpr :: Parser Regex
charClassExpr
= between (char '[') (char ']') charGroup
charGroup :: Parser Regex
charGroup
= do
r <- ( negCharGroup
<|>
posCharGroup
)
s <- option (mkZero "")
( do
_ <- char '-'
charClassExpr
)
return $ mkDiff r s
posCharGroup :: Parser Regex
posCharGroup
= do
rs <- many1 (charRange <|> charClassEsc)
return $ foldr1 mkAlt rs
charRange :: Parser Regex
charRange
= try seRange
<|>
xmlCharIncDash
seRange :: Parser Regex
seRange
= do
c1 <- charOrEsc'
_ <- char '-'
c2 <- charOrEsc'
return $ mkSymRng c1 c2
charOrEsc' :: Parser Char
charOrEsc'
= ( do
_ <- char '\\'
singleCharEsc'
)
<|>
satisfy (`notElem` "\\-[]")
xmlCharIncDash :: Parser Regex
xmlCharIncDash
= try ( do
_ <- char '-'
notFollowedBy (char '[')
return $ mkSym1 '-'
)
<|>
( do
c <- satisfy (`notElem` "-\\[]")
return $ mkSym1 c
)
negCharGroup :: Parser Regex
negCharGroup
= do
_ <- char '^'
r <- posCharGroup
return $ mkDiff mkDot r
wildCardEsc :: Parser Regex
wildCardEsc
= do
char '.'
return . mkSym . compCS $ stringCS "\n\r"