-- ------------------------------------------------------------ {- | Module : Text.XML.HXT.RelaxNG.XmlSchema.String.RegexParser Copyright : Copyright (C) 2009 Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : experimental Portability: portable W3C XML Schema Regular Expression Parser This parser supports the full W3C standard, the complete grammar can be found under -} -- ------------------------------------------------------------ module Text.Regex.XMLSchema.String.RegexParser ( parseRegex ) where import Data.Maybe import Text.ParserCombinators.Parsec import Text.Regex.XMLSchema.String.Unicode.Blocks import Text.Regex.XMLSchema.String.Unicode.CharProps import Text.Regex.XMLSchema.String.XML.CharProps import Text.Regex.XMLSchema.String.Regex import Text.Regex.XMLSchema.String.CharSet -- ------------------------------------------------------------ -- | parse a W3C XML Schema regular expression -- -- the Syntax of the W3C XML Schema spec is extended by -- further useful set operations, like intersection, difference, exor. -- Subexpression match becomes possible with \"named\" pairs of parentheses. -- The multi char escape sequence \\a represents any Unicode char, -- The multi char escape sequence \\A represents any Unicode word, (\\A = \\a*). -- All syntactically wrong inputs are mapped to the Zero expression representing the -- empty set of words. Zero contains as data field a string for an error message. -- So error checking after parsing becomes possible by checking against Zero ('isZero' predicate) parseRegex :: String -> Regex parseRegex = either (mkZero . ("syntax error: " ++) . show) id . parse ( do r <- regExp eof return r ) "" -- ------------------------------------------------------------ regExp :: Parser Regex regExp = branchList branchList :: Parser Regex branchList = do r1 <- orElseList rs <- many branchList1 return (foldr1 mkAlt $ r1:rs) -- union is associative, so we use right ass. -- as with seq, alt and exor where branchList1 = do _ <- char '|' orElseList orElseList :: Parser Regex orElseList = do r1 <- interleaveList rs <- many orElseList1 return (foldr1 mkElse $ r1:rs) -- orElse is associative, so we choose right ass. -- as with seq and alt ops where orElseList1 = do _ <- try (string "{|}") interleaveList interleaveList :: Parser Regex interleaveList = do r1 <- exorList rs <- many interleaveList1 return (foldr1 mkInterleave $ r1:rs) -- interleave is associative, so we choose right ass. -- as with seq and alt ops where interleaveList1 = do _ <- try (string "{:}") exorList exorList :: Parser Regex exorList = do r1 <- diffList rs <- many exorList1 return (foldr1 mkExor $ r1:rs) -- exor is associative, so we choose right ass. where exorList1 = do _ <- try (string "{^}") diffList diffList :: Parser Regex diffList = do r1 <- intersectList rs <- many diffList1 return (foldl1 mkDiff $ r1:rs) -- diff is not associative, so we choose left ass. where diffList1 = do _ <- try (string "{\\}") intersectList intersectList :: Parser Regex intersectList = do r1 <- seqList rs <- many intersectList1 return (foldr1 mkIsect $ r1:rs) where intersectList1 = do _ <- try (string "{&}") seqList seqList :: Parser Regex seqList = do rs <- many piece return $ mkSeqs rs piece :: Parser Regex piece = do r <- atom quantifier r quantifier :: Regex -> Parser Regex quantifier r = ( do _ <- char '?' return $ mkOpt r ) <|> ( do _ <- char '*' return $ mkStar r ) <|> ( do _ <- char '+' return $ mkRep 1 r ) <|> try ( do _ <- char '{' res <- quantity r _ <- char '}' return res ) <|> ( return r ) quantity :: Regex -> Parser Regex quantity r = do lb <- many1 digit quantityRest r (read lb) quantityRest :: Regex -> Int -> Parser Regex quantityRest r lb = ( do _ <- char ',' ub <- many digit return ( if null ub then mkRep lb r else mkRng lb (read ub) r ) ) <|> ( return $ mkRng lb lb r) atom :: Parser Regex atom = char1 <|> charClass <|> between (char '(') (char ')') regExp' regExp' :: Parser Regex regExp' = do lab <- option id (between (char '{') (char '}') label') r <- regExp return $ lab r where label' = do l <- many1 (satisfy (flip elemCS isXmlNameChar)) return $ mkBr l char1 :: Parser Regex char1 = do c <- satisfy (`notElem` ".\\?*+{}()|[]") return $ mkSym1 c charClass :: Parser Regex charClass = charClassEsc <|> charClassExpr <|> wildCardEsc charClassEsc :: Parser Regex charClassEsc = do _ <- char '\\' ( singleCharEsc <|> multiCharEsc <|> catEsc <|> complEsc ) singleCharEsc :: Parser Regex singleCharEsc = do c <- singleCharEsc' return $ mkSym1 c singleCharEsc' :: Parser Char singleCharEsc' = do c <- satisfy (`elem` "nrt\\|.?*+(){}-[]^") return $ maybe c id . lookup c . zip "ntr" $ "\n\r\t" multiCharEsc :: Parser Regex multiCharEsc = ( do c <- satisfy (`elem` es) return $ mkSym . fromJust . lookup c $ pm ) <|> ( do -- extension: \a represents the whole alphabet inclusive newline chars: \a == .|\n|\r _ <- char 'a' return mkDot ) <|> ( do -- extension: \A represents all words: \A == \a* or \A == (.|\n|\r)* _ <- char 'A' return mkAll ) where es = map fst pm pm = [ ('s', isXmlSpaceChar ) , ('S', compCS isXmlSpaceChar ) , ('i', isXmlNameStartChar ) , ('I', compCS isXmlNameStartChar ) , ('c', isXmlNameChar ) , ('C', compCS isXmlNameChar ) , ('d', isDigit ) , ('D', compCS isDigit ) , ('w', compCS isNotWord ) , ('W', isNotWord ) ] isDigit = rangeCS '0' '9' isNotWord = isUnicodeP `unionCS` isUnicodeZ `unionCS` isUnicodeC catEsc :: Parser Regex catEsc = do _ <- char 'p' s <- between (char '{') (char '}') charProp return $ mkSym s charProp :: Parser CharSet charProp = isCategory <|> isBlock isBlock :: Parser CharSet isBlock = do _ <- string "Is" name <- many1 (satisfy legalChar) case lookup name codeBlocks of Just b -> return $ uncurry rangeCS b Nothing -> fail $ "unknown Unicode code block " ++ show name where legalChar c = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || '-' == c isCategory :: Parser CharSet isCategory = do pr <- isCategory' return $ fromJust (lookup pr categories) categories :: [(String, CharSet)] categories = [ ("C", isUnicodeC ) , ("Cc", isUnicodeCc) , ("Cf", isUnicodeCf) , ("Co", isUnicodeCo) , ("Cs", isUnicodeCs) , ("L", isUnicodeL ) , ("Ll", isUnicodeLl) , ("Lm", isUnicodeLm) , ("Lo", isUnicodeLo) , ("Lt", isUnicodeLt) , ("Lu", isUnicodeLu) , ("M", isUnicodeM ) , ("Mc", isUnicodeMc) , ("Me", isUnicodeMe) , ("Mn", isUnicodeMn) , ("N", isUnicodeN ) , ("Nd", isUnicodeNd) , ("Nl", isUnicodeNl) , ("No", isUnicodeNo) , ("P", isUnicodeP ) , ("Pc", isUnicodePc) , ("Pd", isUnicodePd) , ("Pe", isUnicodePe) , ("Pf", isUnicodePf) , ("Pi", isUnicodePi) , ("Po", isUnicodePo) , ("Ps", isUnicodePs) , ("S", isUnicodeS ) , ("Sc", isUnicodeSc) , ("Sk", isUnicodeSk) , ("Sm", isUnicodeSm) , ("So", isUnicodeSo) , ("Z", isUnicodeZ ) , ("Zl", isUnicodeZl) , ("Zp", isUnicodeZp) , ("Zs", isUnicodeZs) ] isCategory' :: Parser String isCategory' = ( foldr1 (<|>) . map (uncurry prop) $ [ ('L', "ultmo") , ('M', "nce") , ('N', "dlo") , ('P', "cdseifo") , ('Z', "slp") , ('S', "mcko") , ('C', "cfon") ] ) "illegal Unicode character property" where prop c1 cs2 = do _ <- char c1 s2 <- option "" ( do c2 <- satisfy (`elem` cs2) return [c2] ) return $ c1:s2 complEsc :: Parser Regex complEsc = do _ <- char 'P' s <- between (char '{') (char '}') charProp return $ mkSym $ compCS s charClassExpr :: Parser Regex charClassExpr = between (char '[') (char ']') charGroup charGroup :: Parser Regex charGroup = do r <- ( negCharGroup -- a ^ at beginning denotes negation, not start of posCharGroup <|> posCharGroup ) s <- option (mkZero "") -- charClassSub ( do _ <- char '-' charClassExpr ) return $ mkDiff r s posCharGroup :: Parser Regex posCharGroup = do rs <- many1 (charRange <|> charClassEsc) return $ foldr1 mkAlt rs charRange :: Parser Regex charRange = try seRange <|> xmlCharIncDash seRange :: Parser Regex seRange = do c1 <- charOrEsc' _ <- char '-' c2 <- charOrEsc' return $ mkSymRng c1 c2 charOrEsc' :: Parser Char charOrEsc' = ( do _ <- char '\\' singleCharEsc' ) <|> satisfy (`notElem` "\\-[]") xmlCharIncDash :: Parser Regex xmlCharIncDash = try ( do -- dash is only allowed if not followed by a [, else charGroup differences do not parse correctly _ <- char '-' notFollowedBy (char '[') return $ mkSym1 '-' ) <|> ( do c <- satisfy (`notElem` "-\\[]") return $ mkSym1 c ) negCharGroup :: Parser Regex negCharGroup = do _ <- char '^' r <- posCharGroup return $ mkDiff mkDot r wildCardEsc :: Parser Regex wildCardEsc = do _ <- char '.' return . mkSym . compCS $ stringCS "\n\r" -- ------------------------------------------------------------