Portability | portable |
Stability | experimental |
Maintainer | Uwe Schmidt (uwe@fh-wedel.de) |
Convenient functions for W3C XML Schema Regular Expression Matcher.
For internals see Text.Regex.XMLSchema.String.Regex
Grammar can be found under http://www.w3.org/TR/xmlschema11-2/#regexs
- data GenRegex l
- type Regex = GenRegex String
- match :: String -> String -> Bool
- matchSubex :: String -> String -> [(String, String)]
- sed :: (String -> String) -> String -> String -> String
- split :: String -> String -> (String, String)
- splitSubex :: String -> String -> ([(String, String)], String)
- tokenize :: String -> String -> [String]
- tokenize' :: String -> String -> [Either String String]
- tokenizeSubex :: String -> String -> [(String, String)]
- matchRE :: (Eq l, Show l) => GenRegex l -> String -> Bool
- matchSubexRE :: (Eq l, Show l) => GenRegex l -> String -> [(l, String)]
- sedRE :: (Eq l, Show l) => (String -> String) -> GenRegex l -> String -> String
- splitRE :: (Eq l, Show l) => GenRegex l -> String -> Maybe (String, String)
- splitSubexRE :: (Eq l, Show l) => GenRegex l -> String -> Maybe ([(l, String)], String)
- tokenizeRE :: (Eq l, Show l) => GenRegex l -> String -> [String]
- tokenizeRE' :: (Eq l, Show l) => GenRegex l -> String -> [Either String String]
- tokenizeSubexRE :: (Eq l, Show l) => GenRegex l -> String -> [(l, String)]
- mkZero :: String -> GenRegex l
- mkUnit :: GenRegex l
- mkSym1 :: Char -> GenRegex l
- mkSymRng :: Char -> Char -> GenRegex l
- mkDot :: GenRegex l
- mkStar :: Eq l => GenRegex l -> GenRegex l
- mkAll :: Eq l => GenRegex l
- mkAlt :: Eq l => GenRegex l -> GenRegex l -> GenRegex l
- mkElse :: Eq l => GenRegex l -> GenRegex l -> GenRegex l
- mkSeq :: GenRegex l -> GenRegex l -> GenRegex l
- mkRep :: Eq l => Int -> GenRegex l -> GenRegex l
- mkRng :: Int -> Int -> GenRegex l -> GenRegex l
- mkOpt :: GenRegex l -> GenRegex l
- mkDiff :: Eq l => GenRegex l -> GenRegex l -> GenRegex l
- mkIsect :: Eq l => GenRegex l -> GenRegex l -> GenRegex l
- mkExor :: Eq l => GenRegex l -> GenRegex l -> GenRegex l
- mkCompl :: Eq l => GenRegex l -> GenRegex l
- mkBr :: l -> GenRegex l -> GenRegex l
- isZero :: GenRegex l -> Bool
- errRegex :: GenRegex l -> String
- parseRegex :: String -> Regex
match :: String -> String -> BoolSource
convenient function for matchRE
match "x*" "xxx" = True match "x" "xxx" = False match "[" "xxx" = False
matchSubex :: String -> String -> [(String, String)]Source
convenient function for matchRE
matchSubex "({1}x*)" "xxx" = [("1","xxx")] matchSubex "({1}x*)" "y" = [] matchSubex "({w}[0-9]+)x({h}[0-9]+)" "800x600" = [("w","800"),("h","600")] matchSubex "[" "xxx" = []
sed :: (String -> String) -> String -> String -> StringSource
convenient function for sedRE
sed (const "b") "a" "xaxax" = "xbxbx" sed (\ x -> x ++ x) "a" "xax" = "xaax" sed undefined "[" "xxx" = "xxx"
split :: String -> String -> (String, String)Source
convenient function for splitRE
split "a*b" "abc" = ("ab","c") split "a*" "bc" = ("", "bc") split "a+" "bc" = ("", "bc") split "[" "abc" = ("", "abc")
splitSubex :: String -> String -> ([(String, String)], String)Source
convenient function for splitSubex
splitSubex "({1}a*)b" "abc" = ([("1","a")],"c") splitSubex "({2}a*)" "bc" = ([("2","")], "bc") splitSubex "({1}a|b)+" "abc" = ([("1","a"),("1","b")],"c") -- subex 1 matches 2 times splitSubex ".*({x}a*)" "aa" = ([("x",""),("x","a"),("x","aa")],"") -- nondeterminism: 3 matches for a* splitSubex "({1}do)|({2}[a-z]+)" "do you know" = ([("1","do"),("2","do")]," you know") -- nondeterminism: 2 matches for do splitSubex "({1}do){|}({2}[a-z]+)" "do you know" = ([("1","do")]," you know") -- no nondeterminism with {|}: 1. match for do splitSubex "({1}a+)" "bcd" = ([], "bcd") -- no match splitSubex "[" "abc" = ([], "abc") -- syntax error
tokenize :: String -> String -> [String]Source
split a string into tokens (words) by giving a regular expression which all tokens must match.
Convenient function for tokenizeRE
This can be used for simple tokenizers.
It is recommended to use regular expressions where the empty word does not match.
Else there will appear a lot of probably useless empty tokens in the output.
All none matching chars are discarded. If the given regex contains syntax errors,
is returned
tokenize "a" "aabba" = ["a","a","a"] tokenize "a*" "aaaba" = ["aaa","a"] tokenize "a*" "bbb" = ["","",""] tokenize "a+" "bbb" = [] tokenize "a*b" "" = [] tokenize "a*b" "abc" = ["ab"] tokenize "a*b" "abaab ab" = ["ab","aab","ab"] tokenize "[a-z]{2,}|[0-9]{2,}|[0-9]+[.][0-9]+" "ab123 456.7abc" = ["ab","123","456.7","abc"] tokenize "[a-z]*|[0-9]{2,}|[0-9]+[.][0-9]+" "cab123 456.7abc" = ["cab","123","456.7","abc"] tokenize "[^ \t\n\r]*" "abc def\t\n\rxyz" = ["abc","def","xyz"] tokenize ".*" "\nabc\n123\n\nxyz\n" = ["","abc","123","","xyz"] tokenize ".*" = lines tokenize "[^ \t\n\r]*" = words
tokenize' :: String -> String -> [Either String String]Source
convenient function for tokenizeRE'
When the regular expression parses as Zero, [Left input]
is returned, that means no tokens are found
tokenizeSubex :: String -> String -> [(String, String)]Source
convenient function for tokenizeSubexRE
a string
tokenizeSubex "({name}[a-z]+)|({num}[0-9]{2,})|({real}[0-9]+[.][0-9]+)" "cab123 456.7abc" = [("name","cab") ,("num","123") ,("real","456.7") ,("name","abc")] tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" "12.34" = [("real","12.34") ,("n","12") ,("f","34")] tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" "12 34" = [("real","12"),("n","12") ,("real","34"),("n","34")] tokenizeSubex "({real}({n}[0-9]+)(([.]({f}[0-9]+))|({f})))" "12 34.56" = [("real","12"),("n","12"),("f","") ,("real","34.56"),("n","34"),("f","56")]
matchRE :: (Eq l, Show l) => GenRegex l -> String -> BoolSource
match a string with a regular expression
matchSubexRE :: (Eq l, Show l) => GenRegex l -> String -> [(l, String)]Source
match a string with a regular expression and extract subexpression matches
sedRE :: (Eq l, Show l) => (String -> String) -> GenRegex l -> String -> StringSource
sed like editing function
All matching tokens are edited by the 1. argument, the editing function, all other chars remain as they are
splitRE :: (Eq l, Show l) => GenRegex l -> String -> Maybe (String, String)Source
split a string by taking the longest prefix matching a regular expression
is returned in case there is no matching prefix,
else the pair of prefix and rest is returned
splitSubexRE :: (Eq l, Show l) => GenRegex l -> String -> Maybe ([(l, String)], String)Source
split a string by removing the longest prefix matching a regular expression and then return the list of subexpressions found in the matching part
is returned in case of no matching prefix,
else the list of pairs of labels and submatches and the
rest is returned
tokenizeRE :: (Eq l, Show l) => GenRegex l -> String -> [String]Source
The function, that does the real work for tokenize
tokenizeRE' :: (Eq l, Show l) => GenRegex l -> String -> [Either String String]Source
split a string into tokens and delimierter by giving a regular expression wich all tokens must match
This is a generalisation of the above tokenizeRE
The none matching char sequences are marked with Left
, the matching ones are marked with Right
If the regular expression contains syntax errors Nothing
is returned
The following Law holds:
concat . map (either id id) . tokenizeRE' re == id
tokenizeSubexRE :: (Eq l, Show l) => GenRegex l -> String -> [(l, String)]Source
split a string into tokens (pair of labels and words) by giving a regular expression containing labeld subexpressions.
This function should not be called with regular expressions witout any labeled subexpressions. This does not make sense, because the result list will always be empty.
Result is the list of matching subexpressions
This can be used for simple tokenizers.
At least one char is consumed by parsing a token.
The pairs in the result list contain the matching substrings.
All none matching chars are discarded. If the given regex contains syntax errors,
is returned
mkZero :: String -> GenRegex lSource
construct the r.e. for the empty set. An (error-) message may be attached
mkElse :: Eq l => GenRegex l -> GenRegex l -> GenRegex lSource
construct the r.e. for r1{|}r2 (r1 orElse r2).
This represents the same r.e. as r1|r2, but when collecting the results of subexpressions in (...) and r1 succeeds, the subexpressions of r2 are discarded, so r1 matches are prioritized
splitSubex "({1}x)|({2}.)" "x" = ([("1","x"),("2","x")], "") splitSubex "({1}x){|}({2}.)" "x" = ([("1","x")], "")
mkDiff :: Eq l => GenRegex l -> GenRegex l -> GenRegex lSource
Construct difference r.e.: r1 {\} r2
match "[a-z]+{\\}bush" "obama" = True match "[a-z]+{\\}bush" "clinton" = True match "[a-z]+{\\}bush" "bush" = False -- not important any more
mkIsect :: Eq l => GenRegex l -> GenRegex l -> GenRegex lSource
Construct r.e. for intersection: r1 {&} r2
match ".*a.*{&}.*b.*" "-a-b-" = True match ".*a.*{&}.*b.*" "-b-a-" = True match ".*a.*{&}.*b.*" "-a-a-" = False match ".*a.*{&}.*b.*" "---b-" = False
mkExor :: Eq l => GenRegex l -> GenRegex l -> GenRegex lSource
Construct r.e. for exclusive or: r1 {^} r2
match "[a-c]+{^}[c-d]+" "abc" = True match "[a-c]+{^}[c-d]+" "acdc" = False match "[a-c]+{^}[c-d]+" "ccc" = False match "[a-c]+{^}[c-d]+" "cdc" = True
mkCompl :: Eq l => GenRegex l -> GenRegex lSource
Construct the Complement of an r.e.: whole set of words - r
parseRegex :: String -> RegexSource
parse a W3C XML Schema regular expression
the Syntax of the W3C XML Schema spec is extended by
further useful set operations, like intersection, difference, exor.
Subexpression match becomes possible with "named" pairs of parentheses.
The multi char escape sequence \a represents any Unicode char,
The multi char escape sequence \A represents any Unicode word, (\A = \a*).
All syntactically wrong inputs are mapped to the Zero expression representing the
empty set of words. Zero contains as data field a string for an error message.
So error checking after parsing becomes possible by checking against Zero (isZero