Copyright | Copyright (C) 2014- Uwe Schmidt |
---|---|

License | MIT |

Maintainer | Uwe Schmidt <uwe@fh-wedel.de> |

Stability | stable |

Portability | portable |

Safe Haskell | Safe |

Language | Haskell2010 |

Convenient functions for W3C XML Schema Regular Expression Matcher.
For internals see `Regex`

and
`Matching`

Grammar can be found under http://www.w3.org/TR/xmlschema11-2/#regexs

- data GenRegex s
- type Regex = GenRegex String
- type RegexText = GenRegex Text
- type RegexTextLazy = GenRegex Text
- type RegexByteString = GenRegex ByteString
- type RegexByteStringLazy = GenRegex ByteString
- grep :: StringLike s => s -> [s] -> [s]
- grepExt :: StringLike s => s -> [s] -> [s]
- grepRE :: StringLike s => GenRegex s -> [s] -> [s]
- grepREwithLineNum :: StringLike s => GenRegex s -> [s] -> [(Int, s)]
- match :: StringLike s => s -> s -> Bool
- matchExt :: StringLike s => s -> s -> Bool
- matchSubex :: StringLike s => s -> s -> [(s, s)]
- sed :: StringLike s => (s -> s) -> s -> s -> s
- sedExt :: StringLike s => (s -> s) -> s -> s -> s
- split :: StringLike s => s -> s -> (s, s)
- splitExt :: StringLike s => s -> s -> (s, s)
- splitSubex :: StringLike s => s -> s -> ([(s, s)], s)
- tokenize :: StringLike s => s -> s -> [s]
- tokenizeExt :: StringLike s => s -> s -> [s]
- tokenize' :: StringLike s => s -> s -> [Either s s]
- tokenizeExt' :: StringLike s => s -> s -> [Either s s]
- tokenizeSubex :: StringLike s => s -> s -> [(s, s)]
- matchRE :: StringLike s => GenRegex s -> s -> Bool
- matchSubexRE :: StringLike s => GenRegex s -> s -> [(s, s)]
- sedRE :: StringLike s => (s -> s) -> GenRegex s -> s -> s
- splitRE :: StringLike s => GenRegex s -> s -> Maybe (s, s)
- splitSubexRE :: StringLike s => GenRegex s -> s -> Maybe ([(s, s)], s)
- tokenizeRE :: StringLike s => GenRegex s -> s -> [s]
- tokenizeRE' :: StringLike s => GenRegex s -> s -> [Either s s]
- tokenizeSubexRE :: StringLike s => GenRegex s -> s -> [(s, s)]
- mkZero :: s -> GenRegex s
- mkZero' :: StringLike s => String -> GenRegex s
- mkUnit :: GenRegex s
- mkSym1 :: StringLike s => Char -> GenRegex s
- mkSymRng :: StringLike s => Char -> Char -> GenRegex s
- mkWord :: StringLike s => [Char] -> GenRegex s
- mkDot :: GenRegex s
- mkStar :: StringLike s => GenRegex s -> GenRegex s
- mkAll :: StringLike s => GenRegex s
- mkAlt :: StringLike s => GenRegex s -> GenRegex s -> GenRegex s
- mkElse :: StringLike s => GenRegex s -> GenRegex s -> GenRegex s
- mkSeq :: GenRegex s -> GenRegex s -> GenRegex s
- mkSeqs :: [GenRegex s] -> GenRegex s
- mkRep :: StringLike s => Int -> GenRegex s -> GenRegex s
- mkRng :: StringLike s => Int -> Int -> GenRegex s -> GenRegex s
- mkOpt :: StringLike s => GenRegex s -> GenRegex s
- mkDiff :: StringLike s => GenRegex s -> GenRegex s -> GenRegex s
- mkIsect :: StringLike s => GenRegex s -> GenRegex s -> GenRegex s
- mkExor :: StringLike s => GenRegex s -> GenRegex s -> GenRegex s
- mkCompl :: StringLike s => GenRegex s -> GenRegex s
- mkBr :: s -> GenRegex s -> GenRegex s
- mkBr' :: StringLike s => String -> GenRegex s -> GenRegex s
- isZero :: GenRegex s -> Bool
- errRegex :: StringLike s => GenRegex s -> s
- parseRegex :: StringLike s => s -> GenRegex s
- parseRegexExt :: StringLike s => s -> GenRegex s
- parseContextRegex :: StringLike s => (String -> GenRegex s) -> s -> GenRegex s

# Documentation

type RegexTextLazy = GenRegex Text Source #

type RegexByteString = GenRegex ByteString Source #

type RegexByteStringLazy = GenRegex ByteString Source #

grep :: StringLike s => s -> [s] -> [s] Source #

grep like filter for lists of strings

The regular expression may be prefixed with the usual context spec "^" for start of string, and "\<" for start of word. and suffixed with "$" for end of text and "\>" end of word. Word chars are defined by the multi char escape sequence "\w"

Examples

grep "a" ["_a_", "_a", "a_", "a", "_"] => ["_a_", "_a", "a_", "a"] grep "^a" ["_a_", "_a", "a_", "a", "_"] => ["a_", "a"] grep "a$" ["_a_", "_a", "a_", "a", "_"] => ["_a", "a"] grep "^a$" ["_a_", "_a", "a_", "a", "_"] => ["a"] grep "\\<a" ["x a b", " ax ", " xa ", "xab"] => ["x a b", " ax "] grep "a\\>" ["x a b", " ax ", " xa ", "xab"] => ["x a b", " xa "]

grepExt :: StringLike s => s -> [s] -> [s] Source #

grep with extended regular expressions

grepRE :: StringLike s => GenRegex s -> [s] -> [s] Source #

grep with already prepared Regex (ususally with `parseContextRegex`

)

grepREwithLineNum :: StringLike s => GenRegex s -> [s] -> [(Int, s)] Source #

grep with Regex and line numbers

match :: StringLike s => s -> s -> Bool Source #

convenient function for `matchRE`

Examples:

match "x*" "xxx" = True match "x" "xxx" = False match "[" "xxx" = False

matchExt :: StringLike s => s -> s -> Bool Source #

match with extended regular expressions

matchSubex :: StringLike s => s -> s -> [(s, s)] Source #

convenient function for `matchRE`

Examples:

matchSubex "({1}x*)" "xxx" = [("1","xxx")] matchSubex "({1}x*)" "y" = [] matchSubex "({w}[0-9]+)x({h}[0-9]+)" "800x600" = [("w","800"),("h","600")] matchSubex "[" "xxx" = []

sed :: StringLike s => (s -> s) -> s -> s -> s Source #

convenient function for `sedRE`

examples:

sed (const "b") "a" "xaxax" = "xbxbx" sed (\ x -> x ++ x) "a" "xax" = "xaax" sed undefined "[" "xxx" = "xxx"

sedExt :: StringLike s => (s -> s) -> s -> s -> s Source #

split :: StringLike s => s -> s -> (s, s) Source #

convenient function for `splitRE`

examples:

split "a*b" "abc" = ("ab","c") split "a*" "bc" = ("", "bc") -- "a*" matches "" split "a+" "bc" = ("", "bc") -- "a+" does not match, no split split "[" "abc" = ("", "abc") -- "[" syntax error, no split

splitExt :: StringLike s => s -> s -> (s, s) Source #

split with extended syntax

splitSubex :: StringLike s => s -> s -> ([(s, s)], s) Source #

convenient function for `splitSubex`

, uses extended syntax

examples:

splitSubex "({1}a*)b" "abc" = ([("1","a")],"c") splitSubex "({2}a*)" "bc" = ([("2","")], "bc") splitSubex "({1}a|b)+" "abc" = ([("1","a"),("1","b")],"c") -- subex 1 matches 2 times splitSubex ".*({x}a*)" "aa" = ([("x",""),("x","a"),("x","aa")],"") -- nondeterminism: 3 matches for a* splitSubex "({1}do)|({2}[a-z]+)" "do you know" = ([("1","do"),("2","do")]," you know") -- nondeterminism: 2 matches for do splitSubex "({1}do){|}({2}[a-z]+)" "do you know" = ([("1","do")]," you know") -- no nondeterminism with {|}: 1. match for do splitSubex "({1}a+)" "bcd" = ([], "bcd") -- no match splitSubex "[" "abc" = ([], "abc") -- syntax error

tokenize :: StringLike s => s -> s -> [s] Source #

split a string into tokens (words) by giving a regular expression which all tokens must match.

Convenient function for `tokenizeRE`

This can be used for simple tokenizers.
It is recommended to use regular expressions where the empty word does not match.
Else there will appear a lot of probably useless empty tokens in the output.
All none matching chars are discarded. If the given regex contains syntax errors,
`Nothing`

is returned

examples:

tokenize "a" "aabba" = ["a","a","a"] tokenize "a*" "aaaba" = ["aaa","a"] tokenize "a*" "bbb" = ["","",""] tokenize "a+" "bbb" = [] tokenize "a*b" "" = [] tokenize "a*b" "abc" = ["ab"] tokenize "a*b" "abaab ab" = ["ab","aab","ab"] tokenize "[a-z]{2,}|[0-9]{2,}|[0-9]+[.][0-9]+" "ab123 456.7abc" = ["ab","123","456.7","abc"] tokenize "[a-z]*|[0-9]{2,}|[0-9]+[.][0-9]+" "cab123 456.7abc" = ["cab","123","456.7","abc"] tokenize "[^ \t\n\r]*" "abc def\t\n\rxyz" = ["abc","def","xyz"] tokenize ".*" "\nabc\n123\n\nxyz\n" = ["","abc","123","","xyz"] tokenize ".*" = lines tokenize "[^ \t\n\r]*" = words

tokenizeExt :: StringLike s => s -> s -> [s] Source #

tokenize with extended syntax

tokenize' :: StringLike s => s -> s -> [Either s s] Source #

convenient function for `tokenizeRE'`

When the regular expression parses as Zero, `[Left input]`

is returned, that means no tokens are found

tokenizeExt' :: StringLike s => s -> s -> [Either s s] Source #

tokenizeSubex :: StringLike s => s -> s -> [(s, s)] Source #

convenient function for `tokenizeSubexRE`

a string

examples:

tokenizeSubex "({name}[a-z]+)|({num}[0-9]{2,})|({real}[0-9]+[.][0-9]+)" "cab123 456.7abc" = [("name","cab") ,("num","123") ,("real","456.7") ,("name","abc")] tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" "12.34" = [("real","12.34") ,("n","12") ,("f","34")] tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)" "12 34" = [("real","12"),("n","12") ,("real","34"),("n","34")] tokenizeSubex "({real}({n}[0-9]+)(([.]({f}[0-9]+))|({f})))" "12 34.56" = [("real","12"),("n","12"),("f","") ,("real","34.56"),("n","34"),("f","56")]

matchSubexRE :: StringLike s => GenRegex s -> s -> [(s, s)] Source #

match a string with a regular expression and extract subexpression matches

sedRE :: StringLike s => (s -> s) -> GenRegex s -> s -> s Source #

sed like editing function

All matching tokens are edited by the 1. argument, the editing function, all other chars remain as they are

splitRE :: StringLike s => GenRegex s -> s -> Maybe (s, s) Source #

split a string by taking the longest prefix matching a regular expression

`Nothing`

is returned in case there is no matching prefix,
else the pair of prefix and rest is returned

splitSubexRE :: StringLike s => GenRegex s -> s -> Maybe ([(s, s)], s) Source #

split a string by removing the longest prefix matching a regular expression and then return the list of subexpressions found in the matching part

`Nothing`

is returned in case of no matching prefix,
else the list of pairs of labels and submatches and the
rest is returned

tokenizeRE :: StringLike s => GenRegex s -> s -> [s] Source #

The function, that does the real work for `tokenize`

tokenizeRE' :: StringLike s => GenRegex s -> s -> [Either s s] Source #

split a string into tokens and delimierter by giving a regular expression which all tokens must match

This is a generalisation of the above `tokenizeRE`

functions.
The none matching char sequences are marked with `Left`

, the matching ones are marked with `Right`

If the regular expression contains syntax errors `Nothing`

is returned

The following Law holds:

concat . map (either id id) . tokenizeRE' re == id

tokenizeSubexRE :: StringLike s => GenRegex s -> s -> [(s, s)] Source #

split a string into tokens (pair of labels and words) by giving a regular expression containing labeled subexpressions.

This function should not be called with regular expressions without any labeled subexpressions. This does not make sense, because the result list will always be empty.

Result is the list of matching subexpressions
This can be used for simple tokenizers.
At least one char is consumed by parsing a token.
The pairs in the result list contain the matching substrings.
All none matching chars are discarded. If the given regex contains syntax errors,
`Nothing`

is returned

mkZero :: s -> GenRegex s Source #

construct the r.e. for the empty set. An (error-) message may be attached

mkSymRng :: StringLike s => Char -> Char -> GenRegex s Source #

construct an r.e. for an intervall of chars

mkAll :: StringLike s => GenRegex s Source #

construct an r.e. for the set of all Unicode words

mkElse :: StringLike s => GenRegex s -> GenRegex s -> GenRegex s Source #

construct the r.e. for r1{|}r2 (r1 orElse r2).

This represents the same r.e. as r1|r2, but when collecting the results of subexpressions in (...) and r1 succeeds, the subexpressions of r2 are discarded, so r1 matches are prioritized

example

splitSubex "({1}x)|({2}.)" "x" = ([("1","x"),("2","x")], "") splitSubex "({1}x){|}({2}.)" "x" = ([("1","x")], "")

mkDiff :: StringLike s => GenRegex s -> GenRegex s -> GenRegex s Source #

Construct difference r.e.: r1 {\} r2

example

match "[a-z]+{\\}bush" "obama" = True match "[a-z]+{\\}bush" "clinton" = True match "[a-z]+{\\}bush" "bush" = False -- not important any more

mkIsect :: StringLike s => GenRegex s -> GenRegex s -> GenRegex s Source #

Construct r.e. for intersection: r1 {&} r2

example

match ".*a.*{&}.*b.*" "-a-b-" = True match ".*a.*{&}.*b.*" "-b-a-" = True match ".*a.*{&}.*b.*" "-a-a-" = False match ".*a.*{&}.*b.*" "---b-" = False

mkExor :: StringLike s => GenRegex s -> GenRegex s -> GenRegex s Source #

Construct r.e. for exclusive or: r1 {^} r2

example

match "[a-c]+{^}[c-d]+" "abc" = True match "[a-c]+{^}[c-d]+" "acdc" = False match "[a-c]+{^}[c-d]+" "ccc" = False match "[a-c]+{^}[c-d]+" "cdc" = True

mkCompl :: StringLike s => GenRegex s -> GenRegex s Source #

Construct the Complement of an r.e.: whole set of words - r

errRegex :: StringLike s => GenRegex s -> s Source #

parseRegex :: StringLike s => s -> GenRegex s Source #

parse a standard W3C XML Schema regular expression

parseRegexExt :: StringLike s => s -> GenRegex s Source #

parse an extended syntax W3C XML Schema regular expression

The Syntax of the W3C XML Schema spec is extended by
further useful set operations, like intersection, difference, exor.
Subexpression match becomes possible with "named" pairs of parentheses.
The multi char escape sequence \a represents any Unicode char,
The multi char escape sequence \A represents any Unicode word, (\A = \a*).
All syntactically wrong inputs are mapped to the Zero expression representing the
empty set of words. Zero contains as data field a string for an error message.
So error checking after parsing becomes possible by checking against Zero (`isZero`

predicate)

parseContextRegex :: StringLike s => (String -> GenRegex s) -> s -> GenRegex s Source #

parse a regular expression surrounded by contenxt spec

a leading `^`

denotes start of text,
a trailing `$`

denotes end of text,
a leading `\<`

denotes word start,
a trailing `\>`

denotes word end.

The 1. param ist the regex parser (`parseRegex`

or `parseRegexExt`

)