hxt-regex-xmlschema-9.2.0: A regular expression library for W3C XML Schema regular expressions

Portabilityportable
Stabilitystable
MaintainerUwe Schmidt <uwe@fh-wedel.de>
Safe HaskellSafe-Inferred

Text.Regex.XMLSchema.String

Description

Deprecated: use the more general Generic instead

Convenient functions for W3C XML Schema Regular Expression Matcher for Strings. A specialisation of Text.Regex.XMLSchema.Generic as compatibility module to old non generic version

Grammar can be found under http://www.w3.org/TR/xmlschema11-2/#regexs

Synopsis

Documentation

type Regex = GenRegex StringSource

grep :: String -> [String] -> [String]Source

grep like filter for lists of strings

The regular expression may be prefixed with the usual context spec "^" for start of string, and \< for start of word. and suffixed with "$" for end of text and \> end of word. Word chars are defined by the multi char escape sequence \w

Examples

 grep "a"    ["_a_", "_a", "a_", "a", "_"]      => ["_a_", "_a", "a_", "a"]
 grep "^a"   ["_a_", "_a", "a_", "a", "_"]      => ["a_", "a"]
 grep "a$"   ["_a_", "_a", "a_", "a", "_"]      => ["_a", "a"]
 grep "^a$"  ["_a_", "_a", "a_", "a", "_"]      => ["a"]
 grep "\\<a" ["x a b", " ax ", " xa ", "xab"]   => ["x a b", " ax "]
 grep "a\\>" ["x a b", " ax ", " xa ", "xab"]   => ["x a b", " xa "]

grepExt :: String -> [String] -> [String]Source

grep with extended regular expressions

grepRE :: Regex -> [String] -> [String]Source

grep with already prepared Regex (ususally with parseContextRegex)

grepREwithLineNum :: Regex -> [String] -> [(Int, String)]Source

grep with Regex and line numbers

match :: String -> String -> BoolSource

convenient function for matchRE

Examples:

 match "x*" "xxx" = True
 match "x" "xxx"  = False
 match "[" "xxx"  = False

matchExt :: String -> String -> BoolSource

match with extended regular expressions

matchSubex :: String -> String -> [(String, String)]Source

convenient function for matchRE

Examples:

 matchSubex "({1}x*)"                 "xxx"      = [("1","xxx")]
 matchSubex "({1}x*)"                 "y"        = []
 matchSubex "({w}[0-9]+)x({h}[0-9]+)" "800x600"  = [("w","800"),("h","600")]
 matchSubex "[" "xxx"                            = []

sed :: (String -> String) -> String -> String -> StringSource

convenient function for sedRE

examples:

 sed (const "b") "a" "xaxax"       = "xbxbx"
 sed (\ x -> x ++ x) "a" "xax"     = "xaax"
 sed undefined       "[" "xxx"     = "xxx"

sedExt :: (String -> String) -> String -> String -> StringSource

split :: String -> String -> (String, String)Source

convenient function for splitRE

examples:

 split "a*b" "abc" = ("ab","c")
 split "a*"  "bc"  = ("", "bc")    -- "a*" matches ""
 split "a+"  "bc"  = ("", "bc")    -- "a+" does not match, no split
 split "["   "abc" = ("", "abc")   -- "["  syntax error, no split

splitExt :: String -> String -> (String, String)Source

split with extended syntax

splitSubex :: String -> String -> ([(String, String)], String)Source

convenient function for splitSubex, uses extended syntax

examples:

 splitSubex "({1}a*)b"  "abc" = ([("1","a")],"c")
 splitSubex "({2}a*)"   "bc"  = ([("2","")], "bc")
 splitSubex "({1}a|b)+" "abc" = ([("1","a"),("1","b")],"c")        -- subex 1 matches 2 times

 splitSubex ".*({x}a*)" "aa"  = ([("x",""),("x","a"),("x","aa")],"")
                                                                   -- nondeterminism: 3 matches for a*

 splitSubex "({1}do)|({2}[a-z]+)" "do you know"
                                = ([("1","do"),("2","do")]," you know")
                                                                   -- nondeterminism: 2 matches for do

 splitSubex "({1}do){|}({2}[a-z]+)" "do you know"
                                = ([("1","do")]," you know")
                                                                   -- no nondeterminism with {|}: 1. match for do

 splitSubex "({1}a+)"   "bcd" = ([], "bcd")                        -- no match
 splitSubex "["         "abc" = ([], "abc")                        -- syntax error

tokenize :: String -> String -> [String]Source

split a string into tokens (words) by giving a regular expression which all tokens must match.

Convenient function for tokenizeRE

This can be used for simple tokenizers. It is recommended to use regular expressions where the empty word does not match. Else there will appear a lot of probably useless empty tokens in the output. All none matching chars are discarded. If the given regex contains syntax errors, Nothing is returned

examples:

 tokenize "a" "aabba"      = ["a","a","a"]
 tokenize "a*" "aaaba"     = ["aaa","a"]
 tokenize "a*" "bbb"       = ["","",""]
 tokenize "a+" "bbb"       = []

 tokenize "a*b" ""         = []
 tokenize "a*b" "abc"      = ["ab"]
 tokenize "a*b" "abaab ab" = ["ab","aab","ab"]

 tokenize "[a-z]{2,}|[0-9]{2,}|[0-9]+[.][0-9]+" "ab123 456.7abc"
                           = ["ab","123","456.7","abc"]

 tokenize "[a-z]*|[0-9]{2,}|[0-9]+[.][0-9]+" "cab123 456.7abc"
                           = ["cab","123","456.7","abc"]

 tokenize "[^ \t\n\r]*" "abc def\t\n\rxyz"
                           = ["abc","def","xyz"]

 tokenize ".*"   "\nabc\n123\n\nxyz\n"
                           = ["","abc","123","","xyz"]

 tokenize ".*"             = lines

 tokenize "[^ \t\n\r]*"    = words

tokenizeExt :: String -> String -> [String]Source

tokenize with extended syntax

tokenize' :: String -> String -> [Either String String]Source

convenient function for tokenizeRE'

When the regular expression parses as Zero, [Left input] is returned, that means no tokens are found

tokenizeExt' :: String -> String -> [Either String String]Source

tokenizeSubex :: String -> String -> [(String, String)]Source

convenient function for tokenizeSubexRE a string

examples:

 tokenizeSubex "({name}[a-z]+)|({num}[0-9]{2,})|({real}[0-9]+[.][0-9]+)"
                 "cab123 456.7abc"
                                  = [("name","cab")
                                    ,("num","123")
                                    ,("real","456.7")
                                    ,("name","abc")]

 tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)"
                 "12.34"          = [("real","12.34")
                                    ,("n","12")
                                    ,("f","34")]

 tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)"
                  "12 34"         = [("real","12"),("n","12")
                                    ,("real","34"),("n","34")]

 tokenizeSubex "({real}({n}[0-9]+)(([.]({f}[0-9]+))|({f})))"
                  "12 34.56"      = [("real","12"),("n","12"),("f","")
                                    ,("real","34.56"),("n","34"),("f","56")]

matchRE :: Regex -> String -> BoolSource

match a string with a regular expression

matchSubexRE :: Regex -> String -> [(String, String)]Source

match a string with a regular expression and extract subexpression matches

sedRE :: (String -> String) -> Regex -> String -> StringSource

sed like editing function

All matching tokens are edited by the 1. argument, the editing function, all other chars remain as they are

splitRE :: Regex -> String -> Maybe (String, String)Source

split a string by taking the longest prefix matching a regular expression

Nothing is returned in case there is no matching prefix, else the pair of prefix and rest is returned

splitSubexRE :: Regex -> String -> Maybe ([(String, String)], String)Source

split a string by removing the longest prefix matching a regular expression and then return the list of subexpressions found in the matching part

Nothing is returned in case of no matching prefix, else the list of pairs of labels and submatches and the rest is returned

tokenizeRE :: Regex -> String -> [String]Source

The function, that does the real work for tokenize

tokenizeRE' :: Regex -> String -> [Either String String]Source

split a string into tokens and delimierter by giving a regular expression wich all tokens must match

This is a generalisation of the above tokenizeRE functions. The none matching char sequences are marked with Left, the matching ones are marked with Right

If the regular expression contains syntax errors Nothing is returned

The following Law holds:

 concat . map (either id id) . tokenizeRE' re == id

tokenizeSubexRE :: Regex -> String -> [(String, String)]Source

split a string into tokens (pair of labels and words) by giving a regular expression containing labeled subexpressions.

This function should not be called with regular expressions without any labeled subexpressions. This does not make sense, because the result list will always be empty.

Result is the list of matching subexpressions This can be used for simple tokenizers. At least one char is consumed by parsing a token. The pairs in the result list contain the matching substrings. All none matching chars are discarded. If the given regex contains syntax errors, Nothing is returned

mkZero :: s -> GenRegex sSource

construct the r.e. for the empty set. An (error-) message may be attached

mkUnit :: GenRegex sSource

construct the r.e. for the set containing the empty word

mkSym1 :: StringLike s => Char -> GenRegex sSource

construct an r.e. for a single char set

mkSymRng :: StringLike s => Char -> Char -> GenRegex sSource

construct an r.e. for an intervall of chars

mkWord :: StringLike s => [Char] -> GenRegex sSource

mkSym generaized for strings

mkDot :: GenRegex sSource

construct an r.e. for the set of all Unicode chars

mkStar :: StringLike s => GenRegex s -> GenRegex sSource

construct r.e. for r*

mkAll :: StringLike s => GenRegex sSource

construct an r.e. for the set of all Unicode words

mkAlt :: StringLike s => GenRegex s -> GenRegex s -> GenRegex sSource

construct the r.e for r1|r2

mkElse :: StringLike s => GenRegex s -> GenRegex s -> GenRegex sSource

construct the r.e. for r1{|}r2 (r1 orElse r2).

This represents the same r.e. as r1|r2, but when collecting the results of subexpressions in (...) and r1 succeeds, the subexpressions of r2 are discarded, so r1 matches are prioritized

example

 splitSubex "({1}x)|({2}.)"   "x" = ([("1","x"),("2","x")], "")

 splitSubex "({1}x){|}({2}.)" "x" = ([("1","x")], "")

mkSeq :: GenRegex s -> GenRegex s -> GenRegex sSource

Construct the sequence r.e. r1.r2

mkSeqs :: [GenRegex s] -> GenRegex sSource

mkSeq extened to lists

mkRep :: StringLike s => Int -> GenRegex s -> GenRegex sSource

Construct repetition r{i,}

mkRng :: StringLike s => Int -> Int -> GenRegex s -> GenRegex sSource

Construct range r{i,j}

mkOpt :: StringLike s => GenRegex s -> GenRegex sSource

Construct option r?

mkDiff :: StringLike s => GenRegex s -> GenRegex s -> GenRegex sSource

Construct difference r.e.: r1 {\} r2

example

 match "[a-z]+{\\}bush" "obama"     = True
 match "[a-z]+{\\}bush" "clinton"   = True
 match "[a-z]+{\\}bush" "bush"      = False     -- not important any more

mkIsect :: StringLike s => GenRegex s -> GenRegex s -> GenRegex sSource

Construct r.e. for intersection: r1 {&} r2

example

 match ".*a.*{&}.*b.*" "-a-b-"  = True
 match ".*a.*{&}.*b.*" "-b-a-"  = True
 match ".*a.*{&}.*b.*" "-a-a-"  = False
 match ".*a.*{&}.*b.*" "---b-"  = False

mkExor :: StringLike s => GenRegex s -> GenRegex s -> GenRegex sSource

Construct r.e. for exclusive or: r1 {^} r2

example

 match "[a-c]+{^}[c-d]+" "abc"  = True
 match "[a-c]+{^}[c-d]+" "acdc" = False
 match "[a-c]+{^}[c-d]+" "ccc"  = False
 match "[a-c]+{^}[c-d]+" "cdc"  = True

mkCompl :: StringLike s => GenRegex s -> GenRegex sSource

Construct the Complement of an r.e.: whole set of words - r

mkBr :: s -> GenRegex s -> GenRegex sSource

Construct a labeled subexpression: ({label}r)

isZero :: GenRegex s -> BoolSource

parseRegex :: StringLike s => s -> GenRegex sSource

parse a standard W3C XML Schema regular expression

parseRegexExt :: StringLike s => s -> GenRegex sSource

parse an extended syntax W3C XML Schema regular expression

The Syntax of the W3C XML Schema spec is extended by further useful set operations, like intersection, difference, exor. Subexpression match becomes possible with "named" pairs of parentheses. The multi char escape sequence \a represents any Unicode char, The multi char escape sequence \A represents any Unicode word, (\A = \a*). All syntactically wrong inputs are mapped to the Zero expression representing the empty set of words. Zero contains as data field a string for an error message. So error checking after parsing becomes possible by checking against Zero (isZero predicate)

parseContextRegex :: StringLike s => (String -> GenRegex s) -> s -> GenRegex sSource

parse a regular expression surrounded by contenxt spec

a leading ^ denotes start of text, a trailing $ denotes end of text, a leading \< denotes word start, a trailing \> denotes word end.

The 1. param ist the regex parser (parseRegex or parseRegexExt)