src/Text/Regex/XMLSchema/String.hs

-- ------------------------------------------------------------

{- |
   Module     : Text.Regex.XMLSchema.String
   Copyright  : Copyright (C) 2009 Uwe Schmidt
   License    : MIT

   Maintainer : Uwe Schmidt (uwe@fh-wedel.de)
   Stability  : experimental
   Portability: portable

   Convenient functions for W3C XML Schema Regular Expression Matcher.
   For internals see 'Text.Regex.XMLSchema.String.Regex'

   Grammar can be found under <http://www.w3.org/TR/xmlschema11-2/#regexs>

-}

-- ------------------------------------------------------------

module Text.Regex.XMLSchema.String
    ( GenRegex
    , Regex

    , grep
    , match
    , matchSubex
    , sed
    , split
    , splitSubex
    , tokenize
    , tokenize'
    , tokenizeSubex

    , matchRE
    , matchSubexRE
    , sedRE
    , splitRE
    , splitSubexRE
    , tokenizeRE
    , tokenizeRE'
    , tokenizeSubexRE

    , mkZero
    , mkUnit
    , mkSym1
    , mkSymRng
    , mkWord
    , mkDot
    , mkStar
    , mkAll
    , mkAlt
    , mkElse
    , mkSeq
    , mkSeqs
    , mkRep
    , mkRng
    , mkOpt
    , mkDiff
    , mkIsect
    , mkExor
    , mkCompl
    , mkBr
    , isZero
    , errRegex

    , parseRegex        -- re-export of Text.Regex.XMLSchema.String.RegexParser
    )
where

import Control.Arrow

import Data.List
import Data.Maybe

import Text.Regex.XMLSchema.String.Regex
import Text.Regex.XMLSchema.String.RegexParser

-- ------------------------------------------------------------

-- | split a string by taking the longest prefix matching a regular expression
--
-- @Nothing@ is returned in case there is no matching prefix,
-- else the pair of prefix and rest is returned

splitRE         :: (Eq l, Show l) => GenRegex l -> String -> Maybe (String, String)
splitRE re input
                = do
                  (sms, rest) <- splitWithRegex re input
                  return (snd . head $ sms, rest)

-- | convenient function for 'splitRE'
--
-- examples:
--
-- > split "a*b" "abc" = ("ab","c")
-- > split "a*"  "bc"  = ("", "bc")
-- > split "a+"  "bc"  = ("", "bc")
-- > split "["   "abc" = ("", "abc")

split           :: String -> String -> (String, String)
split re input  = fromMaybe ("", input)
                  . (splitRE . parseRegex $ re) $ input

-- ------------------------------------------------------------

-- | split a string by removing the longest prefix matching a regular expression
-- and then return the list of subexpressions found in the matching part
--
-- @Nothing@ is returned in case of no matching prefix,
-- else the list of pairs of labels and submatches and the
-- rest is returned

splitSubexRE    :: (Eq l, Show l) => GenRegex l -> String -> Maybe ([(l, String)], String)
splitSubexRE re input
                = do
                  (sms, rest) <- splitWithRegex re input
                  return (map (first fromJust) . drop 1 $ sms, rest)

-- | convenient function for 'splitSubex'
--
-- examples:
--
-- > splitSubex "({1}a*)b"  "abc" = ([("1","a")],"c")
-- > splitSubex "({2}a*)"   "bc"  = ([("2","")], "bc")
-- > splitSubex "({1}a|b)+" "abc" = ([("1","a"),("1","b")],"c")        -- subex 1 matches 2 times
-- >
-- > splitSubex ".*({x}a*)" "aa"  = ([("x",""),("x","a"),("x","aa")],"")
-- >                                                                   -- nondeterminism: 3 matches for a*
-- >
-- > splitSubex "({1}do)|({2}[a-z]+)" "do you know"
-- >                                = ([("1","do"),("2","do")]," you know")
-- >                                                                   -- nondeterminism: 2 matches for do
-- >
-- > splitSubex "({1}do){|}({2}[a-z]+)" "do you know"
-- >                                = ([("1","do")]," you know")
-- >                                                                   -- no nondeterminism with {|}: 1. match for do
-- >
-- > splitSubex "({1}a+)"   "bcd" = ([], "bcd")                        -- no match
-- > splitSubex "["         "abc" = ([], "abc")                        -- syntax error


splitSubex      :: String -> String -> ([(String,String)], String)
splitSubex re inp
                = fromMaybe ([], inp) . (splitSubexRE . parseRegex $ re) $ inp

-- ------------------------------------------------------------

-- | The function, that does the real work for 'tokenize'

tokenizeRE      :: (Eq l, Show l) => GenRegex l -> String -> [String]
tokenizeRE re
    = token''
    where
    re1         = mkDiff re mkUnit
    token''     = token' re  fcs
    token1''    = token' re1 fcs
    fcs         = firstChars re

    -- token'   :: (Eq l, Show l) => GenRegex l -> CharSet -> String -> [String]
    token' re' fcs' inp
        | null inp      = []
        | otherwise     = evalRes . splitWithRegexCS re' fcs' $ inp
        where
        evalRes Nothing = token'' (tail inp)            -- re does not match any prefix
        evalRes (Just (toks, rest))
            | null tok  = tok : token'' (tail rest)     -- re is nullable and only the empty prefix matches
                                                        -- discard one char and try again
            | otherwise = tok : token1'' rest           -- real token found, next token must not be empty
            where
            tok = snd . head $ toks

-- | split a string into tokens (words) by giving a regular expression
-- which all tokens must match.
--
-- Convenient function for 'tokenizeRE'
--
-- This can be used for simple tokenizers.
-- It is recommended to use regular expressions where the empty word does not match.
-- Else there will appear a lot of probably useless empty tokens in the output.
-- All none matching chars are discarded. If the given regex contains syntax errors,
-- @Nothing@ is returned
--
-- examples:
--
-- > tokenize "a" "aabba"      = ["a","a","a"]
-- > tokenize "a*" "aaaba"     = ["aaa","a"]
-- > tokenize "a*" "bbb"       = ["","",""]
-- > tokenize "a+" "bbb"       = []
-- >
-- > tokenize "a*b" ""         = []
-- > tokenize "a*b" "abc"      = ["ab"]
-- > tokenize "a*b" "abaab ab" = ["ab","aab","ab"]
-- >
-- > tokenize "[a-z]{2,}|[0-9]{2,}|[0-9]+[.][0-9]+" "ab123 456.7abc"
-- >                           = ["ab","123","456.7","abc"]
-- >
-- > tokenize "[a-z]*|[0-9]{2,}|[0-9]+[.][0-9]+" "cab123 456.7abc"
-- >                           = ["cab","123","456.7","abc"]
-- >
-- > tokenize "[^ \t\n\r]*" "abc def\t\n\rxyz"
-- >                           = ["abc","def","xyz"]
-- >
-- > tokenize ".*"   "\nabc\n123\n\nxyz\n"
-- >                           = ["","abc","123","","xyz"]
-- >
-- > tokenize ".*"             = lines
-- >
-- > tokenize "[^ \t\n\r]*"    = words

tokenize        :: String -> String -> [String]
tokenize        = tokenizeRE . parseRegex

-- ------------------------------------------------------------

-- | split a string into tokens and delimierter by giving a regular expression
-- wich all tokens must match
--
-- This is a generalisation of the above 'tokenizeRE' functions.
-- The none matching char sequences are marked with @Left@, the matching ones are marked with @Right@
--
-- If the regular expression contains syntax errors @Nothing@ is returned
--
-- The following Law holds:
--
-- > concat . map (either id id) . tokenizeRE' re == id

tokenizeRE'     :: (Eq l, Show l) => GenRegex l -> String -> [Either String String]
tokenizeRE' re
    = token'' ""
    where
    re1         = mkDiff re mkUnit
    token''     = token' re  fcs
    token1''    = token' re1 fcs
    fcs         = firstChars re

    -- token'   :: (Eq l, Show l) => GenRegex l -> CharSet -> String -> String -> [Either String String]
    token' re' fcs' unmatched inp
        | null inp      = addUnmatched []
        | otherwise     = evalRes . splitWithRegexCS re' fcs' $ inp
        where
        addUnmatched
            | null unmatched    = id
            | otherwise         = ((Left . reverse $ unmatched) :)

        addMatched t            = addUnmatched . ((Right t) :)

        evalRes Nothing = token'' ((head inp) : unmatched) (tail inp)                   -- re does not match any prefix

        evalRes (Just (toks, rest))
            | null tok  = addMatched tok $ token'' (take 1 rest) (tail rest)            -- re is nullable and only the empty prefix matches
                                                                                        -- discard one char and try again
            | otherwise = addMatched tok $ token1'' "" rest                             -- real token found, next token must not be empty
            where
            tok = snd . head $ toks

-- | convenient function for 'tokenizeRE''
--
-- When the regular expression parses as Zero, @[Left input]@ is returned, that means no tokens are found

tokenize'       :: String -> String -> [Either String String]
tokenize'       = tokenizeRE' . parseRegex

-- ------------------------------------------------------------

-- | split a string into tokens (pair of labels and words) by giving a regular expression
-- containing labeled subexpressions.
--
-- This function should not be called with regular expressions
-- without any labeled subexpressions. This does not make sense, because the result list
-- will always be empty.
--
-- Result is the list of matching subexpressions
-- This can be used for simple tokenizers.
-- At least one char is consumed by parsing a token.
-- The pairs in the result list contain the matching substrings.
-- All none matching chars are discarded. If the given regex contains syntax errors,
-- @Nothing@ is returned

tokenizeSubexRE :: (Eq l, Show l) => GenRegex l -> String -> [(l, String)]
tokenizeSubexRE re
    = token''
    where
    re1         = mkDiff re mkUnit
    token''     = token' re  fcs
    token1''    = token' re1 fcs
    fcs         = firstChars re

    -- token'   :: (Eq l, Show l) => GenRegex l -> CharSet -> String -> [(l,String)]
    token' re' fcs' inp
        | null inp      = []
        | otherwise     = evalRes . splitWithRegexCS re' fcs' $ inp
        where
        evalRes Nothing = token'' (tail inp)            -- re does not match any prefix
        evalRes (Just (toks, rest))
            | null tok  = res ++ token'' (tail rest)    -- re is nullable and only the empty prefix matches
            | otherwise = res ++ token1'' rest          -- token found, tokenize the rest
            where
            res = map (first fromJust) . tail $ toks
            tok = snd . head $ toks

-- | convenient function for 'tokenizeSubexRE' a string
--
-- examples:
--
-- > tokenizeSubex "({name}[a-z]+)|({num}[0-9]{2,})|({real}[0-9]+[.][0-9]+)"
-- >                 "cab123 456.7abc"
-- >                                  = [("name","cab")
-- >                                    ,("num","123")
-- >                                    ,("real","456.7")
-- >                                    ,("name","abc")]
-- >
-- > tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)"
-- >                 "12.34"          = [("real","12.34")
-- >                                    ,("n","12")
-- >                                    ,("f","34")]
-- >
-- > tokenizeSubex "({real}({n}[0-9]+)([.]({f}[0-9]+))?)"
-- >                  "12 34"         = [("real","12"),("n","12")
-- >                                    ,("real","34"),("n","34")]
-- >
-- > tokenizeSubex "({real}({n}[0-9]+)(([.]({f}[0-9]+))|({f})))"
-- >                  "12 34.56"      = [("real","12"),("n","12"),("f","")
-- >                                    ,("real","34.56"),("n","34"),("f","56")]

tokenizeSubex   :: String -> String -> [(String,String)]
tokenizeSubex   = tokenizeSubexRE . parseRegex

-- ------------------------------------------------------------

-- | sed like editing function
--
-- All matching tokens are edited by the 1. argument, the editing function,
-- all other chars remain as they are

sedRE           :: (Eq l, Show l) => (String -> String) ->  GenRegex l -> String -> String
sedRE edit re   = concatMap (either id edit) . tokenizeRE' re

-- | convenient function for 'sedRE'
--
-- examples:
--
-- > sed (const "b") "a" "xaxax"       = "xbxbx"
-- > sed (\ x -> x ++ x) "a" "xax"     = "xaax"
-- > sed undefined       "[" "xxx"     = "xxx"

sed             :: (String -> String) -> String -> String -> String
sed edit        = sedRE edit . parseRegex

-- ------------------------------------------------------------

-- | match a string with a regular expression

matchRE         :: (Eq l, Show l) => GenRegex l -> String -> Bool
matchRE         = matchWithRegex

-- | convenient function for 'matchRE'
--
-- Examples:
--
-- > match "x*" "xxx" = True
-- > match "x" "xxx"  = False
-- > match "[" "xxx"  = False

match           :: String -> String -> Bool
match           = matchWithRegex . parseRegex

-- ------------------------------------------------------------

-- | match a string with a regular expression
-- and extract subexpression matches

matchSubexRE            :: (Eq l, Show l) => GenRegex l -> String -> [(l, String)]
matchSubexRE re         = map (first fromJust) . fromMaybe [] . matchWithRegex' re

-- | convenient function for 'matchRE'
--
-- Examples:
--
-- > matchSubex "({1}x*)"                 "xxx"      = [("1","xxx")]
-- > matchSubex "({1}x*)"                 "y"        = []
-- > matchSubex "({w}[0-9]+)x({h}[0-9]+)" "800x600"  = [("w","800"),("h","600")]
-- > matchSubex "[" "xxx"                            = []

matchSubex              :: String -> String -> [(String, String)]
matchSubex              = matchSubexRE . parseRegex

-- ------------------------------------------------------------

-- | grep like filter for lists of strings
--
-- The regular expression may be prefixed with the usual context spec \"^\" for start of string,
-- and "\\<" for start of word.
-- and suffixed with \"$\" for end of text and "\\>" end of word.
-- Word chars are defined by the multi char escape sequence "\\w"
--
-- Examples
--
-- > grep "a"    ["_a_", "_a", "a_", "a", "_"]      => ["_a_", "_a", "a_", "a"]
-- > grep "^a"   ["_a_", "_a", "a_", "a", "_"]      => ["a_", "a"]
-- > grep "a$"   ["_a_", "_a", "a_", "a", "_"]      => ["_a", "a"]
-- > grep "^a$"  ["_a_", "_a", "a_", "a", "_"]      => ["a"]
-- > grep "\\<a" ["x a b", " ax ", " xa ", "xab"]   => ["x a b", " ax "]
-- > grep "a\\>" ["x a b", " ax ", " xa ", "xab"]   => ["x a b", " xa "]

grep                    :: String -> [String] -> [String]
grep re                 = filter (matchRE re')
                          where
                          re' = mkSeqs . concat $ [ startContext
                                                  , (:[]) . parseRegex $ re2
                                                  , endContext
                                                  ]
                          (startContext, re1)
                              | "^"   `isPrefixOf` re   = ([],                       tail   re)
                              | "\\<" `isPrefixOf` re   = ([parseRegex "(\\A\\W)?"], drop 2 re)
                              | otherwise               = ([mkStar mkDot],                  re)
                          (endContext, re2)
                              | "$"   `isSuffixOf` re1  = ([],                       init          re1)
                              | "\\>" `isSuffixOf` re1  = ([parseRegex "(\\W\\A)?"], init . init $ re1)
                              | otherwise               = ([mkStar mkDot],                         re1)

-- ------------------------------------------------------------