----------------------------------------------------------------------------- -- | -- Module : Data.SBV.RegExp -- Copyright : (c) Levent Erkok -- License : BSD3 -- Maintainer: erkokl@gmail.com -- Stability : experimental -- -- A collection of regular-expression related utilities. The recommended -- workflow is to import this module qualified as the names of the functions -- are specificly chosen to be common identifiers. Also, it is recommended -- you use the @OverloadedStrings@ extension to allow literal strings to be -- used as symbolic-strings and regular-expressions when working with -- this module. ----------------------------------------------------------------------------- {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeApplications #-} {-# OPTIONS_GHC -Wall -Werror #-} module Data.SBV.RegExp ( -- * Regular expressions RegExp(..) -- * Matching -- $matching , RegExpMatchable(..) -- * Constructing regular expressions -- ** Literals , exactly -- ** A class of characters , oneOf -- ** Spaces , newline, whiteSpaceNoNewLine, whiteSpace -- ** Separators , tab, punctuation -- ** Letters , asciiLetter, asciiLower, asciiUpper -- ** Digits , digit, octDigit, hexDigit -- ** Numbers , decimal, octal, hexadecimal, floating -- ** Identifiers , identifier ) where import Prelude hiding (length, take, elem, notElem, head) import qualified Prelude as P import qualified Data.List as L import Data.SBV.Core.Data import Data.SBV.Core.Model () -- instances only import Data.SBV.String import qualified Data.Char as C import Data.Proxy -- For testing only import Data.SBV.Char -- For doctest use only -- -- $setup -- >>> import Prelude hiding (length, take, elem, notElem, head) -- >>> import Data.SBV.Provers.Prover (prove, sat) -- >>> import Data.SBV.Core.Model -- >>> :set -XOverloadedStrings -- >>> :set -XScopedTypeVariables -- | Matchable class. Things we can match against a 'RegExp'. -- (TODO: Currently SBV does *not* optimize this call if the input is a concrete string or -- a character, but rather directly calls down to the solver. We might want to perform the -- operation on the Haskell side for performance reasons, should this become important.) -- -- For instance, you can generate valid-looking phone numbers like this: -- -- >>> :set -XOverloadedStrings -- >>> let dig09 = Range '0' '9' -- >>> let dig19 = Range '1' '9' -- >>> let pre = dig19 * Loop 2 2 dig09 -- >>> let post = dig19 * Loop 3 3 dig09 -- >>> let phone = pre * "-" * post -- >>> sat $ \s -> (s :: SString) `match` phone -- Satisfiable. Model: -- s0 = "100-1000" :: String class RegExpMatchable a where -- | @`match` s r@ checks whether @s@ is in the language generated by @r@. match :: a -> RegExp -> SBool -- | Matching a character simply means the singleton string matches the regex. instance RegExpMatchable SChar where match = match . singleton -- | Matching symbolic strings. instance RegExpMatchable SString where match input regExp = lift1 (StrInRe regExp) (Just (go regExp P.null)) input where -- This isn't super efficient, but it gets the job done. go :: RegExp -> (String -> Bool) -> String -> Bool go (Literal l) k s = l `L.isPrefixOf` s && k (P.drop (P.length l) s) go All _ _ = True go None _ _ = False go (Range _ _) _ [] = False go (Range a b) k (c:cs) = a <= c && c <= b && k cs go (Conc []) k s = k s go (Conc (r:rs)) k s = go r (go (Conc rs) k) s go (KStar r) k s = k s || go r (smaller (P.length s) (go (KStar r) k)) s go (KPlus r) k s = go (Conc [r, KStar r]) k s go (Opt r) k s = k s || go r k s go (Loop i j r) k s = go (Conc (replicate i r ++ replicate (j - i) (Opt r))) k s go (Union []) _ _ = False go (Union [x]) k s = go x k s go (Union (x:xs)) k s = go x k s || go (Union xs) k s go (Inter a b) k s = go a k s && go b k s -- In the KStar case, make sure the continuation is called with something -- smaller to avoid infinite recursion! smaller orig k inp = P.length inp < orig && k inp -- | A literal regular-expression, matching the given string exactly. Note that -- with @OverloadedStrings@ extension, you can simply use a Haskell -- string to mean the same thing, so this function is rarely needed. -- -- >>> prove $ \(s :: SString) -> s `match` exactly "LITERAL" .<=> s .== "LITERAL" -- Q.E.D. exactly :: String -> RegExp exactly = Literal -- | Helper to define a character class. -- -- >>> prove $ \(c :: SChar) -> c `match` oneOf "ABCD" .<=> sAny (c .==) (map literal "ABCD") -- Q.E.D. oneOf :: String -> RegExp oneOf xs = Union [exactly [x] | x <- xs] -- | Recognize a newline. Also includes carriage-return and form-feed. -- -- >>> newline -- (re.union (str.to.re "\n") (str.to.re "\r") (str.to.re "\f")) -- >>> prove $ \c -> c `match` newline .=> isSpace c -- Q.E.D. newline :: RegExp newline = oneOf "\n\r\f" -- | Recognize a tab. -- -- >>> tab -- (str.to.re "\x09") -- >>> prove $ \c -> c `match` tab .=> c .== literal '\t' -- Q.E.D. tab :: RegExp tab = oneOf "\t" -- | Recognize white-space, but without a new line. -- -- >>> whiteSpaceNoNewLine -- (re.union (str.to.re "\x09") (re.union (str.to.re "\v") (str.to.re "\xa0") (str.to.re " "))) -- >>> prove $ \c -> c `match` whiteSpaceNoNewLine .=> c `match` whiteSpace .&& c ./= literal '\n' -- Q.E.D. whiteSpaceNoNewLine :: RegExp whiteSpaceNoNewLine = tab + oneOf "\v\160 " -- | Recognize white space. -- -- >>> prove $ \c -> c `match` whiteSpace .=> isSpace c -- Q.E.D. whiteSpace :: RegExp whiteSpace = newline + whiteSpaceNoNewLine -- | Recognize a punctuation character. Anything that satisfies the predicate 'isPunctuation' will -- be accepted. (TODO: Will need modification when we move to unicode.) -- -- >>> prove $ \c -> c `match` punctuation .=> isPunctuation c -- Q.E.D. punctuation :: RegExp punctuation = oneOf $ filter C.isPunctuation $ map C.chr [0..255] -- | Recognize an alphabet letter, i.e., @A@..@Z@, @a@..@z@. -- -- >>> asciiLetter -- (re.union (re.range "a" "z") (re.range "A" "Z")) -- >>> prove $ \c -> c `match` asciiLetter .<=> toUpper c `match` asciiLetter -- Q.E.D. -- >>> prove $ \c -> c `match` asciiLetter .<=> toLower c `match` asciiLetter -- Q.E.D. asciiLetter :: RegExp asciiLetter = asciiLower + asciiUpper -- | Recognize an ASCII lower case letter -- -- >>> asciiLower -- (re.range "a" "z") -- >>> prove $ \c -> (c :: SChar) `match` asciiLower .=> c `match` asciiLetter -- Q.E.D. -- >>> prove $ \c -> c `match` asciiLower .=> toUpper c `match` asciiUpper -- Q.E.D. -- >>> prove $ \c -> c `match` asciiLetter .=> toLower c `match` asciiLower -- Q.E.D. asciiLower :: RegExp asciiLower = Range 'a' 'z' -- | Recognize an upper case letter -- -- >>> asciiUpper -- (re.range "A" "Z") -- >>> prove $ \c -> (c :: SChar) `match` asciiUpper .=> c `match` asciiLetter -- Q.E.D. -- >>> prove $ \c -> c `match` asciiUpper .=> toLower c `match` asciiLower -- Q.E.D. -- >>> prove $ \c -> c `match` asciiLetter .=> toUpper c `match` asciiUpper -- Q.E.D. asciiUpper :: RegExp asciiUpper = Range 'A' 'Z' -- | Recognize a digit. One of @0@..@9@. -- -- >>> digit -- (re.range "0" "9") -- >>> prove $ \c -> c `match` digit .<=> let v = digitToInt c in 0 .<= v .&& v .< 10 -- Q.E.D. digit :: RegExp digit = Range '0' '9' -- | Recognize an octal digit. One of @0@..@7@. -- -- >>> octDigit -- (re.range "0" "7") -- >>> prove $ \c -> c `match` octDigit .<=> let v = digitToInt c in 0 .<= v .&& v .< 8 -- Q.E.D. -- >>> prove $ \(c :: SChar) -> c `match` octDigit .=> c `match` digit -- Q.E.D. octDigit :: RegExp octDigit = Range '0' '7' -- | Recognize a hexadecimal digit. One of @0@..@9@, @a@..@f@, @A@..@F@. -- -- >>> hexDigit -- (re.union (re.range "0" "9") (re.range "a" "f") (re.range "A" "F")) -- >>> prove $ \c -> c `match` hexDigit .<=> let v = digitToInt c in 0 .<= v .&& v .< 16 -- Q.E.D. -- >>> prove $ \(c :: SChar) -> c `match` digit .=> c `match` hexDigit -- Q.E.D. hexDigit :: RegExp hexDigit = digit + Range 'a' 'f' + Range 'A' 'F' -- | Recognize a decimal number. -- -- >>> decimal -- (re.+ (re.range "0" "9")) -- >>> prove $ \s -> (s::SString) `match` decimal .=> sNot (s `match` KStar asciiLetter) -- Q.E.D. decimal :: RegExp decimal = KPlus digit -- | Recognize an octal number. Must have a prefix of the form @0o@\/@0O@. -- -- >>> octal -- (re.++ (re.union (str.to.re "0o") (str.to.re "0O")) (re.+ (re.range "0" "7"))) -- >>> prove $ \s -> s `match` octal .=> sAny (.== take 2 s) ["0o", "0O"] -- Q.E.D. octal :: RegExp octal = ("0o" + "0O") * KPlus octDigit -- | Recognize a hexadecimal number. Must have a prefix of the form @0x@\/@0X@. -- -- >>> hexadecimal -- (re.++ (re.union (str.to.re "0x") (str.to.re "0X")) (re.+ (re.union (re.range "0" "9") (re.range "a" "f") (re.range "A" "F")))) -- >>> prove $ \s -> s `match` hexadecimal .=> sAny (.== take 2 s) ["0x", "0X"] -- Q.E.D. hexadecimal :: RegExp hexadecimal = ("0x" + "0X") * KPlus hexDigit -- | Recognize a floating point number. The exponent part is optional if a fraction -- is present. The exponent may or may not have a sign. -- -- >>> prove $ \s -> s `match` floating .=> length s .>= 3 -- Q.E.D. floating :: RegExp floating = withFraction + withoutFraction where withFraction = decimal * "." * decimal * Opt expt withoutFraction = decimal * expt expt = ("e" + "E") * Opt (oneOf "+-") * decimal -- | For the purposes of this regular expression, an identifier consists of a letter -- followed by zero or more letters, digits, underscores, and single quotes. The first -- letter must be lowercase. -- -- >>> prove $ \s -> s `match` identifier .=> isAsciiLower (head s) -- Q.E.D. -- >>> prove $ \s -> s `match` identifier .=> length s .>= 1 -- Q.E.D. identifier :: RegExp identifier = asciiLower * KStar (asciiLetter + digit + "_" + "'") -- | Lift a unary operator over strings. lift1 :: forall a b. (SymVal a, SymVal b) => StrOp -> Maybe (a -> b) -> SBV a -> SBV b lift1 w mbOp a | Just cv <- concEval1 mbOp a = cv | True = SBV $ SVal k $ Right $ cache r where k = kindOf (Proxy @b) r st = do sva <- sbvToSV st a newExpr st k (SBVApp (StrOp w) [sva]) -- | Concrete evaluation for unary ops concEval1 :: (SymVal a, SymVal b) => Maybe (a -> b) -> SBV a -> Maybe (SBV b) concEval1 mbOp a = literal <$> (mbOp <*> unliteral a) -- | Quiet GHC about testing only imports __unused :: a __unused = undefined isSpace length take elem notElem head {- $matching A symbolic string or a character ('SString' or 'SChar') can be matched against a regular-expression. Note that the regular-expression itself is not a symbolic object: It's a fully concrete representation, as captured by the 'RegExp' class. The 'RegExp' class is an instance of the @IsString@ class, which makes writing literal matches easier. The 'RegExp' type also has a (somewhat degenerate) 'Num' instance: Concatenation corresponds to multiplication, union corresponds to addition, and @0@ corresponds to the empty language. Note that since `match` is a method of 'RegExpMatchable' class, both 'SChar' and 'SString' can be used as an argument for matching. In practice, this means you might have to disambiguate with a type-ascription if it is not deducible from context. >>> prove $ \s -> (s :: SString) `match` "hello" .<=> s .== "hello" Q.E.D. >>> prove $ \s -> s `match` Loop 2 5 "xyz" .=> length s .>= 6 Q.E.D. >>> prove $ \s -> s `match` Loop 2 5 "xyz" .=> length s .<= 15 Q.E.D. >>> prove $ \s -> match s (Loop 2 5 "xyz") .=> length s .>= 7 Falsifiable. Counter-example: s0 = "xyzxyz" :: String >>> prove $ \s -> (s :: SString) `match` "hello" .=> s `match` ("hello" + "world") Q.E.D. >>> prove $ \s -> sNot $ (s::SString) `match` ("so close" * 0) Q.E.D. >>> prove $ \c -> (c :: SChar) `match` oneOf "abcd" .=> ord c .>= ord (literal 'a') .&& ord c .<= ord (literal 'd') Q.E.D. -}