{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE ViewPatterns #-} {-# LANGUAGE RecordWildCards #-} ----------------------------------------------------------------------------- -- | -- Module : Text.Regex.PCRE.Rex -- Copyright : (c) Michael Sloan 2011 -- -- Maintainer : Michael Sloan (mgsloan@gmail.com) -- Stability : unstable -- Portability : unportable -- -- This module provides a template Haskell quasiquoter for regular expressions, -- which provides the following features: -- -- 1) Compile-time checking that the regular expression is valid. -- -- 2) Arity of resulting tuple based on the number of selected capture patterns -- in the regular expression. -- -- 3) Allows for the inline interpolation of mapping functions :: String -> a. -- -- 4) Precompiles the regular expression at compile time, by calling into the -- PCRE library and storing a 'ByteString' literal representation of its state. -- -- 5) Compile-time configurable to use different PCRE options, turn off -- precompilation, use 'ByteString's, or set a default mapping expression. -- -- Inspired by Matt Morrow's regexqq package: -- . -- -- And some code from Erik Charlebois's interpolatedstring-qq package: -- . -- ----------------------------------------------------------------------------- module Text.Regex.PCRE.Rex ( -- * Language Extensions -- | -- Since this is a quasiquoter library that generates code using view patterns, -- the following extensions are required: -- -- > {-# LANGUAGE TemplateHaskell, QuasiQuotes, ViewPatterns #-} -- * First Example -- | -- Here's an example which parses peano numbers of the form Z, S Z, S S Z, etc. -- The \s+ means that it is not sensitive to the quantity or type of separating -- whitespace. These examples can also be found in Test.hs. -- -- > peano :: String -> Maybe Int -- > peano = [rex|^(?{ length . filter (=='S') } \s* (?:S\s+)*Z)\s*$|] -- -- > *Main> peano "Z" -- > Just 0 -- > *Main> peano "S Z" -- > Just 1 -- > *Main> peano "S S Z" -- > Just 2 -- > *Main> peano "S S S Z" -- > Just 3 -- > *Main> peano "invalid" -- > Nothing -- -- The token \"(?{\" introduces a capture group which has a mapping applied to -- the result. In this case, it's @length . filter (=='S')@. If the ?{ ... } -- are omitted, then the capture group is not taken as part of the results of -- the match. If the contents of the ?{ ... } is omitted, then a call to -- 'rexView' is assumed: -- -- > parsePair :: String -> Maybe (String, String) -- > parsePair = [rex|^<\s* (?{ }[^\s,>]+) \s*,\s* (?{ }[^\s,>]+) \s*>$|] -- -- The 'rexView' exported by this module is just equal to 'id', so by default -- no preprocessing is done. However, we can shadow this locally: -- -- > parsePair' :: String -> Maybe (Int, Int) -- > parsePair' = [rex|^<\s* (?{ }[^\s,>]+) \s*,\s* (?{ }[^\s,>]+) \s*>$|] -- > where -- > rexView = read -- -- Additional shorthands can be added by using 'rexWithConf' and specifying -- custom values for 'rexPreprocessExp' or 'rexPreprocessPat'. -- * Second Example -- | -- This example is derived from -- http://www.regular-expressions.info/dates.html -- -- > parseDate :: String -> Maybe (Int, Int, Int) -- > parseDate [rex|^(?{ read -> y }(?:19|20)\d\d)[- /.] -- > (?{ read -> m }0[1-9]|1[012])[- /.] -- > (?{ read -> d }0[1-9]|[12][0-9]|3[01])$|] -- > | (d > 30 && (m `elem` [4, 6, 9, 11])) -- > || (m == 2 && -- > (d == 29 && not (mod y 4 == 0 && (mod y 100 /= 0 || mod y 400 == 0))) -- > || (d > 29)) = Nothing -- > | otherwise = Just (y, m, d) -- > parseDate _ = Nothing -- -- The above example makes use of the regex quasi-quoter as a pattern matcher. -- The interpolated Haskell patterns are used to construct an implicit view -- pattern out of the inlined ones. The above pattern is expanded to the -- equivalent: -- -- > parseDate ([rex|^(?{ read }(?:19|20)\d\d)[- /.] -- > (?{ read }0[1-9]|1[012])[- /.] -- > (?{ read }0[1-9]|[12][0-9]|3[01])$|] -- > -> Just (y, m, d)) -- * ByteStrings vs Strings -- | -- Since pcre-light is a wrapper over a C API, the most efficient interface is -- ByteStrings, as it does not natively speak Haskell lists. The [rex| ... ] -- quasiquoter implicitely packs the input into a bystestring, and unpacks the -- results to strings before providing them to your mappers. The 'brex' -- 'QuasiQuoter' is provided for this purpose. You can also define your own -- 'QuasiQuoter' - the definitions of the default configurations are as follows: -- -- > rex = rexWithConf $ defaultRexConf -- > brex = rexWithConf $ defaultRexConf { rexByteString = True } -- > -- > defaultRexConf = RexConf False True "id" [PCRE.extended] [] -- -- The first @False@ specifies to use @String@ rather than 'ByteString'. The -- @True@ argument specifies to use precompilation. -- The -- string following is the default mapping expression, used when omitted. -- Due to GHC staging restrictions, your configuration will need to be in a -- different module than its usage. -- * Future Work -- | -- There are a few things that could potentially be improved: -- -- 1) PCRE captures, unlike .NET regular expressions, yield the last capture -- made by a particular pattern. So, for example, (...)*, will only yield one -- match for '...'. Ideally these would be detected and yield an implicit [a]. -- -- 2) Patterns with disjunction between captures ((?{f}a) | (?{g}b)) will -- provide the empty string to one of f / g. In the case of pattern -- expressions, it would be convenient to be able to map multiple captures into -- a single variable / pattern, preferring the first non-empty option. -- * Quasiquoters rex, brex -- * Configurable QuasiQuoter , rexWithConf, RexConf(..), defaultRexConf -- * Utilities , makeQuasiMultiline , eitherToParseResult , parseExp , parsePat , rexParseMode -- * Used by the generated code , padRight, rexView ) where import Text.Regex.PCRE.Precompile import qualified Text.Regex.PCRE.Light as PCRE import Control.Applicative ( (<$>) ) import Control.Arrow ( first ) import Data.ByteString.Char8 ( pack, unpack, empty ) import Data.Either ( partitionEithers ) import Data.Maybe ( catMaybes, fromJust, isJust ) import Data.Char ( isSpace ) import System.IO.Unsafe ( unsafePerformIO ) import Language.Haskell.TH (Exp(..), ExpQ, Pat(..), PatQ, Lit(..), mkName, runIO) import Language.Haskell.TH.Quote import Language.Haskell.Meta (toExp,toPat) import Language.Haskell.Exts.Extension (Extension(..), KnownExtension(..)) import Language.Haskell.Exts (parseExpWithMode, parsePatWithMode, ParseMode, defaultParseMode, extensions, ParseResult(..)) import Language.Haskell.Exts.SrcLoc (noLoc) {- TODO: * Target Text.Regex.Base ? * Add unit tests -} data RexConf = RexConf { -- | When @True@, the input type is a ByteString, otherwise, it's a String. rexByteString :: Bool, -- | When @True@, the regex is precompiled. rexCompiled :: Bool, -- | Preprocess the string used in expression antiquotes. 'defaultRexConf' -- just passes through the string unaltered, unless it just consists of -- whitespace. When it's all whitespace, @"rexView"@ is used. rexPreprocessExp :: String -> String, -- | Preprocess the string used in pattern antiquotes. 'defaultRexConf' -- adds parenthesis around the string, so that view patterns will parse -- without requiring parenthesis around them. rexPreprocessPat :: String -> String, -- | When a pattern match doesn't have a view pattern, this expression is -- used to preprocess it before matching. When 'defaultRexConf' is used, -- perhaps via 'rex' or 'brex', a reference to @rexView@ is used. -- -- The 'rexView' exported by this module is 'id', so by default no -- preprocessing is done before rexViewExp :: Exp, -- | Options used when compiling PCRE regular expressions. rexPCREOpts :: [PCRE.PCREOption], -- | Options used when executing PCRE regular expressions. rexPCREExecOpts :: [PCRE.PCREExecOption] } -- | Default rex configuration, which specifies that the regexes operate on -- strings, don't post-process the matched patterns, and use 'PCRE.extended'. -- This setting causes whitespace to be non-semantic, and ignores # comments. defaultRexConf :: RexConf defaultRexConf = RexConf { rexByteString = False , rexCompiled = True , rexPreprocessExp = \s -> if all isSpace s then "rexView" else s , rexPreprocessPat = \s -> "(" ++ s ++ ")" , rexViewExp = VarE (mkName "rexView") , rexPCREOpts = [PCRE.extended] , rexPCREExecOpts = [] } -- | Rex quasiquoter which takes 'String' as input, and uses 'defaultRexConf' -- for its configuration. Can be used in expressions and patterns. rex :: QuasiQuoter rex = rexWithConf defaultRexConf -- | Rex quasiquoter which takes 'ByteString' as input, and otherwise uses -- 'defaultRexConf' for its configuration. Can be used in expressions and -- patterns. brex :: QuasiQuoter brex = rexWithConf defaultRexConf { rexByteString = True } -- | This is a 'QuasiQuoter' transformer, which allows for a whitespace- -- sensitive quasi-quoter to be broken over multiple lines. The default 'rex' -- and 'brex' functions do not need this as they are already whitespace -- insensitive. However, if you create your own configuration, which omits the -- 'PCRE.extended' parameter, then this could be useful. The leading space of -- each line is ignored, and all newlines removed. makeQuasiMultiline :: QuasiQuoter -> QuasiQuoter makeQuasiMultiline (QuasiQuoter a b c d) = QuasiQuoter (a . pre) (b . pre) (c . pre) (d . pre) where pre = concat . (\(x:xs) -> x : map (dropWhile isSpace) xs) . lines -- | A configureable regular-expression QuasiQuoter. Takes the options to pass -- to the PCRE engine, along with 'Bool's to flag 'ByteString' usage and -- non-compilation respecively. The provided 'String' indicates which mapping -- function to use, when one is omitted - \"(?{} ...)\". rexWithConf :: RexConf -> QuasiQuoter rexWithConf conf = QuasiQuoter (makeExp conf . parseRex) (makePat conf . parseRex) undefined undefined -- Template Haskell Code Generation -------------------------------------------------------------------------------- -- Creates the template haskell Exp which corresponds to the parsed interpolated -- regex. This particular code mainly just handles making "read" the -- default for captures which lack a parser definition, and defaulting to making -- the parser that doesn't exist makeExp :: RexConf -> ParseChunks -> ExpQ makeExp conf (cnt, pat, exs) = buildExp conf cnt pat $ flip map exs $ fmap $ fromParseOk "While parsing expression antiquote" . parseExp . rexPreprocessExp conf -- Creates the template haskell Pat which corresponds to the parsed interpolated -- regex. As well as handling the aforementioned defaulting considerations, this -- turns per-capture view patterns into a single tuple-resulting view pattern. -- -- E.g. [reg| ... (?{e1 -> v1} ...) ... (?{e2 -> v2} ...) ... |] becomes -- [reg| ... (?{e1} ...) ... (?{e2} ...) ... |] -> (v1, v2) makePat :: RexConf -> ParseChunks -> PatQ makePat conf (cnt, pat, exs) = do viewExp <- buildExp conf cnt pat $ map (fmap fst) views return . ViewP viewExp . (\xs -> ConP 'Just [TupP xs]) . map snd $ catMaybes views where views :: [Maybe (Exp, Pat)] views = map (fmap processView) exs processView :: String -> (Exp, Pat) processView xs = case parsePat (rexPreprocessPat conf xs) of ParseOk (ParensP (ViewP e p)) -> (e,p) ParseOk p -> (rexViewExp conf, p) ParseFailed _ b -> error b -- Here's where the main meat of the template haskell is generated. Given the -- number of captures, the pattern string, and a list of capture expressions, -- yields the template Haskell Exp which parses a string into a tuple. buildExp :: RexConf -> Int -> String -> [Maybe Exp] -> ExpQ buildExp RexConf{..} cnt pat xs = [| let r = $(get_regex) in $(process) . (flip $ PCRE.match r) $(liftRS rexPCREExecOpts) . $(if rexByteString then [| id |] else [| pack |]) |] where liftRS x = [| read shown |] where shown = show x get_regex | rexCompiled = [| unsafePerformIO (regexFromTable $! $(table_bytes)) |] | otherwise = [| PCRE.compile (pack pat) $(liftRS pcreOpts) |] table_bytes = [| pack $(LitE . StringL . unpack <$> runIO table_string) |] table_string = fromJust' "Error while getting PCRE compiled representation\n" <$> precompile (pack pat) pcreOpts pcreOpts = rexPCREOpts process = case (null vs, rexByteString) of (True, _) -> [| fmap ( const () ) |] (_, False) -> [| fmap ($(return maps) . padRight "" pad . map unpack) |] (_, True) -> [| fmap ($(return maps) . padRight empty pad) |] pad = cnt + 2 maps = LamE [ListP . (WildP:) $ map VarP vs] . TupE . map (uncurry AppE) -- filter out all "Nothing" exprs . map (first fromJust) . filter (isJust . fst) -- [(Expr, Variable applied to)] . zip xs $ map VarE vs vs = [mkName $ "v" ++ show i | i <- [0..cnt]] -- | Converts @Left@ to @'ParseFailed' 'noLoc'@, and a @Right@ to @'ParseOk'@. eitherToParseResult :: Either String a -> ParseResult a eitherToParseResult (Left err) = ParseFailed noLoc err eitherToParseResult (Right x) = ParseOk x -- | Parse a Haskell expression into a Template Haskell Exp. parseExp :: String -> ParseResult Exp parseExp = fmap toExp . parseExpWithMode rexParseMode -- | Parse a Haskell pattern match into a Template Haskell Pat. parsePat :: String -> ParseResult Pat parsePat = fmap toPat . parsePatWithMode rexParseMode -- | Parse mode used by 'parseExp' and 'parsePat'. rexParseMode :: ParseMode rexParseMode = defaultParseMode { extensions = map EnableExtension exts } where -- probably the quasiquote should have access to the pragmas in the current -- file, but for now just enable some common extensions that do not steal -- much syntax exts = [ ViewPatterns , ImplicitParams , RecordPuns , RecordWildCards , ScopedTypeVariables , TupleSections , TypeFamilies , TypeOperators ] -- Parsing -------------------------------------------------------------------------------- type ParseChunk = Either String (Maybe String) type ParseChunks = (Int, String, [Maybe String]) -- Postprocesses the results of the chunk-wise parse output, into the pattern to -- be pased to the regex engine, with the interpolated patterns / expressions. parseRex :: String -> ParseChunks parseRex xs = (cnt, concat chunks, quotes) where (chunks, quotes) = partitionEithers results (cnt, results) = parseRegex (filter (`notElem` "\r\n") xs) "" (-1) -- A pair of mutually-recursive functions, one for processing the quotation -- and the other for the anti-quotation. parseRegex :: String -> String -> Int -> (Int, [ParseChunk]) parseRegex inp s ix = case inp of -- Disallow branch-reset capture. ('(':'?':'|':_) -> error "Branch reset pattern (?| not allowed in quasi-quoted regex." -- Ignore non-capturing parens / handle backslash escaping. ('\\':'\\' :xs) -> parseRegex xs ("\\\\" ++ s) ix ('\\':'(' :xs) -> parseRegex xs (")\\" ++ s) ix ('\\':')' :xs) -> parseRegex xs ("(\\" ++ s) ix ('(':'?':':':xs) -> parseRegex xs (":?(" ++ s) ix -- Anti-quote for processing a capture group. ('(':'?':'{':xs) -> mapSnd ((Left $ reverse ('(':s)) :) $ parseAntiquote xs "" (ix + 1) -- Keep track of how many capture groups we've seen. ('(':xs) -> mapSnd (Right Nothing :) $ parseRegex xs ('(':s) (ix + 1) -- Consume the regular expression contents. (x:xs) -> parseRegex xs (x:s) ix [] -> (ix, [Left $ reverse s]) parseAntiquote :: String -> String -> Int -> (Int, [ParseChunk]) parseAntiquote inp s ix = case inp of -- Escape } in the Haskell splice using a backslash. ('\\':'}':xs) -> parseAntiquote xs ('}':s) ix -- Capture accumulated antiquote, and continue parsing regex literal. ('}':xs) -> mapSnd ((Right (Just (reverse s))):) $ parseRegex xs "" ix -- Consume the antiquoute contents, appending to a reverse accumulator. (x:xs) -> parseAntiquote xs (x:s) ix [] -> error "Rex haskell splice terminator, }, never found" -- Utils -------------------------------------------------------------------------------- -- | Given a desired list-length, if the passed list is too short, it is padded -- with the given element. Otherwise, it trims. padRight :: a -> Int -> [a] -> [a] padRight _ 0 _ = [] padRight v i [] = replicate i v padRight v i (x:xs) = x : padRight v (i-1) xs -- | A default view function used when expression antiquotes are empty, or when -- pattern antiquotes omit a view pattern. See the documentation for -- 'rexPreprocessPat' and 'rexPreprocessExp' for more details. -- -- You can locally shadow this 'rexView' with your own version, if you wish. -- One good option is readMay from the safe package: -- . -- -- The type of this identity rexView is fully polymorphic so that it can be -- used with either 'String' or 'ByteString'. rexView :: a -> a rexView = id mapSnd :: (t -> t2) -> (t1, t) -> (t1, t2) mapSnd f (x, y) = (x, f y) fromJust' :: String -> Maybe a -> a fromJust' msg Nothing = error msg fromJust' _ (Just x) = x fromParseOk :: Show a => String -> ParseResult a -> a fromParseOk _ (ParseOk x) = x fromParseOk msg err = error $ msg ++ ": " ++ show err