Text/Regex/PCRE/Rex.hs

{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE QuasiQuotes #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE ViewPatterns #-}
{-# LANGUAGE RecordWildCards #-}

-----------------------------------------------------------------------------
-- |
-- Module      :  Text.Regex.PCRE.Rex
-- Copyright   :  (c) Michael Sloan 2011
--
-- Maintainer  :  Michael Sloan (mgsloan@gmail.com)
-- Stability   :  unstable
-- Portability :  unportable
--
-- This module provides a template Haskell quasiquoter for regular expressions,
-- which provides the following features:
--
-- 1) Compile-time checking that the regular expression is valid.
--
-- 2) Arity of resulting tuple based on the number of selected capture patterns
-- in the regular expression.
--
-- 3) Allows for the inline interpolation of mapping functions :: String -> a.
--
-- 4) Precompiles the regular expression at compile time, by calling into the
-- PCRE library and storing a 'ByteString' literal representation of its state.
--
-- 5) Compile-time configurable to use different PCRE options, turn off
-- precompilation, use 'ByteString's, or set a default mapping expression.
--
-- Inspired by Matt Morrow's regexqq package:
-- <http://hackage.haskell.org/package/regexqq/docs/Text-Regex-PCRE-QQ.html>.
--
-- And some code from Erik Charlebois's interpolatedstring-qq package:
-- <http://hackage.haskell.org/package/interpolatedstring-qq/>.
--
-----------------------------------------------------------------------------

module Text.Regex.PCRE.Rex
  (

-- * Language Extensions
-- |
-- Since this is a quasiquoter library that generates code using view patterns,
-- the following extensions are required:
--
-- > {-# LANGUAGE TemplateHaskell, QuasiQuotes, ViewPatterns #-}

-- * First Example
-- |
-- Here's an example which parses peano numbers of the form Z, S Z, S S Z, etc.
-- The \s+ means that it is not sensitive to the quantity or type of separating
-- whitespace.  These examples can also be found in Test.hs.
--
-- > peano :: String -> Maybe Int
-- > peano = [rex|^(?{ length . filter (=='S') } \s* (?:S\s+)*Z)\s*$|]
--
-- > *Main> peano "Z"
-- > Just 0
-- > *Main> peano "S Z"
-- > Just 1
-- > *Main> peano "S   S Z"
-- > Just 2
-- > *Main> peano "S S S Z"
-- > Just 3
-- > *Main> peano "invalid"
-- > Nothing
--
-- The token \"(?{\" introduces a capture group which has a mapping applied to
-- the result. In this case, it's @length . filter (=='S')@.  If the ?{ ... }
-- are omitted, then the capture group is not taken as part of the results of
-- the match.  If the contents of the ?{ ... } is omitted, then a call to
-- 'rexView' is assumed:
--
-- > parsePair :: String -> Maybe (String, String)
-- > parsePair = [rex|^<\s* (?{ }[^\s,>]+) \s*,\s* (?{ }[^\s,>]+) \s*>$|]
--
-- The 'rexView' exported by this module is just equal to 'id', so by default
-- no preprocessing is done.  However, we can shadow this locally:
--
-- > parsePair' :: String -> Maybe (Int, Int)
-- > parsePair' = [rex|^<\s* (?{ }[^\s,>]+) \s*,\s* (?{ }[^\s,>]+) \s*>$|]
-- >   where
-- >     rexView = read
--
-- Additional shorthands can be added by using 'rexWithConf' and specifying
-- custom values for 'rexPreprocessExp' or 'rexPreprocessPat'.

-- * Second Example
-- |
-- This example is derived from
-- http://www.regular-expressions.info/dates.html
--
-- > parseDate :: String -> Maybe (Int, Int, Int)
-- > parseDate [rex|^(?{ read -> y }(?:19|20)\d\d)[- /.]
-- >                 (?{ read -> m }0[1-9]|1[012])[- /.]
-- >                 (?{ read -> d }0[1-9]|[12][0-9]|3[01])$|]
-- >   |  (d > 30 && (m `elem` [4, 6, 9, 11]))
-- >   || (m == 2 &&
-- >       (d == 29 && not (mod y 4 == 0 && (mod y 100 /= 0 || mod y 400 == 0)))
-- >      || (d > 29)) = Nothing
-- >   | otherwise = Just (y, m, d)
-- > parseDate _ = Nothing
--
-- The above example makes use of the regex quasi-quoter as a pattern matcher.
-- The interpolated Haskell patterns are used to construct an implicit view
-- pattern out of the inlined ones.  The above pattern is expanded to the
-- equivalent:
--
-- > parseDate ([rex|^(?{ read }(?:19|20)\d\d)[- /.]
-- >                  (?{ read }0[1-9]|1[012])[- /.]
-- >                  (?{ read }0[1-9]|[12][0-9]|3[01])$|]
-- >           -> Just (y, m, d))

-- * ByteStrings vs Strings
-- |
-- Since pcre-light is a wrapper over a C API, the most efficient interface is
-- ByteStrings, as it does not natively speak Haskell lists.  The [rex| ... ]
-- quasiquoter implicitely packs the input into a bystestring, and unpacks the
-- results to strings before providing them to your mappers.  The 'brex'
-- 'QuasiQuoter' is provided for this purpose.  You can also define your own
-- 'QuasiQuoter' - the definitions of the default configurations are as follows:
--
-- > rex  = rexWithConf $ defaultRexConf
-- > brex = rexWithConf $ defaultRexConf { rexByteString = True }
-- >
-- > defaultRexConf = RexConf False True "id" [PCRE.extended] []
--
-- The first @False@ specifies to use @String@ rather than 'ByteString'.  The
-- @True@ argument specifies to use precompilation.  --  The
-- string following is the default mapping expression, used when omitted.
-- Due to GHC staging restrictions, your configuration will need to be in a
-- different module than its usage.

-- * Future Work
-- |
-- There are a few things that could potentially be improved:
--
-- 1) PCRE captures, unlike .NET regular expressions, yield the last capture
-- made by a particular pattern.  So, for example, (...)*, will only yield one
-- match for '...'.  Ideally these would be detected and yield an implicit [a].
--
-- 2) Patterns with disjunction between captures ((?{f}a) | (?{g}b)) will
-- provide the empty string to one of f / g.  In the case of pattern
-- expressions, it would be convenient to be able to map multiple captures into
-- a single variable / pattern, preferring the first non-empty option.

-- * Quasiquoters
    rex, brex
-- * Configurable QuasiQuoter
  , rexWithConf, RexConf(..), defaultRexConf
-- * Utilities
  , makeQuasiMultiline
  , eitherToParseResult
  , parseExp
  , parsePat
  , rexParseMode
-- * Used by the generated code
  , padRight, rexView
  ) where

import Text.Regex.PCRE.Precompile

import qualified Text.Regex.PCRE.Light as PCRE

import Control.Applicative   ( (<$>) )
import Control.Arrow         ( first )
import Data.ByteString.Char8 ( pack, unpack, empty )
import Data.Either           ( partitionEithers )
import Data.Maybe            ( catMaybes, fromJust, isJust )
import Data.Char             ( isSpace )
import System.IO.Unsafe      ( unsafePerformIO )

import Language.Haskell.TH (Exp(..), ExpQ, Pat(..), PatQ, Lit(..),
                            mkName, runIO)
import Language.Haskell.TH.Quote
import Language.Haskell.Meta (toExp,toPat)
import Language.Haskell.Exts.Extension (Extension(..), KnownExtension(..))
import Language.Haskell.Exts (parseExpWithMode, parsePatWithMode,
                              ParseMode, defaultParseMode, extensions,
                              ParseResult(..))
import Language.Haskell.Exts.SrcLoc (noLoc)

{- TODO:
  * Target Text.Regex.Base ?
  * Add unit tests
-}

data RexConf = RexConf {
  -- | When @True@, the input type is a ByteString, otherwise, it's a String.
  rexByteString :: Bool,
  -- | When @True@, the regex is precompiled.
  rexCompiled :: Bool,
  -- | Preprocess the string used in expression antiquotes.  'defaultRexConf'
  --   just passes through the string unaltered, unless it just consists of
  --   whitespace.  When it's all whitespace, @"rexView"@ is used.
  rexPreprocessExp :: String -> String,
  -- | Preprocess the string used in pattern antiquotes. 'defaultRexConf'
  --   adds parenthesis around the string, so that view patterns will parse
  --   without requiring parenthesis around them.
  rexPreprocessPat :: String -> String,
  -- | When a pattern match doesn't have a view pattern, this expression is
  --   used to preprocess it before matching.  When 'defaultRexConf' is used,
  --   perhaps via 'rex' or 'brex', a reference to @rexView@ is used.
  --
  --   The 'rexView' exported by this module is 'id', so by default no
  --   preprocessing is done before
  rexViewExp :: Exp,
  -- | Options used when compiling PCRE regular expressions.
  rexPCREOpts :: [PCRE.PCREOption],
  -- | Options used when executing PCRE regular expressions.
  rexPCREExecOpts :: [PCRE.PCREExecOption]
  }

-- | Default rex configuration, which specifies that the regexes operate on
--   strings, don't post-process the matched patterns, and use 'PCRE.extended'.
--   This setting causes whitespace to be non-semantic, and ignores # comments.
defaultRexConf :: RexConf
defaultRexConf = RexConf
  { rexByteString = False
  , rexCompiled = True
  , rexPreprocessExp = \s -> if all isSpace s then "rexView" else s
  , rexPreprocessPat = \s -> "(" ++ s ++ ")"
  , rexViewExp = VarE (mkName "rexView")
  , rexPCREOpts = [PCRE.extended]
  , rexPCREExecOpts = []
  }

-- | Rex quasiquoter which takes 'String' as input, and uses 'defaultRexConf'
--   for its configuration.  Can be used in expressions and patterns.
rex :: QuasiQuoter
rex  = rexWithConf defaultRexConf

-- | Rex quasiquoter which takes 'ByteString' as input, and otherwise uses
--  'defaultRexConf' for its configuration.  Can be used in expressions and
--  patterns.
brex :: QuasiQuoter
brex = rexWithConf defaultRexConf { rexByteString = True }

-- | This is a 'QuasiQuoter' transformer, which allows for a whitespace-
--   sensitive quasi-quoter to be broken over multiple lines.  The default 'rex'
--   and 'brex' functions do not need this as they are already whitespace
--   insensitive. However, if you create your own configuration, which omits the
--   'PCRE.extended' parameter, then this could be useful. The leading space of
--   each line is ignored, and all newlines removed.
makeQuasiMultiline :: QuasiQuoter -> QuasiQuoter
makeQuasiMultiline (QuasiQuoter a b c d) =
    QuasiQuoter (a . pre) (b . pre) (c . pre) (d . pre)
  where
    pre = concat . (\(x:xs) -> x : map (dropWhile isSpace) xs) . lines

-- | A configureable regular-expression QuasiQuoter.  Takes the options to pass
--   to the PCRE engine, along with 'Bool's to flag 'ByteString' usage and
--   non-compilation respecively.  The provided 'String' indicates which mapping
--   function to use, when one is omitted - \"(?{} ...)\".
rexWithConf :: RexConf -> QuasiQuoter
rexWithConf conf =
  QuasiQuoter
    (makeExp conf . parseRex)
    (makePat conf . parseRex)
    undefined
    undefined

-- Template Haskell Code Generation
--------------------------------------------------------------------------------

-- Creates the template haskell Exp which corresponds to the parsed interpolated
-- regex.  This particular code mainly just handles making "read" the
-- default for captures which lack a parser definition, and defaulting to making
-- the parser that doesn't exist
makeExp :: RexConf -> ParseChunks -> ExpQ
makeExp conf (cnt, pat, exs) =
  buildExp conf cnt pat $ flip map exs $ fmap $
    fromParseOk "While parsing expression antiquote"
    . parseExp
    . rexPreprocessExp conf

-- Creates the template haskell Pat which corresponds to the parsed interpolated
-- regex. As well as handling the aforementioned defaulting considerations, this
-- turns per-capture view patterns into a single tuple-resulting view pattern.
--
-- E.g. [reg| ... (?{e1 -> v1} ...) ... (?{e2 -> v2} ...) ... |] becomes
--      [reg| ... (?{e1} ...) ... (?{e2} ...) ... |] -> (v1, v2)
makePat :: RexConf -> ParseChunks -> PatQ
makePat conf (cnt, pat, exs) = do
  viewExp <- buildExp conf cnt pat $ map (fmap fst) views
  return . ViewP viewExp
         . (\xs -> ConP 'Just [TupP xs])
         . map snd $ catMaybes views
 where
  views :: [Maybe (Exp, Pat)]
  views = map (fmap processView) exs

  processView :: String -> (Exp, Pat)
  processView xs = case parsePat (rexPreprocessPat conf xs) of
    ParseOk (ParensP (ViewP e p)) -> (e,p)
    ParseOk p -> (rexViewExp conf, p)
    ParseFailed _ b -> error b

-- Here's where the main meat of the template haskell is generated.  Given the
-- number of captures, the pattern string, and a list of capture expressions,
-- yields the template Haskell Exp which parses a string into a tuple.
buildExp :: RexConf -> Int -> String -> [Maybe Exp] -> ExpQ
buildExp RexConf{..} cnt pat xs =
    [| let r = $(get_regex) in
       $(process) . (flip $ PCRE.match r) $(liftRS rexPCREExecOpts)
     . $(if rexByteString then [| id |] else [| pack |]) |]
  where
    liftRS x = [| read shown |] where shown = show x

    get_regex
      | rexCompiled = [| unsafePerformIO (regexFromTable $! $(table_bytes)) |]
      | otherwise = [| PCRE.compile (pack pat) $(liftRS pcreOpts) |]
    table_bytes = [| pack $(LitE . StringL . unpack <$> runIO table_string) |]
    table_string =
      fromJust' "Error while getting PCRE compiled representation\n" <$>
      precompile (pack pat) pcreOpts
    pcreOpts = rexPCREOpts

    process = case (null vs, rexByteString) of
      (True, _)  -> [| fmap ( const () ) |]
      (_, False) -> [| fmap ($(return maps) . padRight "" pad . map unpack) |]
      (_, True)  -> [| fmap ($(return maps) . padRight empty pad) |]
    pad = cnt + 2
    maps = LamE [ListP . (WildP:) $ map VarP vs]
         . TupE . map (uncurry AppE)
         -- filter out all "Nothing" exprs
         . map (first fromJust) . filter (isJust . fst)
         -- [(Expr, Variable applied to)]
         . zip xs $ map VarE vs
    vs = [mkName $ "v" ++ show i | i <- [0..cnt]]

-- | Converts @Left@ to @'ParseFailed' 'noLoc'@, and a @Right@ to @'ParseOk'@.
eitherToParseResult :: Either String a -> ParseResult a
eitherToParseResult (Left err) = ParseFailed noLoc err
eitherToParseResult (Right x) = ParseOk x

-- | Parse a Haskell expression into a Template Haskell Exp.
parseExp :: String -> ParseResult Exp
parseExp = fmap toExp . parseExpWithMode rexParseMode

-- | Parse a Haskell pattern match into a Template Haskell Pat.
parsePat :: String -> ParseResult Pat
parsePat = fmap toPat . parsePatWithMode rexParseMode

-- | Parse mode used by 'parseExp' and 'parsePat'.
rexParseMode :: ParseMode
rexParseMode = defaultParseMode { extensions = map EnableExtension exts }
  where
    -- probably the quasiquote should have access to the pragmas in the current
    -- file, but for now just enable some common extensions that do not steal
    -- much syntax
    exts =
      [ ViewPatterns
      , ImplicitParams
      , RecordPuns
      , RecordWildCards
      , ScopedTypeVariables
      , TupleSections
      , TypeFamilies
      , TypeOperators
      ]

-- Parsing
--------------------------------------------------------------------------------

type ParseChunk = Either String (Maybe String)
type ParseChunks = (Int, String, [Maybe String])

-- Postprocesses the results of the chunk-wise parse output, into the pattern to
-- be pased to the regex engine, with the interpolated patterns / expressions.
parseRex :: String -> ParseChunks
parseRex xs = (cnt, concat chunks, quotes)
  where
    (chunks, quotes) = partitionEithers results
    (cnt, results) = parseRegex (filter (`notElem` "\r\n") xs) "" (-1)

-- A pair of mutually-recursive functions, one for processing the quotation
-- and the other for the anti-quotation.

parseRegex :: String -> String -> Int -> (Int, [ParseChunk])
parseRegex inp s ix = case inp of
  -- Disallow branch-reset capture.
  ('(':'?':'|':_) ->
    error "Branch reset pattern (?| not allowed in quasi-quoted regex."

  -- Ignore non-capturing parens / handle backslash escaping.
  ('\\':'\\'  :xs) -> parseRegex xs ("\\\\" ++ s) ix
  ('\\':'('   :xs) -> parseRegex xs (")\\"  ++ s) ix
  ('\\':')'   :xs) -> parseRegex xs ("(\\"  ++ s) ix
  ('(':'?':':':xs) -> parseRegex xs (":?("  ++ s) ix

  -- Anti-quote for processing a capture group.
  ('(':'?':'{':xs) -> mapSnd ((Left $ reverse ('(':s)) :)
                    $ parseAntiquote xs "" (ix + 1)

  -- Keep track of how many capture groups we've seen.
  ('(':xs) -> mapSnd (Right Nothing :)
            $ parseRegex xs ('(':s) (ix + 1)

  -- Consume the regular expression contents.
  (x:xs) -> parseRegex xs (x:s) ix
  [] -> (ix, [Left $ reverse s])

parseAntiquote :: String -> String -> Int -> (Int, [ParseChunk])
parseAntiquote inp s ix = case inp of
  -- Escape } in the Haskell splice using a backslash.
  ('\\':'}':xs) -> parseAntiquote xs ('}':s) ix

  -- Capture accumulated antiquote, and continue parsing regex literal.
  ('}':xs) -> mapSnd ((Right (Just (reverse s))):)
            $ parseRegex xs "" ix

  -- Consume the antiquoute contents, appending to a reverse accumulator.
  (x:xs) -> parseAntiquote xs (x:s) ix
  [] -> error "Rex haskell splice terminator, }, never found"

-- Utils
--------------------------------------------------------------------------------

-- | Given a desired list-length, if the passed list is too short, it is padded
-- with the given element.  Otherwise, it trims.
padRight :: a -> Int -> [a] -> [a]
padRight _ 0 _ = []
padRight v i [] = replicate i v
padRight v i (x:xs) = x : padRight v (i-1) xs

-- | A default view function used when expression antiquotes are empty, or when
--   pattern antiquotes omit a view pattern.  See the documentation for
--   'rexPreprocessPat' and 'rexPreprocessExp' for more details.
--
--   You can locally shadow this 'rexView' with your own version, if you wish.
--   One good option is readMay from the safe package:
--   <http://hackage.haskell.org/package/safe/docs/Safe.html#v:readMay>.
--
--   The type of this identity rexView is fully polymorphic so that it can be
--   used with either 'String' or 'ByteString'.
rexView :: a -> a
rexView = id

mapSnd :: (t -> t2) -> (t1, t) -> (t1, t2)
mapSnd f (x, y) = (x, f y)

fromJust' :: String -> Maybe a -> a
fromJust' msg Nothing = error msg
fromJust' _ (Just x) = x

fromParseOk :: Show a => String -> ParseResult a -> a
fromParseOk _ (ParseOk x) = x
fromParseOk msg err = error $ msg ++ ": " ++ show err