{-# LANGUAGE CPP #-}
--------------------------------------------------------------------
-- |
-- Module   : Text.Regex.PCRE.Light
-- Copyright: Copyright (c) 2007-2008, Don Stewart
-- License  : BSD3
--
-- Maintainer:  Don Stewart <dons@galois.com>
-- Stability :  experimental
-- Portability: H98 + CPP
--
--------------------------------------------------------------------
--
-- A simple, portable binding to perl-compatible regular expressions
-- (PCRE) via strict ByteStrings.
--

module Text.Regex.PCRE.Light (

        -- * The abstract PCRE Regex type
          Regex

        -- * ByteString interface
        , compile, compileM
        , match
        , captureCount
        , captureNames

        -- * Regex types and constructors externally visible

        -- ** PCRE compile-time bit flags
        , PCREOption

        , anchored
        , auto_callout
        {-, bsr_anycrlf-}
        {-, bsr_unicode-}
        , caseless
        , dollar_endonly
        , dotall
        , dupnames
        , extended
        , extra
        , firstline
        , multiline
        {-, newline_any-}
        {-, newline_anycrlf-}
        , newline_cr
        , newline_crlf
        , newline_lf
        , no_auto_capture
        , ungreedy
        , utf8
        , no_utf8_check

        -- ** PCRE exec-time bit flags
        , PCREExecOption

        , exec_anchored
        {-, exec_newline_any     -}
        {-, exec_newline_anycrlf -}
        , exec_newline_cr
        , exec_newline_crlf
        , exec_newline_lf
        , exec_notbol
        , exec_noteol
        , exec_notempty
        , exec_no_utf8_check
        , exec_partial

    ) where

import Text.Regex.PCRE.Light.Base

-- Strings
import qualified Data.ByteString          as S

#if __GLASGOW_HASKELL__ >= 608
import qualified Data.ByteString.Internal as S
import qualified Data.ByteString.Unsafe   as S
#else
import qualified Data.ByteString.Base     as S
#endif

import System.IO.Unsafe (unsafePerformIO)
import Data.List (sortBy)
import Data.Function (on)

-- Foreigns
import Foreign (newForeignPtr, withForeignPtr)
import Foreign.Ptr
import Foreign.C.Types
import Foreign.C.String
import Foreign.Storable
import Foreign.Marshal.Alloc

-- | 'compile'
--
-- Compile a perl-compatible regular expression stored in a strict bytestring.
--
-- An example
--
-- > let r = compile (pack "^(b+|a){1,2}?bc") []
--
-- Or using GHC's -XOverloadedStrings flag, and importing
-- Data.ByteString.Char8, we can avoid the pack:
--
-- > let r = compile "^(b+|a){1,2}?bc" []
--
-- If the regular expression is invalid, an exception is thrown.
-- If this is unsuitable, 'compileM' is availlable, which returns failure
-- in a monad.
--
-- To do case insentive matching,
--
-- > compile "^(b+|a){1,2}?bc" [caseless]
--
-- Other flags are documented below.
--
-- The resulting abstract regular expression can be passed to 'match'
-- for matching against a subject string.
--
-- The arguments are:
--
-- * 'pat': A ByteString containing the regular expression to be compiled.
--
-- * 'flags', optional bit flags. If 'Nothing' is provided, defaults are used.
--
-- Valid compile-time flags are:
--
-- * 'anchored'        - Force pattern anchoring
--
-- * 'auto_callout'    - Compile automatic callouts
--
-- * 'bsr_anycrlf'     - \\R matches only CR, LF, or CRLF
--
-- * 'bsr_unicode'     - \\R matches all Unicode line endings
--
-- * 'caseless'        - Do caseless matching
--
-- * 'dollar_endonly'  - '$' not to match newline at end
--
-- * 'dotall'          - matches anything including NL
--
-- * 'dupnames'        - Allow duplicate names for subpatterns
--
-- * 'extended'        - Ignore whitespace and # comments
--
-- * 'extra'           - PCRE extra features (not much use currently)
--
-- * 'firstline'       - Force matching to be  before  newline
--
-- * 'multiline'       - '^' and '$' match newlines within data
--
-- * 'newline_any'     - Recognize any Unicode newline sequence
--
-- * 'newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
--
-- * 'newline_cr'      - Set CR as the newline sequence
--
-- * 'newline_crlf'    - Set CRLF as the newline sequence
--
-- * 'newline_lf'      - Set LF as the newline sequence
--
-- * 'no_auto_capture' - Disable numbered capturing parentheses (named ones available)
--
-- * 'ungreedy'        - Invert greediness of quantifiers
--
-- * 'utf8'            - Run in UTF-8 mode
--
-- * 'no_utf8_check'   - Do not check the pattern for UTF-8 validity
--
-- The regex is allocated via malloc on the C side, and will be
-- deallocated by the runtime when the Haskell value representing it
-- goes out of scope.
--
-- See 'man pcreapi for more details.
--
-- Caveats: patterns with embedded nulls, such as "\0*" seem to be
-- mishandled, as this won't currently match the subject "\0\0\0".
--
compile :: S.ByteString -> [PCREOption] -> Regex
compile :: ByteString -> [PCREOption] -> Regex
compile ByteString
s [PCREOption]
o = case ByteString -> [PCREOption] -> Either String Regex
compileM ByteString
s [PCREOption]
o of
    Right Regex
r -> Regex
r
    Left  String
e -> forall a. HasCallStack => String -> a
error (String
"Text.Regex.PCRE.Light: Error in regex: " forall a. [a] -> [a] -> [a]
++ String
e)

------------------------------------------------------------------------

-- | 'compileM'
-- A safe version of 'compile' with failure wrapped in an Either.
--
-- Examples,
--
-- > > compileM ".*" [] :: Either String Regex
-- > Right (Regex 0x000000004bb5b980 ".*")
--
-- > > compileM "*" [] :: Either String Regex
-- > Left "nothing to repeat"
--
compileM :: S.ByteString -> [PCREOption] -> Either String Regex
compileM :: ByteString -> [PCREOption] -> Either String Regex
compileM ByteString
str [PCREOption]
os = forall a. IO a -> a
unsafePerformIO forall a b. (a -> b) -> a -> b
$
  forall a. ByteString -> (Ptr CChar -> IO a) -> IO a
S.useAsCString ByteString
str forall a b. (a -> b) -> a -> b
$ \Ptr CChar
pattern -> do
    forall a b. Storable a => (Ptr a -> IO b) -> IO b
alloca forall a b. (a -> b) -> a -> b
$ \Ptr (Ptr CChar)
errptr       -> do
      forall a b. Storable a => (Ptr a -> IO b) -> IO b
alloca forall a b. (a -> b) -> a -> b
$ \Ptr CInt
erroffset    -> do
          Ptr PCRE
pcre_ptr <- Ptr CChar
-> PCREOption
-> Ptr (Ptr CChar)
-> Ptr CInt
-> Ptr Word8
-> IO (Ptr PCRE)
c_pcre_compile Ptr CChar
pattern ([PCREOption] -> PCREOption
combineOptions [PCREOption]
os) Ptr (Ptr CChar)
errptr Ptr CInt
erroffset forall a. Ptr a
nullPtr
          if Ptr PCRE
pcre_ptr forall a. Eq a => a -> a -> Bool
== forall a. Ptr a
nullPtr
              then do
                  String
err <- Ptr CChar -> IO String
peekCString forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< forall a. Storable a => Ptr a -> IO a
peek Ptr (Ptr CChar)
errptr
                  forall (m :: * -> *) a. Monad m => a -> m a
return (forall a b. a -> Either a b
Left String
err)
              else do
                  ForeignPtr PCRE
reg <- forall a. FinalizerPtr a -> Ptr a -> IO (ForeignPtr a)
newForeignPtr forall a. FinalizerPtr a
c_pcre_free Ptr PCRE
pcre_ptr
                  forall (m :: * -> *) a. Monad m => a -> m a
return (forall a b. b -> Either a b
Right (ForeignPtr PCRE -> ByteString -> Regex
Regex ForeignPtr PCRE
reg ByteString
str))

-- Possible improvements: an 'IsString' instance could be defined
-- for 'Regex', which would allow the compiler to insert calls to
-- 'compile' based on the type:
--
-- The following would be valid:
--
-- > match "a.*b" "abcdef" []
--
-- and equivalent to:
--
-- > match (either error id (compile "a.*b")) "abcdef" []

-- | 'match'
--
-- Matches a compiled regular expression against a given subject string,
-- using a matching algorithm that is similar to Perl's. If the subject
-- string doesn't match the regular expression, 'Nothing' is returned,
-- otherwise the portion of the string that matched is returned, along
-- with any captured subpatterns.
--
-- The arguments are:
--
-- * 'regex', a PCRE regular expression value produced by compile
--
-- * 'subject', the subject string to match against
--
-- * 'options', an optional set of exec-time flags to exec.
--
-- Available runtime options are:
--
-- * 'exec_anchored'        - Match only at the first position
--
-- * 'exec_newline_any'     - Recognize any Unicode newline sequence
--
-- * 'exec_newline_anycrlf' - Recognize CR, LF, and CRLF as newline sequences
--
-- * 'exec_newline_cr'      - Set CR as the newline sequence
--
-- * 'exec_newline_crlf'    - Set CRLF as the newline sequence
--
-- * 'exec_newline_lf'      - Set LF as the newline sequence
--
-- * 'exec_notbol'          - Subject is not the beginning of a line
--
-- * 'exec_noteol'          - Subject is not the end of a line
--
-- * 'exec_notempty'        - An empty string is not a valid match
--
-- * 'exec_no_utf8_check'   - Do not check the subject for UTF-8
--
-- * 'exec_partial'         - Return PCRE_ERROR_PARTIAL for a partial match
--
-- The result value, and any captured subpatterns, are returned.
-- If the regex is invalid, or the subject string is empty, Nothing
-- is returned.
--
match :: Regex -> S.ByteString -> [PCREExecOption] -> Maybe [S.ByteString]
match :: Regex -> ByteString -> [PCREExecOption] -> Maybe [ByteString]
match (Regex ForeignPtr PCRE
pcre_fp ByteString
_) ByteString
subject [PCREExecOption]
os = forall a. IO a -> a
unsafePerformIO forall a b. (a -> b) -> a -> b
$ do
  forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr PCRE
pcre_fp forall a b. (a -> b) -> a -> b
$ \Ptr PCRE
pcre_ptr -> do
    Int
n_capt <- forall {b}. Num b => Ptr PCRE -> CInt -> IO b
fullInfoInt Ptr PCRE
pcre_ptr CInt
info_capturecount

    -- The smallest  size  for ovector that will allow for n captured
    -- substrings, in addition to the offsets  of  the  substring
    -- matched by the whole pattern, is (n+1)*3. (man pcreapi)

    let ovec_size :: Int
ovec_size = (Int
n_capt forall a. Num a => a -> a -> a
+ Int
1) forall a. Num a => a -> a -> a
* Int
3
        ovec_bytes :: Int
ovec_bytes = Int
ovec_size forall a. Num a => a -> a -> a
* Int
size_of_cint

    forall a b. Int -> (Ptr a -> IO b) -> IO b
allocaBytes Int
ovec_bytes forall a b. (a -> b) -> a -> b
$ \Ptr CInt
ovec -> do

        let (ForeignPtr Word8
str_fp, Int
off, Int
len) = ByteString -> (ForeignPtr Word8, Int, Int)
S.toForeignPtr ByteString
subject
        forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Word8
str_fp forall a b. (a -> b) -> a -> b
$ \Ptr Word8
cstr -> do
            let exec :: Ptr Word8 -> a -> IO CInt
exec Ptr Word8
csub a
clen = Ptr PCRE
-> Ptr PCRE
-> Ptr Word8
-> CInt
-> CInt
-> PCREExecOption
-> Ptr CInt
-> CInt
-> IO CInt
c_pcre_exec
                         Ptr PCRE
pcre_ptr
                         forall a. Ptr a
nullPtr
                         Ptr Word8
csub -- may contain binary zero bytes.
                         (forall a b. (Integral a, Num b) => a -> b
fromIntegral a
clen)
                         CInt
0
                         ([PCREExecOption] -> PCREExecOption
combineExecOptions [PCREExecOption]
os)
                         Ptr CInt
ovec
                         (forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
ovec_size)

            -- An empty ByteString may be represented with a nullPtr.  Passing
            -- a nullPtr to pcre_exec will cause it to return an error, even if
            -- the pattern could succesfully match an empty subject. As a
            -- workaround, allocate a small buffer and pass that to pcre_exec.
            CInt
r <- if Ptr Word8
cstr forall a. Eq a => a -> a -> Bool
== forall a. Ptr a
nullPtr
                then forall a b. Int -> (Ptr a -> IO b) -> IO b
allocaBytes Int
1 forall a b. (a -> b) -> a -> b
$ \Ptr Word8
buf -> forall {a}. Integral a => Ptr Word8 -> a -> IO CInt
exec Ptr Word8
buf Integer
0
                else forall {a}. Integral a => Ptr Word8 -> a -> IO CInt
exec (Ptr Word8
cstr forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
off) Int
len

            if CInt
r forall a. Ord a => a -> a -> Bool
< CInt
0 -- errors, or error_no_match
                then forall (m :: * -> *) a. Monad m => a -> m a
return forall a. Maybe a
Nothing
                else let loop :: CInt -> Int -> [ByteString] -> IO (Maybe [ByteString])
loop CInt
n Int
o [ByteString]
acc =
                            if CInt
n forall a. Eq a => a -> a -> Bool
== CInt
r
                              then forall (m :: * -> *) a. Monad m => a -> m a
return (forall a. a -> Maybe a
Just (forall a. [a] -> [a]
reverse [ByteString]
acc))
                              else do
                                    CInt
i <- forall a. Storable a => Ptr a -> Int -> IO a
peekElemOff Ptr CInt
ovec forall a b. (a -> b) -> a -> b
$! Int
o
                                    CInt
j <- forall a. Storable a => Ptr a -> Int -> IO a
peekElemOff Ptr CInt
ovec (Int
oforall a. Num a => a -> a -> a
+Int
1)
                                    let s :: ByteString
s = CInt -> CInt -> ByteString -> ByteString
substring CInt
i CInt
j ByteString
subject
                                    ByteString
s seq :: forall a b. a -> b -> b
`seq` CInt -> Int -> [ByteString] -> IO (Maybe [ByteString])
loop (CInt
nforall a. Num a => a -> a -> a
+CInt
1) (Int
oforall a. Num a => a -> a -> a
+Int
2) (ByteString
s forall a. a -> [a] -> [a]
: [ByteString]
acc)
                     in CInt -> Int -> [ByteString] -> IO (Maybe [ByteString])
loop CInt
0 Int
0 []

    -- The  first  two-thirds  of ovec is used to pass back captured
    -- substrings When  a  match  is  successful, information about captured
    -- substrings is returned in pairs of integers,  starting  at the
    -- beginning of ovector, and continuing up to two-thirds of its length at
    -- the most.  The first pair, ovector[0] and ovector[1], identify the
    -- portion of the subject string matched  by  the entire pattern.  The next
    -- pair is used for the first capturing subpattern,  and  so on.  The
    -- value returned  by pcre_exec() is one more than the highest num- bered
    -- pair that has been set. For  example,  if  two  sub- strings  have been
    -- captured, the returned value is 3.

  where
    -- The first element of a pair is set  to  the offset of the first
    -- character in a substring, and the second is set to the offset of the
    -- first character after  the  end of a substring.
    substring :: CInt -> CInt -> S.ByteString -> S.ByteString
    substring :: CInt -> CInt -> ByteString -> ByteString
substring CInt
x CInt
y ByteString
_ | CInt
x forall a. Eq a => a -> a -> Bool
== CInt
y = ByteString
S.empty -- XXX an unset subpattern
    substring CInt
a CInt
b ByteString
s = ByteString
end -- note that we're not checking...
        where
            start :: ByteString
start = Int -> ByteString -> ByteString
S.unsafeDrop (forall a b. (Integral a, Num b) => a -> b
fromIntegral CInt
a) ByteString
s
            end :: ByteString
end   = Int -> ByteString -> ByteString
S.unsafeTake (forall a b. (Integral a, Num b) => a -> b
fromIntegral (CInt
bforall a. Num a => a -> a -> a
-CInt
a)) ByteString
start


-- Wrapper around c_pcre_fullinfo for integer values
fullInfoInt :: Ptr PCRE -> CInt -> IO b
fullInfoInt Ptr PCRE
pcre_ptr CInt
what =
  forall a b. Storable a => (Ptr a -> IO b) -> IO b
alloca forall a b. (a -> b) -> a -> b
$ \Ptr CInt
n_ptr -> do
    forall a. Ptr PCRE -> Ptr PCRE -> CInt -> Ptr a -> IO CInt
c_pcre_fullinfo Ptr PCRE
pcre_ptr forall a. Ptr a
nullPtr CInt
what Ptr CInt
n_ptr
    forall (m :: * -> *) a. Monad m => a -> m a
return forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (Integral a, Num b) => a -> b
fromIntegral forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< forall a. Storable a => Ptr a -> IO a
peek (Ptr CInt
n_ptr :: Ptr CInt)


-- | 'captureCount'
--
-- Returns the number of captures in a 'Regex'. Correctly ignores non-capturing groups
-- like @(?:abc)@.
--
-- >>> captureCount (compile "(?<one>abc) (def) (?:non-captured) (?<three>ghi)" [])
-- 3
captureCount :: Regex -> Int
captureCount :: Regex -> Int
captureCount (Regex ForeignPtr PCRE
pcre_fp ByteString
_) = forall a. IO a -> a
unsafePerformIO forall a b. (a -> b) -> a -> b
$
  forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr PCRE
pcre_fp forall a b. (a -> b) -> a -> b
$ \Ptr PCRE
pcre_ptr ->
    forall {b}. Num b => Ptr PCRE -> CInt -> IO b
fullInfoInt Ptr PCRE
pcre_ptr CInt
info_capturecount


-- | 'captureNames'
--
-- Returns the names and numbers of all named subpatterns in the regular
-- expression. Groups are zero-indexed. Unnamed groups are counted, but don't appear in the
-- result list.
--
-- >>> captureNames (compile "(?<one>abc) (def) (?<three>ghi)")
-- [("one", 0), ("three", 2)]
captureNames :: Regex -> [(S.ByteString, Int)]
captureNames :: Regex -> [(ByteString, Int)]
captureNames (Regex ForeignPtr PCRE
pcre_fp ByteString
_) = forall a. IO a -> a
unsafePerformIO forall a b. (a -> b) -> a -> b
$
  forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr PCRE
pcre_fp forall a b. (a -> b) -> a -> b
$ \Ptr PCRE
pcre_ptr -> do
    Int
count     <- forall {b}. Num b => Ptr PCRE -> CInt -> IO b
fullInfoInt Ptr PCRE
pcre_ptr CInt
info_namecount
    Int
entrysize <- forall {b}. Num b => Ptr PCRE -> CInt -> IO b
fullInfoInt Ptr PCRE
pcre_ptr CInt
info_nameentrysize

    ByteString
buf <- forall a b. Storable a => (Ptr a -> IO b) -> IO b
alloca forall a b. (a -> b) -> a -> b
$ \Ptr (Ptr CChar)
n_ptr -> do
      forall a. Ptr PCRE -> Ptr PCRE -> CInt -> Ptr a -> IO CInt
c_pcre_fullinfo Ptr PCRE
pcre_ptr forall a. Ptr a
nullPtr CInt
info_nametable Ptr (Ptr CChar)
n_ptr
      Ptr CChar
buf <- forall a. Storable a => Ptr a -> IO a
peek Ptr (Ptr CChar)
n_ptr
      CStringLen -> IO ByteString
S.packCStringLen (Ptr CChar
buf, Int
countforall a. Num a => a -> a -> a
*Int
entrysize)

    let results :: [(ByteString, Int)]
results = Int -> ByteString -> [(ByteString, Int)]
split Int
entrysize ByteString
buf
        zeroIndexed :: [(ByteString, Int)]
zeroIndexed = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (forall a. Num a => a -> a -> a
subtract Int
1) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [(ByteString, Int)]
results
        sorted :: [(ByteString, Int)]
sorted = forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (forall a. Ord a => a -> a -> Ordering
compare forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` forall a b. (a, b) -> b
snd) [(ByteString, Int)]
zeroIndexed
    forall (m :: * -> *) a. Monad m => a -> m a
return [(ByteString, Int)]
sorted

  where
    -- Split the nametable buffer into entries. Each entry has a fixed size in
    -- bytes. The first two bytes in each entry store the pattern number in
    -- big-endian format, the bytes following that contain the nul-terminated
    -- name of the subpattern.
    split :: Int -> S.ByteString -> [(S.ByteString, Int)]
    split :: Int -> ByteString -> [(ByteString, Int)]
split Int
entrysize ByteString
buf
      | ByteString -> Bool
S.null ByteString
buf = []
      | Bool
otherwise =
        let
          (ByteString
entry, ByteString
tail) = Int -> ByteString -> (ByteString, ByteString)
S.splitAt Int
entrysize ByteString
buf
          idx :: Int -> Int
idx = forall a b. (Integral a, Num b) => a -> b
fromIntegral forall b c a. (b -> c) -> (a -> b) -> a -> c
. HasCallStack => ByteString -> Int -> Word8
S.index ByteString
entry
          num :: Int
num = Int -> Int
idx Int
0 forall a. Num a => a -> a -> a
* Int
256 forall a. Num a => a -> a -> a
+ Int -> Int
idx Int
1
          name :: ByteString
name = (Word8 -> Bool) -> ByteString -> ByteString
S.takeWhile (forall a. Eq a => a -> a -> Bool
/= Word8
0) forall a b. (a -> b) -> a -> b
$ Int -> ByteString -> ByteString
S.drop Int
2 ByteString
entry
        in (ByteString
name, Int
num) forall a. a -> [a] -> [a]
: Int -> ByteString -> [(ByteString, Int)]
split Int
entrysize ByteString
tail