{-# LANGUAGE FlexibleInstances     #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE OverloadedStrings     #-}
{-# LANGUAGE ScopedTypeVariables   #-}
{-|

Easy regular expression helpers, currently based on regex-tdfa. These should:

- be cross-platform, not requiring C libraries

- support unicode

- support extended regular expressions

- support replacement, with backreferences etc.

- support splitting

- have mnemonic names

- have simple monomorphic types

- work with simple strings

Regex strings are automatically compiled into regular expressions the first
time they are seen, and these are cached. If you use a huge number of unique
regular expressions this might lead to increased memory usage. Several
functions have memoised variants (*Memo), which also trade space for time.

Currently two APIs are provided:

- The old partial one (with ' suffixes') which will call error on any problem
  (eg with malformed regexps). This comes from hledger's origin as a
  command-line tool.

- The new total one which will return an error message. This is better for
  long-running apps like hledger-web.

Current limitations:

- (?i) and similar are not supported

-}

module Hledger.Utils.Regex (
  -- * Regexp type and constructors
   Regexp(reString)
  ,toRegex
  ,toRegexCI
  ,toRegex'
  ,toRegexCI'
   -- * type aliases
  ,Replacement
  ,RegexError
   -- * total regex operations
  ,regexMatch
  ,regexMatchText
  ,regexReplace
  ,regexReplaceUnmemo
  ,regexReplaceAllBy
  )
where

import Control.Monad (foldM)
import Data.Aeson (ToJSON(..), Value(String))
import Data.Array ((!), elems, indices)
import Data.Char (isDigit)
import Data.List (foldl')
import Data.MemoUgly (memo)
import Data.Text (Text)
import qualified Data.Text as T
import Text.Regex.TDFA (
  Regex, CompOption(..), defaultCompOpt, defaultExecOpt,
  makeRegexOptsM, AllMatches(getAllMatches), match, MatchText,
  RegexLike(..), RegexMaker(..), RegexOptions(..), RegexContext(..)
  )


-- | Regular expression. Extended regular expression-ish syntax ? But does not support eg (?i) syntax.
data Regexp
  = Regexp   { Regexp -> Text
reString :: Text, Regexp -> Regex
reCompiled :: Regex }
  | RegexpCI { reString :: Text, reCompiled :: Regex }

instance Eq Regexp where
  Regexp   Text
s1 Regex
_ == :: Regexp -> Regexp -> Bool
== Regexp   Text
s2 Regex
_ = Text
s1 forall a. Eq a => a -> a -> Bool
== Text
s2
  RegexpCI Text
s1 Regex
_ == RegexpCI Text
s2 Regex
_ = Text
s1 forall a. Eq a => a -> a -> Bool
== Text
s2
  Regexp
_ == Regexp
_ = Bool
False

instance Ord Regexp where
  Regexp   Text
s1 Regex
_ compare :: Regexp -> Regexp -> Ordering
`compare` Regexp   Text
s2 Regex
_ = Text
s1 forall a. Ord a => a -> a -> Ordering
`compare` Text
s2
  RegexpCI Text
s1 Regex
_ `compare` RegexpCI Text
s2 Regex
_ = Text
s1 forall a. Ord a => a -> a -> Ordering
`compare` Text
s2
  Regexp Text
_ Regex
_ `compare` RegexpCI Text
_ Regex
_ = Ordering
LT
  RegexpCI Text
_ Regex
_ `compare` Regexp Text
_ Regex
_ = Ordering
GT

instance Show Regexp where
  showsPrec :: Int -> Regexp -> ShowS
showsPrec Int
d Regexp
r = Bool -> ShowS -> ShowS
showParen (Int
d forall a. Ord a => a -> a -> Bool
> Int
app_prec) forall a b. (a -> b) -> a -> b
$ ShowS
reCons forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Show a => Int -> a -> ShowS
showsPrec (Int
app_precforall a. Num a => a -> a -> a
+Int
1) (Text -> String
T.unpack forall a b. (a -> b) -> a -> b
$ Regexp -> Text
reString Regexp
r)
    where app_prec :: Int
app_prec = Int
10
          reCons :: ShowS
reCons = case Regexp
r of Regexp   Text
_ Regex
_ -> String -> ShowS
showString String
"Regexp "
                             RegexpCI Text
_ Regex
_ -> String -> ShowS
showString String
"RegexpCI "

instance Read Regexp where
  readsPrec :: Int -> ReadS Regexp
readsPrec Int
d String
r =  forall a. Bool -> ReadS a -> ReadS a
readParen (Int
d forall a. Ord a => a -> a -> Bool
> Int
app_prec) (\String
r' -> [(Text -> Regexp
toRegexCI' Text
m,String
t) |
                                                    (String
"RegexCI",String
s) <- ReadS String
lex String
r',
                                                    (Text
m,String
t) <- forall a. Read a => Int -> ReadS a
readsPrec (Int
app_precforall a. Num a => a -> a -> a
+Int
1) String
s]) String
r
                forall a. [a] -> [a] -> [a]
++ forall a. Bool -> ReadS a -> ReadS a
readParen (Int
d forall a. Ord a => a -> a -> Bool
> Int
app_prec) (\String
r' -> [(Text -> Regexp
toRegex' Text
m, String
t) |
                                                    (String
"Regex",String
s) <- ReadS String
lex String
r',
                                                    (Text
m,String
t) <- forall a. Read a => Int -> ReadS a
readsPrec (Int
app_precforall a. Num a => a -> a -> a
+Int
1) String
s]) String
r
    where app_prec :: Int
app_prec = Int
10

instance ToJSON Regexp where
  toJSON :: Regexp -> Value
toJSON (Regexp   Text
s Regex
_) = Text -> Value
String forall a b. (a -> b) -> a -> b
$ Text
"Regexp "   forall a. Semigroup a => a -> a -> a
<> Text
s
  toJSON (RegexpCI Text
s Regex
_) = Text -> Value
String forall a b. (a -> b) -> a -> b
$ Text
"RegexpCI " forall a. Semigroup a => a -> a -> a
<> Text
s

instance RegexLike Regexp String where
  matchOnce :: Regexp -> String -> Maybe MatchArray
matchOnce = forall regex source.
RegexLike regex source =>
regex -> source -> Maybe MatchArray
matchOnce forall b c a. (b -> c) -> (a -> b) -> a -> c
. Regexp -> Regex
reCompiled
  matchAll :: Regexp -> String -> [MatchArray]
matchAll = forall regex source.
RegexLike regex source =>
regex -> source -> [MatchArray]
matchAll forall b c a. (b -> c) -> (a -> b) -> a -> c
. Regexp -> Regex
reCompiled
  matchCount :: Regexp -> String -> Int
matchCount = forall regex source.
RegexLike regex source =>
regex -> source -> Int
matchCount forall b c a. (b -> c) -> (a -> b) -> a -> c
. Regexp -> Regex
reCompiled
  matchTest :: Regexp -> String -> Bool
matchTest = forall regex source.
RegexLike regex source =>
regex -> source -> Bool
matchTest forall b c a. (b -> c) -> (a -> b) -> a -> c
. Regexp -> Regex
reCompiled
  matchAllText :: Regexp -> String -> [MatchText String]
matchAllText = forall regex source.
RegexLike regex source =>
regex -> source -> [MatchText source]
matchAllText forall b c a. (b -> c) -> (a -> b) -> a -> c
. Regexp -> Regex
reCompiled
  matchOnceText :: Regexp -> String -> Maybe (String, MatchText String, String)
matchOnceText = forall regex source.
RegexLike regex source =>
regex -> source -> Maybe (source, MatchText source, source)
matchOnceText forall b c a. (b -> c) -> (a -> b) -> a -> c
. Regexp -> Regex
reCompiled

instance RegexContext Regexp String String where
  match :: Regexp -> ShowS
match = forall regex source target.
RegexContext regex source target =>
regex -> source -> target
match forall b c a. (b -> c) -> (a -> b) -> a -> c
. Regexp -> Regex
reCompiled
  matchM :: forall (m :: * -> *). MonadFail m => Regexp -> String -> m String
matchM = forall regex source target (m :: * -> *).
(RegexContext regex source target, MonadFail m) =>
regex -> source -> m target
matchM forall b c a. (b -> c) -> (a -> b) -> a -> c
. Regexp -> Regex
reCompiled

-- Convert a Regexp string to a compiled Regex, or return an error message.
toRegex :: Text -> Either RegexError Regexp
toRegex :: Text -> Either String Regexp
toRegex = forall a b. Ord a => (a -> b) -> a -> b
memo forall a b. (a -> b) -> a -> b
$ \Text
s -> forall a. Text -> Maybe a -> Either String a
mkRegexErr Text
s (Text -> Regex -> Regexp
Regexp Text
s forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall regex compOpt execOpt source (m :: * -> *).
(RegexMaker regex compOpt execOpt source, MonadFail m) =>
source -> m regex
makeRegexM (Text -> String
T.unpack Text
s))  -- Have to unpack here because Text instance in regex-tdfa only appears in 1.3.1

-- Like toRegex, but make a case-insensitive Regex.
toRegexCI :: Text -> Either RegexError Regexp
toRegexCI :: Text -> Either String Regexp
toRegexCI = forall a b. Ord a => (a -> b) -> a -> b
memo forall a b. (a -> b) -> a -> b
$ \Text
s -> forall a. Text -> Maybe a -> Either String a
mkRegexErr Text
s (Text -> Regex -> Regexp
RegexpCI Text
s forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall regex compOpt execOpt source (m :: * -> *).
(RegexMaker regex compOpt execOpt source, MonadFail m) =>
compOpt -> execOpt -> source -> m regex
makeRegexOptsM forall regex compOpt execOpt.
RegexOptions regex compOpt execOpt =>
compOpt
defaultCompOpt{caseSensitive :: Bool
caseSensitive=Bool
False} forall regex compOpt execOpt.
RegexOptions regex compOpt execOpt =>
execOpt
defaultExecOpt (Text -> String
T.unpack Text
s))  -- Have to unpack here because Text instance in regex-tdfa only appears in 1.3.1

-- | Make a nice error message for a regexp error.
mkRegexErr :: Text -> Maybe a -> Either RegexError a
mkRegexErr :: forall a. Text -> Maybe a -> Either String a
mkRegexErr Text
s = forall b a. b -> (a -> b) -> Maybe a -> b
maybe (forall a b. a -> Either a b
Left String
errmsg) forall a b. b -> Either a b
Right
  where errmsg :: String
errmsg = Text -> String
T.unpack forall a b. (a -> b) -> a -> b
$ Text
"This regular expression is malformed, please correct it:\n" forall a. Semigroup a => a -> a -> a
<> Text
s

-- Convert a Regexp string to a compiled Regex, throw an error
toRegex' :: Text -> Regexp
toRegex' :: Text -> Regexp
toRegex' = forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either forall a. String -> a
errorWithoutStackTrace forall a. a -> a
id forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Either String Regexp
toRegex

-- Like toRegex', but make a case-insensitive Regex.
toRegexCI' :: Text -> Regexp
toRegexCI' :: Text -> Regexp
toRegexCI' = forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either forall a. String -> a
errorWithoutStackTrace forall a. a -> a
id forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Either String Regexp
toRegexCI

-- | A replacement pattern. May include numeric backreferences (\N).
type Replacement = String

-- | An error message arising during a regular expression operation.
-- Eg: trying to compile a malformed regular expression, or
-- trying to apply a malformed replacement pattern.
type RegexError = String

-- helpers

-- | Test whether a Regexp matches a String. This is an alias for `matchTest` for consistent
-- naming.
regexMatch :: Regexp -> String -> Bool
regexMatch :: Regexp -> String -> Bool
regexMatch = forall regex source.
RegexLike regex source =>
regex -> source -> Bool
matchTest

-- | Tests whether a Regexp matches a Text.
--
-- This currently unpacks the Text to a String an works on that. This is due to
-- a performance bug in regex-tdfa (#9), which may or may not be relevant here.
regexMatchText :: Regexp -> Text -> Bool
regexMatchText :: Regexp -> Text -> Bool
regexMatchText Regexp
r = forall regex source.
RegexLike regex source =>
regex -> source -> Bool
matchTest Regexp
r forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> String
T.unpack

--------------------------------------------------------------------------------
-- new total functions

-- | A memoising version of regexReplace. Caches the result for each
-- search pattern, replacement pattern, target string tuple.
-- This won't generate a regular expression parsing error since that
-- is pre-compiled nowadays, but there can still be a runtime error 
-- from the replacement pattern, eg with a backreference referring 
-- to a nonexistent match group.
regexReplace :: Regexp -> Replacement -> String -> Either RegexError String
regexReplace :: Regexp -> String -> String -> Either String String
regexReplace Regexp
re String
repl = forall a b. Ord a => (a -> b) -> a -> b
memo forall a b. (a -> b) -> a -> b
$ Regexp -> String -> String -> Either String String
regexReplaceUnmemo Regexp
re String
repl

-- helpers:

-- Replace this regular expression with this replacement pattern in this
-- string, or return an error message. (There should be no regexp
-- parsing errors these days since Regexp's compiled form is used,
-- but there can still be a runtime error from the replacement
-- pattern, eg a backreference referring to a nonexistent match group.)
regexReplaceUnmemo :: Regexp -> Replacement -> String -> Either RegexError String
regexReplaceUnmemo :: Regexp -> String -> String -> Either String String
regexReplaceUnmemo Regexp
re String
repl String
str = forall (t :: * -> *) (m :: * -> *) b a.
(Foldable t, Monad m) =>
(b -> a -> m b) -> b -> t a -> m b
foldM (String -> String -> MatchText String -> Either String String
replaceMatch String
repl) String
str (forall a. [a] -> [a]
reverse forall a b. (a -> b) -> a -> b
$ forall regex source target.
RegexContext regex source target =>
regex -> source -> target
match (Regexp -> Regex
reCompiled Regexp
re) String
str :: [MatchText String])
  where
    -- Replace one match within the string with the replacement text
    -- appropriate for this match. Or return an error message.
    replaceMatch :: Replacement -> String -> MatchText String -> Either RegexError String
    replaceMatch :: String -> String -> MatchText String -> Either String String
replaceMatch String
replpat String
s MatchText String
matchgroups =
      case forall i e. Array i e -> [e]
elems MatchText String
matchgroups of 
        [] -> forall a b. b -> Either a b
Right String
s
        ((String
_,(Int
off,Int
len)):[(String, (Int, Int))]
_) ->   -- groups should have 0-based indexes, and there should always be at least one, since this is a match
          Either String String
erpl forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \String
rpl -> forall a b. b -> Either a b
Right forall a b. (a -> b) -> a -> b
$ String
pre forall a. [a] -> [a] -> [a]
++ String
rpl forall a. [a] -> [a] -> [a]
++ String
post
          where
            (String
pre, String
post') = forall a. Int -> [a] -> ([a], [a])
splitAt Int
off String
s
            post :: String
post = forall a. Int -> [a] -> [a]
drop Int
len String
post'
            -- The replacement text: the replacement pattern with all
            -- numeric backreferences replaced by the appropriate groups
            -- from this match. Or an error message.
            erpl :: Either String String
erpl = forall (m :: * -> *).
Monad m =>
Regexp -> (String -> m String) -> String -> m String
regexReplaceAllByM Regexp
backrefRegex (MatchText String -> String -> Either String String
lookupMatchGroup MatchText String
matchgroups) String
replpat
              where
                -- Given some match groups and a numeric backreference,
                -- return the referenced group text, or an error message.
                lookupMatchGroup :: MatchText String -> String -> Either RegexError String
                lookupMatchGroup :: MatchText String -> String -> Either String String
lookupMatchGroup MatchText String
grps (Char
'\\':s2 :: String
s2@(Char
_:String
_)) | forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all Char -> Bool
isDigit String
s2 =
                  case forall a. Read a => String -> a
read String
s2 of Int
n | Int
n forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` forall i e. Ix i => Array i e -> [i]
indices MatchText String
grps -> forall a b. b -> Either a b
Right forall a b. (a -> b) -> a -> b
$ forall a b. (a, b) -> a
fst (MatchText String
grps forall i e. Ix i => Array i e -> i -> e
! Int
n)  -- PARTIAL: should not fail, all digits
                                  Int
_                         -> forall a b. a -> Either a b
Left forall a b. (a -> b) -> a -> b
$ String
"no match group exists for backreference \"\\"forall a. [a] -> [a] -> [a]
++String
sforall a. [a] -> [a] -> [a]
++String
"\""
                lookupMatchGroup MatchText String
_ String
s2 = forall a b. a -> Either a b
Left forall a b. (a -> b) -> a -> b
$ String
"lookupMatchGroup called on non-numeric-backreference \""forall a. [a] -> [a] -> [a]
++String
s2forall a. [a] -> [a] -> [a]
++String
"\", shouldn't happen"
    backrefRegex :: Regexp
backrefRegex = Text -> Regexp
toRegex' Text
"\\\\[0-9]+"  -- PARTIAL: should not fail

-- regexReplace' :: Regexp -> Replacement -> String -> String
-- regexReplace' re repl s =
--     foldl (replaceMatch repl) s (reverse $ match (reCompiled re) s :: [MatchText String])
--   where
--     replaceMatch :: Replacement -> String -> MatchText String -> String
--     replaceMatch replpat s matchgroups = pre ++ repl ++ post
--       where
--         ((_,(off,len)):_) = elems matchgroups  -- groups should have 0-based indexes, and there should always be at least one, since this is a match
--         (pre, post') = splitAt off s
--         post = drop len post'
--         repl = regexReplaceAllBy backrefRegex (lookupMatchGroup matchgroups) replpat
--           where
--             lookupMatchGroup :: MatchText String -> String -> String
--             lookupMatchGroup grps ('\\':s@(_:_)) | all isDigit s =
--               case read s of n | n `elem` indices grps -> fst (grps ! n)
--               -- PARTIAL:
--                              _                         -> error' $ "no match group exists for backreference \"\\"++s++"\""
--             lookupMatchGroup _ s = error' $ "lookupMatchGroup called on non-numeric-backreference \""++s++"\", shouldn't happen"
--     backrefRegex = toRegex' "\\\\[0-9]+"  -- PARTIAL: should not fail


-- helpers

-- adapted from http://stackoverflow.com/questions/9071682/replacement-substition-with-haskell-regex-libraries:

-- Replace all occurrences of a regexp in a string, transforming each match
-- with the given pure function.
regexReplaceAllBy :: Regexp -> (String -> String) -> String -> String
regexReplaceAllBy :: Regexp -> ShowS -> ShowS
regexReplaceAllBy Regexp
re ShowS
transform String
s = ShowS
prependdone String
rest
  where
    (Int
_, String
rest, ShowS
prependdone) = forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' (Int, String, ShowS) -> (Int, Int) -> (Int, String, ShowS)
go (Int
0, String
s, forall a. a -> a
id) [(Int, Int)]
matches
      where
        matches :: [(Int, Int)]
matches = forall (f :: * -> *) b. AllMatches f b -> f b
getAllMatches forall a b. (a -> b) -> a -> b
$ forall regex source target.
RegexContext regex source target =>
regex -> source -> target
match (Regexp -> Regex
reCompiled Regexp
re) String
s :: [(Int, Int)]  -- offset and length
        go :: (Int,String,String->String) -> (Int,Int) ->  (Int,String,String->String)
        go :: (Int, String, ShowS) -> (Int, Int) -> (Int, String, ShowS)
go (Int
pos,String
todo,ShowS
prepend) (Int
off,Int
len) =
          let (String
prematch, String
matchandrest) = forall a. Int -> [a] -> ([a], [a])
splitAt (Int
off forall a. Num a => a -> a -> a
- Int
pos) String
todo
              (String
matched, String
rest2) = forall a. Int -> [a] -> ([a], [a])
splitAt Int
len String
matchandrest
          in (Int
off forall a. Num a => a -> a -> a
+ Int
len, String
rest2, ShowS
prepend forall b c a. (b -> c) -> (a -> b) -> a -> c
. (String
prematchforall a. [a] -> [a] -> [a]
++) forall b c a. (b -> c) -> (a -> b) -> a -> c
. (ShowS
transform String
matched forall a. [a] -> [a] -> [a]
++))

-- Replace all occurrences of a regexp in a string, transforming each match
-- with the given monadic function. Eg if the monad is Either, a Left result
-- from the transform function short-circuits and is returned as the overall
-- result.
regexReplaceAllByM :: forall m. Monad m => Regexp -> (String -> m String) -> String -> m String
regexReplaceAllByM :: forall (m :: * -> *).
Monad m =>
Regexp -> (String -> m String) -> String -> m String
regexReplaceAllByM Regexp
re String -> m String
transform String
s =
    forall (t :: * -> *) (m :: * -> *) b a.
(Foldable t, Monad m) =>
(b -> a -> m b) -> b -> t a -> m b
foldM (Int, String, ShowS) -> (Int, Int) -> m (Int, String, ShowS)
go (Int
0, String
s, forall a. a -> a
id) [(Int, Int)]
matches forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \(Int
_, String
rest, ShowS
prependdone) -> forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ ShowS
prependdone String
rest
  where
    matches :: [(Int, Int)]
matches = forall (f :: * -> *) b. AllMatches f b -> f b
getAllMatches forall a b. (a -> b) -> a -> b
$ forall regex source target.
RegexContext regex source target =>
regex -> source -> target
match (Regexp -> Regex
reCompiled Regexp
re) String
s :: [(Int, Int)]  -- offset and length
    go :: (Int,String,String->String) -> (Int,Int) -> m (Int,String,String->String)
    go :: (Int, String, ShowS) -> (Int, Int) -> m (Int, String, ShowS)
go (Int
pos,String
todo,ShowS
prepend) (Int
off,Int
len) =
      let (String
prematch, String
matchandrest) = forall a. Int -> [a] -> ([a], [a])
splitAt (Int
off forall a. Num a => a -> a -> a
- Int
pos) String
todo
          (String
matched, String
rest) = forall a. Int -> [a] -> ([a], [a])
splitAt Int
len String
matchandrest
      in String -> m String
transform String
matched forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \String
matched' -> forall (f :: * -> *) a. Applicative f => a -> f a
pure (Int
off forall a. Num a => a -> a -> a
+ Int
len, String
rest, ShowS
prepend forall b c a. (b -> c) -> (a -> b) -> a -> c
. (String
prematchforall a. [a] -> [a] -> [a]
++) forall b c a. (b -> c) -> (a -> b) -> a -> c
. (String
matched' forall a. [a] -> [a] -> [a]
++))