{- | A module implementing regexp-based split and substitute.

>>> substituteCompile "(a+)" "lapin" "'\\1'"
Right "l'a'pin"

>>> splitCompile "\\d" "a1b2c3"
Right ["a","b","c"]
-}
module Text.Regex.PCRE.ByteString.Utils
    ( -- * Perl-like utility functions
      substitute
    , split
    , substituteCompile
    , splitCompile
    -- * Re-exports from "Text.Regex.PCRE.ByteString"
    , Regex
    , CompOption
    , ExecOption
    , compBlank
    , execBlank
    -- * Pure version of the functions, using 'unsafePerformIO'
    , compile'
    , execute'
    , substitute'
    , split'
    , substituteCompile'
    , splitCompile'
    ) where

import Text.Regex.PCRE.ByteString
import qualified Data.ByteString.Char8 as BS
import Control.Monad.Except
import Data.Attoparsec.ByteString.Char8 as A
import Control.Applicative
import Data.Char (digitToInt)
import System.IO.Unsafe
import qualified Data.Array as A

{-| Substitutes values matched by a `Regex`. References can be used.

It doesn't support anything else than global substitution for now ..

-}
substitute :: Regex             -- ^ The regular expression, taken from a call to `compile`
           -> BS.ByteString     -- ^ The source string
           -> BS.ByteString     -- ^ The replacement string
           -> IO (Either String BS.ByteString)
substitute regexp srcstring repla = runExceptT $ do
    parsedReplacement <- case parseOnly repparser repla of
                             Right y -> return y
                             Left rr -> throwError (rr ++ " when parsing the replacement string")
    matches <- getMatches regexp srcstring
    let applyReplacement :: RegexpSplit BS.ByteString -> BS.ByteString
        applyReplacement (Unmatched x) = x
        applyReplacement (Matched captured mcaptures) = applyCaptures captured parsedReplacement mcaptures
    return $! BS.concat $! map applyReplacement matches

-- Transforms the parsed replacement and the vector of captured stuff into
-- the destination ByteString.
applyCaptures :: BS.ByteString -> [Replacement] -> [BS.ByteString] -> BS.ByteString
applyCaptures firstmatch repl mcaptures = BS.concat (map applyCaptures' repl)
    where
        ncaptures = length mcaptures
        applyCaptures' :: Replacement -> BS.ByteString
        applyCaptures' WholeMatch = firstmatch
        applyCaptures' (RawReplacement r) = r
        applyCaptures' (IndexedReplacement idx)
            | idx > ncaptures || idx < 0 = ""
            | otherwise = mcaptures !! (idx - 1)

-- | Splits strings, using a `Regex` as delimiter.
split :: Regex  -- ^ The regular expression, taken from a call to `compile`
      -> BS.ByteString -- ^ The source string
      -> IO (Either String [BS.ByteString])
split regexp srcstring = fmap (removeEmptyLeft . regexpUnmatched) <$> runExceptT (getMatches regexp srcstring)
    where
        removeEmptyLeft = reverse . dropWhile BS.null . reverse

-- | A pure version of 'substitute', using unsafePerformIO.
substitute' :: Regex             -- ^ The regular expression, taken from a call to `compile`
            -> BS.ByteString     -- ^ The source string
            -> BS.ByteString     -- ^ The replacement string
            -> Either String BS.ByteString
substitute' regexp srcstring repla = unsafePerformIO (substitute regexp srcstring repla)

-- | A pure version of 'split', using unsafePerformIO.
split' :: Regex  -- ^ The regular expression, taken from a call to `compile`
       -> BS.ByteString -- ^ The source string
       -> Either String [BS.ByteString]
split' regexp srcstring = unsafePerformIO (split regexp srcstring)

data RegexpSplit a = Matched a [a]
                   | Unmatched a
                   deriving (Show, Eq, Ord)

instance Functor RegexpSplit where
    fmap f (Matched x y)   = Matched (f x) (map f y)
    fmap f (Unmatched x) = Unmatched (f x)

regexpAll :: [RegexpSplit a] -> [a]
regexpAll = map unreg
    where
        unreg ( Matched x _ ) = x
        unreg ( Unmatched x ) = x

isMatched :: RegexpSplit a -> Bool
isMatched (Matched _ _) = True
isMatched _ = False

regexpUnmatched :: [RegexpSplit a] -> [a]
regexpUnmatched = regexpAll . filter (not . isMatched)

getMatches :: Regex -> BS.ByteString -> ExceptT String IO [RegexpSplit BS.ByteString]
getMatches _ "" = return []
getMatches creg src = do
    x <- liftIO $ regexec creg src
    case x of
        Left (rcode, rerror) -> throwError ("Regexp application error: " ++ rerror ++ "(" ++ show rcode ++ ")")
        Right Nothing -> return [Unmatched src]

        -- Now this is a trick, I don't know exactly why this happens, but this happens with empty regexps. We are going to cheat here
        Right (Just ("","",rm,_)) -> return (map (Unmatched . BS.singleton) (BS.unpack rm))

        Right (Just (before,current,remaining,captures)) -> do
            remain <- getMatches creg remaining
            return (Unmatched before : Matched current captures : remain)


data Replacement = RawReplacement BS.ByteString
                 | IndexedReplacement Int
                 | WholeMatch
                 deriving (Show)

repparser :: Parser [Replacement]
repparser = many replacement <* endOfInput

replacement :: Parser Replacement
replacement = fmap RawReplacement rawData <|> escapedThing

rawData :: Parser BS.ByteString
rawData = takeWhile1 (/= '\\')

escapedThing :: Parser Replacement
escapedThing = do
    void (char '\\')
    let ac = do
            n <- anyChar
            r <- rawData
            return $ BS.cons n r
        toReplacement 0 = WholeMatch
        toReplacement n = IndexedReplacement n
    fmap (toReplacement . digitToInt) digit <|> fmap (RawReplacement . BS.cons '\\') ac

-- | Compiles the regular expression (using default options) and `substitute`s
substituteCompile :: BS.ByteString     -- ^ The regular expression
                  -> BS.ByteString     -- ^ The source string
                  -> BS.ByteString     -- ^ The replacement string
                  -> IO (Either String BS.ByteString)
substituteCompile regexp srcstring repla = do
    re <- compile compBlank execBlank regexp
    case re of
        Right cre -> substitute cre srcstring repla
        Left rr   -> return $ Left $ "Regexp compilation failed: " ++ show rr

-- | Compiles the regular expression (using default options) and `split`s.
splitCompile :: BS.ByteString -- ^ The regular expression
             -> BS.ByteString -- ^ The source string
             -> IO (Either String [BS.ByteString])
splitCompile regexp srcstring = do
    re <- compile compBlank execBlank regexp
    case re of
        Right cre -> split cre srcstring
        Left rr   -> return $ Left $ "Regexp compilation failed: " ++ show rr

-- | A pure version of 'compile', using unsafePerformIO.
compile' :: CompOption -> ExecOption -> BS.ByteString -> Either (MatchOffset, String) Regex
compile' co eo s = unsafePerformIO (compile co eo s)

-- | A pure version of 'execute', using unsafePerformIO.
execute' :: Regex -> BS.ByteString -> Either WrapError (Maybe (A.Array Int (MatchOffset, MatchLength)))
execute' r s = unsafePerformIO (execute r s)

-- | A pure version of 'substituteCompile', using unsafePerformIO.
substituteCompile' :: BS.ByteString     -- ^ The regular expression
                   -> BS.ByteString     -- ^ The source string
                   -> BS.ByteString     -- ^ The replacement string
                   -> Either String BS.ByteString
substituteCompile' regexp srcstring repla = unsafePerformIO (substituteCompile regexp srcstring repla)

-- | A pure version of 'splitCompile', using unsafePerformIO.
splitCompile' :: BS.ByteString -- ^ The regular expression
              -> BS.ByteString -- ^ The source string
              -> Either String [BS.ByteString]
splitCompile' regexp srcstring = unsafePerformIO (splitCompile regexp srcstring)