\begin{code}
{-# LANGUAGE RecordWildCards            #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE OverloadedStrings          #-}

module Text.RE.TestBench
  ( MacroID(..)
  , RegexType(..)
  , MacroEnv
  , WithCaptures(..)
  , MacroDescriptor(..)
  , TestResult(..)
  , RegexSource(..)
  , FunctionID(..)
  , mkMacros
  , testMacroEnv
  , badMacros
  , runTests
  , runTests'
  , formatMacroTable
  , dumpMacroTable
  , formatMacroSummary
  , formatMacroSources
  , formatMacroSource
  , testMacroDescriptors
-- , regexSource
  ) where

import           Data.Array
import           Control.Applicative
import qualified Data.HashMap.Lazy              as HML
import           Data.List
import           Data.Maybe
import           Data.Ord
import           Data.String
import           Text.Printf
import           Text.RE.Capture
import           Text.RE.Options
import           Text.RE.Replace
import qualified Text.Regex.PCRE                as PCRE
import qualified Text.Regex.TDFA                as TDFA
\end{code} Types ----- \begin{code}
-- | what kind of back end will be compiling the RE
data RegexType
  = TDFA    -- the TDFA back end
  | PCRE    -- the PCRE back end
  deriving (Bounded,Enum,Eq,Ord,Show)

-- | do we need the captures in the RE or whould they be stripped out
-- where possible
data WithCaptures
  = InclCaptures
  | ExclCaptures
  deriving (Eq,Ord,Show)

-- | each macro can reference others, the whole environment being
-- required for each macro, so we use a Lazy HashMap
type MacroEnv = HML.HashMap MacroID MacroDescriptor

-- | describes a macro, giving the text of the RE and a si=ummary
-- description
data MacroDescriptor =
  MacroDescriptor
    { _md_source          :: !RegexSource         -- ^ the RE
    , _md_samples         :: ![String]            -- ^ some sample matches
    , _md_counter_samples :: ![String]            -- ^ some sample non-matches
    , _md_test_results    :: ![TestResult]        -- ^ validation test results
    , _md_parser          :: !(Maybe FunctionID)  -- ^ WA, the parser function
    , _md_description     :: !String              -- ^ summary comment
    }
  deriving (Show)

-- | list of failures on a validation run
newtype TestResult =
  TestResult { _TestResult :: String }
  deriving (IsString,Show)

-- | a RE that should work for POSIX and PCRE with open brackets ('(')
-- represented as follows:
--    \(    mere symbol
--    (?:   used for grouping only, not for captures
--    (}:   used for captures only, not for grouping
--    (]:   used for captures and grouping
--    (     do not modify
newtype RegexSource =
    RegexSource { _RegexSource :: String }
  deriving (IsString,Show)

-- | name of the Haskell parser function for parsing the text matched
-- by a macro
newtype FunctionID =
    FunctionID { _FunctionID :: String }
  deriving (IsString,Show)

-- | we are only interested in the open parentheses used for
-- grouping and/or capturing; if neither grouping or capturing then
-- there is no initial '(' or '(?:', just the suffic text
data REToken =
  REToken
    { _ret_prefix    :: String  -- ^ following text optional ( or (?:
    , _ret_fixed     :: Bool    -- ^ a '(' that is not safe to modify
    , _ret_grouping  :: Bool    -- ^ is this a grouping group
    , _ret_capturing :: Bool    -- ^ is this a capturing group
    }
  deriving (Show)
\end{code} mkMacros -------- \begin{code}
mkMacros :: (Monad m,Functor m)
         => (String->m r)
         -> RegexType
         -> WithCaptures
         -> MacroEnv
         -> m (Macros r)
mkMacros prs rty wc env =
    HML.fromList <$> mapM (uncurry mk) (HML.toList env)
  where
    mk mid md = (,) mid <$> prs (mdRegexSource rty wc env md)
\end{code} testMacroEnv, badMacros ----------------------- \begin{code}
testMacroEnv :: String -> RegexType -> MacroEnv -> IO Bool
testMacroEnv lab rty m_env = case badMacros m_env of
  []    -> return True
  fails -> do
    putStrLn $ lab' ++ " has failing tests for these macros: "
    putStr   $ unlines $ [ "  "++_MacroID mid | mid<-fails ]
    putStrLn $ "The whole table:"
    putStrLn $ "========================================================"
    putStr   $ formatMacroTable rty m_env
    putStrLn $ "========================================================"
    return False
  where
    lab' = lab ++ " [" ++ show rty ++"]"

badMacros :: MacroEnv -> [MacroID]
badMacros m_env =
  [ mid
      | (mid,MacroDescriptor{..}) <- HML.toList m_env
      , not $ null _md_test_results
      ]

runTests :: (Eq a,Show a)
         => RegexType
         -> (String->Maybe a)
         -> [(String,a)]
         -> MacroEnv
         -> MacroID
         -> MacroDescriptor
         -> MacroDescriptor
runTests rty parser = runTests' rty parser'
  where
    parser' caps = fmap capturedText (matchCapture caps) >>= parser

runTests' :: (Eq a,Show a)
          => RegexType
          -> (Match String->Maybe a)
          -> [(String,a)]
          -> MacroEnv
          -> MacroID
          -> MacroDescriptor
          -> MacroDescriptor
runTests' rty parser vector env mid md@MacroDescriptor{..} =
    md { _md_test_results = test_results }
  where
    test_results = concat
      [ concat $ map test     vector
      , concat $ map test_neg _md_counter_samples
      ]

    test (src,x) = test'     mid rty parser x $ match_ src env md

    test_neg src = test_neg' mid rty parser   $ match_ src env md

    match_ = case rty of
      TDFA -> match_tdfa
      PCRE -> match_pcre
\end{code} dumpMacroTable, formatMacroTable, formatMacroSummary, formatMacroSources, formatMacroSource ------------------------------------------------------------------------------------------- \begin{code}
dumpMacroTable :: String -> RegexType -> MacroEnv -> IO ()
dumpMacroTable lab rty m_env = do
    writeFile fp_t $ formatMacroTable   rty              m_env
    writeFile fp_s $ formatMacroSources rty ExclCaptures m_env
  where
    fp_t = "tables/" ++ lab ++ "-" ++ show rty ++ ".md"
    fp_s = "tables/" ++ lab ++ "-" ++ show rty ++ ".txt"
\end{code} \begin{code}
formatMacroTable :: RegexType -> MacroEnv -> String
formatMacroTable rty env = unlines $
  format_table macro_table_hdr
    [ macro_table_row rty mid md
        | (mid,md) <- sortBy (comparing fst) $ HML.toList env
        ]
\end{code} \begin{code}
formatMacroSummary :: RegexType -> MacroEnv -> MacroID -> String
formatMacroSummary rty env mid = maybe oops prep $ HML.lookup mid env
  where
    prep :: MacroDescriptor -> String
    prep md = unlines $ concat $ map (fmt md) [minBound..maxBound]

    fmt :: MacroDescriptor -> Col -> [String]
    fmt md c =
        [ printf "%-15s : %s" (present_col c) ini
        ] ++ map ("      "++) lns
      where
        (ini,lns) = case macro_attribute rty mid md c of
          []   -> (,) "" []
          [ln] -> (,) ln []
          lns_ -> (,) "" lns_

    oops = error $ _MacroID mid ++ ": macro not defined in this environment"
\end{code} \begin{code}
formatMacroSources :: RegexType
                   -> WithCaptures
                   -> MacroEnv
                   -> String
formatMacroSources rty wc env = unlines $
    [ printf "%-20s : %s" (_MacroID mid) $ formatMacroSource rty wc env mid
        | mid <- sort $ HML.keys env
        ]
\end{code} \begin{code}
formatMacroSource :: RegexType
                  -> WithCaptures
                  -> MacroEnv
                  -> MacroID
                  -> String
formatMacroSource rty wc env mid =
    mdRegexSource rty wc env $ fromMaybe oops $ HML.lookup mid env
  where
    oops = error $ "formatMacroSource: not found: " ++ _MacroID mid
\end{code} testMacroDescriptors, regexSource --------------------------------- \begin{code}
testMacroDescriptors :: [MacroDescriptor] -> [TestResult]
testMacroDescriptors = concat . map _md_test_results

regexSource :: RegexType -> WithCaptures -> RegexSource -> String
regexSource rty wc = format_tokens rty wc . scan_re
\end{code} Formatting helpers ------------------ \begin{code}
type TableRow = Array Col [String]

data Col
  = C_name
  | C_caps
  | C_regex
  | C_examples
  | C_anti_examples
  | C_fails
  | C_parser
  | C_comment
  deriving (Ix,Bounded,Enum,Ord,Eq,Show)

present_col :: Col -> String
present_col = map tr . drop 2 . show
  where
    tr '_' = '-'
    tr c   = c

macro_table_hdr :: TableRow
macro_table_hdr = listArray (minBound,maxBound)
  [ [present_col c]
    | c<-[minBound..maxBound]
    ]

macro_table_row :: RegexType -> MacroID -> MacroDescriptor -> TableRow
macro_table_row rty mid md =
    listArray (minBound,maxBound) $
      map (macro_attribute rty mid md) [minBound..maxBound]

macro_attribute :: RegexType
                -> MacroID
                -> MacroDescriptor
                -> Col
                -> [String]
macro_attribute rty mid MacroDescriptor{..} c =
    case c of
      C_name          -> [_MacroID mid]
      C_caps          -> [show $ min_captures rty $ scan_re _md_source]
      C_regex         -> [regexSource rty ExclCaptures _md_source]
      C_examples      -> _md_samples
      C_anti_examples -> _md_counter_samples
      C_fails         -> map _TestResult _md_test_results
      C_parser        -> [maybe "-" _FunctionID _md_parser]
      C_comment       -> [_md_description]

format_table :: TableRow -> [TableRow] -> [String]
format_table hdr rows0 = concat
    [ format_row cws hdr'
    , format_row cws dsh
    , concat $ map (format_row cws) rows
    ]
  where
    dsh  = listArray (minBound,maxBound)
              [ [replicate n '-'] | n<-elems cws ]

    hdr' = hdr // [(,) C_regex $ [take n $ concat $ repeat "regex="] ]
      where
        n = min 29 $ cws!C_regex

    cws  = widths $ hdr : rows

    rows = map wrap_row rows0

field_width :: Int
field_width = 40

wrap_row :: TableRow -> TableRow
wrap_row = fmap $ concat . map f
  where
    f, g :: String -> [String]

    f cts = (ini ++ ['\\' | not (null rst)]) : g rst
      where
        (ini,rst) = splitAt (1+field_width) cts

    g ""  = []
    g cts = ('\\' : ini ++ ['\\' | not (null rst)]) : g rst
      where
        (ini,rst) = splitAt field_width cts


widths :: [TableRow] -> Array Col Int
widths rows = listArray (minBound,maxBound)
  [ maximum $ concat [ map length $ row!c | row<-rows ]
    | c<-[minBound..maxBound]
    ]

format_row :: Array Col Int -> TableRow -> [String]
format_row cw_arr row =
  [ ("|"++) $ intercalate "|"
      [ field cw_arr row c i | c<-[minBound..maxBound] ]
    | i <- [0..depth-1]
    ]
  where
    depth = maximum [ length $ row!c | c<-[minBound..maxBound] ]

field :: Array Col Int -> TableRow -> Col -> Int -> String
field cws row c i = ljust (cws!c) $ sel i $ row!c

sel :: Int -> [String] -> String
sel i ss = case drop i ss of
  []  -> ""
  s:_ -> s

ljust :: Int -> String -> String
ljust w s = s ++ replicate n ' '
  where
    n = max 0 $ w - length s

min_captures :: RegexType -> [REToken] -> Int
min_captures rty rets = length
  [ ()
    | REToken{..}<-rets
    , _ret_fixed || (_ret_grouping && rty==TDFA)
    ]
\end{code} Formatting tokens ----------------- \begin{code}
format_tokens :: RegexType -> WithCaptures -> [REToken] -> String
format_tokens rty wc = foldr f ""
  where
    f REToken{..} rst = _ret_prefix ++ bra ++ xket rst
      where
        bra = case _ret_fixed of
          True  -> "("
          False ->
            case (,) _ret_grouping (_ret_capturing && wc==InclCaptures) of
              (False,False) -> ""
              (True ,False) -> if rty==PCRE then "(?:" else "("
              (False,True ) -> "("
              (True ,True ) -> "("

        xket =
          case not _ret_grouping && _ret_capturing && wc==ExclCaptures of
            True  -> delete_ket 0
            False -> id

delete_ket :: Int -> String -> String
delete_ket _ "" = error "delete_ket: end of input"
delete_ket n (c:t) = case c of
  '\\' -> case t of
    ""    -> error "delete_ket: end of input"
    c':t' -> c : c' : delete_ket n t'
  ')'  -> case n of
    0  -> t
    _  -> c : delete_ket (n-1) t
  '('  -> c : delete_ket (n+1) t
  _    -> c : delete_ket  n    t
\end{code} scan_re ------- \begin{code}
scan_re :: RegexSource -> [REToken]
scan_re (RegexSource src0) = loop src0
  where
    loop ""  = []
    loop src =
        case rst of
          '\\':t -> case t of
              ""    -> REToken (ini++['\\'])    False False False : []
              c':t' -> REToken (ini++['\\',c']) False False False : loop t'
          '(' :t -> case t of
            c:':':t'
              | c=='?'  -> REToken  ini False True  False : loop t'
              | c=='}'  -> REToken  ini False False True  : loop t'
              | c==']'  -> REToken  ini False True  True  : loop t'
            _           -> REToken  ini True  True  True  : loop t
          _ -> [REToken src False False False]
      where
        (ini,rst) = break chk src

        chk '\\'  = True
        chk '('   = True
        chk _     = False
\end{code} scan_re ------- \begin{code}
match_tdfa :: String -> MacroEnv -> MacroDescriptor -> Matches String
match_tdfa txt env md = txt TDFA.=~ mdRegexSource TDFA ExclCaptures env md

match_pcre :: String -> MacroEnv -> MacroDescriptor -> Matches String
match_pcre txt env md = txt PCRE.=~ mdRegexSource PCRE ExclCaptures env md
\end{code} mdRegexSource ------------- \begin{code}
mdRegexSource :: RegexType
              -> WithCaptures
              -> MacroEnv
              -> MacroDescriptor
              -> String
mdRegexSource rty wc env md =
    expandMacros' lu $ regexSource rty wc $ _md_source md
  where
    lu  = fmap (regexSource rty wc . _md_source) .
            flip HML.lookup env
\end{code} test', test_neg' ---------------- \begin{code}
test' :: (Eq a,Show a)
      => MacroID
      -> RegexType
      -> (Match String->Maybe a)
      -> a
      -> Matches String
      -> [TestResult]
test' mid rty prs x Matches{..} = either (:[]) (const []) $ do
    cs <- case allMatches of
      [cs] -> return cs
      _    -> oops "RE failed to parse"
    mtx <- case matchCapture cs of
      Nothing -> oops $ "RE parse failure: " ++ show cs
      Just c  -> return $ capturedText c
    case mtx == matchesSource of
      True  -> return ()
      False -> oops "RE failed to match the whole text"
    x' <- case prs cs of
      Nothing -> oops "matched text failed to parse"
      Just x' -> return x'
    case x'==x of
      True  -> return ()
      False -> oops "parser failed to yield the expected result"
  where
    oops = Left . test_diagnostic mid False rty matchesSource

test_neg' :: MacroID
          -> RegexType
          -> (Match String->Maybe a)
          -> Matches String
          -> [TestResult]
test_neg' mid rty prs Matches{..} = either id (const []) $ do
    case allMatches of
      [] -> return ()
      cz -> case ms of
          [] -> return ()
          _  -> Left [oops "RE parse succeeded"]
        where
          ms =
            [ ()
              | cs     <- cz
              , Just c <- [matchCapture cs]
              , let t = capturedText c
              , t == matchesSource
              , isJust $ prs cs
              ]

  where
    oops = test_diagnostic mid True rty matchesSource

test_diagnostic :: MacroID
                -> Bool
                -> RegexType
                -> String
                -> String
                -> TestResult
test_diagnostic mid is_neg rty tst msg =
    TestResult $
      printf "%-20s [%s %s] : %s (%s)" mid_s neg_s rty_s msg tst
  where
    mid_s = _MacroID mid
    neg_s = if is_neg then "-ve" else "+ve" :: String
    rty_s = show rty
\end{code}