-- |
-- Module      : BenchGraph
-- Copyright   : (c) 2017 Composewell Technologies
--
-- License     : BSD3
-- Maintainer  : harendra.kumar@gmail.com
-- Stability   : experimental
-- Portability : GHC
--
-- BenchGraph generates a graph from benchmarking results (CSV) generated by
-- @gauge@ or @criterion@, specifically, it generates comparative graphs for
-- several groups of benchmarks that can be compared.  The groups could
-- correspond to different packages providing similar and comparable operations
-- or even different versions of the same package. This is a convenient tool to
-- compare performance results of alternative libraries or versions of the same
-- package after you make a change that may impact the performance benchmarks
-- of the package.
--
-- The input is the CSV file generated by @gauge --csv=results.csv@ or a
-- similar output generated by @criterion@. You need to invoke the 'bgraph'
-- function with an appropriate 'Config' to control various parameters of graph
-- generation.
-- Benchmark results found in the CSV file can be classified into several
-- groups using a classifier function and each group is displayed side by side
-- in the graph on the same scale for comparison.  The constituent benchmarks
-- in each benchmark group are placed together as a group and a legend is
-- displayed to mark who is who.
--
-- See the @test@ directory for an example of how to use it.
-- A sample output can be found in the @sample-charts@ directory.

{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TupleSections #-}

module BenchGraph
    ( ComparisonStyle(..)
    , Config(..)
    , defaultConfig
    , bgraph
    ) where

import Control.Arrow (second)
import Control.Monad (when)
import Control.Monad.Trans.State.Lazy (get, put)
import Data.Char (toUpper)
import Data.Function ((&), on)
import Data.List (nub, nubBy, transpose, findIndex, groupBy, (\\), group, sort)
import Data.Maybe (catMaybes, fromMaybe, maybe)
import Debug.Trace (trace)
import System.Directory (createDirectoryIfMissing)
import System.FilePath ((</>))
import Text.CSV (CSV, parseCSVFromFile)
import Text.Read (readMaybe)

import Graphics.Rendering.Chart.Easy
import Graphics.Rendering.Chart.Backend.Diagrams

-- Utilities
-- Find items that are repeated more than once
getRepeated :: Ord a => [a] -> [a]
getRepeated = map head . filter ((>1) . length) . group . sort

-- | How to show the comparisons among benchmark groups.
--
-- @since 0.1.0
data ComparisonStyle =
      CompareFull  -- ^ Show full results for all groups
    | CompareDelta -- ^ Show the first group with full results, show delta
                   -- from the first group for the subsequent groups.
    deriving Eq

-- | Configuration governing generation of chart.
--
-- @since 0.1.0
data Config = Config
    {
    -- | The directory where the output graph should be placed.
      outputDir   :: FilePath
    -- | The title to be embedded in the generated graph.
    , chartTitle  :: Maybe String
    -- | User supplied function that translates a benchmark name into a tuple
    -- @(groupname, benchname)@, where @groupname@ is the name of the group the
    -- benchmark should be placed in and @benchname@ is the translated
    -- benchmark name to be displayed on the graph.  If it returns 'Nothing'
    -- the benchmark is omitted from the results.
    , classifyBenchmark :: String -> Maybe (String, String)
    -- XXX need ability to sort in ascending order of bar heights for a
    -- particular group. For that we will need access to the full data.
    -- | User supplied function to sort or reorder the list of benchmark names
    -- generated by 'classifyBenchmark'. These are the benchmarks to be plotted
    -- for each benchmark group.
    , sortBenchmarks :: [String] -> [String]
    -- | User supplied function to sort or reorder the benchmark group names
    -- generated by 'classifyBenchmark'.
    , sortBenchGroups :: [String] -> [String]
    -- XXX ability to sepcify different units e.g. Milliseconds etc.
    -- | @(RangeMin, RangeMax, NumIntervals)@ of the plot on the @y@ (time)
    -- axis in microseconds.
    , setYScale :: Maybe (Double, Double, Int)
    -- | How to show the comparisons.
    , comparisonStyle :: ComparisonStyle
    }

-- | Default configuration. Use this as the base configuration and modify the
-- required fields. The defaults are:
--
-- @
--  outputDir         = "."
--  chartTitle        = Nothing
--  classifyBenchmark = \b -> Just ("default", b)
--  sortBenchmarks    = id
--  sortBenchGroups   = id
--  setYScale         = Nothing
--  comparisonStyle   = 'CompareFull'
-- @
--
-- @since 0.1.0
defaultConfig :: Config
defaultConfig = Config
    { outputDir         = "."
    , chartTitle        = Nothing
    , classifyBenchmark = \b -> Just ("default", b)
    , sortBenchmarks    = id
    , sortBenchGroups   = id
    , setYScale         = Nothing
    , comparisonStyle   = CompareFull
    }

-- "values" is [(benchGroupName, [benchResult])]
-- benchResult contains results for each benchmark in "benchNames" in exactly
-- the same order.
genGroupGraph
    :: FilePath
    -> String
    -> Maybe [Double]
    -> Config
    -> [String]
    -> [(String, [Maybe Double])]
    -> IO ()
genGroupGraph outputFile units yindexes Config{..} benchNames values = do
    toFile
        def ((outputDir </> outputFile) ++ ".svg") $ do
        case chartTitle of
            Just title -> do
                layout_title .= title
                layout_title_style . font_size .= 25
            Nothing -> return ()

        -- woraround for a bug that renders the plot badly when using a single
        -- cluster in the bar chart.
        let vals =
                if length values == 1
                then values ++ [([], [])]
                else if comparisonStyle == CompareDelta
                     then let (_, h) = head values
                              toDelta x1 x2 =
                                case x1 of
                                    Nothing -> Nothing
                                    Just v -> fmap (\v1 -> v1 - v) x2
                              convertTail (name, xs) = (name ++ "(-base)", zipWith toDelta h xs)
                              convertHead (name, xs) = (name ++ "(base)", xs)
                          in convertHead (head values) : map convertTail (tail values)
                     else values

        layout_x_axis . laxis_generate .= autoIndexAxis (map fst vals)
        layout_x_axis . laxis_style . axis_label_style . font_size .= 16
        layout_y_axis . laxis_style . axis_label_style . font_size .= 14

        layout <- get
        case _layout_legend layout of
            Nothing -> return ()
            Just style@LegendStyle{..} -> do
                let s = style { _legend_plot_size = 22
                              -- , _legend_margin = 40
                              -- This is not available in versions <= 1.8.2
                              -- , _legend_position = LegendBelow
                              , _legend_label_style = _legend_label_style
                                    { _font_size = 14 }
                              }
                put $ layout { _layout_legend = Just s }

        -- layout_y_axis . laxis_override .= axisGridAtTicks
        let modifyLabels ad = ad {
                _axis_labels = map (map (second (++ " " ++ units)))
                                   (_axis_labels ad)
            }
        layout_y_axis . laxis_override .= modifyLabels

        case yindexes of
            Nothing -> return ()
            Just indexes ->
                layout_y_axis . laxis_override .= \_ ->
                    makeAxis (let f = floor :: Double -> Int
                              in map ((++ " " ++ units) . show . f))
                             (indexes, [], [])

        -- XXX We are mapping a missing value to 0, can we label it missing
        -- instead?
        let modifyVal x = map (fromMaybe 0) (snd x)
        plot $ fmap plotBars $ bars benchNames (addIndexes (map modifyVal vals))

-- [[Double]] each list is multiple results for each benchmark
transposeLists :: Show a => String -> [[a]] -> Maybe [[Maybe a]]
transposeLists gname xs =
    -- If each benchmark does not have the same number of results then reject
    -- all because the results may not correspond with each other when zipped.
    case nub $ map length xs of
        [0] -> trace ("Warning! " ++ gname ++ ": No results") Nothing
        [n] -> -- all lists are of the same length 'n'
            let ys = map (convertToMaybe n) xs
            in Just $ transpose ys
        [0,n] -> -- some lists are empty others are of the same length 'n'
            -- some packages may have missing benchmarks
            -- fill the empty results with Nothing
            let ys = map (convertToMaybe n) xs
            in Just $ transpose ys
        _ -> trace ("Warning! " ++ gname
                ++ ": has multiple runs with different number of benchmarks\n"
                ++ show xs) Nothing
    where
        convertToMaybe n zs = case zs of
            [] -> replicate n Nothing
            x  -> map Just x

-- We return a list of lists as the same benchmark may appear more than once if
-- we ran benchmarks for the same package multiple times. This is helpful in
-- comparing the benchmarks for the same package after fixing something.
getResultsForBenchGroup
    :: CSV
    -> (String -> Maybe (String, String))
    -> String
    -> [String]
    -> Maybe [[Maybe Double]]
getResultsForBenchGroup csvData classify groupName bmnames  =
    -- XXX this can be quite inefficient, need to do it better
    -- the map results in a list of lists. Where each inner list consists of
    -- multiple values belonging to the same benchmark, each value is from a
    -- different benchmark run. We transpose the values to get all benchmark
    -- values from a single run together.
    transposeLists groupName $ map getBenchmarkValues bmnames

    where

    match name origName =
        case classify origName of
            Nothing -> False
            Just (g, n) -> g == groupName && n == name

    getBenchmarkValues :: String -> [Double]
    getBenchmarkValues bmname =
        -- Filter all the lines from the CSV data that belong to the given
        -- benchmark group and bmname and read the field value at index 1.
        -- Usually the resulting list will have a single element. However, when
        -- we have multiple runs with the same benchgroup name then we may have
        -- multiple elements in the resulting list.
        map (\(_, y) -> read y)
            $ map (\xs -> (xs !! 0, xs !! 1))
            $ filter (match bmname .  head) csvData

genGraph :: FilePath -> String -> Maybe [Double] -> Config -> CSV -> IO ()
genGraph outfile units yindexes cfg@Config{..} csvData = do
    let origNames = nub $ map head csvData
        bmTuples = catMaybes $ map classifyBenchmark origNames

    let origGroups = nub $ map fst bmTuples
        bmgroups = sortBenchGroups origGroups
    when (bmgroups == []) $ error
        "No benchmark groups to plot. Please check your benchmark \
        \classifier (classifyBenchmarks), group filter (sortBenchGroups) or \
        \the input data"

    let newGroups = bmgroups \\ origGroups
    when (newGroups /= []) $ error $
        "sortBenchGroups cannot add new groups to the original list. The\
        \following new groups were added: " ++ show newGroups

    let rep = getRepeated bmTuples
        z = zip origNames bmTuples
        zrep = filter (\(_, tup) -> tup `elem` rep) z
    when (zrep /= []) $ do
        error $
            "classifyBenchmark cannot map different benchmarks to the same \
            \name under the same group.\n"
            ++ unlines (map show zrep)

    let names = nub $ map snd bmTuples
        bmnames = sortBenchmarks names
    when (bmnames == []) $ error
        "No benchmark names to plot. Please check your benchmark \
        \classifier (classifyBenchmarks), filter (sortBenchmarks) or \
        \the input data"

    let newNames = bmnames \\ names
    when (newNames /= []) $ error $
        "sortBenchmarks cannot add new names to the original list. The\
        \following new names were added: " ++ show newNames

    mapM_ (checkBenchNameInGrps bmgroups bmTuples) bmnames

    -- this produces results for all groups
    -- Each group may have multiple results, in that case the group name is
    -- indexed e.g. streamly(1), streamly(2) etc.
    -- returns [(groupName, Maybe [Maybe Double])]
    let grpResults = concat $ map (grpGetResults bmnames) bmgroups
        filterJust (gName, res) =
            case res of
                Nothing -> do
                    putStrLn $ "Warning! no results found for benchmark group " ++ gName
                    return Nothing
                Just x -> return $ Just (gName, x)
    res0 <- mapM filterJust grpResults
    let res = catMaybes res0
    when (res == []) $ error
        "Each benchmark being plotted must have the same number of results \
        \in the CSV input"

    genGroupGraph outfile units yindexes cfg bmnames res

    where

    grpGetResults bmnames groupName =
        let res = getResultsForBenchGroup csvData classifyBenchmark
                                          groupName bmnames
        in case res of
            Nothing -> [(groupName, Nothing)]
            Just xs ->
                case length xs of
                    0 -> [(groupName, Nothing)]
                    1 -> map (groupName,) (map Just xs)
                    _ -> zipWith (withIndexes groupName) [(1::Int)..]
                                 (map Just xs)
    withIndexes groupName indx y = (groupName ++ "(" ++ show indx ++ ")", y)

    checkBenchNameInGrps bmgroups bmTuples nm =
        let appearsIn = nub $ map fst $ filter (\(_, n) -> nm == n) bmTuples
            xs = bmgroups \\ appearsIn
        in if not (null xs)
           then error $
            "Each benchmark name must appear in all benchmark groups.\n\
            \Benchmark " ++ nm ++ " does not appear in " ++ show xs ++
            "\nPlease check your benchmark classifier (classifyBenchmarks).\n"
           else return ()

eqCaseInsensitive :: String -> String -> Bool
eqCaseInsensitive a b = map toUpper a == map toUpper b

getFieldIndexInLine :: String -> (Int, [String]) -> Maybe (Int, Int)
getFieldIndexInLine fieldName (lineno, fields) =
    fmap (lineno,) $
        findIndex (\x -> eqCaseInsensitive x fieldName) fields

-- can return multiple indexes or empty list
getFieldIndexUnchecked :: String -> [(Int, [String])] -> [(Int, Int)]
getFieldIndexUnchecked fieldName csvlines =
    nubBy ((==) `on` snd) $
        catMaybes $ map (getFieldIndexInLine fieldName) csvlines

getFieldIndexChecked :: String -> [(Int, [String])] -> Int
getFieldIndexChecked fieldName csvlines =
    let idxs = getFieldIndexUnchecked fieldName csvlines
    in case idxs of
        [(_, x)] -> x
        [] -> error $ "Field name [" ++ fieldName
                      ++ "] does not occur in any line of "
                      ++ "the CSV input. Is the header line missing?"
        _ -> error $ "Field [" ++ fieldName
                     ++ "] found at different indexes [(line, column): " ++ show idxs
                     ++ "] in different lines of CSV input"

-- Find the indexes of benchmark name, iterations and the requested field to be
-- plotted, also remove the header lines from the csv input and return the
-- cleaned up lines.
-- Returns
-- ( index of the name field
-- , index of the iter field
-- , index of the field being plotted
-- , CSV lines without the header lines
-- )
--
findIndexes :: String
           -> String
           -> String
           -> [(Int, [String])]
           -> (Int, Maybe Int, Int, [(Int, [String])])
findIndexes benchmarkNameField iterationField fieldName csvlines =
    let fieldIdx = getFieldIndexChecked fieldName csvlines
        iterIdxs = getFieldIndexUnchecked iterationField csvlines
        nameIdxs = getFieldIndexUnchecked benchmarkNameField csvlines
        csvlines' = filter (\(_, xs) ->
            (not $ (xs !! fieldIdx) `eqCaseInsensitive` fieldName)) csvlines
    in case nameIdxs of
        [] -> case iterIdxs of
            [] -> -- just to show an error
                case getFieldIndexChecked benchmarkNameField csvlines of
                    _ -> error "not reached"
            _ ->
                -- Avoid a gauge/csvraw bug where the name field is not
                -- present
                ( 0
                , Just $ getFieldIndexChecked iterationField csvlines + 1
                , fieldIdx + 1
                , csvlines'
                )
        _ ->
            ( getFieldIndexChecked benchmarkNameField csvlines
            , case iterIdxs of
                [] -> Nothing
                _ -> Just $ getFieldIndexChecked iterationField csvlines
            , fieldIdx
            , csvlines'
            )

--  Keep the CSV columns corresponding to the provided indexes and remove the
--  unwanted columns.
extractIndexes :: (Int, Maybe Int, Int, [(Int, [String])]) -> [(Int, [String])]
extractIndexes (nameIdx, iterIdx, fieldIdx, csvlines) =
    let zipList = zipWith (\(l, xs) (_, ys) -> (l, xs ++ ys))
    in map (indexWithError nameIdx "name") csvlines
        `zipList`
            case iterIdx of
                Nothing  -> repeat (1, ["1"])
                Just idx -> map (indexWithError idx "iter") csvlines
        `zipList` map (indexWithError fieldIdx "requested") csvlines
    where

    indexWithError :: Int -> String -> (Int, [String]) -> (Int, [String])
    indexWithError idx colName (l, xs) =
        if (length xs < idx)
        then error $ "Line " ++ show l ++ " " ++ show colName
                     ++ " column at index " ++ show idx ++ " not present"
        else (l, [xs !! idx])

-- XXX display GHC version as well
-- XXX display the OS/arch
-- This data should be in the measurement data
--
-- TODO Specify the units of the field being plotted, this should be mentioned
-- in the CSV header.

-- | The first parameter is an input file containing CSV data as generated by
-- @gauge --csv=results.csv@ or a similar output generated by @criterion@.  The
-- second parameter is the name of the output file containing the graph SVG
-- image. The third parameter is the name of the field that should be plotted.
-- The field is matched with the fields in the header line of the CSV input
-- using a case insensitive match.  The last parameter is the configuration to
-- customize the graph, you can start with 'defaultConfig' as the base and set
-- any of the fields that you may want to change.
--
-- For example:
--
-- @
-- bgraph "results.csv" "Plot mean time" "mean" 'defaultConfig'
-- @
--
-- @since 0.1.0
bgraph :: FilePath -> FilePath -> String -> Config -> IO ()
bgraph inputFile outputFile fieldName cfg@Config{..} = do
    createDirectoryIfMissing True outputDir
    putStrLn $ "Creating chart "
        ++ maybe "" (\x -> "[" ++ x ++ "]") chartTitle
        ++ " at "
        ++ show (outputDir </> outputFile)

    -- We assume the dataset is not big and therefore take liberties to process
    -- in a non-streaming fashion.
    csvData <- parseCSVFromFile inputFile
    case csvData of
        Left e -> error $ show e
        Right csvlines -> do
            -- XXX make the multiplier and units configurable
            -- XXX Use a separate table for the defaults
            let isTimeField =
                    let x = map toUpper fieldName
                    in x == "TIME" || x == "MEAN"
            let isAllocationField =
                    let x = map toUpper fieldName
                    in x == "ALLOCATED" || x == "MAXRSS"
            -- By default the fields are considered "scaled" fields that is
            -- they scale by iterations. However in case of maxrss field it is
            -- a max value across the experiment and does not scale by
            -- iterations, in this case we just need to take a mean or max
            -- without scaling.
            let isMaxField =
                    let x = map toUpper fieldName
                    in x == "MAXRSS"

                (multiplier, units) =
                    case isTimeField of
                        -- XXX automatically use ns/us/ms/sec on the scale
                        -- get the max and convert it to appropriate unit
                        True -> (1000, "ms")
                        False -> case isAllocationField of
                            True -> (1/2^(20 :: Int), "MiB")
                            False -> (1, "")

                -- XXX need the ability to specify Units in the scale
                yindexes =
                    case setYScale of
                        Nothing -> Nothing
                        Just (rangeMin, rangeMax, nInterval) ->
                            let r = (rangeMax - rangeMin)/(fromIntegral nInterval)
                            in case isTimeField of
                                True ->
                                    let r'   = r/1000
                                        rmin = rangeMin/1000
                                    in Just $ take (nInterval + 1) [rmin, rmin + r'..]
                                False -> Just $ take (nInterval + 1) [rangeMin, rangeMin + r..]

                readWithError :: Read a => String -> String -> Int -> (Int, [String]) -> a
                readWithError fname typ idx (lno, xs) =
                    case readMaybe (xs !! idx) of
                        Nothing -> error $ "Cannot read " ++ show fname
                            ++ " field as a " ++ typ ++ " type at line number "
                            ++ show lno
                        Just n -> n

                -- xs is [(lineno, [iter, field])]
                foldToMean xs =
                    let iters  = map (readWithError "iter" "Double" 0) xs :: [Double]
                        values = map (readWithError "requested" "Double" 1) xs :: [Double]
                        mean = sum values / sum iters
                    in show $ mean * multiplier

                foldToMax xs =
                    let values = map (readWithError "requested" "Double" 1) xs :: [Double]
                    in show $ (maximum values) * multiplier

             in   -- Add line numbers for error reporting
                  zip [1..] csvlines
                  -- cleanup blank rows
                & filter (\(_,xs) -> xs /= [""])
                  -- An iteration field indicates that consecutive rows with
                  -- the same benchmark name have results from different
                  -- iterations of the same benchmark and the measurement
                  -- fields have to be scaled per iteration based on the number
                  -- of iterations in the iteration count field.
                & findIndexes "Name" "iters" fieldName
                & extractIndexes
                  -- from here on three elements are guaranteed in each row.
                  -- group successive iterations. If the iteration number does
                  -- not increase then it means it is another benchmark and not
                  -- the next iteration of the previous benchmark. This can
                  -- happen if there is only one benchmark in two separate
                  -- runs.
                & groupBy (\l1@(_, x:_) l2@(_, y:_) -> x == y
                    && ((readWithError "iter" "Int" 1 l2 :: Int) >
                        (readWithError "iter" "Int" 1 l1 :: Int)))
                  -- reduce grouped iterations to a single row with the mean.
                  -- xs below is a list of tuples [(lineno, [name, iter, field]]
                  -- we send [(lineno, [iter, field])] to foldToMean.
                & map (\xs -> [ head $ map (head . snd) xs
                              , if isMaxField
                                then foldToMax  $ map (\(l, ys) -> (l, tail ys)) xs
                                else foldToMean $ map (\(l, ys) -> (l, tail ys)) xs
                              ]
                      )
                 -- XXX send tuples [(String, Double)] instead of [[String]]
                 -- XXX determine the units based on the field name
                 -- We can pass here the units to be displayed by the chart
                & genGraph outputFile units yindexes cfg