-- |
-- Module      : BenchShow.Common
-- Copyright   : (c) 2018 Composewell Technologies
--
-- License     : BSD3
-- Maintainer  : harendra.kumar@gmail.com
-- Stability   : experimental
-- Portability : GHC
--

{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TupleSections #-}

module BenchShow.Common
    ( Presentation(..)
    , GroupStyle(..)
    , FieldTick (..)
    , SortColumn (..)
    , RelativeUnit (..)
    , Estimator (..)
    , DiffStrategy (..)
    , Config(..)
    , defaultConfig

    , getFieldRange
    , getFieldTick

    , GroupMatrix(..)
    , prepareGroupMatrices

    , ReportColumn(..)
    , RawReport(..)
    , ReportType(..)
    , diffString
    , makeTitle
    , prepareToReport
    , reportComparingGroups
    , reportPerGroup
    ) where

import Control.Applicative (ZipList(..))
import Control.Arrow (second)
import Control.Exception (assert)
import Control.Monad (when, unless)
import Data.Char (toLower)
import Data.Foldable (foldl')
import Data.Function ((&), on)
import Data.List
       (transpose, groupBy, (\\), find, sortBy, elemIndex, intersect,
        intersectBy)
import Data.List.Split (linesBy)
import Data.Maybe (fromMaybe, mapMaybe)
import Data.Ord (comparing)
import Debug.Trace (trace)
import Statistics.Types (Estimate(..), ConfInt(..))
import System.Directory (createDirectoryIfMissing)
import System.FilePath ((</>))
import Text.CSV (CSV, parseCSVFromFile)
import Text.Read (readMaybe)

import BenchShow.Analysis

-------------------------------------------------------------------------------
-- Utilities
-------------------------------------------------------------------------------

filterSanity :: (Eq a, Show a) => String -> [a] -> [a] -> IO ()
filterSanity label old new = do
    let added = new \\ old

    when (null new) $ error $
        label ++ " must select at least one item from the list: "
        ++ show old

    unless (null added) $ error $
        label ++
        " must not add any new items to the original list. The \
        \following items were added: " ++ show added

-------------------------------------------------------------------------------

data ReportType = TextReport | GraphicalChart

-- | How to show the results for multiple benchmark groups presented in columns
-- or bar chart clusters.
--
-- @since 0.2.0
data GroupStyle =
      Absolute       -- ^ Show absolute field values for all groups
    | Diff        -- ^ Show baseline group values as usual and values for
                     -- the subsequent groups as differences from the baseline
    | Percent     -- ^ Show baseline group values as 100% and values for
                     -- subsequent groups as a percentage of the baseline
    | PercentDiff -- ^ Show baseline group values as usual and values for
                     -- subsequent groups as precentage difference from the
                     -- baseline
    deriving (Eq, Show)

-- | How to present the reports or graphs. Each report presents a number of
-- benchmarks as rows, it may have, (1) a single column presenting the values
-- for a single field, (2) multiple columns presenting values for different
-- fields, or (3) multiple columns presenting values of the same field for
-- different groups.
--
-- @since 0.2.0
data Presentation =
      Solo              -- ^ Reports are generated for each group and for
                        -- each field selected by the configuration. Each
                        -- report presents benchmarks in a single group with a
                        -- single column presenting a single field.  If there
                        -- are @m@ fields and @n@ groups selected by the
                        -- configuration then a total of @m x n@ reports are
                        -- generated.  Output files are named using
                        -- @-estimator-groupname-fieldname@ as suffix.
    | Groups GroupStyle -- ^ One report is generated for each field selected by
                        -- the configuration. Each report presents a field
                        -- with all the groups selected by the configuration as
                        -- columns or clusters. Output files are named using
                        -- @-estimator-fieldname@ as suffix.
    | Fields            -- ^ One report is generated for each group selected by
                        -- the configuration. Each report presents a group
                        -- with all the fields selected by the configuration as
                        -- columns or clusters. Output files are named using
                        -- @-estimator-groupname@ as suffix.
    deriving (Eq, Show)

-- | FieldTick is used only in visual charts to generate the major ticks on
-- the y-axis. You can specify either the size of a tick ('TickSize') or the
-- total number of ticks ('TickCount').
--
-- @since 0.2.0
data FieldTick =
      TickSize Int  -- ^ Size of a tick, the unit is microseconds for time
                     -- fields, and bytes for space fields.
    | TickCount Int -- ^ Total number of ticks in the range spread.

-- | When sorting and filtering the benchmarks using 'selectBenchmarks' we can
-- choose a column as a sort criterion.  'selectBenchmarks' is provided with
-- the data for the corresponding column which can be used for sorting the
-- benchmarks. The column could be a group or a field depending on the
-- 'Presentation'.
--
-- @since 0.2.0
data SortColumn =
      ColumnIndex Int -- ^ Specify the index of the sort column. Index 0
        -- corresponds to the first @value@ column. In a textual report, the
        -- very first column consists of benchmark names, therefore index 0
        -- addresses the second column of the report.
    | ColumnName (Either String (String, Int)) -- ^ Specify the column using
        -- the name of the group or the field it represents, and the @runId@.
        -- When just the name is enough to uniquely identify the sort column
        -- the 'Left' constructor can be used, otherwise the 'Right'
        -- constructor is used which can use the @runId@ to disambiguate.  In a
        -- 'Fields' presentation, just the field name is enough.  In a 'Groups'
        -- presentation, when there is a single benchmark run in the input
        -- file, just the group name is enough to identify the group, the
        -- @runId@ defaults to 0.  However, when there are multiple runs, a
        -- group needs to specify a @runId@ as well.

-- | Strategy to compute the difference between two groups of benchmarks being
-- compared.
--
-- @since 0.2.0
data DiffStrategy =
      SingleEstimator -- ^ Use a single estimator to compute the difference
                      -- between the baseline and the candidate. The estimator
                      -- that is provided in the 'Config' is used.
    | MinEstimator    -- ^ Use 'Mean', 'Median' and 'Regression' estimators for
                      -- both baseline and candidate, and report the estimator
                      -- that shows the minimum difference. This is more robust
                      -- against random variations.
    {-
    | WorstBest
    | BestBest
    -}

-- | Configuration governing generation of chart. See 'defaultConfig' for the
-- default values of these fields.
--
-- @since 0.2.0
data Config = Config
    {
    -- | Provide more details in the report, especially the standard deviation,
    -- outlier variance, R-square estimate and an annotation to indicate the
    -- actual method used when using 'MinEstimator' are reported.
      verbose :: Bool

    -- | The directory where the output graph or report file should be placed.
    , outputDir   :: Maybe FilePath

    -- | Report title, more information like the plotted field name or
    -- the presentation style may be added to it.
    , title  :: Maybe String

    -- | How to determine the layout of the report or the chart.
    , presentation :: Presentation

    -- | The estimator used for the report.
    , estimator    :: Estimator

    -- | The minimum percentage difference between two runs of a benchmark
    -- beyond which the benchmark is flagged to have regressed or improved.
    , threshold :: Word

    -- | Strategy to compare two runs or groups of benchmarks.
    , diffStrategy  :: DiffStrategy

    ---------------------------------------------------------------------------
    -- Fields (Columns)
    ---------------------------------------------------------------------------

    -- | Filter and reorder the benchmarking fields. It is invoked with a list
    -- of all available benchmarking fields. Only those fields present in the
    -- output of this function are plotted and in that order.
    , selectFields :: [String] -> [String]

    -- | The values in the tuple are @(fieldName, RangeMin, RangeMax)@.
    -- Specify the min and max range of benchmarking fields. If the field
    -- value is outside the range it is clipped to the range limit.
    -- For time fields, the range values are in microseconds, and for space
    -- fields they are in bytes. The minimum of the range is used to determine
    -- the unit for the field.
    , fieldRanges :: [(String, Double, Double)]

    -- | The values in the tuple are @(fieldName, tick)@.  Specify the
    -- tick size of the fields to be used for the graphical reports.
    , fieldTicks :: [(String, FieldTick)]

    ---------------------------------------------------------------------------
    -- Groups (Row Grouping)
    ---------------------------------------------------------------------------

    -- | Filter, group and translate benchmark names. This function is invoked
    -- once for all benchmark names found in the results. It produces a tuple
    -- @(groupname, benchname)@, where @groupname@ is the name of the group the
    -- benchmark should be placed in, and @benchname@ is the translated
    -- benchmark name to be used in the report.  If it returns 'Nothing' for a
    -- benchmark, that benchmark is omitted from the results.
    , classifyBenchmark :: String -> Maybe (String, String)

    -- | Filter and reorder the benchmark group names. A benchmark group may be
    -- assigned using 'classifyBenchmark'; when not assigned, all benchmarks
    -- are placed in the @default@ group. The input to this function is a list
    -- of tuples with benchmark group names and the @runId@s.  The output
    -- produced by this function is a filtered and reordered subset of the
    -- input.  Only those benchmark groups present in the output are rendered
    -- and are presented in that order.
    , selectGroups :: [(String, Int)] -> [(String, Int)]

    ---------------------------------------------------------------------------
    -- Benchmarks (Rows)
    ---------------------------------------------------------------------------

    -- | Filter and reorder benchmarks. 'selectBenchmarks' is provided with a
    -- function which is invoked with a sorting column name or index, the
    -- function produces the benchmark names and corresponding values for that
    -- column which can be used as a sorting criterion. The output of
    -- 'selectBenchmarks' is a list of benchmarks in the order in which they
    -- are to be rendered.
    , selectBenchmarks
        :: (SortColumn -> Either String [(String, Double)])
        -> [String]
    }

-- | Default configuration. Use this as the base configuration and modify the
-- required fields. The defaults are:
--
-- @
--  verbose           = False
--  title             = Nothing
--  outputDir         = Nothing
--  presentation      = Groups Absolute
--  estimator         = Median
--  threshold         = 3
--  diffStrategy      = MinEstimator
--  selectFields      = filter (flip elem ["time", "mean", "maxrss"] . map toLower)
--  fieldRanges       = []
--  fieldTicks        = []
--  classifyBenchmark = Just . ("default",)
--  selectGroups      = id
--  selectBenchmarks  = \f -> either error (map fst) $ f (ColumnIndex 0)
-- @
--
-- @since 0.2.0
defaultConfig :: Config
defaultConfig = Config
    { verbose           = False
    , title             = Nothing
    , outputDir         = Nothing
    , presentation      = Groups Absolute
    , estimator         = Median
    , threshold         = 3
    , diffStrategy      = MinEstimator
    , selectFields      = filter (flip elem ["time", "mean", "maxrss"] . map toLower)
    , fieldRanges       = []
    , fieldTicks        = []
    , classifyBenchmark = Just . ("default",)
    , selectGroups      = id
    , selectBenchmarks  = \f -> either error (map fst) $ f (ColumnIndex 0)
    }

-------------------------------------------------------------------------------
-- Benchmarking field types
-------------------------------------------------------------------------------

timeFields :: [String]
timeFields = map (map toLower)
    [ "time"
    , "mean"
    , "cpuTime"
    , "utime"
    , "stime"
    , "mutatorWallSeconds"
    , "mutatorCpuSeconds"
    , "gcWallSeconds"
    , "gcCpuSeconds"
    ]

isTimeField :: String -> Bool
isTimeField fieldName = map toLower fieldName `elem` timeFields

allocFields :: [String]
allocFields = map (map toLower) ["allocated", "bytesCopied", "maxrss"]

isAllocationField :: String -> Bool
isAllocationField fieldName = map toLower fieldName `elem` allocFields

predictorFields :: [String]
predictorFields = map (map toLower)
    [ "iters"
    -- , "minflt"
    -- , "majflt"
    -- , "nvcsw"
    -- , "nivcsw"
    ]

isPredictorField :: String -> Bool
isPredictorField fieldName = map toLower fieldName `elem` predictorFields

-------------------------------------------------------------------------------
-- Units
-------------------------------------------------------------------------------

-- | Describe a relative unit i.e. a unit in terms of another unit. A relative
-- unit has a label and a ratio which when multiplied with the unit gives us
-- the other unit. For example, if the known time unit is seconds, we can
-- describe a millisecond as @Unit "ms" (1/1000)@.
data RelativeUnit = RelativeUnit String Double deriving Show

getTimeUnit :: Double -> RelativeUnit
getTimeUnit k
    | k < 0      = getTimeUnit (-k)
    | k >= 1     = RelativeUnit "s" 1
    | k >= 1e-3  = RelativeUnit "ms" 1e-3
    | k >= 1e-6  = RelativeUnit "μs" 1e-6
    | otherwise  = RelativeUnit "ns" 1e-9

getSpaceUnit :: Double -> RelativeUnit
getSpaceUnit k
    | k < 0             = getSpaceUnit (-k)
    | k >= 2^(30 ::Int) = RelativeUnit "GiB" (2^(30 :: Int))
    | k >= 2^(20 ::Int) = RelativeUnit "MiB" (2^(20 :: Int))
    | k >= 2^(10 ::Int) = RelativeUnit "KiB" (2^(10 :: Int))
    | otherwise         = RelativeUnit "Bytes" 1

getUnitByFieldName :: String -> Double -> RelativeUnit
getUnitByFieldName fieldName fieldMin =
    case isTimeField fieldName of
        True -> getTimeUnit fieldMin
        False -> case isAllocationField fieldName of
            True -> getSpaceUnit fieldMin
            False -> RelativeUnit "" 1

-- returns (multiplier, units)
fieldUnits :: String -> Double -> GroupStyle -> RelativeUnit
fieldUnits fieldName fieldMin style =
    case style of
        Percent      -> RelativeUnit "%" 1
        PercentDiff  -> RelativeUnit "%" 1
        _ -> getUnitByFieldName fieldName fieldMin

-------------------------------------------------------------------------------
-- Comparison
-------------------------------------------------------------------------------

absoluteDiff :: Num a => a -> a -> a
absoluteDiff v1 v2 = v2 - v1

percentDiff :: (Fractional a, Num a) => a -> a -> a
percentDiff v1 v2 = ((v2 - v1) * 100) / v1

percent :: (Fractional a, Num a) => a -> a -> a
percent v1 v2 = (v2 * 100) / v1

cmpTransformColumns :: ReportType
                    -> GroupStyle
                    -> Estimator
                    -> DiffStrategy
                    -- XXX we do not really need the benchmark name here
                    -> [[(String, AnalyzedField)]]
                    -> (Maybe [[Estimator]], [[(String, Double)]])
cmpTransformColumns rtype style estimator diffStrategy cols =
    let cmpWith diff =
            let firstCol = head columns
                colTransform col =
                    let mkDiff (n1, v1) (n2,v2) =
                            verify (n1 == n2) (n2, diff v1 v2)
                    in zipWith mkDiff firstCol col
            in map colTransform (tail columns)

        cmpMinWith diff =
            let firstCol = head cols
                colTransform col = zipWith (mkMinDiff diff) firstCol col
            in map colTransform (tail cols)
    in case style of
            Absolute    -> (Nothing, columns)
            Percent     -> (Nothing, cmpWith percent)
            Diff        ->
                case diffStrategy of
                    MinEstimator ->
                        let (ests, vals) = unzip $ map unzip (cmpMinWith absoluteDiff)
                        in ( Just $ map (const estimator) (head cols) : ests
                           , head columns : vals
                           )
                    SingleEstimator ->
                        (Nothing, head columns : cmpWith absoluteDiff)
            PercentDiff ->
                -- In a comparative graphical chart we cannot show the absolute
                -- values in the baseline column as the units won't match for
                -- the baseline and the diff clusters.
                let baseCol =
                        case rtype of
                            TextReport -> head columns
                            GraphicalChart | length columns == 1 ->
                                head columns
                            GraphicalChart ->
                                map (\(n,_) -> (n,100)) (head columns)
                in case diffStrategy of
                    MinEstimator ->
                        let (ests, vals) = unzip $ map unzip (cmpMinWith percentDiff)
                        in ( Just $ map (const estimator) (head cols) : ests
                           , baseCol : vals
                           )
                    SingleEstimator ->
                       (Nothing, baseCol : cmpWith percentDiff)
    where
        verify a b = if a then b else error "bug: benchmark names mismatch"
        transformVals = map (map (second (getAnalyzedValue estimator)))
        columns = transformVals cols

        -- Find which estimator gives us the minimum diff
        mkMinDiff diff (n1, v1) (n2,v2) = verify (n1 == n2) $
            let meanDiff = diff (getAnalyzedValue Mean v1)
                                (getAnalyzedValue Mean v2)
                medDiff = diff (getAnalyzedValue Median v1)
                               (getAnalyzedValue Median v2)
                regDiff = diff (getAnalyzedValue Regression v1)
                               (getAnalyzedValue Regression v2)
            in if abs medDiff <= abs meanDiff
               then if abs medDiff <= abs regDiff
                    then (Median, (n2, medDiff))
                    else (Regression, (n2, regDiff))
                else if abs meanDiff <= abs regDiff
                     then (Mean, (n2, meanDiff))
                     else (Regression, (n2, regDiff))

transformColumnNames :: GroupStyle -> [ReportColumn] -> [ReportColumn]
transformColumnNames _ [] = []
transformColumnNames style columns@(h:t) =
    let withDiff = colSuffix baseName h : map (colSuffix diffName) t
    in case style of
            Diff        | length columns > 1 -> withDiff
            PercentDiff | length columns > 1 -> withDiff
            _           -> columns

    where
    colSuffix xl col = col { colName = xl (colName col) }
    baseName        = (++ "(base)")
    diffName        = (++ "(-base)")

-- Represents the data for a single benchmark run
data GroupMatrix = GroupMatrix
    { groupIndex :: Int
    , groupName   :: String
    , groupBenches :: [(String, String)] -- (newname, origname)
    , groupMatrix :: BenchmarkMatrix
    } deriving Show

-- Each run may be split into multiple groups of benchmarks.  Benchmarks can be
-- renamed by the classifier. Sanity checks:
-- Two original benchmarks cannot map to the same target benchmark
--
-- When using a comparative style report, after filtering and sorting:
-- Same original benchmark cannot belong to multiple groups
-- All groups must have exactly the same benchmark names
splitGroup :: (String -> Maybe (String, String))
           -> (Int, BenchmarkMatrix)
           -> [GroupMatrix]
splitGroup classify (serial, matrix@BenchmarkMatrix{..}) =
      mapMaybe (\x -> fmap (,x) $ classify x) (map fst rowValues)
    & sortBy (comparing (fst . fst))
    & groupBy ((==) `on` (fst . fst))
    & map (foldr foldGroup ("",[]))
    & map sanityCheckGroup
    & map (\(name, benches) ->
        GroupMatrix
        { groupIndex  = serial
        , groupName    = name
        , groupBenches = benches
        , groupMatrix  = matrix
        })

    where

    foldGroup ((grp, bench), srcBench) (_, tuples) =
        (grp, (bench, srcBench) : tuples)

    sanityCheckGroup orig@(grp, tuples) =
        let duplicated =
                  sortBy (comparing fst) tuples
                & groupBy ((==) `on` fst)
                & filter ((>1) . length)
        in if not $ null duplicated
           then
            let msg = unlines (map show duplicated)
            in error $ "Two benchmarks must not map to the same target \
               \benchmark. Please check your 'classifyBenchmark' operation. \
               \In group " ++ show grp ++ ", the following target benchmarks \
               \are mapped to more than one source benchmarks:\n" ++ msg
            else orig

-------------------------------------------------------------------------------
-- sort the benchmark groups
-------------------------------------------------------------------------------

findGroup :: [GroupMatrix] -> (String, Int) -> Maybe GroupMatrix
findGroup matrices (name, i) =
    find (\x -> groupName x == name && groupIndex x == i) matrices

sortGroups :: Config -> [GroupMatrix] -> IO [GroupMatrix]
sortGroups Config{..} matrices = do
    let origGroups = map (\x -> (groupName x, groupIndex x)) matrices
        newGroups = selectGroups origGroups

    filterSanity "selectGroups" origGroups newGroups
    return $ mapMaybe (findGroup matrices) newGroups

-------------------------------------------------------------------------------
-- sort the benchmarks
-------------------------------------------------------------------------------

extractColumn :: String -> GroupMatrix -> [(String, AnalyzedField)]
extractColumn field GroupMatrix{..} =
    let idx = elemIndex field (colNames groupMatrix)
        vals = case idx of
            Just i -> map (!! i) (map snd (rowValues groupMatrix))
            Nothing -> error $ "Field [" ++ field
                ++ "] does not exist in group ["
                ++ groupName ++ "] and run id [" ++ show groupIndex ++ "]"
    in zip (map fst groupBenches) vals

extractColumnValue :: String -> GroupMatrix -> Estimator -> [(String, Double)]
extractColumnValue field matrix estimator =
    map (second (getAnalyzedValue estimator)) $ extractColumn field matrix

benchmarkCompareSanity :: [String] -> GroupMatrix -> [String]
benchmarkCompareSanity benchmarks GroupMatrix{..} = do
    let benches = map fst groupBenches
    let absent = benchmarks \\ benches
    let msg =
            "selectBenchmarks: Group [" ++ groupName ++ "] run id ["
            ++ show groupIndex
            ++ "] does not contain the following selected benchmarks; \
            \ignoring them: " ++ show absent
            ++ "\nAvailable benchmarks in this group are: "
            ++ show benches

    if (null absent)
    then benchmarks
    else trace msg (benchmarks \\ absent)

selectBenchmarksByField :: Config
                        -> [GroupMatrix]
                        -> [[(String, Double)]]
                        -> [String]
selectBenchmarksByField Config{..} matrices columns =
    let bmnames = selectBenchmarks extractGroup
    in if (null bmnames)
       then error $ "selectBenchmarks must select at least one benchmark"
       else
           -- XXX instead of matrices we can just use columns here
           let xs = foldl benchmarkCompareSanity bmnames matrices
           in if (null xs)
              then error $ "selectBenchmarks: none of the selected benchmarks "
                    ++ show bmnames
                    ++ " is common to all the benchmark groups "
                    ++ show grpNames
              else xs

    where

    grpNames =
        let getName x = (groupName x, groupIndex x)
        in map getName matrices

    -- columns are benchmark groups in this case
    extractGroup (ColumnName (Left name)) =
            let len = length columns
            in if len <= 1
               then extractGroup $ ColumnName (Right (name, 0))
               else Left $ "selectBenchmarks: there are " ++ show len
                    ++ " runs in the input data, please specify the run \
                    \index [0-" ++ show (len - 1)
                    ++ "] along with the group name."
    extractGroup (ColumnName (Right (name, runId))) =
            extractColumnByGroupName name runId
    extractGroup (ColumnIndex n) = extractColumnByGroupIndex n

    -- The benchmark field is constant.  Extract all benchmark values for the
    -- given field and for the given group.
    findColumnIndex mxs (name, runId) =
        let foldFunc res@(idx, found) grp =
                case found of
                    False ->
                        if groupName grp == name && groupIndex grp == runId
                        then (idx, True)
                        else (idx + 1, found)
                    True -> res
        in foldl foldFunc (0, False) mxs

    extractColumnByGroupName name runId =
            case findColumnIndex matrices (name, runId) of
                (_, False) -> Left $ "Benchmark group name [" ++ name
                            ++ "] and index [" ++ show runId
                            ++ "] not found. Available groups are: "
                            ++ show grpNames
                (i, True) -> extractGroup (ColumnIndex i)

    extractColumnByGroupIndex idx =
        let len = length columns
        in if idx >= len
           then Left $ "Column index must be in the range [0-"
                ++ show (len - 1) ++ "]"
           else Right $ columns !! idx

selectBenchmarksByGroup :: Config -> GroupMatrix -> [String]
selectBenchmarksByGroup Config{..} grp@GroupMatrix{..} =
    -- XXX this is common to ByField and ByGroup
    let bmnames = selectBenchmarks extractField
    in if (null bmnames)
       then error $ "selectBenchmarks must select at least one benchmark"
       else bmnames

    where

    -- columns are benchmark fields in this case
    extractField (ColumnName (Left name)) = extractColumnByFieldName name
    extractField (ColumnName (Right (name, _))) =
        -- XXX runId does not make sense for fields
        extractColumnByFieldName name
    extractField (ColumnIndex n) = extractColumnByFieldIndex n

    -- The benchmark field is constant.  Extract all benchmark values for the
    -- given field and for the given group.
    extractColumnByFieldName name =
        let fields = colNames groupMatrix
        in case elem name fields of
            False -> Left $ "Benchmark field name [" ++ name
                        ++ "] not found in group ["
                        ++ groupName ++ "]. Available fields are: "
                        ++ show fields
            True -> Right $ extractColumnValue name grp estimator

    extractColumnByFieldIndex idx =
        let fields = colNames groupMatrix
            len = length fields
        in if idx >= len
           then Left $ "Column index must be in the range [0-"
                ++ show (len - 1) ++ "]"
           else Right $ extractColumnValue (fields !! idx) grp estimator

type NumberedLines = [(Int, [String])]

sanityCheckCSV :: CSV -> NumberedLines
sanityCheckCSV csvlines | null csvlines = error $ "The input file is empty"
sanityCheckCSV csvlines =
    let headRow = head csvlines
        rowLen = length headRow
    in  if not $ "name" `elem` map (map toLower) headRow
        then error "No 'Name' column found in the CSV header line"
        else
          -- Add line numbers for error reporting
          zip [1..] csvlines
          -- cleanup blank rows
        & filter (\(_,xs) -> xs /= [""])

          -- make sure all lines are of the same size, So that we can transpose
          -- back and forth without losing information.
        & map (\x@(i,xs) ->
               if length xs == rowLen
               then x
               else error $ "Line number " ++ show i
                        ++ " in the input file is not of the same length as\
                            \ the header line"
              )

-- An iteration field indicates that consecutive rows with the same benchmark
-- name have results from different iterations of the same benchmark and the
-- measurement fields have to be scaled per iteration based on the number of
-- iterations in the iteration count field.
--
-- Make sure that "iters" and "name" are the first and second columns
ensureIterField :: ([String], [NumberedLines])  -> ([String], [NumberedLines])
ensureIterField (header, groups) =
    ( "iters" : "name" : filter isNotNameIter header
    , map reorderNameIter groups
    )

    where

    isNotNameIter x =
           map toLower x /= "name"
        && map toLower x /= "iters"

    notNameIters [] = True
    notNameIters (x:_) = isNotNameIter x

    nameNotFound = error "Name field is required in the csv file"

    reorderNameIter csvlines =
          unzip csvlines
        & second (header :)
        & second transpose
        & second reorder
        & second transpose
        & uncurry zip

        where

        reorder xs =
            let findField x = find (\(y:_) -> map toLower y == x)
                iterCol = replicate (length (head xs) - 1) "1"
            in   fromMaybe iterCol (fmap tail $ findField "iters" xs)
               : fromMaybe nameNotFound (fmap tail $ findField "name" xs)
               : map tail (filter notNameIters xs)

-- Only keep those fields that are passed to this function
-- Also, preserve any predictor fields for regression analysis
filterFields :: [String] -> BenchmarkIterMatrix -> BenchmarkIterMatrix
filterFields fieldNames BenchmarkIterMatrix{..} =
    BenchmarkIterMatrix
        { iterPredColNames = ["iters"] ++ filter isPredictorField iterRespColNames
        , iterRespColNames = filter isRequestedField iterRespColNames
        , iterRowValues = transform iterRowValues
        }

    where

    transform :: [(String, [([Double], [Double])])] -> [(String, [([Double], [Double])])]
    transform = map (\(name, tuples) ->
        let (ys, zs) = unzip tuples
            pcols = transpose (map Left iterPredColNames : map (map Right) ys)
            rcols = transpose (map Left iterRespColNames : map (map Right) zs)
            pcols' = pcols ++ filter isPredictor rcols
            rcols' = filter requested rcols
            pcols'' = map (map fromRt) $ tail $ transpose pcols'
            rcols'' = map (map fromRt) $ tail $ transpose rcols'
        in (name, zip pcols'' rcols''))

    fromRt (Right x) = x
    fromRt _ = error "bug"

    isRequestedField = (`elem` fieldNames)

    requested [] = True
    requested (Left x:_) = isRequestedField x
    requested _ = error "bug"

    isPredictor [] = True
    isPredictor (Left x:_) = isPredictorField x
    isPredictor _ = error "bug"

-- Split the file into different runs
-- return the header fields and list of runs without the header
splitRuns :: NumberedLines -> ([String], [NumberedLines])
splitRuns csvlines =
    let header = snd $ head csvlines
        ls = linesBy (\x -> snd x == header) (tail csvlines)
    in (header, ls)

readWithError :: Read a => Int -> String -> (String, String) -> a
readWithError lno typ (fname, fval) =
    case readMaybe fval of
        Nothing -> error $ "Cannot read " ++ show fname
            ++ " field [" ++ show fval ++ "] as "
            ++ typ ++ " type at line number "
            ++ show lno
        Just n -> n

-- An iteration field indicates that consecutive rows with the same benchmark
-- name have results from different iterations of the same benchmark and the
-- measurement fields have to be scaled per iteration based on the number of
-- iterations in the iteration count field.
--
-- If the first column is iteration then fold all iterations and remove the
-- iteration column.
readIterations :: [String] -> NumberedLines -> BenchmarkIterMatrix
readIterations header csvlines =
    let tuples =
            map (parseNumericFields header) csvlines
            -- we now have a list of triples [(iter, name, (fieldName, [Double])]
          & groupBy successiveIters
          & map (foldl' addIters ("",[]))
    in BenchmarkIterMatrix
        { iterPredColNames = ["iters"]
        , iterRespColNames = drop 2 header
        , iterRowValues = tuples
        }

    where

    -- The first column is iters and the second is the name
    -- We zip the header for error reporting
    parseNumericFields hdr (lno, vals) = parseNumericTuples lno $ zip hdr vals

    parseNumericTuples lno (iter:(_,name):xs) =
          (readWithError lno "Int" iter :: Int
          , name
          , map (\x@(n,_) -> (n, readWithError lno "Double" x)) xs
                :: [(String, Double)]
          )
    parseNumericTuples _ _ = error "iters and name fields are needed"

    successiveIters (i1,name1,_) (i2,name2,_) = name2 == name1 && i2 > i1

    addIters (_,siters) (iter,name,vals) =
        (name, ([fromIntegral iter], map snd vals) : siters)

getFieldRange :: String -> Config -> Maybe (Double, Double)
getFieldRange fieldName Config{..} =
    let res = find (\(x, _, _) -> x == fieldName) fieldRanges
    in case res of
        Nothing -> Nothing
        Just (_, x, y) -> Just (x, y)

getFieldTick :: String -> Config -> Maybe FieldTick
getFieldTick fieldName Config{..} =
    fmap snd $ find (\x -> fst x == fieldName) fieldTicks

getReportExtension :: ReportType -> String
getReportExtension rtype =
    case rtype of
        TextReport -> ".txt"
        GraphicalChart -> ".svg"

prepareOutputFile :: FilePath -> ReportType -> FilePath -> Estimator -> String -> IO FilePath
prepareOutputFile dir rtype file est field = do
    let estStr = case est of
            Mean -> "mean"
            Median -> "median"
            Regression -> "coeff"
    let path = dir </> (file ++ "-" ++ estStr ++ "-" ++ field
                             ++ getReportExtension rtype)
    return path

prepareToReport :: FilePath -> Config -> IO (CSV, [String])
prepareToReport inputFile Config{..} = do
    case outputDir of
        Nothing -> return ()
        Just dir -> createDirectoryIfMissing True dir
    -- We assume the dataset is not big and therefore take liberties to process
    -- in a non-streaming fashion.
    csvData <- parseCSVFromFile inputFile
    case csvData of
        Left e -> error $ show e
        Right csvlines -> do
            when (null csvlines) $ error $ "The input file ["
                ++ show inputFile ++ "] is empty"
            let allFields = head csvlines
                fields = selectFields allFields
            filterSanity "selectFields" allFields fields
            let filt x = notElem (map toLower x) ["name", "iters"]
            return (csvlines, filter filt fields)

-- Keep only those benchmarks that belong to the group.
filterGroupBenchmarks :: [GroupMatrix] -> IO [GroupMatrix]
filterGroupBenchmarks matrices = return $ map filterMatrix matrices
    where
    filterMatrix matrix =
        -- XXX make sure there are no duplicates
        let m = groupMatrix matrix
            vals = map (\(new,old) ->
                (new, fromMaybe (error "bug") $ lookup old (rowValues m)))
                (groupBenches matrix)
        in matrix {groupMatrix = m {rowValues = vals}}

_filterCommonSubsets :: [BenchmarkIterMatrix] -> [BenchmarkIterMatrix]
_filterCommonSubsets matrices =
    let commonPreds =
            let initPreds = matrixPreds $ head matrices
            in foldl' intersectPreds initPreds (tail matrices)
    in map (isectCommonPreds commonPreds) matrices

    where

    pcols = iterPredColNames $ head matrices

    cmpPred name v1 v2 =
            case map toLower name of
                "iters" -> v1 == v2
                "nivcsw" -> v1 == v2
                _ -> v1 == v2

    isectBench (name1, preds1) (name2, preds2) =
        let isect row1 row2 = all id $ zipWith3 cmpPred pcols row1 row2
        in assert (name1 == name2) $ (name1, intersectBy isect preds1 preds2)

    matrixPreds = map (second (map fst)) . iterRowValues

    intersectPreds preds matrix = zipWith isectBench preds (matrixPreds matrix)

    isectRows (name1, preds1) (name2, xs) =
        let isect row1 = find (\(x,_) -> all id
                                $ zipWith3 cmpPred pcols row1 x) xs
        in assert (name1 == name2) $ (name1, mapMaybe isect preds1)

    isectCommonPreds preds matrix@BenchmarkIterMatrix{..} =
        matrix
            { iterRowValues = zipWith isectRows preds iterRowValues
            }

-- when comparing make sure all groups have same benchmarks and sort the other
-- ones based on the first column so that they are all in the same order.
selectCommon :: [GroupMatrix] -> IO [GroupMatrix]
selectCommon matrices =
    let commonBenches =
            let initBenches = map fst $ groupBenches $ head matrices
            in foldl' intersectBenches initBenches (tail matrices)
    in mapM (isectCommonBenches commonBenches) matrices

    where

    intersectBenches benches matrix =
        intersect benches (map fst $ groupBenches matrix)

    isectCommonBenches benches matrix@GroupMatrix{..} = do
        let absent = map fst groupBenches \\ benches
            msg =
                "Removing benchmarks " ++ show absent
                ++ " from column [" ++ groupName
                ++ "] run id [" ++ show groupIndex ++ "]"
            lookupBench x = lookup x groupBenches
            findBench x = (x, fromMaybe undefined (lookupBench x))
            newBenches = map findBench benches

        unless (null absent) $ putStrLn msg
        return matrix { groupBenches = newBenches }

prepareGroupMatrices :: Config -> CSV -> [String] -> IO (Int, [GroupMatrix])
prepareGroupMatrices cfg@Config{..} csvlines fields = do
    let (hdr, runs) =
              sanityCheckCSV csvlines
            & splitRuns
            & ensureIterField
    xs <- sequence $ map (readIterations hdr) runs
            & map (filterFields fields)
            -- & _filterCommonSubsets
            -- & map filterSamples
            & map foldBenchmark

    zip [0..] xs
        & map (splitGroup classifyBenchmark)
        & concat
        & sortGroups cfg
        >>= selectCommon
        >>= filterGroupBenchmarks
        >>= return . (length runs,)

-- XXX display GHC version as well
-- XXX display the OS/arch
-- XXX display compiler/RTS options as well e.g. -threaded and -N
-- This data should be in the measurement data

data ReportColumn = ReportColumn
    { colName   :: String
    , colUnit   :: RelativeUnit
    , colValues :: [Double]
    } deriving Show

-- XXX put reportAnalyzed in reportColumns
data RawReport = RawReport
    { reportOutputFile :: Maybe FilePath
    , reportIdentifier :: String
    , reportRowIds     :: [String]
    , reportColumns    :: [ReportColumn]
    , reportAnalyzed   :: [[AnalyzedField]]
    , reportEstimators :: Maybe [[Estimator]]
    } deriving Show

getFieldMin :: Config -> Double -> String -> Double
getFieldMin cfg minval fieldName =
    case getFieldRange fieldName cfg of
        Nothing -> minval
        Just (minr, _) -> minr

scaleAnalyzedField :: RelativeUnit -> AnalyzedField -> AnalyzedField
scaleAnalyzedField (RelativeUnit _ mult) AnalyzedField{..} =
    AnalyzedField
    { analyzedMean = analyzedMean / mult
    , analyzedStdDev = analyzedStdDev / mult

    , analyzedMedian = analyzedMedian / mult
    , analyzedOutliers = analyzedOutliers
    , analyzedOutlierVar = analyzedOutlierVar
    , analyzedKDE = analyzedKDE
    , analyzedRegCoeff = case
        analyzedRegCoeff of
            Nothing -> Nothing
            Just Estimate{..} ->
                let ConfInt{..} = estError
                in Just $ Estimate
                    { estPoint = estPoint / mult
                    , estError = ConfInt
                        { confIntLDX = confIntLDX / mult
                        , confIntUDX = confIntUDX / mult
                        , confIntCL = confIntCL
                        }
                    }
    , analyzedRegRSq = analyzedRegRSq
    }

prepareGroupsReport :: Config
                    -> GroupStyle
                    -> Maybe FilePath
                    -> ReportType
                    -> Int
                    -> String
                    -> [GroupMatrix]
                    -> RawReport
prepareGroupsReport cfg@Config{..} style outfile rtype runs field matrices =
    -- XXX Determine the unit based the whole range of values across all columns
    let sortValues :: [String] -> [(String, a)] -> [a]
        sortValues bmarks vals =
            map (\name -> fromMaybe (error "bug") (lookup name vals)) bmarks

        unsortedCols = map (extractColumn field) matrices

        (estimators, transformedCols) =
            cmpTransformColumns rtype style estimator diffStrategy unsortedCols
        benchmarks = selectBenchmarksByField cfg matrices transformedCols
        sortedCols = map (sortValues benchmarks) transformedCols
        origSortedCols = map (sortValues benchmarks) unsortedCols

        mkColUnits :: [RelativeUnit]
        mkColUnits =
            let cols =
                    if style == Diff || style == PercentDiff
                    -- if we consider diff values as well here then the
                    -- units will change to potentially very small.
                    then [head sortedCols]
                    else sortedCols
                minVal = getFieldMin cfg (minimum $ concat cols) field
            in case (rtype, style) of
                -- In case of percentDiff in TextReport we use absolute
                -- values in the baseline column, so the unit is different.
                (TextReport, PercentDiff) ->
                    let unit = fieldUnits field minVal Absolute
                        punit = fieldUnits field 1 style -- % unit
                    in unit : replicate (length matrices - 1) punit
                (GraphicalChart, PercentDiff) | length matrices == 1 ->
                    [fieldUnits field minVal Absolute]
                _ -> let unit = fieldUnits field minVal style
                     in replicate (length matrices) unit

        mkColValues :: [[Double]]
        mkColValues =
            let applyUnit col (RelativeUnit _ multiplier) =
                    map (/multiplier) col
            in zipWith applyUnit sortedCols mkColUnits

        mkColNames :: [String]
        mkColNames =
                let withSuffix x =
                        groupName x ++
                            if runs > 1
                            then "(" ++ show (groupIndex x) ++ ")"
                            else ""
                    applyUnit name (RelativeUnit label _) =
                        name ++ inParens label
                in zipWith applyUnit (map withSuffix matrices) mkColUnits

        columns = getZipList $ ReportColumn
                    <$> ZipList mkColNames
                    <*> ZipList mkColUnits
                    <*> ZipList mkColValues

    in RawReport
            { reportOutputFile = outfile
            , reportIdentifier = field
            , reportRowIds     = benchmarks
            , reportColumns    = transformColumnNames style columns
            , reportAnalyzed   = zipWith (\x y -> map (scaleAnalyzedField x) y)
                                         mkColUnits origSortedCols
            , reportEstimators = estimators
            }

showStatusMessage :: Show a => Config -> String -> Maybe a -> IO ()
showStatusMessage cfg field outfile =
    let atitle = makeTitle field (diffString (presentation cfg)
                                 (diffStrategy cfg)) cfg
    in case outfile of
        Just path ->
            putStrLn $ "Creating chart "
                ++ "[" ++ atitle ++ "]"
                ++ " at "
                ++ show path
        Nothing -> return ()

reportComparingGroups
    :: GroupStyle
    -> FilePath
    -> Maybe FilePath
    -> ReportType
    -> Int
    -> Config
    -> (RawReport -> Config -> IO ())
    -> [GroupMatrix]
    -> String
    -> IO ()
reportComparingGroups style dir outputFile rtype runs cfg@Config{..} mkReport matrices field = do
    outfile <- case outputFile of
        Just file -> fmap Just $ prepareOutputFile dir rtype file
                                        estimator field
        Nothing -> return Nothing

    let rawReport = prepareGroupsReport cfg style outfile rtype runs field matrices
    showStatusMessage cfg field outfile
    mkReport rawReport cfg

-- Prepare report for a given group, the report would consist of multiple
-- field columns.
prepareFieldsReport :: Config
                 -> Maybe FilePath
                 -> GroupMatrix
                 -> RawReport
prepareFieldsReport cfg@Config{..} outfile group =
    let mkColNames :: [String]
        mkColNames = colNames $ groupMatrix group

        benchmarks = selectBenchmarksByGroup cfg group

        getBenchValues name =
              fromMaybe (error "bug") $
                lookup name (rowValues $ groupMatrix group)

        sortedCols = transpose $ map getBenchValues benchmarks
        minColValues = map (minimum . map (getAnalyzedValue estimator))
                           sortedCols

        mkColUnits :: [RelativeUnit]
        mkColUnits = map (\(x, v) -> getUnitByFieldName x (getFieldMin cfg v x))
                         (zip mkColNames minColValues)

        mkColValues :: [[Double]]
        mkColValues =
            let scaleCol (RelativeUnit _ multiplier) = map (/ multiplier)
            in  zipWith scaleCol mkColUnits
                    (map (map (getAnalyzedValue estimator)) sortedCols)

        addUnitLabel name (RelativeUnit label _) =
            if label /= []
            then name ++ inParens label
            else name
        withUnits xs = zipWith addUnitLabel xs mkColUnits

        columns = getZipList $ ReportColumn
                <$> ZipList (withUnits mkColNames)
                <*> ZipList mkColUnits
                <*> ZipList mkColValues

    in RawReport
            { reportOutputFile = outfile
            , reportIdentifier = groupName group
            , reportRowIds     = benchmarks
            , reportColumns    = columns
            , reportAnalyzed   = sortedCols
            , reportEstimators = Nothing
            }

reportPerGroup
    :: FilePath
    -> Maybe FilePath
    -> ReportType
    -> Config
    -> (RawReport -> Config -> IO ())
    -> GroupMatrix
    -> IO ()
reportPerGroup dir outputFile rtype cfg@Config{..} mkReport group = do
    outfile <- case outputFile of
        Just file -> fmap Just $ prepareOutputFile dir rtype file
                                        estimator (groupName group)
        Nothing -> return Nothing

    let rawReport = prepareFieldsReport cfg outfile group
    showStatusMessage cfg (groupName group) outfile
    mkReport rawReport cfg

-------------------------------------------------------------------------------
-- Utility functions
-------------------------------------------------------------------------------

showDiffStrategy :: DiffStrategy -> String
showDiffStrategy s =
    case s of
        SingleEstimator -> ""
        MinEstimator -> "using min estimator"

diffString :: Presentation -> DiffStrategy -> Maybe String
diffString style s =
    case style of
        Groups Diff        -> Just $ "Diff " ++ showDiffStrategy s
        Groups PercentDiff -> Just $ "Diff " ++ showDiffStrategy s
        _ -> Nothing

inParens :: String -> String
inParens str = "(" ++ str ++ ")"

showEstimator :: Estimator -> String
showEstimator est =
    case est of
        Mean       -> "Mean"
        Median     -> "Median"
        Regression -> "Regression Coeff."

makeTitle :: String -> Maybe String -> Config -> String
makeTitle field diff Config{..} =
       fromMaybe "" title
    ++ inParens field
    ++ inParens (showEstimator estimator)
    ++ case diff of
            Nothing -> ""
            Just str -> inParens str