{-# LANGUAGE CPP #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE ScopedTypeVariables #-} module TastyBench where import Prelude hiding (Int, Integer) import qualified Prelude import Control.DeepSeq (NFData, force) import Control.Exception (evaluate) import Data.Data (Typeable) import Data.Word (Word64) #if !MIN_VERSION_base(4,10,0) import Data.Int ( Int64 ) #endif #if MIN_VERSION_base(4,6,0) import GHC.Stats #endif import System.CPUTime import System.IO import System.IO.Unsafe import System.Mem import Text.Printf (printf) data Timeout = Timeout Prelude.Integer -- ^ number of microseconds (e. g., 200000) String -- ^ textual representation (e. g., @"0.2s"@) | NoTimeout deriving (Show) -- | In addition to @--stdev@ command-line option, -- one can adjust target relative standard deviation -- for individual benchmarks and groups of benchmarks -- using 'adjustOption' and 'localOption'. -- -- E. g., set target relative standard deviation to 2% as follows: -- -- > import Test.Tasty (localOption) -- > localOption (RelStDev 0.02) (bgroup [...]) -- -- If you set 'RelStDev' to infinity, -- a benchmark will be executed -- only once and its standard deviation will be recorded as zero. -- This is rather a blunt approach, but it might be a necessary evil -- for extremely long benchmarks. If you wish to run all benchmarks -- only once, use command-line option @--stdev@ @Infinity@. -- -- @since 0.2 newtype RelStDev = RelStDev Double deriving (Show, Read, Typeable) -- | Whether to measure CPU time or wall-clock time. -- Normally 'CpuTime' is a better option (and default), -- but consider switching to 'WallTime' -- to measure multithreaded algorithms or time spent in external processes. -- -- One can switch the default measurement mode globally -- using @--time-mode@ command-line option, -- but it is usually better to adjust the mode locally: -- -- > import Test.Tasty (localOption) -- > localOption WallTime (bgroup [...]) -- -- @since 0.3.2 data TimeMode = CpuTime -- ^ Measure CPU time. deriving (Typeable) -- | Something that can be benchmarked, produced by 'nf', 'whnf', 'nfIO', 'whnfIO', -- 'nfAppIO', 'whnfAppIO' below. -- -- Drop-in replacement for @Criterion.@'Criterion.Benchmarkable' and -- @Gauge.@'Gauge.Benchmarkable'. -- -- @since 0.1 newtype Benchmarkable = -- | @since 0.3 Benchmarkable { unBenchmarkable :: Word64 -> IO () -- ^ Run benchmark given number of times. } deriving (Typeable) data Measurement = Measurement { measTime :: !Word64 -- ^ time in picoseconds , measAllocs :: !Word64 -- ^ allocations in bytes , measCopied :: !Word64 -- ^ copied bytes , measMaxMem :: !Word64 -- ^ max memory in use } deriving (Show, Read) data Estimate = Estimate { estMean :: !Measurement , estStdev :: !Word64 -- ^ stdev in picoseconds } deriving (Show, Read) predict :: Measurement -- ^ time for one run -> Measurement -- ^ time for two runs -> Estimate predict (Measurement t1 a1 c1 m1) (Measurement t2 a2 c2 m2) = Estimate { estMean = Measurement t (fit a1 a2) (fit c1 c2) (max m1 m2) , estStdev = truncate (sqrt d :: Double) } where fit x1 x2 = x1 `quot` 5 + 2 * (x2 `quot` 5) t = fit t1 t2 sqr x = x * x d = sqr (word64ToDouble t1 - word64ToDouble t) + sqr (word64ToDouble t2 - 2 * word64ToDouble t) predictPerturbed :: Measurement -> Measurement -> Estimate predictPerturbed t1 t2 = Estimate { estMean = estMean (predict t1 t2) , estStdev = max (estStdev (predict (lo t1) (hi t2))) (estStdev (predict (hi t1) (lo t2))) } where prec = max (fromInteger cpuTimePrecision) 1000000000 -- 1 ms hi meas = meas { measTime = measTime meas + prec } lo meas = meas { measTime = measTime meas - prec } hasGCStats :: Bool #if MIN_VERSION_base(4,10,0) hasGCStats = unsafePerformIO getRTSStatsEnabled #elif MIN_VERSION_base(4,6,0) hasGCStats = unsafePerformIO getGCStatsEnabled #else hasGCStats = False #endif getAllocsAndCopied :: IO (Word64, Word64, Word64) getAllocsAndCopied = do if not hasGCStats then pure (0, 0, 0) else #if MIN_VERSION_base(4,10,0) (\s -> (allocated_bytes s, copied_bytes s, max_mem_in_use_bytes s)) <$> getRTSStats #elif MIN_VERSION_base(4,6,0) (\s -> (int64ToWord64 $ bytesAllocated s, int64ToWord64 $ bytesCopied s, int64ToWord64 $ peakMegabytesAllocated s * 1024 * 1024)) <$> getGCStats #else pure (0, 0, 0) #endif getTimePicoSecs :: TimeMode -> IO Word64 getTimePicoSecs timeMode = case timeMode of CpuTime -> fromInteger <$> getCPUTime measure :: TimeMode -> Word64 -> Benchmarkable -> IO Measurement measure timeMode n (Benchmarkable act) = do let getTimePicoSecs' = getTimePicoSecs timeMode performGC startTime <- getTimePicoSecs' (startAllocs, startCopied, startMaxMemInUse) <- getAllocsAndCopied act n endTime <- getTimePicoSecs' (endAllocs, endCopied, endMaxMemInUse) <- getAllocsAndCopied let meas = Measurement { measTime = endTime - startTime , measAllocs = endAllocs - startAllocs , measCopied = endCopied - startCopied , measMaxMem = max endMaxMemInUse startMaxMemInUse } pure meas measureUntil :: TimeMode -> Bool -> Timeout -> RelStDev -> Benchmarkable -> IO Estimate measureUntil timeMode _ _ (RelStDev targetRelStDev) b | isInfinite targetRelStDev, targetRelStDev > 0 = do t1 <- measure timeMode 1 b pure $ Estimate { estMean = t1, estStdev = 0 } measureUntil timeMode warnIfNoTimeout timeout (RelStDev targetRelStDev) b = do t1 <- measure' 1 b go 1 t1 0 where measure' = measure timeMode go :: Word64 -> Measurement -> Word64 -> IO Estimate go n t1 sumOfTs = do t2 <- measure' (2 * n) b let Estimate (Measurement meanN allocN copiedN maxMemN) stdevN = predictPerturbed t1 t2 isTimeoutSoon = case timeout of NoTimeout -> False -- multiplying by 12/10 helps to avoid accidental timeouts Timeout micros _ -> (sumOfTs' + 3 * measTime t2) `quot` (1000000 * 10 `quot` 12) >= fromInteger micros isStDevInTargetRange = stdevN < truncate (max 0 targetRelStDev * word64ToDouble meanN) scale = (`quot` n) sumOfTs' = sumOfTs + measTime t1 case timeout of NoTimeout | warnIfNoTimeout, sumOfTs' + measTime t2 > 100 * 1000000000000 -> hPutStrLn stderr "This benchmark takes more than 100 seconds. Consider setting --timeout, if this is unexpected (or to silence this warning)." _ -> pure () if isStDevInTargetRange || isTimeoutSoon then pure $ Estimate { estMean = Measurement (scale meanN) (scale allocN) (scale copiedN) maxMemN , estStdev = scale stdevN } else go (2 * n) t2 sumOfTs' -- | An internal routine to measure CPU execution time in seconds -- for a given timeout (put 'NoTimeout', or 'mkTimeout' 100000000 for 100 seconds) -- and a target relative standard deviation -- (put 'RelStDev' 0.05 for 5% or 'RelStDev' (1/0) to run only one iteration). -- -- 'Timeout' takes soft priority over 'RelStDev': this function prefers -- to finish in time even if at cost of precision. However, timeout is guidance -- not guarantee: 'measureCpuTime' can take longer, if there is not enough time -- to run at least thrice or an iteration takes unusually long. -- -- @since 0.3 measureCpuTime :: Timeout -> RelStDev -> Benchmarkable -> IO Double measureCpuTime = ((fmap ((/ 1e12) . word64ToDouble . measTime . estMean) .) .) . measureUntil CpuTime False funcToBench :: (b -> c) -> (a -> b) -> a -> Benchmarkable funcToBench frc = (Benchmarkable .) . benchLoop where -- Here we rely on the fact that GHC (unless spurred by -- -fstatic-argument-transformation) is not smart enough: -- it does not notice that `f` and `x` arguments are loop invariant -- and could be floated, and the whole `f x` expression shared. -- If we create a closure with `f` and `x` bound in the environment, -- then GHC is smart enough to share computation of `f x`. -- -- For perspective, gauge and criterion < 1.4 mark similar functions as INLINE, -- while criterion >= 1.4 switches to NOINLINE. -- If we mark `benchLoop` NOINLINE then benchmark results are slightly larger -- (noticeable in bench-fibo), because the loop body is slightly bigger, -- since GHC does not unbox numbers or inline `Eq @Word64` dictionary. -- -- This function is called `benchLoop` instead of, say, `go`, -- so it is easier to spot in Core dumps. benchLoop f x n | n == 0 = pure () | otherwise = do _ <- evaluate (frc (f x)) benchLoop f x (n - 1) {-# INLINE funcToBench #-} -- | 'nf' @f@ @x@ measures time to compute -- a normal form (by means of 'force') of an application of @f@ to @x@. -- This does not include time to evaluate @f@ or @x@ themselves. -- Ideally @x@ should be a primitive data type like 'Data.Int.Int'. -- -- The same thunk of @x@ is shared by multiple calls of @f@. We cannot evaluate -- @x@ beforehand: there is no 'NFData' @a@ constraint, and potentially @x@ may -- be an infinite structure. Thus @x@ will be evaluated in course of the first -- application of @f@. This noisy measurement is to be discarded soon, -- but if @x@ is not a primitive data type, consider forcing its evaluation -- separately, e. g., via 'env' or 'withResource'. -- -- Here is a textbook anti-pattern: 'nf' 'sum' @[1..1000000]@. -- Since an input list is shared by multiple invocations of 'sum', -- it will be allocated in memory in full, putting immense pressure -- on garbage collector. Also no list fusion will happen. -- A better approach is 'nf' (@\\n@ @->@ 'sum' @[1..n]@) @1000000@. -- -- If you are measuring an inlinable function, -- it is prudent to ensure that its invocation is fully saturated, -- otherwise inlining will not happen. That's why one can often -- see 'nf' (@\\n@ @->@ @f@ @n@) @x@ instead of 'nf' @f@ @x@. -- Same applies to rewrite rules. -- -- While @tasty-bench@ is capable to perform micro- and even nanobenchmarks, -- such measurements are noisy and involve an overhead. Results are more reliable -- when @f@ @x@ takes at least several milliseconds. -- -- Note that forcing a normal form requires an additional -- traverse of the structure. In certain scenarios (imagine benchmarking 'tail'), -- especially when 'NFData' instance is badly written, -- this traversal may take non-negligible time and affect results. -- -- Drop-in replacement for @Criterion.@'Criterion.nf' and -- @Gauge.@'Gauge.nf'. -- -- @since 0.1 nf :: NFData b => (a -> b) -> a -> Benchmarkable nf = funcToBench force {-# INLINE nf #-} -- | 'whnf' @f@ @x@ measures time to compute -- a weak head normal form of an application of @f@ to @x@. -- This does not include time to evaluate @f@ or @x@ themselves. -- Ideally @x@ should be a primitive data type like 'Data.Int.Int'. -- -- The same thunk of @x@ is shared by multiple calls of @f@. We cannot evaluate -- @x@ beforehand: there is no 'NFData' @a@ constraint, and potentially @x@ may -- be an infinite structure. Thus @x@ will be evaluated in course of the first -- application of @f@. This noisy measurement is to be discarded soon, -- but if @x@ is not a primitive data type, consider forcing its evaluation -- separately, e. g., via 'env' or 'withResource'. -- -- Computing only a weak head normal form is -- rarely what intuitively is meant by "evaluation". -- Beware that many educational materials contain examples with 'whnf': -- this is a wrong default. -- Unless you understand precisely, what is measured, -- it is recommended to use 'nf' instead. -- -- Here is a textbook anti-pattern: 'whnf' ('Data.List.replicate' @1000000@) @1@. -- This will succeed in a matter of nanoseconds, because weak head -- normal form forces only the first element of the list. -- -- Drop-in replacement for @Criterion.@'Criterion.whnf' and @Gauge.@'Gauge.whnf'. -- -- @since 0.1 whnf :: (a -> b) -> a -> Benchmarkable whnf = funcToBench id {-# INLINE whnf #-} ioToBench :: (b -> c) -> IO b -> Benchmarkable ioToBench frc act = Benchmarkable go where go n | n == 0 = pure () | otherwise = do val <- act _ <- evaluate (frc val) go (n - 1) {-# INLINE ioToBench #-} -- | 'nfIO' @x@ measures time to evaluate side-effects of @x@ -- and compute its normal form (by means of 'force'). -- -- Pure subexpression of an effectful computation @x@ -- may be evaluated only once and get cached. -- To avoid surprising results it is usually preferable -- to use 'nfAppIO' instead. -- -- Note that forcing a normal form requires an additional -- traverse of the structure. In certain scenarios, -- especially when 'NFData' instance is badly written, -- this traversal may take non-negligible time and affect results. -- -- A typical use case is 'nfIO' ('readFile' @"foo.txt"@). -- However, if your goal is not to benchmark I\/O per se, -- but just read input data from a file, it is cleaner to -- use 'env' or 'withResource'. -- -- Drop-in replacement for @Criterion.@'Criterion.nfIO' and @Gauge.@'Gauge.nfIO'. -- -- @since 0.1 nfIO :: NFData a => IO a -> Benchmarkable nfIO = ioToBench force {-# INLINE nfIO #-} -- | 'whnfIO' @x@ measures time to evaluate side-effects of @x@ -- and compute its weak head normal form. -- -- Pure subexpression of an effectful computation @x@ -- may be evaluated only once and get cached. -- To avoid surprising results it is usually preferable -- to use 'whnfAppIO' instead. -- -- Computing only a weak head normal form is -- rarely what intuitively is meant by "evaluation". -- Unless you understand precisely, what is measured, -- it is recommended to use 'nfIO' instead. -- -- Lazy I\/O is treacherous. -- If your goal is not to benchmark I\/O per se, -- but just read input data from a file, it is cleaner to -- use 'env' or 'withResource'. -- -- Drop-in replacement for @Criterion.@'Criterion.whnfIO' and @Gauge.@'Gauge.whnfIO'. -- -- @since 0.1 whnfIO :: IO a -> Benchmarkable whnfIO = ioToBench id {-# INLINE whnfIO #-} ioFuncToBench :: (b -> c) -> (a -> IO b) -> a -> Benchmarkable ioFuncToBench frc = (Benchmarkable .) . go where go f x n | n == 0 = pure () | otherwise = do val <- f x _ <- evaluate (frc val) go f x (n - 1) {-# INLINE ioFuncToBench #-} -- | 'nfAppIO' @f@ @x@ measures time to evaluate side-effects of -- an application of @f@ to @x@. -- and compute its normal form (by means of 'force'). -- This does not include time to evaluate @f@ or @x@ themselves. -- Ideally @x@ should be a primitive data type like 'Data.Int.Int'. -- -- The same thunk of @x@ is shared by multiple calls of @f@. We cannot evaluate -- @x@ beforehand: there is no 'NFData' @a@ constraint, and potentially @x@ may -- be an infinite structure. Thus @x@ will be evaluated in course of the first -- application of @f@. This noisy measurement is to be discarded soon, -- but if @x@ is not a primitive data type, consider forcing its evaluation -- separately, e. g., via 'env' or 'withResource'. -- -- Note that forcing a normal form requires an additional -- traverse of the structure. In certain scenarios, -- especially when 'NFData' instance is badly written, -- this traversal may take non-negligible time and affect results. -- -- A typical use case is 'nfAppIO' 'readFile' @"foo.txt"@. -- However, if your goal is not to benchmark I\/O per se, -- but just read input data from a file, it is cleaner to -- use 'env' or 'withResource'. -- -- Drop-in replacement for @Criterion.@'Criterion.nfAppIO' and @Gauge.@'Gauge.nfAppIO'. -- -- @since 0.1 nfAppIO :: NFData b => (a -> IO b) -> a -> Benchmarkable nfAppIO = ioFuncToBench force {-# INLINE nfAppIO #-} -- | 'whnfAppIO' @f@ @x@ measures time to evaluate side-effects of -- an application of @f@ to @x@. -- and compute its weak head normal form. -- This does not include time to evaluate @f@ or @x@ themselves. -- Ideally @x@ should be a primitive data type like 'Data.Int.Int'. -- -- The same thunk of @x@ is shared by multiple calls of @f@. We cannot evaluate -- @x@ beforehand: there is no 'NFData' @a@ constraint, and potentially @x@ may -- be an infinite structure. Thus @x@ will be evaluated in course of the first -- application of @f@. This noisy measurement is to be discarded soon, -- but if @x@ is not a primitive data type, consider forcing its evaluation -- separately, e. g., via 'env' or 'withResource'. -- -- Computing only a weak head normal form is -- rarely what intuitively is meant by "evaluation". -- Unless you understand precisely, what is measured, -- it is recommended to use 'nfAppIO' instead. -- -- Lazy I\/O is treacherous. -- If your goal is not to benchmark I\/O per se, -- but just read input data from a file, it is cleaner to -- use 'env' or 'withResource'. -- -- Drop-in replacement for @Criterion.@'Criterion.whnfAppIO' and @Gauge.@'Gauge.whnfAppIO'. -- -- @since 0.1 whnfAppIO :: (a -> IO b) -> a -> Benchmarkable whnfAppIO = ioFuncToBench id {-# INLINE whnfAppIO #-} word64ToDouble :: Word64 -> Double word64ToDouble = fromIntegral #if !MIN_VERSION_base(4,10,0) && MIN_VERSION_base(4,6,0) int64ToWord64 :: Int64 -> Word64 int64ToWord64 = fromIntegral #endif prettyEstimate :: Estimate -> String prettyEstimate (Estimate m stdev) = showPicos4 (measTime m) ++ (if stdev == 0 then " " else " ± " ++ showPicos3 (2 * stdev)) -- | Show picoseconds, fitting number in 4 characters. showPicos4 :: Word64 -> String showPicos4 i | t < 995 = printf "%3.0f ps" t | t < 995e1 = printf "%4.2f ns" (t / 1e3) | t < 995e2 = printf "%4.1f ns" (t / 1e3) | t < 995e3 = printf "%3.0f ns" (t / 1e3) | t < 995e4 = printf "%4.2f μs" (t / 1e6) | t < 995e5 = printf "%4.1f μs" (t / 1e6) | t < 995e6 = printf "%3.0f μs" (t / 1e6) | t < 995e7 = printf "%4.2f ms" (t / 1e9) | t < 995e8 = printf "%4.1f ms" (t / 1e9) | t < 995e9 = printf "%3.0f ms" (t / 1e9) | otherwise = printf "%4.3f s" (t / 1e12) where t = word64ToDouble i -- | Show picoseconds, fitting number in 3 characters. showPicos3 :: Word64 -> String showPicos3 i | t < 995 = printf "%3.0f ps" t | t < 995e1 = printf "%3.1f ns" (t / 1e3) | t < 995e3 = printf "%3.0f ns" (t / 1e3) | t < 995e4 = printf "%3.1f μs" (t / 1e6) | t < 995e6 = printf "%3.0f μs" (t / 1e6) | t < 995e7 = printf "%3.1f ms" (t / 1e9) | t < 995e9 = printf "%3.0f ms" (t / 1e9) | otherwise = printf "%4.2f s" (t / 1e12) where t = word64ToDouble i prettyEstimateWithGC :: Estimate -> String prettyEstimateWithGC (Estimate m stdev) = showPicos4 (measTime m) ++ (if stdev == 0 then ", " else " ± " ++ showPicos3 (2 * stdev) ++ ", ") ++ showBytes (measAllocs m) ++ " allocated, " ++ showBytes (measCopied m) ++ " copied, " ++ showBytes (measMaxMem m) ++ " peak memory" csvEstimate :: Estimate -> String csvEstimate (Estimate m stdev) = show (measTime m) ++ "," ++ show (2 * stdev) csvEstimateWithGC :: Estimate -> String csvEstimateWithGC (Estimate m stdev) = show (measTime m) ++ "," ++ show (2 * stdev) ++ "," ++ show (measAllocs m) ++ "," ++ show (measCopied m) ++ "," ++ show (measMaxMem m) showBytes :: Word64 -> String showBytes i | t < 1000 = printf "%3.0f B " t | t < 10189 = printf "%3.1f KB" (t / 1024) | t < 1023488 = printf "%3.0f KB" (t / 1024) | t < 10433332 = printf "%3.1f MB" (t / 1048576) | t < 1048051712 = printf "%3.0f MB" (t / 1048576) | t < 10683731149 = printf "%3.1f GB" (t / 1073741824) | t < 1073204953088 = printf "%3.0f GB" (t / 1073741824) | t < 10940140696372 = printf "%3.1f TB" (t / 1099511627776) | t < 1098961871962112 = printf "%3.0f TB" (t / 1099511627776) | t < 11202704073084108 = printf "%3.1f PB" (t / 1125899906842624) | t < 1125336956889202624 = printf "%3.0f PB" (t / 1125899906842624) | t < 11471568970838126592 = printf "%3.1f EB" (t / 1152921504606846976) | otherwise = printf "%3.0f EB" (t / 1152921504606846976) where t = word64ToDouble i