-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Low-level run time measurement. -- -- A set of tools to accurately measure time performance of Haskell -- programs. perf aims to be lightweight by having minimal dependencies -- on standard libraries. See the Perf module for an example and full API -- documentation. @package perf @version 0.7.0 -- | tick uses the rdtsc chipset to measure time performance of a -- computation. -- -- The measurement unit - a Cycle - is one oscillation of the chip -- crystal as measured by the rdtsc instruction which inspects the -- TSC register. -- -- For reference, a computer with a frequency of 2 GHz means that one -- cycle is equivalent to 0.5 nanoseconds. module Perf.Cycle -- | an unwrapped Word64 type Cycle = Word64 -- | tick_ measures the number of cycles it takes to read the rdtsc chip -- twice: the difference is then how long it took to read the clock the -- second time. -- -- Below are indicative measurements using tick_: -- --
-- >>> onetick <- tick_ -- -- >>> ticks' <- replicateM 10 tick_ -- -- >>> manyticks <- replicateM 1000000 tick_ -- -- >>> let average = L.fold ((/) <$> L.sum <*> L.genericLength) -- -- >>> let avticks = average (fromIntegral <$> manyticks) ---- --
-- one tick_: 78 cycles -- next 10: [20,18,20,20,20,20,18,16,20,20] -- average over 1m: 20.08 cycles -- 99.999% perc: 7,986 -- 99.9% perc: 50.97 -- 99th perc: 24.99 -- 40th perc: 18.37 -- [min, 10th, 20th, .. 90th, max]: -- 12.00 16.60 17.39 17.88 18.37 18.86 19.46 20.11 20.75 23.04 5.447e5 ---- -- The distribution of tick_ measurements is highly skewed, with the -- maximum being around 50k cycles, which is of the order of a GC. The -- important point on the distribution is around the 30th to 50th -- percentile, where you get a clean measure, usually free of GC activity -- and cache miss-fires tick_ :: IO Cycle -- | Warm up the register, to avoid a high first measurement. Without a -- warmup, one or more larger values can occur at the start of a -- measurement spree, and often are in the zone of an L2 miss. -- --
-- >>> t <- tick_ -- first measure can be very high -- -- >>> _ <- warmup 100 -- -- >>> t <- tick_ -- should be around 20 (3k for ghci) --warmup :: Int -> IO Double -- | `tick f a` strictly evaluates f and a, then deeply evaluates f a, -- returning a (Cycle, f a) -- --
-- >>> _ <- warmup 100 -- -- >>> (cs, _) <- tick f a ---- -- Note that feeding the same computation through tick twice may kick off -- sharing (aka memoization aka let floating). Given the importance of -- sharing to GHC optimisations this is the intended behaviour. If you -- want to turn this off then see -fno-full-laziness (and maybe -- -fno-cse). tick :: NFData b => (a -> b) -> a -> IO (Cycle, b) -- | tick where the arguments are lazy, so measurement may include -- evaluation of thunks that may constitute f and/or a tick' :: NFData b => (a -> b) -> a -> IO (Cycle, b) -- | measures and deeply evaluates an `IO a` -- --
-- >>> (cs, _) <- tickIO (pure (f a)) --tickIO :: NFData a => IO a -> IO (Cycle, a) tickNoinline :: NFData b => (a -> b) -> a -> IO (Cycle, b) -- | n measurements of a tick -- -- returns a list of Cycles and the last evaluated f a -- -- GHC is very good at finding ways to share computation, and anything -- measuring a computation multiple times is a prime candidate for -- aggresive ghc treatment. Internally, ticks uses a noinline pragma and -- a noinline version of to help reduce the chances of memoization, but -- this is an inexact science in the hands of he author, at least, so -- interpret with caution. The use of noinline interposes an extra -- function call, which can highly skew very fast computations. -- --
-- >>> let n = 1000 -- -- >>> (cs, fa) <- ticks n f a ---- -- Baseline speed can be highly sensitive to the nature of the function -- trimmings. Polymorphic functions can tend to be slightly slower, and -- functions with lambda expressions can experience dramatic slowdowns. -- --
-- fMono :: Int -> Int -- fMono x = foldl' (+) 0 [1 .. x] -- fPoly :: (Enum b, Num b, Additive b) => b -> b -- fPoly x = foldl' (+) 0 [1 .. x] -- fLambda :: Int -> Int -- fLambda = \x -> foldl' (+) 0 [1 .. x] --ticks :: NFData b => Int -> (a -> b) -> a -> IO ([Cycle], b) -- | n measuremenst of a tickIO -- -- returns an IO tuple; list of Cycles and the last evaluated f a -- --
-- >>> (cs, fa) <- ticksIO n (pure $ f a) --ticksIO :: NFData a => Int -> IO a -> IO ([Cycle], a) -- | make a series of measurements on a list of a's to be applied to f, for -- a tick function. -- -- Tends to be fragile to sharing issues, but very useful to determine -- computation Order -- --
-- ns ticks n f [1,10,100,1000] --ns :: (a -> IO ([Cycle], b)) -> [a] -> IO ([[Cycle]], [b]) -- | WHNF versions tickWHNF :: (a -> b) -> a -> IO (Cycle, b) -- | WHNF version tickWHNF' :: (a -> b) -> a -> IO (Cycle, b) -- | WHNF version tickWHNFIO :: IO a -> IO (Cycle, a) -- | WHNF version ticksWHNF :: Int -> (a -> b) -> a -> IO ([Cycle], b) -- | WHNF version ticksWHNFIO :: Int -> IO a -> IO ([Cycle], a) -- | Specification of a performance measurement type suitable for the -- PerfT monad transformer. module Perf.Measure -- | A Measure consists of a monadic effect prior to measuring, a monadic -- effect to finalise the measurement, and the value measured -- -- For example, the measure specified below will return 1 every time -- measurement is requested, thus forming the base of a simple counter -- for loopy code. -- --
-- >>> let count = Measure 0 (pure ()) (pure 1) --data Measure m b Measure :: b -> m a -> (a -> m b) -> Measure m b [measure] :: Measure m b -> b [prestep] :: Measure m b -> m a [poststep] :: Measure m b -> a -> m b -- | Measure a single effect. -- --
-- >>> r <- runMeasure count (pure "joy") -- -- >>> r -- (1,"joy") --runMeasure :: Monad m => Measure m b -> m a -> m (b, a) -- | Measure once, but run an effect multiple times. -- --
-- >>> r <- runMeasureN 1000 count (pure "joys") -- -- >>> r -- (1,"joys") --runMeasureN :: Monad m => Int -> Measure m b -> m a -> m (b, a) -- | cost of a measurement in terms of the Measure's own units -- --
-- >>> r <- cost count -- -- >>> r -- 1 --cost :: Monad m => Measure m b -> m b -- | a measure using getCPUTime from System.CPUTime (unit is -- picoseconds) -- --
-- >>> r <- runMeasure cputime (pure $ foldl' (+) 0 [0..1000]) ---- --
-- (34000000,500500) --cputime :: Measure IO Integer -- | a measure using getCurrentTime (unit is NominalDiffTime -- which prints as seconds) -- --
-- >>> r <- runMeasure realtime (pure $ foldl' (+) 0 [0..1000]) ---- --
-- (0.000046s,500500) --realtime :: Measure IO NominalDiffTime -- | a Measure used to count iterations -- --
-- >>> r <- runMeasure count (pure ()) -- -- >>> r -- (1,()) --count :: Measure IO Int -- | a Measure using the rdtsc CPU register (units are in -- cycles) -- --
-- >>> r <- runMeasureN 1000 cycles (pure ()) ---- --
-- (120540,()) -- ghci-level -- (18673,()) -- compiled with -O2 --cycles :: Measure IO Cycle -- | Lightweight Additive class. class Num a => Additive a add :: Additive a => a -> a -> a zero :: Additive a => a instance Perf.Measure.Additive GHC.Types.Int instance Perf.Measure.Additive GHC.Integer.Type.Integer instance Perf.Measure.Additive GHC.Word.Word64 instance Perf.Measure.Additive Data.Time.Clock.Internal.NominalDiffTime.NominalDiffTime -- |
-- result <- do -- txt <- readFile "examples/examples.hs" -- let n = Text.length txt -- let x = foldl' (+) 0 [1..n] -- putStrLn $ "sum of one to number of characters is: " <> -- (show x :: Text) -- pure (n, x) ---- -- The same code, instrumented with perf : -- --
-- (result', ms) <- runPerfT $ do -- txt <- perf "file read" cycles $ readFile "examples/examples.hs" -- n <- perf "length" cycles $ pure (Text.length txt) -- x <- perf "sum" cycles $ pure (foldl' (+) 0 [1..n]) -- perf "print to screen" cycles $ -- putStrLn $ "sum of one to number of characters is: " <> -- (show x :: Text) -- pure (n, x) ---- -- Running the code produces a tuple of the original computation results, -- and a Map of performance measurements that were specified. Indicative -- results: -- --
-- file read 4.92e5 cycles -- length 1.60e6 cycles -- print to screen 1.06e5 cycles -- sum 8.12e3 cycles ---- --
-- >>> :set -XOverloadedStrings -- -- >>> (cs, result) <- runPerfT $ perf "sum" cycles (pure $ foldl' (+) 0 [0..10000]) ---- --
-- (50005000,fromList [("sum",562028)])
--
runPerfT :: PerfT m b a -> m (a, Map Text b)
-- | Consume the PerfT layer and return the original monadic result.
-- Fingers crossed, PerfT structure should be completely compiled away.
--
-- -- >>> result <- evalPerfT $ perf "sum" cycles (pure $ foldl' (+) 0 [0..10000]) ---- --
-- 50005000 --evalPerfT :: Monad m => PerfT m b a -> m a -- | Consume a PerfT layer and return the measurement. -- --
-- >>> cs <- execPerfT $ perf "sum" cycles (pure $ foldl' (+) 0 [0..10000]) ---- --
-- fromList [("sum",562028)]
--
execPerfT :: Monad m => PerfT m b a -> m (Map Text b)
instance GHC.Base.Monad m => GHC.Base.Monad (Perf.PerfT m b)
instance GHC.Base.Monad m => GHC.Base.Applicative (Perf.PerfT m b)
instance GHC.Base.Functor m => GHC.Base.Functor (Perf.PerfT m b)
instance Control.Monad.IO.Class.MonadIO m => Control.Monad.IO.Class.MonadIO (Perf.PerfT m b)