-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | Low-level run time measurement.
--   
--   A set of tools to accurately measure time performance of Haskell
--   programs. perf aims to be lightweight by having minimal dependencies
--   on standard libraries. See the Perf module for an example and full API
--   documentation.
@package perf
@version 0.6.0


-- | <a>tick</a> uses the rdtsc chipset to measure time performance of a
--   computation.
--   
--   The measurement unit - a <a>Cycle</a> - is one oscillation of the chip
--   crystal as measured by the <a>rdtsc</a> instruction which inspects the
--   TSC register.
--   
--   For reference, a computer with a frequency of 2 GHz means that one
--   cycle is equivalent to 0.5 nanoseconds.
module Perf.Cycle

-- | an unwrapped Word64
type Cycle = Word64

-- | tick_ measures the number of cycles it takes to read the rdtsc chip
--   twice: the difference is then how long it took to read the clock the
--   second time.
--   
--   Below are indicative measurements using tick_:
--   
--   <pre>
--   &gt;&gt;&gt; onetick &lt;- tick_
--   
--   &gt;&gt;&gt; ticks' &lt;- replicateM 10 tick_
--   
--   &gt;&gt;&gt; manyticks &lt;- replicateM 1000000 tick_
--   
--   &gt;&gt;&gt; let average = L.fold ((/) &lt;$&gt; L.sum &lt;*&gt; L.genericLength)
--   
--   &gt;&gt;&gt; let avticks = average (fromIntegral &lt;$&gt; manyticks)
--   </pre>
--   
--   <pre>
--   one tick_: 78 cycles
--   next 10: [20,18,20,20,20,20,18,16,20,20]
--   average over 1m: 20.08 cycles
--   99.999% perc: 7,986
--   99.9% perc: 50.97
--   99th perc:  24.99
--   40th perc:  18.37
--   [min, 10th, 20th, .. 90th, max]:
--   12.00 16.60 17.39 17.88 18.37 18.86 19.46 20.11 20.75 23.04 5.447e5
--   </pre>
--   
--   The distribution of tick_ measurements is highly skewed, with the
--   maximum being around 50k cycles, which is of the order of a GC. The
--   important point on the distribution is around the 30th to 50th
--   percentile, where you get a clean measure, usually free of GC activity
--   and cache miss-fires
tick_ :: IO Cycle

-- | Warm up the register, to avoid a high first measurement. Without a
--   warmup, one or more larger values can occur at the start of a
--   measurement spree, and often are in the zone of an L2 miss.
--   
--   <pre>
--   &gt;&gt;&gt; t &lt;- tick_ -- first measure can be very high
--   
--   &gt;&gt;&gt; _ &lt;- warmup 100
--   
--   &gt;&gt;&gt; t &lt;- tick_ -- should be around 20 (3k for ghci)
--   </pre>
warmup :: Int -> IO Double

-- | `tick f a` strictly evaluates f and a, then deeply evaluates f a,
--   returning a (Cycle, f a)
--   
--   <pre>
--   &gt;&gt;&gt; _ &lt;- warmup 100
--   
--   &gt;&gt;&gt; (cs, _) &lt;- tick f a
--   </pre>
--   
--   Note that feeding the same computation through tick twice may kick off
--   sharing (aka memoization aka let floating). Given the importance of
--   sharing to GHC optimisations this is the intended behaviour. If you
--   want to turn this off then see -fno-full-laziness (and maybe
--   -fno-cse).
tick :: NFData b => (a -> b) -> a -> IO (Cycle, b)

-- | tick where the arguments are lazy, so measurement may include
--   evaluation of thunks that may constitute f and/or a
tick' :: NFData b => (a -> b) -> a -> IO (Cycle, b)

-- | measures and deeply evaluates an `IO a`
--   
--   <pre>
--   &gt;&gt;&gt; (cs, _) &lt;- tickIO (pure (f a))
--   </pre>
tickIO :: NFData a => IO a -> IO (Cycle, a)
tickNoinline :: NFData b => (a -> b) -> a -> IO (Cycle, b)

-- | n measurements of a tick
--   
--   returns a list of Cycles and the last evaluated f a
--   
--   GHC is very good at finding ways to share computation, and anything
--   measuring a computation multiple times is a prime candidate for
--   aggresive ghc treatment. Internally, ticks uses a noinline pragma and
--   a noinline version of to help reduce the chances of memoization, but
--   this is an inexact science in the hands of he author, at least, so
--   interpret with caution. The use of noinline interposes an extra
--   function call, which can highly skew very fast computations.
--   
--   <pre>
--   &gt;&gt;&gt; let n = 1000
--   
--   &gt;&gt;&gt; (cs, fa) &lt;- ticks n f a
--   </pre>
--   
--   Baseline speed can be highly sensitive to the nature of the function
--   trimmings. Polymorphic functions can tend to be slightly slower, and
--   functions with lambda expressions can experience dramatic slowdowns.
--   
--   <pre>
--   fMono :: Int -&gt; Int
--   fMono x = foldl' (+) 0 [1 .. x]
--   fPoly :: (Enum b, Num b, Additive b) =&gt; b -&gt; b
--   fPoly x = foldl' (+) 0 [1 .. x]
--   fLambda :: Int -&gt; Int
--   fLambda = \x -&gt; foldl' (+) 0 [1 .. x]
--   </pre>
ticks :: NFData b => Int -> (a -> b) -> a -> IO ([Cycle], b)

-- | n measuremenst of a tickIO
--   
--   returns an IO tuple; list of Cycles and the last evaluated f a
--   
--   <pre>
--   &gt;&gt;&gt; (cs, fa) &lt;- ticksIO n (pure $ f a)
--   </pre>
ticksIO :: NFData a => Int -> IO a -> IO ([Cycle], a)

-- | make a series of measurements on a list of a's to be applied to f, for
--   a tick function.
--   
--   Tends to be fragile to sharing issues, but very useful to determine
--   computation Order
--   
--   <pre>
--   ns ticks n f [1,10,100,1000]
--   </pre>
ns :: (a -> IO ([Cycle], b)) -> [a] -> IO ([[Cycle]], [b])

-- | WHNF versions
tickWHNF :: (a -> b) -> a -> IO (Cycle, b)

-- | WHNF version
tickWHNF' :: (a -> b) -> a -> IO (Cycle, b)

-- | WHNF version
tickWHNFIO :: IO a -> IO (Cycle, a)

-- | WHNF version
ticksWHNF :: Int -> (a -> b) -> a -> IO ([Cycle], b)

-- | WHNF version
ticksWHNFIO :: Int -> IO a -> IO ([Cycle], a)

-- | average of an Integral foldable
--   
--   <pre>
--   cAv &lt;- average &lt;$&gt; ticks n f a
--   </pre>
average :: (Integral a, Foldable f) => f a -> Double


-- | Specification of a performance measurement type suitable for the
--   <tt>PerfT</tt> monad transformer.
module Perf.Measure

-- | A Measure consists of a monadic effect prior to measuring, a monadic
--   effect to finalise the measurement, and the value measured
--   
--   For example, the measure specified below will return 1 every time
--   measurement is requested, thus forming the base of a simple counter
--   for loopy code.
--   
--   <pre>
--   &gt;&gt;&gt; let count = Measure 0 (pure ()) (pure 1)
--   </pre>
data Measure m b
Measure :: b -> m a -> (a -> m b) -> Measure m b
[measure] :: Measure m b -> b
[prestep] :: Measure m b -> m a
[poststep] :: Measure m b -> a -> m b

-- | Measure a single effect.
--   
--   <pre>
--   &gt;&gt;&gt; r &lt;- runMeasure count (pure "joy")
--   
--   &gt;&gt;&gt; r
--   (1,"joy")
--   </pre>
runMeasure :: Monad m => Measure m b -> m a -> m (b, a)

-- | Measure once, but run an effect multiple times.
--   
--   <pre>
--   &gt;&gt;&gt; r &lt;- runMeasureN 1000 count (pure "joys")
--   
--   &gt;&gt;&gt; r
--   (1,"joys")
--   </pre>
runMeasureN :: Monad m => Int -> Measure m b -> m a -> m (b, a)

-- | cost of a measurement in terms of the Measure's own units
--   
--   <pre>
--   &gt;&gt;&gt; r &lt;- cost count
--   
--   &gt;&gt;&gt; r
--   1
--   </pre>
cost :: Monad m => Measure m b -> m b

-- | a measure using <a>getCPUTime</a> from System.CPUTime (unit is
--   picoseconds)
--   
--   <pre>
--   &gt;&gt;&gt; r &lt;- runMeasure cputime (pure $ foldl' (+) 0 [0..1000])
--   </pre>
--   
--   <pre>
--   (34000000,500500)
--   </pre>
cputime :: Measure IO Integer

-- | a measure using <a>getCurrentTime</a> (unit is <a>NominalDiffTime</a>
--   which prints as seconds)
--   
--   <pre>
--   &gt;&gt;&gt; r &lt;- runMeasure realtime (pure $ foldl' (+) 0 [0..1000])
--   </pre>
--   
--   <pre>
--   (0.000046s,500500)
--   </pre>
realtime :: Measure IO NominalDiffTime

-- | a <a>Measure</a> used to count iterations
--   
--   <pre>
--   &gt;&gt;&gt; r &lt;- runMeasure count (pure ())
--   
--   &gt;&gt;&gt; r
--   (1,())
--   </pre>
count :: Measure IO Int

-- | a <a>Measure</a> using the <a>rdtsc</a> CPU register (units are in
--   cycles)
--   
--   <pre>
--   &gt;&gt;&gt; r &lt;- runMeasureN 1000 cycles (pure ())
--   </pre>
--   
--   <pre>
--   (120540,()) -- ghci-level
--   (18673,())  -- compiled with -O2
--   </pre>
cycles :: Measure IO Cycle

-- | Lightweight <a>Additive</a> class.
class Num a => Additive a
add :: Additive a => a -> a -> a
zero :: Additive a => a
instance Perf.Measure.Additive GHC.Types.Int
instance Perf.Measure.Additive GHC.Integer.Type.Integer
instance Perf.Measure.Additive GHC.Word.Word64
instance Perf.Measure.Additive Data.Time.Clock.Internal.NominalDiffTime.NominalDiffTime


-- | <h2>Introduction</h2>
--   
--   <a>perf</a> provides high-resolution measurements of the runtime of
--   Haskell functions. It does so by reading the RDTSC register (TSC
--   stands for "time stamp counter"), which is present on all x86 CPUs
--   since the Pentium architecture.
--   
--   With <a>perf</a> the user may measure both pure and effectful
--   functions, as shown in the Example below. Every piece of code the user
--   may want to profile is passed as an argument to the <a>perf</a>
--   function, along with a text label (that will be displayed in the final
--   summary) and the measurement function (e.g. <a>cycles</a>,
--   <a>cputime</a> or <a>realtime</a>).
--   
--   <a>PerfT</a> is a monad transformer designed to collect performance
--   information. The transformer can be used to add performance measurent
--   to existing code using <a>Measure</a>s.
--   
--   <h2>Example :</h2>
--   
--   Code block to be profiled :
--   
--   <pre>
--   result &lt;- do
--       txt &lt;- readFile "examples/examples.hs"
--       let n = Text.length txt
--       let x = foldl' (+) 0 [1..n]
--       putStrLn $ "sum of one to number of characters is: " &lt;&gt;
--           (show x :: Text)
--       pure (n, x)
--   </pre>
--   
--   The same code, instrumented with <a>perf</a> :
--   
--   <pre>
--   (result', ms) &lt;- runPerfT $ do
--           txt &lt;- perf "file read" cycles $ readFile "examples/examples.hs"
--           n &lt;- perf "length" cycles $ pure (Text.length txt)
--           x &lt;- perf "sum" cycles $ pure (foldl' (+) 0 [1..n])
--           perf "print to screen" cycles $
--               putStrLn $ "sum of one to number of characters is: " &lt;&gt;
--               (show x :: Text)
--           pure (n, x)
--   </pre>
--   
--   Running the code produces a tuple of the original computation results,
--   and a Map of performance measurements that were specified. Indicative
--   results:
--   
--   <pre>
--   file read                               4.92e5 cycles
--   length                                  1.60e6 cycles
--   print to screen                         1.06e5 cycles
--   sum                                     8.12e3 cycles
--   </pre>
--   
--   <h2>Note on RDTSC</h2>
--   
--   Measuring program runtime with RDTSC comes with a set of caveats, such
--   as portability issues, internal timer consistency in the case of
--   multiprocessor architectures, and flucturations due to power
--   throttling. For more details, see :
--   <a>https://en.wikipedia.org/wiki/Time_Stamp_Counter</a>
module Perf

-- | PerfT is polymorphic in the type of measurement being performed. The
--   monad stores and produces a Map of labelled measurement values
data PerfT m b a

-- | The obligatory transformer over Identity
type Perf b a = PerfT Identity b a

-- | Lift a monadic computation to a PerfT m, providing a label and a
--   <a>Measure</a>.
perf :: (MonadIO m, Additive b) => Text -> Measure m b -> m a -> PerfT m b a

-- | Lift a monadic computation to a PerfT m, and carry out the computation
--   multiple times.
perfN :: (MonadIO m, Monoid b) => Int -> Text -> Measure m b -> m a -> PerfT m b a

-- | Consume the PerfT layer and return a (result, measurement).
--   
--   <pre>
--   &gt;&gt;&gt; :set -XOverloadedStrings
--   
--   &gt;&gt;&gt; (cs, result) &lt;- runPerfT $ perf "sum" cycles (pure $ foldl' (+) 0 [0..10000])
--   </pre>
--   
--   <pre>
--   (50005000,fromList [("sum",562028)])
--   </pre>
runPerfT :: PerfT m b a -> m (a, Map Text b)

-- | Consume the PerfT layer and return the original monadic result.
--   Fingers crossed, PerfT structure should be completely compiled away.
--   
--   <pre>
--   &gt;&gt;&gt; result &lt;- evalPerfT $ perf "sum" cycles (pure $ foldl' (+) 0 [0..10000])
--   </pre>
--   
--   <pre>
--   50005000
--   </pre>
evalPerfT :: Monad m => PerfT m b a -> m a

-- | Consume a PerfT layer and return the measurement.
--   
--   <pre>
--   &gt;&gt;&gt; cs &lt;- execPerfT $ perf "sum" cycles (pure $ foldl' (+) 0 [0..10000])
--   </pre>
--   
--   <pre>
--   fromList [("sum",562028)]
--   </pre>
execPerfT :: Monad m => PerfT m b a -> m (Map Text b)
instance GHC.Base.Monad m => GHC.Base.Monad (Perf.PerfT m b)
instance GHC.Base.Monad m => GHC.Base.Applicative (Perf.PerfT m b)
instance GHC.Base.Functor m => GHC.Base.Functor (Perf.PerfT m b)
instance Control.Monad.IO.Class.MonadIO m => Control.Monad.IO.Class.MonadIO (Perf.PerfT m b)