-- |
-- Module      : Data.Array.Accelerate.LLVM.PTX.Debug
-- Copyright   : [2014..2017] Trevor L. McDonell
--               [2014..2014] Vinod Grover (NVIDIA Corporation)
-- License     : BSD3
--
-- Maintainer  : Trevor L. McDonell <tmcdonell@cse.unsw.edu.au>
-- Stability   : experimental
-- Portability : non-portable (GHC extensions)
--

module Data.Array.Accelerate.LLVM.PTX.Debug (

  module Data.Array.Accelerate.Debug,
  module Data.Array.Accelerate.LLVM.PTX.Debug,

) where

import Data.Array.Accelerate.Debug                      hiding ( timed, elapsed )

import Foreign.CUDA.Driver.Stream                       ( Stream )
import qualified Foreign.CUDA.Driver.Event              as Event

import Control.Monad.Trans
import Control.Concurrent
import Data.Time.Clock
import System.CPUTime
import Text.Printf

import GHC.Float


-- | Execute an action and time the results. The second argument specifies how
-- to format the output string given elapsed GPU and CPU time respectively
--
timed
    :: Flag
    -> (Double -> Double -> Double -> String)
    -> Maybe Stream
    -> IO ()
    -> IO ()
{-# INLINE timed #-}
timed f msg =
  monitorProcTime (getFlag f) (\t1 t2 t3 -> traceIO f (msg t1 t2 t3))

monitorProcTime
    :: MonadIO m
    => IO Bool
    -> (Double -> Double -> Double -> IO ())
    -> Maybe Stream
    -> m a
    -> m a
{-# INLINE monitorProcTime #-}
monitorProcTime enabled display stream action = do
  yes <- if debuggingIsEnabled then liftIO enabled else return False
  if yes
    then do
      gpuBegin  <- liftIO $ Event.create []
      gpuEnd    <- liftIO $ Event.create []
      wallBegin <- liftIO $ getCurrentTime
      cpuBegin  <- liftIO $ getCPUTime
      _         <- liftIO $ Event.record gpuBegin stream
      result    <- action
      _         <- liftIO $ Event.record gpuEnd stream
      cpuEnd    <- liftIO $ getCPUTime
      wallEnd   <- liftIO $ getCurrentTime

      -- Wait for the GPU to finish executing then display the timing execution
      -- message. Do this in a separate thread so that the remaining kernels can
      -- be queued asynchronously.
      --
      _         <- liftIO . forkIO $ do
        Event.block gpuEnd
        diff    <- Event.elapsedTime gpuBegin gpuEnd
        let gpuTime  = float2Double $ diff * 1E-3                   -- milliseconds
            cpuTime  = fromIntegral (cpuEnd - cpuBegin) * 1E-12     -- picoseconds
            wallTime = realToFrac (diffUTCTime wallEnd wallBegin)

        Event.destroy gpuBegin
        Event.destroy gpuEnd
        --
        display wallTime cpuTime gpuTime
      --
      return result

    else
      action


{-# INLINE elapsed #-}
elapsed :: Double -> Double -> Double -> String
elapsed wallTime cpuTime gpuTime =
  printf "%s (wall), %s (cpu), %s (gpu)"
    (showFFloatSIBase (Just 3) 1000 wallTime "s")
    (showFFloatSIBase (Just 3) 1000 cpuTime "s")
    (showFFloatSIBase (Just 3) 1000 gpuTime "s")