{-# LANGUAGE BangPatterns             #-}
{-# LANGUAGE CPP                      #-}
{-# LANGUAGE ForeignFunctionInterface #-}
{-# LANGUAGE GADTs                    #-}
{-# LANGUAGE TemplateHaskell          #-}
--------------------------------------------------------------------------------
-- |
-- Module    : Foreign.CUDA.Runtime.Exec
-- Copyright : [2009..2018] Trevor L. McDonell
-- License   : BSD
--
-- Kernel execution control for C-for-CUDA runtime interface
--
--------------------------------------------------------------------------------

module Foreign.CUDA.Runtime.Exec (

  -- * Kernel Execution
  Fun, FunAttributes(..), FunParam(..), CacheConfig(..),
  attributes, setCacheConfig, launchKernel,

) where

#include "cbits/stubs.h"
{# context lib="cudart" #}

-- Friends
import Foreign.CUDA.Runtime.Stream                      ( Stream(..), defaultStream )
import Foreign.CUDA.Runtime.Error
import Foreign.CUDA.Internal.C2HS

-- System
import Foreign
import Foreign.C
import Control.Monad
import Data.Maybe

#c
typedef struct cudaFuncAttributes cudaFuncAttributes;
#endc


--------------------------------------------------------------------------------
-- Data Types
--------------------------------------------------------------------------------

-- |
-- A @__global__@ device function.
--
-- Note that the use of a string naming a function was deprecated in CUDA 4.1
-- and removed in CUDA 5.0.
--
#if CUDART_VERSION >= 5000
type Fun = FunPtr ()
#else
type Fun = String
#endif


--
-- Function Attributes
--
{# pointer *cudaFuncAttributes as ^ foreign -> FunAttributes nocode #}

data FunAttributes = FunAttributes
  {
    constSizeBytes           :: !Int64,
    localSizeBytes           :: !Int64,
    sharedSizeBytes          :: !Int64,
    maxKernelThreadsPerBlock :: !Int,   -- ^ maximum block size that can be successively launched (based on register usage)
    numRegs                  :: !Int    -- ^ number of registers required for each thread
  }
  deriving (Show)

instance Storable FunAttributes where
  sizeOf _    = {# sizeof cudaFuncAttributes #}
  alignment _ = alignment (undefined :: Ptr ())

  poke _ _    = error "Can not poke Foreign.CUDA.Runtime.FunAttributes"
  peek p      = do
    cs <- cIntConv `fmap` {#get cudaFuncAttributes.constSizeBytes#} p
    ls <- cIntConv `fmap` {#get cudaFuncAttributes.localSizeBytes#} p
    ss <- cIntConv `fmap` {#get cudaFuncAttributes.sharedSizeBytes#} p
    tb <- cIntConv `fmap` {#get cudaFuncAttributes.maxThreadsPerBlock#} p
    nr <- cIntConv `fmap` {#get cudaFuncAttributes.numRegs#} p

    return FunAttributes
      {
        constSizeBytes           = cs,
        localSizeBytes           = ls,
        sharedSizeBytes          = ss,
        maxKernelThreadsPerBlock = tb,
        numRegs                  = nr
      }

#if CUDART_VERSION < 3000
data CacheConfig
#else
-- |
-- Cache configuration preference
--
{# enum cudaFuncCache as CacheConfig
    { }
    with prefix="cudaFuncCachePrefer" deriving (Eq, Show) #}
#endif

-- |
-- Kernel function parameters. Doubles will be converted to an internal float
-- representation on devices that do not support doubles natively.
--
data FunParam where
  IArg :: !Int             -> FunParam
  FArg :: !Float           -> FunParam
  DArg :: !Double          -> FunParam
  VArg :: Storable a => !a -> FunParam


--------------------------------------------------------------------------------
-- Execution Control
--------------------------------------------------------------------------------

-- |
-- Obtain the attributes of the named @__global__@ device function. This
-- itemises the requirements to successfully launch the given kernel.
--
{-# INLINEABLE attributes #-}
attributes :: Fun -> IO FunAttributes
attributes !fn = resultIfOk =<< cudaFuncGetAttributes fn

{-# INLINE cudaFuncGetAttributes #-}
{# fun unsafe cudaFuncGetAttributes
  { alloca-  `FunAttributes' peek*
  , withFun* `Fun'                 } -> `Status' cToEnum #}


-- |
-- On devices where the L1 cache and shared memory use the same hardware
-- resources, this sets the preferred cache configuration for the given device
-- function. This is only a preference; the driver is free to choose a different
-- configuration as required to execute the function.
--
-- Switching between configuration modes may insert a device-side
-- synchronisation point for streamed kernel launches
--
{-# INLINEABLE setCacheConfig #-}
setCacheConfig :: Fun -> CacheConfig -> IO ()
#if CUDART_VERSION < 3000
setCacheConfig _ _       = requireSDK 'setCacheConfig 3.0
#else
setCacheConfig !fn !pref = nothingIfOk =<< cudaFuncSetCacheConfig fn pref

{-# INLINE cudaFuncSetCacheConfig #-}
{# fun unsafe cudaFuncSetCacheConfig
  { withFun*  `Fun'
  , cFromEnum `CacheConfig' } -> `Status' cToEnum #}
#endif


-- |
-- Invoke a kernel on a @(gx * gy)@ grid of blocks, where each block contains
-- @(tx * ty * tz)@ threads and has access to a given number of bytes of shared
-- memory. The launch may also be associated with a specific 'Stream'.
--
{-# INLINEABLE launchKernel #-}
launchKernel
    :: Fun              -- ^ Device function symbol
    -> (Int,Int)        -- ^ grid dimensions
    -> (Int,Int,Int)    -- ^ thread block shape
    -> Int64            -- ^ shared memory per block (bytes)
    -> Maybe Stream     -- ^ (optional) execution stream
    -> [FunParam]
    -> IO ()
#if CUDART_VERSION >= 7000
launchKernel !fn (!gx,!gy) (!bx,!by,!bz) !sm !mst !args
  = (=<<) nothingIfOk
  $ withMany withFP args
  $ \pa -> withArray pa
  $ \pp -> cudaLaunchKernel_simple fn gx gy 1 bx by bz pp sm (fromMaybe defaultStream mst)
  where
    withFP :: FunParam -> (Ptr FunParam -> IO b) -> IO b
    withFP p f = case p of
      IArg v -> with' v (f . castPtr)
      FArg v -> with' v (f . castPtr)
      DArg v -> with' v (f . castPtr)
      VArg v -> with' v (f . castPtr)

    with' :: Storable a => a -> (Ptr a -> IO b) -> IO b
    with' !val !f =
      allocaBytes (sizeOf val) $ \ptr -> do
        poke ptr val
        f ptr

{-# INLINE cudaLaunchKernel_simple #-}
{# fun unsafe cudaLaunchKernel_simple
  { withFun*  `Fun'
  ,           `Int', `Int', `Int'
  ,           `Int', `Int', `Int'
  , castPtr   `Ptr (Ptr FunParam)'
  ,           `Int64'
  , useStream `Stream'
  }
  -> `Status' cToEnum #}

#else
launchKernel !fn (!gx,!gy) (!bx,!by,!bz) !sm !mst !args = do
  setConfig gx gy bx by bz sm (fromMaybe defaultStream mst)
  setParams args
  launch fn


-- Invoke the @__global__@ kernel function on the device. This must be preceded
-- by a call to 'setConfig' and (if appropriate) 'setParams'.
--
{-# INLINE launch #-}
{# fun unsafe cudaLaunch as launch
  { withFun* `Fun' } -> `()' checkStatus*- #}


-- Specify the grid and block dimensions for a device call. Used in conjunction
-- with 'setParams', this pushes data onto the execution stack that will be
-- popped when a function is 'launch'ed.
--
-- The FFI does not support passing deferenced structures to C functions, as
-- this is highly platform/compiler dependent. Wrap our own function stub
-- accepting plain integers.
--
{-# INLINE setConfig #-}
{# fun unsafe cudaConfigureCall_simple as setConfig
  {           `Int', `Int'
  ,           `Int', `Int', `Int'
  , cIntConv  `Int64'
  , useStream `Stream'
  }
  -> `()' checkStatus*- #}

-- |
-- Set the argument parameters that will be passed to the next kernel
-- invocation. This is used in conjunction with 'setConfig' to control kernel
-- execution.
--
{-# INLINEABLE setParams #-}
setParams :: [FunParam] -> IO ()
setParams = foldM_ k 0
  where
    k !offset !arg = do
      let s = size arg
      set arg s offset
      return (offset + s)

    size (IArg _) = sizeOf (undefined :: Int)
    size (FArg _) = sizeOf (undefined :: Float)
    size (DArg _) = sizeOf (undefined :: Double)
    size (VArg a) = sizeOf a

    set (IArg v) s o = cudaSetupArgument v s o
    set (FArg v) s o = cudaSetupArgument v s o
    set (VArg v) s o = cudaSetupArgument v s o
    set (DArg v) s o = do
      d <- cudaSetDoubleForDevice v
      cudaSetupArgument d s o

{-# INLINE cudaSetupArgument #-}
{# fun unsafe cudaSetupArgument
  `Storable a' =>
  { with'* `a'
  ,        `Int'
  ,        `Int'
  }
  -> `()' checkStatus*- #}
  where
    with' v a = with v $ \p -> a (castPtr p)

{-# INLINE cudaSetDoubleForDevice #-}
{# fun unsafe cudaSetDoubleForDevice
  { with'* `Double' peek'* } -> `()' checkStatus*- #}
  where
    with' v a = with v $ \p -> a (castPtr p)
    peek'     = peek . castPtr
#endif

--------------------------------------------------------------------------------
-- Internals
--------------------------------------------------------------------------------

-- CUDA 5.0 changed the type of a kernel function from char* to void*
--
#if CUDART_VERSION >= 5000
withFun :: Fun -> (Ptr a -> IO b) -> IO b
withFun fn action = action (castFunPtrToPtr fn)
#else
withFun :: Fun -> (Ptr CChar -> IO a) -> IO a
withFun           = withCString
#endif