-- GENERATED by C->Haskell Compiler, version 0.28.6 Switcheroo, 25 November 2017 (Haskell)
-- Edit the ORIGNAL .chs file instead!


{-# LINE 1 "src/Foreign/CUDA/Runtime/Exec.chs" #-}
{-# LANGUAGE BangPatterns             #-}
{-# LANGUAGE CPP                      #-}
{-# LANGUAGE ForeignFunctionInterface #-}
{-# LANGUAGE GADTs                    #-}
{-# LANGUAGE TemplateHaskell          #-}
--------------------------------------------------------------------------------
-- |
-- Module    : Foreign.CUDA.Runtime.Exec
-- Copyright : [2009..2018] Trevor L. McDonell
-- License   : BSD
--
-- Kernel execution control for C-for-CUDA runtime interface
--
--------------------------------------------------------------------------------

module Foreign.CUDA.Runtime.Exec (

  -- * Kernel Execution
  Fun, FunAttributes(..), FunParam(..), CacheConfig(..),
  attributes, setCacheConfig, launchKernel,

) where
import qualified Foreign.C.Types as C2HSImp
import qualified Foreign.ForeignPtr as C2HSImp
import qualified Foreign.Ptr as C2HSImp
import qualified Foreign.Storable as C2HSImp





{-# LINE 25 "src/Foreign/CUDA/Runtime/Exec.chs" #-}


-- Friends
import Foreign.CUDA.Runtime.Stream                      ( Stream(..), defaultStream )
import Foreign.CUDA.Runtime.Error
import Foreign.CUDA.Internal.C2HS

-- System
import Foreign
import Foreign.C
import Control.Monad
import Data.Maybe

--------------------------------------------------------------------------------
-- Data Types
--------------------------------------------------------------------------------

-- |
-- A @__global__@ device function.
--
-- Note that the use of a string naming a function was deprecated in CUDA 4.1
-- and removed in CUDA 5.0.
--
type Fun = FunPtr ()


--
-- Function Attributes
--

{-# LINE 62 "src/Foreign/CUDA/Runtime/Exec.chs" #-}


data FunAttributes = FunAttributes
  {
    constSizeBytes           :: !Int64,
    localSizeBytes           :: !Int64,
    sharedSizeBytes          :: !Int64,
    maxKernelThreadsPerBlock :: !Int,   -- ^ maximum block size that can be successively launched (based on register usage)
    numRegs                  :: !Int    -- ^ number of registers required for each thread
  }
  deriving (Show)

instance Storable FunAttributes where
  sizeOf _    = 56
{-# LINE 75 "src/Foreign/CUDA/Runtime/Exec.chs" #-}

  alignment _ = alignment (undefined :: Ptr ())

  poke _ _    = error "Can not poke Foreign.CUDA.Runtime.FunAttributes"
  peek p      = do
    cs <- cIntConv `fmap` (\ptr -> do {C2HSImp.peekByteOff ptr 8 :: IO C2HSImp.CULong}) p
    ls <- cIntConv `fmap` (\ptr -> do {C2HSImp.peekByteOff ptr 16 :: IO C2HSImp.CULong}) p
    ss <- cIntConv `fmap` (\ptr -> do {C2HSImp.peekByteOff ptr 0 :: IO C2HSImp.CULong}) p
    tb <- cIntConv `fmap` (\ptr -> do {C2HSImp.peekByteOff ptr 24 :: IO C2HSImp.CInt}) p
    nr <- cIntConv `fmap` (\ptr -> do {C2HSImp.peekByteOff ptr 28 :: IO C2HSImp.CInt}) p

    return FunAttributes
      {
        constSizeBytes           = cs,
        localSizeBytes           = ls,
        sharedSizeBytes          = ss,
        maxKernelThreadsPerBlock = tb,
        numRegs                  = nr
      }

-- |
-- Cache configuration preference
--
data CacheConfig = None
                 | Shared
                 | L1
                 | Equal
  deriving (Eq,Show)
instance Enum CacheConfig where
  succ None = Shared
  succ Shared = L1
  succ L1 = Equal
  succ Equal = error "CacheConfig.succ: Equal has no successor"

  pred Shared = None
  pred L1 = Shared
  pred Equal = L1
  pred None = error "CacheConfig.pred: None has no predecessor"

  enumFromTo from to = go from
    where
      end = fromEnum to
      go v = case compare (fromEnum v) end of
                 LT -> v : go (succ v)
                 EQ -> [v]
                 GT -> []

  enumFrom from = enumFromTo from Equal

  fromEnum None = 0
  fromEnum Shared = 1
  fromEnum L1 = 2
  fromEnum Equal = 3

  toEnum 0 = None
  toEnum 1 = Shared
  toEnum 2 = L1
  toEnum 3 = Equal
  toEnum unmatched = error ("CacheConfig.toEnum: Cannot match " ++ show unmatched)

{-# LINE 104 "src/Foreign/CUDA/Runtime/Exec.chs" #-}


-- |
-- Kernel function parameters. Doubles will be converted to an internal float
-- representation on devices that do not support doubles natively.
--
data FunParam where
  IArg :: !Int             -> FunParam
  FArg :: !Float           -> FunParam
  DArg :: !Double          -> FunParam
  VArg :: Storable a => !a -> FunParam


--------------------------------------------------------------------------------
-- Execution Control
--------------------------------------------------------------------------------

-- |
-- Obtain the attributes of the named @__global__@ device function. This
-- itemises the requirements to successfully launch the given kernel.
--
{-# INLINEABLE attributes #-}
attributes :: Fun -> IO FunAttributes
attributes !fn = resultIfOk =<< cudaFuncGetAttributes fn

{-# INLINE cudaFuncGetAttributes #-}
cudaFuncGetAttributes :: (Fun) -> IO ((Status), (FunAttributes))
cudaFuncGetAttributes a2 =
  alloca $ \a1' ->
  withFun a2 $ \a2' ->
  cudaFuncGetAttributes'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  peek  a1'>>= \a1'' ->
  return (res', a1'')

{-# LINE 132 "src/Foreign/CUDA/Runtime/Exec.chs" #-}



-- |
-- On devices where the L1 cache and shared memory use the same hardware
-- resources, this sets the preferred cache configuration for the given device
-- function. This is only a preference; the driver is free to choose a different
-- configuration as required to execute the function.
--
-- Switching between configuration modes may insert a device-side
-- synchronisation point for streamed kernel launches
--
{-# INLINEABLE setCacheConfig #-}
setCacheConfig :: Fun -> CacheConfig -> IO ()
setCacheConfig !fn !pref = nothingIfOk =<< cudaFuncSetCacheConfig fn pref

{-# INLINE cudaFuncSetCacheConfig #-}
cudaFuncSetCacheConfig :: (Fun) -> (CacheConfig) -> IO ((Status))
cudaFuncSetCacheConfig a1 a2 =
  withFun a1 $ \a1' ->
  let {a2' = cFromEnum a2} in
  cudaFuncSetCacheConfig'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 155 "src/Foreign/CUDA/Runtime/Exec.chs" #-}



-- |
-- Invoke a kernel on a @(gx * gy)@ grid of blocks, where each block contains
-- @(tx * ty * tz)@ threads and has access to a given number of bytes of shared
-- memory. The launch may also be associated with a specific 'Stream'.
--
{-# INLINEABLE launchKernel #-}
launchKernel
    :: Fun              -- ^ Device function symbol
    -> (Int,Int)        -- ^ grid dimensions
    -> (Int,Int,Int)    -- ^ thread block shape
    -> Int64            -- ^ shared memory per block (bytes)
    -> Maybe Stream     -- ^ (optional) execution stream
    -> [FunParam]
    -> IO ()
launchKernel !fn (!gx,!gy) (!bx,!by,!bz) !sm !mst !args
  = (=<<) nothingIfOk
  $ withMany withFP args
  $ \pa -> withArray pa
  $ \pp -> cudaLaunchKernel_simple fn gx gy 1 bx by bz pp sm (fromMaybe defaultStream mst)
  where
    withFP :: FunParam -> (Ptr FunParam -> IO b) -> IO b
    withFP p f = case p of
      IArg v -> with' v (f . castPtr)
      FArg v -> with' v (f . castPtr)
      DArg v -> with' v (f . castPtr)
      VArg v -> with' v (f . castPtr)

    with' :: Storable a => a -> (Ptr a -> IO b) -> IO b
    with' !val !f =
      allocaBytes (sizeOf val) $ \ptr -> do
        poke ptr val
        f ptr

{-# INLINE cudaLaunchKernel_simple #-}
cudaLaunchKernel_simple :: (Fun) -> (Int) -> (Int) -> (Int) -> (Int) -> (Int) -> (Int) -> (Ptr (Ptr FunParam)) -> (Int64) -> (Stream) -> IO ((Status))
cudaLaunchKernel_simple a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 =
  withFun a1 $ \a1' ->
  let {a2' = fromIntegral a2} in
  let {a3' = fromIntegral a3} in
  let {a4' = fromIntegral a4} in
  let {a5' = fromIntegral a5} in
  let {a6' = fromIntegral a6} in
  let {a7' = fromIntegral a7} in
  let {a8' = castPtr a8} in
  let {a9' = fromIntegral a9} in
  let {a10' = useStream a10} in
  cudaLaunchKernel_simple'_ a1' a2' a3' a4' a5' a6' a7' a8' a9' a10' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 201 "src/Foreign/CUDA/Runtime/Exec.chs" #-}



--------------------------------------------------------------------------------
-- Internals
--------------------------------------------------------------------------------

-- CUDA 5.0 changed the type of a kernel function from char* to void*
--
withFun :: Fun -> (Ptr a -> IO b) -> IO b
withFun fn action = action (castFunPtrToPtr fn)


foreign import ccall unsafe "Foreign/CUDA/Runtime/Exec.chs.h cudaFuncGetAttributes"
  cudaFuncGetAttributes'_ :: ((C2HSImp.Ptr (FunAttributes)) -> ((C2HSImp.Ptr ()) -> (IO C2HSImp.CInt)))

foreign import ccall unsafe "Foreign/CUDA/Runtime/Exec.chs.h cudaFuncSetCacheConfig"
  cudaFuncSetCacheConfig'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CInt -> (IO C2HSImp.CInt)))

foreign import ccall unsafe "Foreign/CUDA/Runtime/Exec.chs.h cudaLaunchKernel_simple"
  cudaLaunchKernel_simple'_ :: ((C2HSImp.Ptr ()) -> (C2HSImp.CUInt -> (C2HSImp.CUInt -> (C2HSImp.CUInt -> (C2HSImp.CUInt -> (C2HSImp.CUInt -> (C2HSImp.CUInt -> ((C2HSImp.Ptr (C2HSImp.Ptr ())) -> (C2HSImp.CULong -> ((C2HSImp.Ptr ()) -> (IO C2HSImp.CInt)))))))))))