-- GENERATED by C->Haskell Compiler, version 0.25.2 Snowboundest, 31 Oct 2014 (Haskell)
-- Edit the ORIGNAL .chs file instead!


{-# LINE 1 "./Foreign/CUDA/Runtime/Exec.chs" #-}
{-# LANGUAGE BangPatterns             #-}
{-# LANGUAGE CPP                      #-}
{-# LANGUAGE ForeignFunctionInterface #-}
{-# LANGUAGE GADTs                    #-}
{-# LANGUAGE TemplateHaskell          #-}
--------------------------------------------------------------------------------
-- |
-- Module    : Foreign.CUDA.Runtime.Exec
-- Copyright : [2009..2014] Trevor L. McDonell
-- License   : BSD
--
-- Kernel execution control for C-for-CUDA runtime interface
--
--------------------------------------------------------------------------------

module Foreign.CUDA.Runtime.Exec (

  -- * Kernel Execution
  Fun, FunAttributes(..), FunParam(..), CacheConfig(..),
  attributes, setConfig, setParams, setCacheConfig, launch, launchKernel,

) where



{-# LINE 25 "./Foreign/CUDA/Runtime/Exec.chs" #-}


-- Friends
import Foreign.CUDA.Runtime.Stream                      ( Stream(..), defaultStream )
import Foreign.CUDA.Runtime.Error
import Foreign.CUDA.Internal.C2HS

-- System
import Foreign
import Foreign.C
import Control.Monad

--------------------------------------------------------------------------------
-- Data Types
--------------------------------------------------------------------------------

-- |
-- A @__global__@ device function.
--
-- Note that the use of a string naming a function was deprecated in CUDA 4.1
-- and removed in CUDA 5.0.
--
type Fun = FunPtr ()


--
-- Function Attributes
--

{-# LINE 61 "./Foreign/CUDA/Runtime/Exec.chs" #-}


data FunAttributes = FunAttributes
  {
    constSizeBytes           :: !Int64,
    localSizeBytes           :: !Int64,
    sharedSizeBytes          :: !Int64,
    maxKernelThreadsPerBlock :: !Int,   -- ^ maximum block size that can be successively launched (based on register usage)
    numRegs                  :: !Int    -- ^ number of registers required for each thread
  }
  deriving (Show)

instance Storable FunAttributes where
  sizeOf _    = 44
{-# LINE 74 "./Foreign/CUDA/Runtime/Exec.chs" #-}

  alignment _ = alignment (undefined :: Ptr ())

  poke _ _    = error "Can not poke Foreign.CUDA.Runtime.FunAttributes"
  peek p      = do
    cs <- cIntConv `fmap` (\ptr -> do {peekByteOff ptr 8 :: IO CULong}) p
    ls <- cIntConv `fmap` (\ptr -> do {peekByteOff ptr 16 :: IO CULong}) p
    ss <- cIntConv `fmap` (\ptr -> do {peekByteOff ptr 0 :: IO CULong}) p
    tb <- cIntConv `fmap` (\ptr -> do {peekByteOff ptr 24 :: IO CInt}) p
    nr <- cIntConv `fmap` (\ptr -> do {peekByteOff ptr 28 :: IO CInt}) p

    return FunAttributes
      {
        constSizeBytes           = cs,
        localSizeBytes           = ls,
        sharedSizeBytes          = ss,
        maxKernelThreadsPerBlock = tb,
        numRegs                  = nr
      }

-- |
-- Cache configuration preference
--
data CacheConfig = None
                 | Shared
                 | L1
                 | Equal
  deriving (Eq,Show)
instance Enum CacheConfig where
  succ None = Shared
  succ Shared = L1
  succ L1 = Equal
  succ Equal = error "CacheConfig.succ: Equal has no successor"

  pred Shared = None
  pred L1 = Shared
  pred Equal = L1
  pred None = error "CacheConfig.pred: None has no predecessor"

  enumFromTo from to = go from
    where
      end = fromEnum to
      go v = case compare (fromEnum v) end of
                 LT -> v : go (succ v)
                 EQ -> [v]
                 GT -> []

  enumFrom from = enumFromTo from Equal

  fromEnum None = 0
  fromEnum Shared = 1
  fromEnum L1 = 2
  fromEnum Equal = 3

  toEnum 0 = None
  toEnum 1 = Shared
  toEnum 2 = L1
  toEnum 3 = Equal
  toEnum unmatched = error ("CacheConfig.toEnum: Cannot match " ++ show unmatched)

{-# LINE 103 "./Foreign/CUDA/Runtime/Exec.chs" #-}


-- |
-- Kernel function parameters. Doubles will be converted to an internal float
-- representation on devices that do not support doubles natively.
--
data FunParam where
  IArg :: !Int             -> FunParam
  FArg :: !Float           -> FunParam
  DArg :: !Double          -> FunParam
  VArg :: Storable a => !a -> FunParam


--------------------------------------------------------------------------------
-- Execution Control
--------------------------------------------------------------------------------

-- |
-- Obtain the attributes of the named @__global__@ device function. This
-- itemises the requirements to successfully launch the given kernel.
--
{-# INLINEABLE attributes #-}
attributes :: Fun -> IO FunAttributes
attributes !fn = resultIfOk =<< cudaFuncGetAttributes fn

{-# INLINE cudaFuncGetAttributes #-}
cudaFuncGetAttributes :: (Fun) -> IO ((Status), (FunAttributes))
cudaFuncGetAttributes a2 =
  alloca $ \a1' -> 
  withFun a2 $ \a2' -> 
  cudaFuncGetAttributes'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  peek  a1'>>= \a1'' -> 
  return (res', a1'')

{-# LINE 131 "./Foreign/CUDA/Runtime/Exec.chs" #-}



-- |
-- Specify the grid and block dimensions for a device call. Used in conjunction
-- with 'setParams', this pushes data onto the execution stack that will be
-- popped when a function is 'launch'ed.
--
{-# INLINEABLE setConfig #-}
setConfig :: (Int,Int)          -- ^ grid dimensions
          -> (Int,Int,Int)      -- ^ block dimensions
          -> Int64              -- ^ shared memory per block (bytes)
          -> Maybe Stream       -- ^ associated processing stream
          -> IO ()
setConfig (!gx,!gy) (!bx,!by,!bz) !sharedMem !mst =
  nothingIfOk =<<
    cudaConfigureCallSimple gx gy bx by bz sharedMem (maybe defaultStream id mst)


--
-- The FFI does not support passing deferenced structures to C functions, as
-- this is highly platform/compiler dependent. Wrap our own function stub
-- accepting plain integers.
--
{-# INLINE cudaConfigureCallSimple #-}
cudaConfigureCallSimple :: (Int) -> (Int) -> (Int) -> (Int) -> (Int) -> (Int64) -> (Stream) -> IO ((Status))
cudaConfigureCallSimple a1 a2 a3 a4 a5 a6 a7 =
  let {a1' = fromIntegral a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = fromIntegral a4} in 
  let {a5' = fromIntegral a5} in 
  let {a6' = cIntConv a6} in 
  let {a7' = useStream a7} in 
  cudaConfigureCallSimple'_ a1' a2' a3' a4' a5' a6' a7' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 160 "./Foreign/CUDA/Runtime/Exec.chs" #-}



-- |
-- Set the argument parameters that will be passed to the next kernel
-- invocation. This is used in conjunction with 'setConfig' to control kernel
-- execution.
--
{-# INLINEABLE setParams #-}
setParams :: [FunParam] -> IO ()
setParams = foldM_ k 0
  where
    k !offset !arg = do
      let s = size arg
      set arg s offset >>= nothingIfOk
      return (offset + s)

    size (IArg _) = sizeOf (undefined :: Int)
    size (FArg _) = sizeOf (undefined :: Float)
    size (DArg _) = sizeOf (undefined :: Double)
    size (VArg a) = sizeOf a

    set (IArg v) s o = cudaSetupArgument v s o
    set (FArg v) s o = cudaSetupArgument v s o
    set (VArg v) s o = cudaSetupArgument v s o
    set (DArg v) s o =
      cudaSetDoubleForDevice v >>= resultIfOk >>= \d ->
      cudaSetupArgument d s o


{-# INLINE cudaSetupArgument #-}
cudaSetupArgument :: Storable a => (a) -> (Int) -> (Int) -> IO ((Status))
cudaSetupArgument a1 a2 a3 =
  with' a1 $ \a1' -> 
  let {a2' = fromIntegral a2} in 
  let {a3' = fromIntegral a3} in 
  cudaSetupArgument'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 195 "./Foreign/CUDA/Runtime/Exec.chs" #-}

  where
    with' v a = with v $ \p -> a (castPtr p)

{-# INLINE cudaSetDoubleForDevice #-}
cudaSetDoubleForDevice :: (Double) -> IO ((Status), (Double))
cudaSetDoubleForDevice a1 =
  with' a1 $ \a1' -> 
  cudaSetDoubleForDevice'_ a1' >>= \res ->
  let {res' = cToEnum res} in
  peek'  a1'>>= \a1'' -> 
  return (res', a1'')

{-# LINE 201 "./Foreign/CUDA/Runtime/Exec.chs" #-}

  where
    with' v a = with v $ \p -> a (castPtr p)
    peek'     = peek . castPtr


-- |
-- On devices where the L1 cache and shared memory use the same hardware
-- resources, this sets the preferred cache configuration for the given device
-- function. This is only a preference; the driver is free to choose a different
-- configuration as required to execute the function.
--
-- Switching between configuration modes may insert a device-side
-- synchronisation point for streamed kernel launches
--
{-# INLINEABLE setCacheConfig #-}
setCacheConfig :: Fun -> CacheConfig -> IO ()
setCacheConfig !fn !pref = nothingIfOk =<< cudaFuncSetCacheConfig fn pref

{-# INLINE cudaFuncSetCacheConfig #-}
cudaFuncSetCacheConfig :: (Fun) -> (CacheConfig) -> IO ((Status))
cudaFuncSetCacheConfig a1 a2 =
  withFun a1 $ \a1' -> 
  let {a2' = cFromEnum a2} in 
  cudaFuncSetCacheConfig'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 227 "./Foreign/CUDA/Runtime/Exec.chs" #-}



-- |
-- Invoke the @__global__@ kernel function on the device. This must be preceded
-- by a call to 'setConfig' and (if appropriate) 'setParams'.
--
{-# INLINEABLE launch #-}
launch :: Fun -> IO ()
launch !fn = nothingIfOk =<< cudaLaunch fn

{-# INLINE cudaLaunch #-}
cudaLaunch :: (Fun) -> IO ((Status))
cudaLaunch a1 =
  withFun a1 $ \a1' -> 
  cudaLaunch'_ a1' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 240 "./Foreign/CUDA/Runtime/Exec.chs" #-}



-- |
-- Invoke a kernel on a @(gx * gy)@ grid of blocks, where each block contains
-- @(tx * ty * tz)@ threads and has access to a given number of bytes of shared
-- memory. The launch may also be associated with a specific 'Stream'.
--
{-# INLINEABLE launchKernel #-}
launchKernel
    :: Fun              -- ^ Device function symbol
    -> (Int,Int)        -- ^ grid dimensions
    -> (Int,Int,Int)    -- ^ thread block shape
    -> Int64            -- ^ shared memory per block (bytes)
    -> Maybe Stream     -- ^ (optional) execution stream
    -> [FunParam]
    -> IO ()
launchKernel !fn !grid !block !sm !mst !args = do
  setConfig grid block sm mst
  setParams args
  launch fn

--------------------------------------------------------------------------------
-- Internals
--------------------------------------------------------------------------------

-- CUDA 5.0 changed the type of a kernel function from char* to void*
--
withFun :: Fun -> (Ptr a -> IO b) -> IO b
withFun fn action = action (castFunPtrToPtr fn)


foreign import ccall unsafe "Foreign/CUDA/Runtime/Exec.chs.h cudaFuncGetAttributes"
  cudaFuncGetAttributes'_ :: ((Ptr (FunAttributes)) -> ((Ptr ()) -> (IO CInt)))

foreign import ccall unsafe "Foreign/CUDA/Runtime/Exec.chs.h cudaConfigureCallSimple"
  cudaConfigureCallSimple'_ :: (CInt -> (CInt -> (CInt -> (CInt -> (CInt -> (CULong -> ((Ptr ()) -> (IO CInt))))))))

foreign import ccall unsafe "Foreign/CUDA/Runtime/Exec.chs.h cudaSetupArgument"
  cudaSetupArgument'_ :: ((Ptr ()) -> (CULong -> (CULong -> (IO CInt))))

foreign import ccall unsafe "Foreign/CUDA/Runtime/Exec.chs.h cudaSetDoubleForDevice"
  cudaSetDoubleForDevice'_ :: ((Ptr CDouble) -> (IO CInt))

foreign import ccall unsafe "Foreign/CUDA/Runtime/Exec.chs.h cudaFuncSetCacheConfig"
  cudaFuncSetCacheConfig'_ :: ((Ptr ()) -> (CInt -> (IO CInt)))

foreign import ccall unsafe "Foreign/CUDA/Runtime/Exec.chs.h cudaLaunch"
  cudaLaunch'_ :: ((Ptr ()) -> (IO CInt))