{-# LANGUAGE GADTs, ForeignFunctionInterface #-} -------------------------------------------------------------------------------- -- | -- Module : Foreign.CUDA.Runtime.Exec -- Copyright : (c) [2009..2010] Trevor L. McDonell -- License : BSD -- -- Kernel execution control for C-for-CUDA runtime interface -- -------------------------------------------------------------------------------- #include "cbits/stubs.h" #include {# context lib="cudart" #} module Foreign.CUDA.Runtime.Exec ( FunAttributes(..), FunParam(..), #if CUDART_VERSION >= 3000 CacheConfig(..), #endif attributes, setConfig, setParams, #if CUDART_VERSION >= 3000 setCacheConfig, #endif launch ) where -- Friends import Foreign.CUDA.Runtime.Stream import Foreign.CUDA.Runtime.Error import Foreign.CUDA.Internal.C2HS -- System import Foreign import Foreign.C import Control.Monad #c typedef struct cudaFuncAttributes cudaFuncAttributes; #endc -------------------------------------------------------------------------------- -- Data Types -------------------------------------------------------------------------------- -- -- Function Attributes -- {# pointer *cudaFuncAttributes as ^ foreign -> FunAttributes nocode #} data FunAttributes = FunAttributes { constSizeBytes :: Int64, localSizeBytes :: Int64, sharedSizeBytes :: Int64, maxKernelThreadsPerBlock :: Int, -- ^ maximum block size that can be successively launched (based on register usage) numRegs :: Int -- ^ number of registers required for each thread } deriving (Show) instance Storable FunAttributes where sizeOf _ = {# sizeof cudaFuncAttributes #} alignment _ = alignment (undefined :: Ptr ()) peek p = do cs <- cIntConv `fmap` {#get cudaFuncAttributes.constSizeBytes#} p ls <- cIntConv `fmap` {#get cudaFuncAttributes.localSizeBytes#} p ss <- cIntConv `fmap` {#get cudaFuncAttributes.sharedSizeBytes#} p tb <- cIntConv `fmap` {#get cudaFuncAttributes.maxThreadsPerBlock#} p nr <- cIntConv `fmap` {#get cudaFuncAttributes.numRegs#} p return FunAttributes { constSizeBytes = cs, localSizeBytes = ls, sharedSizeBytes = ss, maxKernelThreadsPerBlock = tb, numRegs = nr } #if CUDART_VERSION >= 3000 -- | -- Cache configuration preference -- {# enum cudaFuncCache as CacheConfig { } with prefix="cudaFuncCachePrefer" deriving (Eq, Show) #} #endif -- | -- Kernel function parameters. Doubles will be converted to an internal float -- representation on devices that do not support doubles natively. -- data FunParam where IArg :: Int -> FunParam FArg :: Float -> FunParam DArg :: Double -> FunParam VArg :: Storable a => a -> FunParam -------------------------------------------------------------------------------- -- Execution Control -------------------------------------------------------------------------------- -- | -- Obtain the attributes of the named @__global__@ device function. This -- itemises the requirements to successfully launch the given kernel. -- attributes :: String -> IO FunAttributes attributes fn = resultIfOk =<< cudaFuncGetAttributes fn {# fun unsafe cudaFuncGetAttributes { alloca- `FunAttributes' peek* , withCString* `String' } -> `Status' cToEnum #} -- | -- Specify the grid and block dimensions for a device call. Used in conjunction -- with 'setParams', this pushes data onto the execution stack that will be -- popped when a function is 'launch'ed. -- setConfig :: (Int,Int) -- ^ grid dimensions -> (Int,Int,Int) -- ^ block dimensions -> Int64 -- ^ shared memory per block (bytes) -> Maybe Stream -- ^ associated processing stream -> IO () setConfig (gx,gy) (bx,by,bz) sharedMem mst = nothingIfOk =<< cudaConfigureCallSimple gx gy bx by bz sharedMem (maybe defaultStream id mst) -- -- The FFI does not support passing deferenced structures to C functions, as -- this is highly platform/compiler dependent. Wrap our own function stub -- accepting plain integers. -- {# fun unsafe cudaConfigureCallSimple { `Int', `Int' , `Int', `Int', `Int' , cIntConv `Int64' , useStream `Stream' } -> `Status' cToEnum #} -- | -- Set the argument parameters that will be passed to the next kernel -- invocation. This is used in conjunction with 'setConfig' to control kernel -- execution. setParams :: [FunParam] -> IO () setParams = foldM_ k 0 where k offset arg = do let s = size arg set arg s offset >>= nothingIfOk return (offset + s) size (IArg _) = sizeOf (undefined :: Int) size (FArg _) = sizeOf (undefined :: Float) size (DArg _) = sizeOf (undefined :: Double) size (VArg a) = sizeOf a set (IArg v) s o = cudaSetupArgument v s o set (FArg v) s o = cudaSetupArgument v s o set (VArg v) s o = cudaSetupArgument v s o set (DArg v) s o = cudaSetDoubleForDevice v >>= resultIfOk >>= \d -> cudaSetupArgument d s o {# fun unsafe cudaSetupArgument `Storable a' => { with'* `a' , `Int' , `Int' } -> `Status' cToEnum #} where with' v a = with v $ \p -> a (castPtr p) {# fun unsafe cudaSetDoubleForDevice { with'* `Double' peek'* } -> `Status' cToEnum #} where with' v a = with v $ \p -> a (castPtr p) peek' = peek . castPtr #if CUDART_VERSION >= 3000 -- | -- On devices where the L1 cache and shared memory use the same hardware -- resources, this sets the preferred cache configuration for the given device -- function. This is only a preference; the driver is free to choose a different -- configuration as required to execute the function. -- -- Switching between configuration modes may insert a device-side -- synchronisation point for streamed kernel launches -- setCacheConfig :: String -> CacheConfig -> IO () setCacheConfig fn pref = nothingIfOk =<< cudaFuncSetCacheConfig fn pref {# fun unsafe cudaFuncSetCacheConfig { withCString* `String' , cFromEnum `CacheConfig' } -> `Status' cToEnum #} #endif -- | -- Invoke the named kernel on the device, which must have been declared -- @__global__@. This must be preceded by a call to 'setConfig' and (if -- appropriate) 'setParams'. -- launch :: String -> IO () launch fn = nothingIfOk =<< cudaLaunch fn {# fun unsafe cudaLaunch { withCString* `String' } -> `Status' cToEnum #}