{-# LANGUAGE BangPatterns #-} {-# LANGUAGE CPP #-} {-# LANGUAGE EmptyDataDecls #-} {-# LANGUAGE ForeignFunctionInterface #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE TemplateHaskell #-} -------------------------------------------------------------------------------- -- | -- Module : Foreign.CUDA.Driver.Exec -- Copyright : [2009..2017] Trevor L. McDonell -- License : BSD -- -- Kernel execution control for low-level driver interface -- -------------------------------------------------------------------------------- module Foreign.CUDA.Driver.Exec ( -- * Kernel Execution Fun(Fun), FunParam(..), FunAttribute(..), SharedMem(..), requires, setCacheConfigFun, setSharedMemConfigFun, launchKernel, launchKernel', launchKernelCooperative, -- Deprecated since CUDA-4.0 setBlockShape, setSharedSize, setParams, launch, ) where #include "cbits/stubs.h" {# context lib="cuda" #} -- Friends import Foreign.CUDA.Internal.C2HS import Foreign.CUDA.Driver.Error import Foreign.CUDA.Driver.Context ( Cache(..), SharedMem(..) ) import Foreign.CUDA.Driver.Stream ( Stream(..), defaultStream ) -- System import Foreign import Foreign.C import Data.Maybe import Control.Monad ( zipWithM_ ) #if CUDA_VERSION >= 4000 {-# DEPRECATED setBlockShape, setSharedSize, setParams, launch "use launchKernel instead" #-} #endif -------------------------------------------------------------------------------- -- Data Types -------------------------------------------------------------------------------- -- | -- A @\_\_global\_\_@ device function -- newtype Fun = Fun { useFun :: {# type CUfunction #}} -- | -- Function attributes -- {# enum CUfunction_attribute as FunAttribute { underscoreToCase , MAX_THREADS_PER_BLOCK as MaxKernelThreadsPerBlock , MAX as CU_FUNC_ATTRIBUTE_MAX } -- ignore with prefix="CU_FUNC_ATTRIBUTE" deriving (Eq, Show) #} -- | -- Kernel function parameters -- data FunParam where IArg :: !Int32 -> FunParam FArg :: !Float -> FunParam VArg :: Storable a => !a -> FunParam instance Storable FunParam where sizeOf (IArg _) = sizeOf (undefined :: CUInt) sizeOf (FArg _) = sizeOf (undefined :: CFloat) sizeOf (VArg v) = sizeOf v alignment (IArg _) = alignment (undefined :: CUInt) alignment (FArg _) = alignment (undefined :: CFloat) alignment (VArg v) = alignment v poke p (IArg i) = poke (castPtr p) i poke p (FArg f) = poke (castPtr p) f poke p (VArg v) = poke (castPtr p) v peek _ = error "Can not peek Foreign.CUDA.Driver.FunParam" -------------------------------------------------------------------------------- -- Execution Control -------------------------------------------------------------------------------- -- | -- Returns the value of the selected attribute requirement for the given kernel. -- -- -- {-# INLINEABLE requires #-} requires :: Fun -> FunAttribute -> IO Int requires !fn !att = resultIfOk =<< cuFuncGetAttribute att fn {-# INLINE cuFuncGetAttribute #-} {# fun unsafe cuFuncGetAttribute { alloca- `Int' peekIntConv* , cFromEnum `FunAttribute' , useFun `Fun' } -> `Status' cToEnum #} -- | -- On devices where the L1 cache and shared memory use the same hardware -- resources, this sets the preferred cache configuration for the given device -- function. This is only a preference; the driver is free to choose a different -- configuration as required to execute the function. -- -- Switching between configuration modes may insert a device-side -- synchronisation point for streamed kernel launches. -- -- Requires CUDA-3.0. -- -- -- {-# INLINEABLE setCacheConfigFun #-} setCacheConfigFun :: Fun -> Cache -> IO () #if CUDA_VERSION < 3000 setCacheConfigFun _ _ = requireSDK 'setCacheConfigFun 3.0 #else setCacheConfigFun !fn !pref = nothingIfOk =<< cuFuncSetCacheConfig fn pref {-# INLINE cuFuncSetCacheConfig #-} {# fun unsafe cuFuncSetCacheConfig { useFun `Fun' , cFromEnum `Cache' } -> `Status' cToEnum #} #endif -- | -- Set the shared memory configuration of a device function. -- -- On devices with configurable shared memory banks, this will force all -- subsequent launches of the given device function to use the specified -- shared memory bank size configuration. On launch of the function, the -- shared memory configuration of the device will be temporarily changed if -- needed to suit the function configuration. Changes in shared memory -- configuration may introduction a device side synchronisation between -- kernel launches. -- -- Any per-function configuration specified by 'setSharedMemConfig' will -- override the context-wide configuration set with -- 'Foreign.CUDA.Driver.Context.Config.setSharedMem'. -- -- Changing the shared memory bank size will not increase shared memory -- usage or affect occupancy of kernels, but may have major effects on -- performance. Larger bank sizes will allow for greater potential -- bandwidth to shared memory, but will change what kinds of accesses to -- shared memory will result in bank conflicts. -- -- This function will do nothing on devices with fixed shared memory bank -- size. -- -- Requires CUDA-5.0. -- -- -- {-# INLINEABLE setSharedMemConfigFun #-} setSharedMemConfigFun :: Fun -> SharedMem -> IO () #if CUDA_VERSION < 5000 setSharedMemConfigFun _ _ = requireSDK 'setSharedMemConfigFun 5.0 #else setSharedMemConfigFun !fun !pref = nothingIfOk =<< cuFuncSetSharedMemConfig fun pref {-# INLINE cuFuncSetSharedMemConfig #-} {# fun unsafe cuFuncSetSharedMemConfig { useFun `Fun' , cFromEnum `SharedMem' } -> `Status' cToEnum #} #endif -- | -- Invoke a kernel on a @(gx * gy * gz)@ grid of blocks, where each block -- contains @(tx * ty * tz)@ threads and has access to a given number of bytes -- of shared memory. The launch may also be associated with a specific 'Stream'. -- -- In 'launchKernel', the number of kernel parameters and their offsets and -- sizes do not need to be specified, as this information is retrieved directly -- from the kernel's image. This requires the kernel to have been compiled with -- toolchain version 3.2 or later. -- -- The alternative 'launchKernel'' will pass the arguments in directly, -- requiring the application to know the size and alignment/padding of each -- kernel parameter. -- -- -- {-# INLINEABLE launchKernel #-} {-# INLINEABLE launchKernel' #-} launchKernel, launchKernel' :: Fun -- ^ function to execute -> (Int,Int,Int) -- ^ block grid dimension -> (Int,Int,Int) -- ^ thread block shape -> Int -- ^ shared memory (bytes) -> Maybe Stream -- ^ (optional) stream to execute in -> [FunParam] -- ^ list of function parameters -> IO () #if CUDA_VERSION >= 4000 launchKernel !fn (!gx,!gy,!gz) (!tx,!ty,!tz) !sm !mst !args = (=<<) nothingIfOk $ withMany withFP args $ \pa -> withArray pa $ \pp -> cuLaunchKernel fn gx gy gz tx ty tz sm st pp nullPtr where !st = fromMaybe defaultStream mst withFP :: FunParam -> (Ptr FunParam -> IO b) -> IO b withFP !p !f = case p of IArg v -> with' v (f . castPtr) FArg v -> with' v (f . castPtr) VArg v -> with' v (f . castPtr) -- can't use the standard 'with' because 'alloca' will pass an undefined -- dummy argument when determining 'sizeOf' and 'alignment', but sometimes -- instances in Accelerate need to evaluate this argument. -- with' :: Storable a => a -> (Ptr a -> IO b) -> IO b with' !val !f = allocaBytes (sizeOf val) $ \ptr -> do poke ptr val f ptr launchKernel' !fn (!gx,!gy,!gz) (!tx,!ty,!tz) !sm !mst !args = (=<<) nothingIfOk $ with bytes $ \pb -> withArray' args $ \pa -> withArray0 nullPtr [buffer, castPtr pa, size, castPtr pb] $ \pp -> cuLaunchKernel fn gx gy gz tx ty tz sm st nullPtr pp where buffer = wordPtrToPtr 0x01 -- CU_LAUNCH_PARAM_BUFFER_POINTER size = wordPtrToPtr 0x02 -- CU_LAUNCH_PARAM_BUFFER_SIZE bytes = foldl (\a x -> a + sizeOf x) 0 args st = fromMaybe defaultStream mst -- can't use the standard 'withArray' because 'mallocArray' will pass -- 'undefined' to 'sizeOf' when determining how many bytes to allocate, but -- our Storable instance for FunParam needs to dispatch on each constructor, -- hence evaluating the undefined. -- withArray' !vals !f = allocaBytes bytes $ \ptr -> do pokeArray ptr vals f ptr {-# INLINE cuLaunchKernel #-} {# fun unsafe cuLaunchKernel { useFun `Fun' , `Int', `Int', `Int' , `Int', `Int', `Int' , `Int' , useStream `Stream' , castPtr `Ptr (Ptr FunParam)' , castPtr `Ptr (Ptr ())' } -> `Status' cToEnum #} #else launchKernel !fn (!gx,!gy,_) (!tx,!ty,!tz) !sm !mst !args = do setParams fn args setSharedSize fn (toInteger sm) setBlockShape fn (tx,ty,tz) launch fn (gx,gy) mst launchKernel' = launchKernel #endif -- | -- Invoke a kernel on a @(gx * gy * gz)@ grid of blocks, where each block -- contains @(tx * ty * tz)@ threads and has access to a given number of bytes -- of shared memory. The launch may also be associated with a specific stream. -- -- The thread blocks can cooperate and synchronise as they execute. -- -- The device on which this kernel is invoked must have -- 'Foreign.CUDA.Driver.Device.attribute' -- 'Foreign.CUDA.Driver.Device.CooperativeLaunch'. -- -- The total number of blocks launched can not exceed the maximum number of -- active thread blocks per multiprocessor -- ('Foreign.CUDA.Analysis.Device.threadBlocksPerMP'), multiplied by the number -- of multiprocessors ('Foreign.CUDA.Analysis.Device.multiProcessorCount'). -- -- The kernel can not make use of dynamic parallelism. -- -- -- -- Requires CUDA-9.0 -- -- @since 0.9.0.0@ -- {-# INLINEABLE launchKernelCooperative #-} launchKernelCooperative :: Fun -- ^ function to execute -> (Int,Int,Int) -- ^ block grid dimension -> (Int,Int,Int) -- ^ thread block shape -> Int -- ^ shared memory (bytes) -> Maybe Stream -- ^ (optional) stream to execute in -> [FunParam] -- ^ list of function parameters -> IO () #if CUDA_VERSION < 9000 launchKernelCooperative _ _ _ _ _ _ = requireSDK 'launchKernelCooperative 9.0 #else launchKernelCooperative !fn (!gx,!gy,!gz) (!tx,!ty,!tz) !sm !mst !args = (=<<) nothingIfOk $ withMany withFP args $ \pa -> withArray pa $ \pp -> cuLaunchCooperativeKernel fn gx gy gz tx ty tz sm st pp where !st = fromMaybe defaultStream mst withFP :: FunParam -> (Ptr FunParam -> IO b) -> IO b withFP !p !f = case p of IArg v -> with' v (f . castPtr) FArg v -> with' v (f . castPtr) VArg v -> with' v (f . castPtr) -- can't use the standard 'with' because 'alloca' will pass an undefined -- dummy argument when determining 'sizeOf' and 'alignment', but sometimes -- instances in Accelerate need to evaluate this argument. -- with' :: Storable a => a -> (Ptr a -> IO b) -> IO b with' !val !f = allocaBytes (sizeOf val) $ \ptr -> do poke ptr val f ptr {-# INLINE cuLaunchCooperativeKernel #-} {# fun unsafe cuLaunchCooperativeKernel { useFun `Fun' , `Int', `Int', `Int' , `Int', `Int', `Int' , `Int' , useStream `Stream' , castPtr `Ptr (Ptr FunParam)' } -> `Status' cToEnum #} #endif -- TODO: cuLaunchCooperativeKernelMultiDevice introduced CUDA-9.0 -------------------------------------------------------------------------------- -- Deprecated -------------------------------------------------------------------------------- -- | -- Invoke the kernel on a size @(w,h)@ grid of blocks. Each block contains the -- number of threads specified by a previous call to 'setBlockShape'. The launch -- may also be associated with a specific 'Stream'. -- {-# INLINEABLE launch #-} launch :: Fun -> (Int,Int) -> Maybe Stream -> IO () launch !fn (!w,!h) mst = nothingIfOk =<< cuLaunchGridAsync fn w h (fromMaybe defaultStream mst) {-# INLINE cuLaunchGridAsync #-} {# fun unsafe cuLaunchGridAsync { useFun `Fun' , `Int' , `Int' , useStream `Stream' } -> `Status' cToEnum #} -- | -- Specify the @(x,y,z)@ dimensions of the thread blocks that are created when -- the given kernel function is launched. -- {-# INLINEABLE setBlockShape #-} setBlockShape :: Fun -> (Int,Int,Int) -> IO () setBlockShape !fn (!x,!y,!z) = nothingIfOk =<< cuFuncSetBlockShape fn x y z {-# INLINE cuFuncSetBlockShape #-} {# fun unsafe cuFuncSetBlockShape { useFun `Fun' , `Int' , `Int' , `Int' } -> `Status' cToEnum #} -- | -- Set the number of bytes of dynamic shared memory to be available to each -- thread block when the function is launched -- {-# INLINEABLE setSharedSize #-} setSharedSize :: Fun -> Integer -> IO () setSharedSize !fn !bytes = nothingIfOk =<< cuFuncSetSharedSize fn bytes {-# INLINE cuFuncSetSharedSize #-} {# fun unsafe cuFuncSetSharedSize { useFun `Fun' , cIntConv `Integer' } -> `Status' cToEnum #} -- | -- Set the parameters that will specified next time the kernel is invoked -- {-# INLINEABLE setParams #-} setParams :: Fun -> [FunParam] -> IO () setParams !fn !prs = do zipWithM_ (set fn) offsets prs nothingIfOk =<< cuParamSetSize fn (last offsets) where offsets = scanl (\a b -> a + size b) 0 prs size (IArg _) = sizeOf (undefined :: CUInt) size (FArg _) = sizeOf (undefined :: CFloat) size (VArg v) = sizeOf v set f o (IArg v) = nothingIfOk =<< cuParamSeti f o v set f o (FArg v) = nothingIfOk =<< cuParamSetf f o v set f o (VArg v) = with v $ \p -> (nothingIfOk =<< cuParamSetv f o p (sizeOf v)) {-# INLINE cuParamSetSize #-} {# fun unsafe cuParamSetSize { useFun `Fun' , `Int' } -> `Status' cToEnum #} {-# INLINE cuParamSeti #-} {# fun unsafe cuParamSeti { useFun `Fun' , `Int' , `Int32' } -> `Status' cToEnum #} {-# INLINE cuParamSetf #-} {# fun unsafe cuParamSetf { useFun `Fun' , `Int' , `Float' } -> `Status' cToEnum #} {-# INLINE cuParamSetv #-} {# fun unsafe cuParamSetv `Storable a' => { useFun `Fun' , `Int' , castPtr `Ptr a' , `Int' } -> `Status' cToEnum #}