{-# LANGUAGE GADTs #-} -- | -- Module : Data.Array.Accelerate.CUDA.Analysis.Launch -- Copyright : [2008..2010] Manuel M T Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell -- License : BSD3 -- -- Maintainer : Manuel M T Chakravarty -- Stability : experimental -- Portability : non-partable (GHC extensions) -- module Data.Array.Accelerate.CUDA.Analysis.Launch (launchConfig) where import Control.Monad.IO.Class import Data.Int import Data.Array.Accelerate.AST import Data.Array.Accelerate.Analysis.Type import Data.Array.Accelerate.CUDA.State import qualified Foreign.CUDA.Analysis as CUDA import qualified Foreign.CUDA.Driver as CUDA import qualified Foreign.Storable as F -- | -- Determine kernel launch parameters for the given array computation (as well -- as compiled function module). This consists of the thread block size, number -- of blocks, and dynamically allocated shared memory (bytes), respectively. -- -- By default, this launches the kernel with the minimum block size that gives -- maximum occupancy, and the grid size limited to the maximum number of -- physically resident blocks. Hence, kernels may need to process multiple -- elements per thread. -- -- TLM: this could probably be stored in the KernelEntry -- launchConfig :: OpenAcc aenv a -> Int -> CUDA.Fun -> CIO (Int, Int, Integer) launchConfig acc n fn = do regs <- liftIO $ CUDA.requires fn CUDA.NumRegs stat <- liftIO $ CUDA.requires fn CUDA.SharedSizeBytes -- static memory only prop <- getM deviceProps let dyn = sharedMem prop acc (cta, occ) = blockSize prop acc regs ((stat+) . dyn) mbk = CUDA.multiProcessorCount prop * CUDA.activeThreadBlocks occ return (cta, mbk `min` gridSize prop acc n cta, toInteger (dyn cta)) -- | -- Determine the optimal thread block size for a given array computation. Fold -- requires blocks with a power-of-two number of threads. -- blockSize :: CUDA.DeviceProperties -> OpenAcc aenv a -> Int -> (Int -> Int) -> (Int, CUDA.Occupancy) blockSize p (Fold _ _ _) r s = CUDA.optimalBlockSizeBy p CUDA.incPow2 (const r) s blockSize p _ r s = CUDA.optimalBlockSizeBy p CUDA.incWarp (const r) s -- | -- Determine the number of blocks of the given size necessary to process the -- given array expression. This should understand things like #elements per -- thread for the various kernels. -- -- foldSeg: 'size' is the number of segments, require one warp per segment -- gridSize :: CUDA.DeviceProperties -> OpenAcc aenv a -> Int -> Int -> Int gridSize p (FoldSeg _ _ _ _) size cta = ((size * CUDA.warpSize p) + cta - 1) `div` cta gridSize _ acc size cta = let between arr n = (n+arr-1) `div` n in 1 `max` ((cta - 1 + (size `between` elementsPerThread acc)) `div` cta) elementsPerThread :: OpenAcc aenv a -> Int elementsPerThread _ = 1 -- | -- Analyse the given array expression, returning an estimate of dynamic shared -- memory usage as a function of thread block size. This can be used by the -- occupancy calculator to optimise kernel launch shape. -- sharedMem :: CUDA.DeviceProperties -> OpenAcc aenv a -> Int -> Int sharedMem _ (Fold _ x _) blockDim = sizeOf (expType x) * blockDim sharedMem _ (Scanl _ x _) blockDim = sizeOf (expType x) * blockDim sharedMem _ (Scanr _ x _) blockDim = sizeOf (expType x) * blockDim sharedMem p (FoldSeg _ x _ _) blockDim = let warp = CUDA.warpSize p in (blockDim `div` warp * 2) * F.sizeOf (undefined :: Int32) + (blockDim + warp `div` 2) * sizeOf (expType x) sharedMem _ _ _ = 0