{-# LANGUAGE CPP #-} {-# LANGUAGE GADTs #-} -- | -- Module : Data.Array.Accelerate.CUDA.Analysis.Launch -- Copyright : [2008..2010] Manuel M T Chakravarty, Gabriele Keller, Sean Lee -- [2009..2012] Manuel M T Chakravarty, Gabriele Keller, Trevor L. McDonell -- License : BSD3 -- -- Maintainer : Trevor L. McDonell -- Stability : experimental -- Portability : non-portable (GHC extensions) -- module Data.Array.Accelerate.CUDA.Analysis.Launch ( launchConfig, determineOccupancy ) where -- friends import Data.Array.Accelerate.AST import Data.Array.Accelerate.Trafo import Data.Array.Accelerate.Analysis.Type import Data.Array.Accelerate.Analysis.Shape -- library import qualified Foreign.CUDA.Analysis as CUDA import qualified Foreign.CUDA.Driver as CUDA #include "accelerate.h" -- | -- Determine kernel launch parameters for the given array computation (as well -- as compiled function module). This consists of the thread block size, number -- of blocks, and dynamically allocated shared memory (bytes), respectively. -- -- For most operations, this selects the minimum block size that gives maximum -- occupancy, and the grid size limited to the maximum number of physically -- resident blocks. Hence, kernels may need to process multiple elements per -- thread. Scan operations select the largest block size of maximum occupancy. -- launchConfig :: DelayedOpenAcc aenv a -> CUDA.DeviceProperties -- the device being executed on -> CUDA.Occupancy -- kernel occupancy information -> ( Int -- block size , Int -> Int -- number of blocks for input problem size (grid) , Int ) -- shared memory (bytes) launchConfig Delayed{} _ _ = INTERNAL_ERROR(error) "launchConfig" "encountered delayed array" launchConfig (Manifest acc) dev occ = let cta = CUDA.activeThreads occ `div` CUDA.activeThreadBlocks occ maxGrid = CUDA.multiProcessorCount dev * CUDA.activeThreadBlocks occ smem = sharedMem dev acc cta in (cta, \n -> maxGrid `min` gridSize dev acc n cta, smem) -- | -- Determine maximal occupancy statistics for the given kernel / device -- combination. -- determineOccupancy :: DelayedOpenAcc aenv a -> CUDA.DeviceProperties -> CUDA.Fun -- corresponding __global__ entry function -> Int -- maximum number of threads per block -> IO CUDA.Occupancy determineOccupancy Delayed{} _ _ _ = INTERNAL_ERROR(error) "determineOccupancy" "encountered delayed array" determineOccupancy (Manifest acc) dev fn maxBlock = do registers <- CUDA.requires fn CUDA.NumRegs static_smem <- CUDA.requires fn CUDA.SharedSizeBytes -- static memory only return . snd $ blockSize dev acc maxBlock registers (\threads -> static_smem + dynamic_smem threads) where dynamic_smem = sharedMem dev acc -- | -- Determine an optimal thread block size for a given array computation. Fold -- requires blocks with a power-of-two number of threads. Scans select the -- largest size thread block possible, because if only one thread block is -- needed we can calculate the scan in a single pass, rather than three. -- blockSize :: CUDA.DeviceProperties -> PreOpenAcc DelayedOpenAcc aenv a -> Int -- maximum number of threads per block -> Int -- number of registers used -> (Int -> Int) -- shared memory as a function of thread block size (bytes) -> (Int, CUDA.Occupancy) blockSize dev acc lim regs smem = CUDA.optimalBlockSizeBy dev (filter (<= lim) . strategy) (const regs) smem where strategy = case acc of Fold _ _ _ -> CUDA.incPow2 Fold1 _ _ -> CUDA.incPow2 Scanl _ _ _ -> CUDA.incWarp Scanl' _ _ _ -> CUDA.incWarp Scanl1 _ _ -> CUDA.incWarp Scanr _ _ _ -> CUDA.incWarp Scanr' _ _ _ -> CUDA.incWarp Scanr1 _ _ -> CUDA.incWarp _ -> CUDA.decWarp -- | -- Determine the number of blocks of the given size necessary to process the -- given array expression. This should understand things like #elements per -- thread for the various kernels. -- -- The 'size' parameter is typically the number of elements in the array, except -- for the following instances: -- -- * foldSeg: the number of segments; require one warp per segment -- -- * fold: for multidimensional reductions, this is the size of the shape tail -- for 1D reductions this is the total number of elements -- gridSize :: CUDA.DeviceProperties -> PreOpenAcc DelayedOpenAcc aenv a -> Int -> Int -> Int gridSize p acc@(FoldSeg _ _ _ _) size cta = split acc (size * CUDA.warpSize p) cta gridSize p acc@(Fold1Seg _ _ _) size cta = split acc (size * CUDA.warpSize p) cta gridSize _ acc@(Fold _ _ _) size cta = if preAccDim delayedDim acc == 0 then split acc size cta else max 1 size gridSize _ acc@(Fold1 _ _) size cta = if preAccDim delayedDim acc == 0 then split acc size cta else max 1 size gridSize _ acc size cta = split acc size cta split :: acc aenv a -> Int -> Int -> Int split acc size cta = (size `between` eltsPerThread acc) `between` cta where between arr n = 1 `max` ((n + arr - 1) `div` n) eltsPerThread _ = 1 -- | -- Analyse the given array expression, returning an estimate of dynamic shared -- memory usage as a function of thread block size. This can be used by the -- occupancy calculator to optimise kernel launch shape. -- sharedMem :: CUDA.DeviceProperties -> PreOpenAcc DelayedOpenAcc aenv a -> Int -> Int -- non-computation forms sharedMem _ (Alet _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Let" sharedMem _ (Avar _) _ = INTERNAL_ERROR(error) "sharedMem" "Avar" sharedMem _ (Apply _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Apply" sharedMem _ (Acond _ _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Acond" sharedMem _ (Atuple _) _ = INTERNAL_ERROR(error) "sharedMem" "Atuple" sharedMem _ (Aprj _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Aprj" sharedMem _ (Use _) _ = INTERNAL_ERROR(error) "sharedMem" "Use" sharedMem _ (Unit _) _ = INTERNAL_ERROR(error) "sharedMem" "Unit" sharedMem _ (Reshape _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Reshape" sharedMem _ (Aforeign _ _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Aforeign" -- skeleton nodes sharedMem _ (Generate _ _) _ = 0 sharedMem _ (Transform _ _ _ _) _ = 0 sharedMem _ (Replicate _ _ _) _ = 0 sharedMem _ (Slice _ _ _) _ = 0 sharedMem _ (Map _ _) _ = 0 sharedMem _ (ZipWith _ _ _) _ = 0 sharedMem _ (Permute _ _ _ _) _ = 0 sharedMem _ (Backpermute _ _ _) _ = 0 sharedMem _ (Stencil _ _ _) _ = 0 sharedMem _ (Stencil2 _ _ _ _ _) _ = 0 sharedMem _ (Fold _ x _) blockDim = sizeOf (delayedExpType x) * blockDim sharedMem _ (Scanl _ x _) blockDim = sizeOf (delayedExpType x) * blockDim sharedMem _ (Scanr _ x _) blockDim = sizeOf (delayedExpType x) * blockDim sharedMem _ (Scanl' _ x _) blockDim = sizeOf (delayedExpType x) * blockDim sharedMem _ (Scanr' _ x _) blockDim = sizeOf (delayedExpType x) * blockDim sharedMem _ (Fold1 _ a) blockDim = sizeOf (delayedAccType a) * blockDim sharedMem _ (Scanl1 _ a) blockDim = sizeOf (delayedAccType a) * blockDim sharedMem _ (Scanr1 _ a) blockDim = sizeOf (delayedAccType a) * blockDim sharedMem p (FoldSeg _ x _ _) blockDim = (blockDim `div` CUDA.warpSize p) * 8 + blockDim * sizeOf (delayedExpType x) -- TLM: why 8? I can't remember... sharedMem p (Fold1Seg _ a _) blockDim = (blockDim `div` CUDA.warpSize p) * 8 + blockDim * sizeOf (delayedAccType a)