{-# LANGUAGE CPP, GADTs, RankNTypes #-} -- | -- Module : Data.Array.Accelerate.CUDA.Analysis.Launch -- Copyright : [2008..2011] Manuel M T Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell -- License : BSD3 -- -- Maintainer : Manuel M T Chakravarty -- Stability : experimental -- Portability : non-partable (GHC extensions) -- module Data.Array.Accelerate.CUDA.Analysis.Launch (launchConfig) where -- friends import Data.Array.Accelerate.AST import Data.Array.Accelerate.Type import Data.Array.Accelerate.Array.Sugar (Array(..), EltRepr) import Data.Array.Accelerate.Analysis.Type hiding (accType, expType) import Data.Array.Accelerate.Analysis.Shape hiding (accDim) import Data.Array.Accelerate.CUDA.State import Data.Array.Accelerate.CUDA.Compile (ExecOpenAcc(..)) -- library import Data.Label.PureM import Control.Monad.IO.Class import qualified Foreign.CUDA.Analysis as CUDA import qualified Foreign.CUDA.Driver as CUDA import qualified Foreign.Storable as F #include "accelerate.h" -- |Reify dimensionality of array computations -- accDim :: ExecOpenAcc aenv (Array sh e) -> Int accDim (ExecAcc _ _ _ acc) = preAccDim accDim acc accDim (ExecAfun _ _) = error "when I get sad, I stop being sad and be AWESOME instead." -- |Reify type of arrays and scalar expressions -- accType :: ExecOpenAcc aenv (Array sh e) -> TupleType (EltRepr e) accType (ExecAcc _ _ _ acc) = preAccType accType acc accType (ExecAfun _ _) = error "TRUE STORY." expType :: PreOpenExp ExecOpenAcc aenv env t -> TupleType (EltRepr t) expType = preExpType accType -- | -- Determine kernel launch parameters for the given array computation (as well -- as compiled function module). This consists of the thread block size, number -- of blocks, and dynamically allocated shared memory (bytes), respectively. -- -- By default, this launches the kernel with the minimum block size that gives -- maximum occupancy, and the grid size limited to the maximum number of -- physically resident blocks. Hence, kernels may need to process multiple -- elements per thread. -- launchConfig :: PreOpenAcc ExecOpenAcc aenv a -> Int -> CUDA.Fun -> CIO (Int, Int, Integer) launchConfig acc n fn = do regs <- liftIO $ CUDA.requires fn CUDA.NumRegs stat <- liftIO $ CUDA.requires fn CUDA.SharedSizeBytes -- static memory only prop <- gets deviceProps let dyn = sharedMem prop acc (cta, occ) = blockSize prop acc regs ((stat+) . dyn) mbk = CUDA.multiProcessorCount prop * CUDA.activeThreadBlocks occ return (cta, mbk `min` gridSize prop acc n cta, toInteger (dyn cta)) -- | -- Determine the optimal thread block size for a given array computation. Fold -- requires blocks with a power-of-two number of threads. -- blockSize :: CUDA.DeviceProperties -> PreOpenAcc ExecOpenAcc aenv a -> Int -> (Int -> Int) -> (Int, CUDA.Occupancy) blockSize p (Fold _ _ _) r s = CUDA.optimalBlockSizeBy p CUDA.incPow2 (const r) s blockSize p (Fold1 _ _) r s = CUDA.optimalBlockSizeBy p CUDA.incPow2 (const r) s blockSize p _ r s = CUDA.optimalBlockSizeBy p CUDA.incWarp (const r) s -- | -- Determine the number of blocks of the given size necessary to process the -- given array expression. This should understand things like #elements per -- thread for the various kernels. -- -- foldSeg: 'size' is the number of segments, require one warp per segment -- gridSize :: CUDA.DeviceProperties -> PreOpenAcc ExecOpenAcc aenv a -> Int -> Int -> Int gridSize p acc@(FoldSeg _ _ _ _) size cta = split acc (size * CUDA.warpSize p) cta gridSize p acc@(Fold1Seg _ _ _) size cta = split acc (size * CUDA.warpSize p) cta gridSize p acc@(Fold _ _ a) size cta = if accDim a == 1 then split acc size cta else split acc (size * CUDA.warpSize p) cta gridSize p acc@(Fold1 _ a) size cta = if accDim a == 1 then split acc size cta else split acc (size * CUDA.warpSize p) cta gridSize _ acc size cta = split acc size cta split :: PreOpenAcc ExecOpenAcc aenv a -> Int -> Int -> Int split acc size cta = (size `between` eltsPerThread acc) `between` cta where between arr n = 1 `max` ((n + arr - 1) `div` n) eltsPerThread _ = 1 -- | -- Analyse the given array expression, returning an estimate of dynamic shared -- memory usage as a function of thread block size. This can be used by the -- occupancy calculator to optimise kernel launch shape. -- sharedMem :: CUDA.DeviceProperties -> PreOpenAcc ExecOpenAcc aenv a -> Int -> Int -- non-computation forms sharedMem _ (Let _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Let" sharedMem _ (Let2 _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Let2" sharedMem _ (PairArrays _ _) _ = INTERNAL_ERROR(error) "sharedMem" "PairArrays" sharedMem _ (Avar _) _ = INTERNAL_ERROR(error) "sharedMem" "Avar" sharedMem _ (Apply _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Apply" sharedMem _ (Acond _ _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Acond" sharedMem _ (Use _) _ = INTERNAL_ERROR(error) "sharedMem" "Use" sharedMem _ (Unit _) _ = INTERNAL_ERROR(error) "sharedMem" "Unit" sharedMem _ (Reshape _ _) _ = INTERNAL_ERROR(error) "sharedMem" "Reshape" -- skeleton nodes sharedMem _ (Generate _ _) _ = 0 sharedMem _ (Replicate _ _ _) _ = 0 sharedMem _ (Index _ _ _) _ = 0 sharedMem _ (Map _ _) _ = 0 sharedMem _ (ZipWith _ _ _) _ = 0 sharedMem _ (Permute _ _ _ _) _ = 0 sharedMem _ (Backpermute _ _ _) _ = 0 sharedMem _ (Stencil _ _ _) _ = 0 sharedMem _ (Stencil2 _ _ _ _ _) _ = 0 sharedMem _ (Fold _ _ a) blockDim = sizeOf (accType a) * blockDim sharedMem _ (Fold1 _ a) blockDim = sizeOf (accType a) * blockDim sharedMem _ (Scanl _ x _) blockDim = sizeOf (expType x) * blockDim sharedMem _ (Scanr _ x _) blockDim = sizeOf (expType x) * blockDim sharedMem _ (Scanl' _ x _) blockDim = sizeOf (expType x) * blockDim sharedMem _ (Scanr' _ x _) blockDim = sizeOf (expType x) * blockDim sharedMem _ (Scanl1 _ a) blockDim = sizeOf (accType a) * blockDim sharedMem _ (Scanr1 _ a) blockDim = sizeOf (accType a) * blockDim sharedMem p (FoldSeg _ _ a _) blockDim = (blockDim `div` CUDA.warpSize p) * 4 * F.sizeOf (undefined::Int32) + blockDim * sizeOf (accType a) sharedMem p (Fold1Seg _ a _) blockDim = (blockDim `div` CUDA.warpSize p) * 4 * F.sizeOf (undefined::Int32) + blockDim * sizeOf (accType a)