module Data.Array.Accelerate.CUDA.Analysis.Launch (
launchConfig, determineOccupancy
) where
import Data.Array.Accelerate.AST
import Data.Array.Accelerate.Error
import Data.Array.Accelerate.Trafo
import Data.Array.Accelerate.Analysis.Type
import Data.Array.Accelerate.Analysis.Shape
import qualified Foreign.CUDA.Analysis as CUDA
import qualified Foreign.CUDA.Driver as CUDA
launchConfig
:: DelayedOpenAcc aenv a
-> CUDA.DeviceProperties
-> CUDA.Occupancy
-> ( Int
, Int -> Int
, Int )
launchConfig Delayed{} _ _ = $internalError "launchConfig" "encountered delayed array"
launchConfig (Manifest acc) dev occ =
let cta = CUDA.activeThreads occ `div` CUDA.activeThreadBlocks occ
maxGrid = CUDA.multiProcessorCount dev * CUDA.activeThreadBlocks occ
smem = sharedMem dev acc cta
in
(cta, \n -> maxGrid `min` gridSize dev acc n cta, smem)
determineOccupancy
:: DelayedOpenAcc aenv a
-> CUDA.DeviceProperties
-> CUDA.Fun
-> Int
-> IO CUDA.Occupancy
determineOccupancy Delayed{} _ _ _ = $internalError "determineOccupancy" "encountered delayed array"
determineOccupancy (Manifest acc) dev fn maxBlock = do
registers <- CUDA.requires fn CUDA.NumRegs
static_smem <- CUDA.requires fn CUDA.SharedSizeBytes
return . snd $ blockSize dev acc maxBlock registers (\threads -> static_smem + dynamic_smem threads)
where
dynamic_smem = sharedMem dev acc
blockSize
:: CUDA.DeviceProperties
-> PreOpenAcc DelayedOpenAcc aenv a
-> Int
-> Int
-> (Int -> Int)
-> (Int, CUDA.Occupancy)
blockSize dev acc lim regs smem =
CUDA.optimalBlockSizeBy dev (filter (<= lim) . strategy) (const regs) smem
where
strategy = case acc of
Fold _ _ _ -> CUDA.incPow2
Fold1 _ _ -> CUDA.incPow2
Scanl _ _ _ -> CUDA.incWarp
Scanl' _ _ _ -> CUDA.incWarp
Scanl1 _ _ -> CUDA.incWarp
Scanr _ _ _ -> CUDA.incWarp
Scanr' _ _ _ -> CUDA.incWarp
Scanr1 _ _ -> CUDA.incWarp
_ -> CUDA.decWarp
gridSize :: CUDA.DeviceProperties -> PreOpenAcc DelayedOpenAcc aenv a -> Int -> Int -> Int
gridSize p acc@(FoldSeg _ _ _ _) size cta = split acc (size * CUDA.warpSize p) cta
gridSize p acc@(Fold1Seg _ _ _) size cta = split acc (size * CUDA.warpSize p) cta
gridSize _ acc@(Fold _ _ _) size cta = if preAccDim delayedDim acc == 0 then split acc size cta else max 1 size
gridSize _ acc@(Fold1 _ _) size cta = if preAccDim delayedDim acc == 0 then split acc size cta else max 1 size
gridSize _ acc size cta = split acc size cta
split :: acc aenv a -> Int -> Int -> Int
split acc size cta = (size `between` eltsPerThread acc) `between` cta
where
between arr n = 1 `max` ((n + arr 1) `div` n)
eltsPerThread _ = 1
sharedMem :: CUDA.DeviceProperties -> PreOpenAcc DelayedOpenAcc aenv a -> Int -> Int
sharedMem _ Alet{} _ = $internalError "sharedMem" "Let"
sharedMem _ Avar{} _ = $internalError "sharedMem" "Avar"
sharedMem _ Apply{} _ = $internalError "sharedMem" "Apply"
sharedMem _ Acond{} _ = $internalError "sharedMem" "Acond"
sharedMem _ Awhile{} _ = $internalError "sharedMem" "Awhile"
sharedMem _ Atuple{} _ = $internalError "sharedMem" "Atuple"
sharedMem _ Aprj{} _ = $internalError "sharedMem" "Aprj"
sharedMem _ Use{} _ = $internalError "sharedMem" "Use"
sharedMem _ Unit{} _ = $internalError "sharedMem" "Unit"
sharedMem _ Reshape{} _ = $internalError "sharedMem" "Reshape"
sharedMem _ Aforeign{} _ = $internalError "sharedMem" "Aforeign"
sharedMem _ Generate{} _ = 0
sharedMem _ Transform{} _ = 0
sharedMem _ Replicate{} _ = 0
sharedMem _ Slice{} _ = 0
sharedMem _ Map{} _ = 0
sharedMem _ ZipWith{} _ = 0
sharedMem _ Permute{} _ = 0
sharedMem _ Backpermute{} _ = 0
sharedMem _ Stencil{} _ = 0
sharedMem _ Stencil2{} _ = 0
sharedMem _ (Fold _ x _) blockDim = sizeOf (delayedExpType x) * blockDim
sharedMem _ (Scanl _ x _) blockDim = sizeOf (delayedExpType x) * blockDim
sharedMem _ (Scanr _ x _) blockDim = sizeOf (delayedExpType x) * blockDim
sharedMem _ (Scanl' _ x _) blockDim = sizeOf (delayedExpType x) * blockDim
sharedMem _ (Scanr' _ x _) blockDim = sizeOf (delayedExpType x) * blockDim
sharedMem _ (Fold1 _ a) blockDim = sizeOf (delayedAccType a) * blockDim
sharedMem _ (Scanl1 _ a) blockDim = sizeOf (delayedAccType a) * blockDim
sharedMem _ (Scanr1 _ a) blockDim = sizeOf (delayedAccType a) * blockDim
sharedMem p (FoldSeg _ x _ _) blockDim =
(blockDim `div` CUDA.warpSize p) * 8 + blockDim * sizeOf (delayedExpType x)
sharedMem p (Fold1Seg _ a _) blockDim =
(blockDim `div` CUDA.warpSize p) * 8 + blockDim * sizeOf (delayedAccType a)