{-# LANGUAGE BangPatterns #-}
--------------------------------------------------------------------------------
-- |
-- Module    : Foreign.CUDA.Analysis.Occupancy
-- Copyright : [2009..2023] Trevor L. McDonell
-- License   : BSD
--
-- Occupancy calculations for CUDA kernels
--
-- <http://developer.download.nvidia.com/compute/cuda/3_0/sdk/docs/CUDA_Occupancy_calculator.xls>
--
-- /Determining Registers Per Thread and Shared Memory Per Block/
--
-- To determine the number of registers used per thread in your kernel, simply
-- compile the kernel code using the option
--
-- > --ptxas-options=-v
--
-- to nvcc.  This will output information about register, local memory, shared
-- memory, and constant memory usage for each kernel in the @.cu@ file.
-- Alternatively, you can compile with the @-cubin@ option to nvcc.  This will
-- generate a @.cubin@ file, which you can open in a text editor.  Look for the
-- @code@ section with your kernel's name.  Within the curly braces (@{ ... }@)
-- for that code block, you will see a line with @reg = X@, where @x@ is the
-- number of registers used by your kernel.  You can also see the amount of
-- shared memory used as @smem = Y@.  However, if your kernel declares any
-- external shared memory that is allocated dynamically, you will need to add
-- the number in the @.cubin@ file to the amount you dynamically allocate at run
-- time to get the correct shared memory usage.
--
-- /Notes About Occupancy/
--
-- Higher occupancy does not necessarily mean higher performance.  If a kernel
-- is not bandwidth bound, then increasing occupancy will not necessarily
-- increase performance.  If a kernel invocation is already running at least one
-- thread block per multiprocessor in the GPU, and it is bottlenecked by
-- computation and not by global memory accesses, then increasing occupancy may
-- have no effect.  In fact, making changes just to increase occupancy can have
-- other effects, such as additional instructions, spills to local memory (which
-- is off chip), divergent branches, etc.  As with any optimization, you should
-- experiment to see how changes affect the *wall clock time* of the kernel
-- execution.  For bandwidth bound applications, on the other hand, increasing
-- occupancy can help better hide the latency of memory accesses, and therefore
-- improve performance.
--
--------------------------------------------------------------------------------


module Foreign.CUDA.Analysis.Occupancy (

    Occupancy(..),
    occupancy, optimalBlockSize, optimalBlockSizeOf, maxResidentBlocks,
    incPow2, incWarp, decPow2, decWarp

) where

import Data.Ord
import Data.List

import Foreign.CUDA.Analysis.Device                                 hiding ( maxSharedMemPerBlock )


-- GPU Occupancy per multiprocessor
--
data Occupancy = Occupancy
  {
    Occupancy -> Int
activeThreads      :: !Int,         -- ^ Active threads per multiprocessor
    Occupancy -> Int
activeThreadBlocks :: !Int,         -- ^ Active thread blocks per multiprocessor
    Occupancy -> Int
activeWarps        :: !Int,         -- ^ Active warps per multiprocessor
    Occupancy -> Double
occupancy100       :: !Double       -- ^ Occupancy of each multiprocessor (percent)
  }
  deriving (Occupancy -> Occupancy -> Bool
(Occupancy -> Occupancy -> Bool)
-> (Occupancy -> Occupancy -> Bool) -> Eq Occupancy
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Occupancy -> Occupancy -> Bool
== :: Occupancy -> Occupancy -> Bool
$c/= :: Occupancy -> Occupancy -> Bool
/= :: Occupancy -> Occupancy -> Bool
Eq, Eq Occupancy
Eq Occupancy =>
(Occupancy -> Occupancy -> Ordering)
-> (Occupancy -> Occupancy -> Bool)
-> (Occupancy -> Occupancy -> Bool)
-> (Occupancy -> Occupancy -> Bool)
-> (Occupancy -> Occupancy -> Bool)
-> (Occupancy -> Occupancy -> Occupancy)
-> (Occupancy -> Occupancy -> Occupancy)
-> Ord Occupancy
Occupancy -> Occupancy -> Bool
Occupancy -> Occupancy -> Ordering
Occupancy -> Occupancy -> Occupancy
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: Occupancy -> Occupancy -> Ordering
compare :: Occupancy -> Occupancy -> Ordering
$c< :: Occupancy -> Occupancy -> Bool
< :: Occupancy -> Occupancy -> Bool
$c<= :: Occupancy -> Occupancy -> Bool
<= :: Occupancy -> Occupancy -> Bool
$c> :: Occupancy -> Occupancy -> Bool
> :: Occupancy -> Occupancy -> Bool
$c>= :: Occupancy -> Occupancy -> Bool
>= :: Occupancy -> Occupancy -> Bool
$cmax :: Occupancy -> Occupancy -> Occupancy
max :: Occupancy -> Occupancy -> Occupancy
$cmin :: Occupancy -> Occupancy -> Occupancy
min :: Occupancy -> Occupancy -> Occupancy
Ord, Int -> Occupancy -> ShowS
[Occupancy] -> ShowS
Occupancy -> String
(Int -> Occupancy -> ShowS)
-> (Occupancy -> String)
-> ([Occupancy] -> ShowS)
-> Show Occupancy
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Occupancy -> ShowS
showsPrec :: Int -> Occupancy -> ShowS
$cshow :: Occupancy -> String
show :: Occupancy -> String
$cshowList :: [Occupancy] -> ShowS
showList :: [Occupancy] -> ShowS
Show)


-- |
-- Calculate occupancy data for a given GPU and kernel resource usage
--
{-# INLINEABLE occupancy #-}
occupancy
    :: DeviceProperties -- ^ Properties of the card in question
    -> Int              -- ^ Threads per block
    -> Int              -- ^ Registers per thread
    -> Int              -- ^ Shared memory per block (bytes)
    -> Occupancy
occupancy :: DeviceProperties -> Int -> Int -> Int -> Occupancy
occupancy !DeviceProperties
dev !Int
threads !Int
regs !Int
smem
  = Int -> Int -> Int -> Double -> Occupancy
Occupancy Int
at Int
ab Int
aw Double
oc
  where
    at :: Int
at = Int
ab Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
threads
    aw :: Int
aw = Int
ab Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
limitWarpsPerBlock
    ab :: Int
ab = [Int] -> Int
forall a. Ord a => [a] -> a
forall (t :: * -> *) a. (Foldable t, Ord a) => t a -> a
minimum [Int
limitDueToWarps, Int
limitDueToRegs, Int
limitDueToSMem]
    oc :: Double
oc = Double
100 Double -> Double -> Double
forall a. Num a => a -> a -> a
* Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
aw Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (DeviceResources -> Int
warpsPerMP DeviceResources
gpu)

    ceiling' :: Double -> Int
ceiling'      = Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
ceiling :: Double -> Int
    floor' :: Double -> Int
floor'        = Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
floor   :: Double -> Int
    ceilingBy :: a -> Int -> Int
ceilingBy a
x Int
s = Int
s Int -> Int -> Int
forall a. Num a => a -> a -> a
* Double -> Int
ceiling' (a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
x Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
s)
    floorBy :: a -> Int -> Int
floorBy a
x Int
s   = Int
s Int -> Int -> Int
forall a. Num a => a -> a -> a
* Double -> Int
floor' (a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
x Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
s)

    -- Physical resources
    --
    gpu :: DeviceResources
gpu                  = DeviceProperties -> DeviceResources
deviceResources DeviceProperties
dev
    maxSharedMemPerBlock :: Int
maxSharedMemPerBlock = Int64 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (DeviceProperties -> Int64
sharedMemPerBlock DeviceProperties
dev) -- 2080Ti reports 48KB, but should be 64KB !

    -- Allocated resource limits
    --
    limitWarpsPerBlock :: Int
limitWarpsPerBlock  = Double -> Int
ceiling' (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
threads Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (DeviceResources -> Int
threadsPerWarp DeviceResources
gpu))
    -- limitWarpsPerSM     = warpsPerMP gpu

    limitRegsPerBlock :: Int
limitRegsPerBlock   = case DeviceResources -> Allocation
regAllocationStyle DeviceResources
gpu of
                            Allocation
Block -> ((Int
limitWarpsPerBlock Int -> Int -> Int
forall {a}. Integral a => a -> Int -> Int
`ceilingBy` DeviceResources -> Int
warpAllocUnit DeviceResources
gpu) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
regs Int -> Int -> Int
forall a. Num a => a -> a -> a
* DeviceResources -> Int
threadsPerWarp DeviceResources
gpu) Int -> Int -> Int
forall {a}. Integral a => a -> Int -> Int
`ceilingBy` DeviceResources -> Int
regAllocUnit DeviceResources
gpu
                            Allocation
Warp  -> Int
limitWarpsPerBlock
    limitRegsPerSM :: Int
limitRegsPerSM      = case DeviceResources -> Allocation
regAllocationStyle DeviceResources
gpu of
                            Allocation
Block -> DeviceResources -> Int
maxRegPerBlock DeviceResources
gpu
                            Allocation
Warp  -> (DeviceResources -> Int
maxRegPerBlock DeviceResources
gpu Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` ((Int
regs Int -> Int -> Int
forall a. Num a => a -> a -> a
* DeviceResources -> Int
threadsPerWarp DeviceResources
gpu) Int -> Int -> Int
forall {a}. Integral a => a -> Int -> Int
`ceilingBy` DeviceResources -> Int
warpRegAllocUnit DeviceResources
gpu)) Int -> Int -> Int
forall {a}. Integral a => a -> Int -> Int
`floorBy` DeviceResources -> Int
warpAllocUnit DeviceResources
gpu

    limitSMemPerBlock :: Int
limitSMemPerBlock   = Int
smem Int -> Int -> Int
forall {a}. Integral a => a -> Int -> Int
`ceilingBy` DeviceResources -> Int
sharedMemAllocUnit DeviceResources
gpu
    -- limitSMemPerSM      = maxSharedMemPerBlock gpu

    -- Maximum thread blocks per multiprocessor
    --
    limitDueToWarps :: Int
limitDueToWarps                 = DeviceResources -> Int
threadBlocksPerMP DeviceResources
gpu Int -> Int -> Int
forall a. Ord a => a -> a -> a
`min` (DeviceResources -> Int
warpsPerMP DeviceResources
gpu Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
limitWarpsPerBlock)
    limitDueToRegs :: Int
limitDueToRegs
      | Int
regs Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> DeviceResources -> Int
maxRegPerThread DeviceResources
gpu  = Int
0
      | Int
regs Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0                    = (Int
limitRegsPerSM Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
limitRegsPerBlock) Int -> Int -> Int
forall a. Num a => a -> a -> a
* (DeviceResources -> Int
regFileSizePerMP DeviceResources
gpu Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` DeviceResources -> Int
maxRegPerBlock DeviceResources
gpu)
      | Bool
otherwise                   = DeviceResources -> Int
threadBlocksPerMP DeviceResources
gpu

    limitDueToSMem :: Int
limitDueToSMem
      | Int
smem Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
maxSharedMemPerBlock = Int
0
      | Int
smem Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0                    = DeviceResources -> Int
sharedMemPerMP DeviceResources
gpu Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
limitSMemPerBlock
      | Bool
otherwise                   = DeviceResources -> Int
threadBlocksPerMP DeviceResources
gpu


-- |
-- Optimise multiprocessor occupancy as a function of thread block size and
-- resource usage. This returns the smallest satisfying block size in increments
-- of a single warp.
--
{-# INLINEABLE optimalBlockSize #-}
optimalBlockSize
    :: DeviceProperties         -- ^ Architecture to optimise for
    -> (Int -> Int)             -- ^ Register count as a function of thread block size
    -> (Int -> Int)             -- ^ Shared memory usage (bytes) as a function of thread block size
    -> (Int, Occupancy)
optimalBlockSize :: DeviceProperties
-> (Int -> Int) -> (Int -> Int) -> (Int, Occupancy)
optimalBlockSize DeviceProperties
dev = DeviceProperties
-> [Int] -> (Int -> Int) -> (Int -> Int) -> (Int, Occupancy)
optimalBlockSizeOf DeviceProperties
dev (DeviceProperties -> [Int]
decWarp DeviceProperties
dev)


-- |
-- As 'optimalBlockSize', but with a generator that produces the specific thread
-- block sizes that should be tested. The generated list can produce values in
-- any order, but the last satisfying block size will be returned. Hence, values
-- should be monotonically decreasing to return the smallest block size yielding
-- maximum occupancy, and vice-versa.
--
{-# INLINEABLE optimalBlockSizeOf #-}
optimalBlockSizeOf
    :: DeviceProperties         -- ^ Architecture to optimise for
    -> [Int]                    -- ^ Thread block sizes to consider
    -> (Int -> Int)             -- ^ Register count as a function of thread block size
    -> (Int -> Int)             -- ^ Shared memory usage (bytes) as a function of thread block size
    -> (Int, Occupancy)
optimalBlockSizeOf :: DeviceProperties
-> [Int] -> (Int -> Int) -> (Int -> Int) -> (Int, Occupancy)
optimalBlockSizeOf !DeviceProperties
dev ![Int]
threads !Int -> Int
freg !Int -> Int
fsmem
  = ((Int, Occupancy) -> (Int, Occupancy) -> Ordering)
-> [(Int, Occupancy)] -> (Int, Occupancy)
forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
maximumBy (((Int, Occupancy) -> Double)
-> (Int, Occupancy) -> (Int, Occupancy) -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Occupancy -> Double
occupancy100 (Occupancy -> Double)
-> ((Int, Occupancy) -> Occupancy) -> (Int, Occupancy) -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int, Occupancy) -> Occupancy
forall a b. (a, b) -> b
snd))
  ([(Int, Occupancy)] -> (Int, Occupancy))
-> [(Int, Occupancy)] -> (Int, Occupancy)
forall a b. (a -> b) -> a -> b
$ [ (Int
t, DeviceProperties -> Int -> Int -> Int -> Occupancy
occupancy DeviceProperties
dev Int
t (Int -> Int
freg Int
t) (Int -> Int
fsmem Int
t)) | Int
t <- [Int]
threads ]


-- | Increments in powers-of-two, over the range of supported thread block sizes
-- for the given device.
--
{-# INLINEABLE incPow2 #-}
incPow2 :: DeviceProperties -> [Int]
incPow2 :: DeviceProperties -> [Int]
incPow2 !DeviceProperties
dev = (Int -> Int) -> [Int] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map ((Int
2::Int)Int -> Int -> Int
forall a b. (Num a, Integral b) => a -> b -> a
^) [Int
lb, Int
lbInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1 .. Int
ub]
  where
    round' :: Double -> Int
round' = Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
round :: Double -> Int
    lb :: Int
lb     = Double -> Int
round' (Double -> Int) -> (Int -> Double) -> Int -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> Double -> Double
forall a. Floating a => a -> a -> a
logBase Double
2 (Double -> Double) -> (Int -> Double) -> Int -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ DeviceProperties -> Int
warpSize DeviceProperties
dev
    ub :: Int
ub     = Double -> Int
round' (Double -> Int) -> (Int -> Double) -> Int -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> Double -> Double
forall a. Floating a => a -> a -> a
logBase Double
2 (Double -> Double) -> (Int -> Double) -> Int -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ DeviceProperties -> Int
maxThreadsPerBlock DeviceProperties
dev

-- | Decrements in powers-of-two, over the range of supported thread block sizes
-- for the given device.
--
{-# INLINEABLE decPow2 #-}
decPow2 :: DeviceProperties -> [Int]
decPow2 :: DeviceProperties -> [Int]
decPow2 !DeviceProperties
dev = (Int -> Int) -> [Int] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map ((Int
2::Int)Int -> Int -> Int
forall a b. (Num a, Integral b) => a -> b -> a
^) [Int
ub, Int
ubInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1 .. Int
lb]
  where
    round' :: Double -> Int
round' = Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
round :: Double -> Int
    lb :: Int
lb     = Double -> Int
round' (Double -> Int) -> (Int -> Double) -> Int -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> Double -> Double
forall a. Floating a => a -> a -> a
logBase Double
2 (Double -> Double) -> (Int -> Double) -> Int -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ DeviceProperties -> Int
warpSize DeviceProperties
dev
    ub :: Int
ub     = Double -> Int
round' (Double -> Int) -> (Int -> Double) -> Int -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> Double -> Double
forall a. Floating a => a -> a -> a
logBase Double
2 (Double -> Double) -> (Int -> Double) -> Int -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ DeviceProperties -> Int
maxThreadsPerBlock DeviceProperties
dev

-- | Decrements in the warp size of the device, over the range of supported
-- thread block sizes.
--
{-# INLINEABLE decWarp #-}
decWarp :: DeviceProperties -> [Int]
decWarp :: DeviceProperties -> [Int]
decWarp !DeviceProperties
dev = [Int
block, Int
blockInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
warp .. Int
warp]
  where
    !warp :: Int
warp  = DeviceProperties -> Int
warpSize DeviceProperties
dev
    !block :: Int
block = DeviceProperties -> Int
maxThreadsPerBlock DeviceProperties
dev

-- | Increments in the warp size of the device, over the range of supported
-- thread block sizes.
--
{-# INLINEABLE incWarp #-}
incWarp :: DeviceProperties -> [Int]
incWarp :: DeviceProperties -> [Int]
incWarp !DeviceProperties
dev = [Int
warp, Int
2Int -> Int -> Int
forall a. Num a => a -> a -> a
*Int
warp .. Int
block]
  where
    warp :: Int
warp  = DeviceProperties -> Int
warpSize DeviceProperties
dev
    block :: Int
block = DeviceProperties -> Int
maxThreadsPerBlock DeviceProperties
dev


-- |
-- Determine the maximum number of CTAs that can be run simultaneously for a
-- given kernel / device combination.
--
{-# INLINEABLE maxResidentBlocks #-}
maxResidentBlocks
  :: DeviceProperties   -- ^ Properties of the card in question
  -> Int                -- ^ Threads per block
  -> Int                -- ^ Registers per thread
  -> Int                -- ^ Shared memory per block (bytes)
  -> Int                -- ^ Maximum number of resident blocks
maxResidentBlocks :: DeviceProperties -> Int -> Int -> Int -> Int
maxResidentBlocks !DeviceProperties
dev !Int
thds !Int
regs !Int
smem =
  DeviceProperties -> Int
multiProcessorCount DeviceProperties
dev Int -> Int -> Int
forall a. Num a => a -> a -> a
* Occupancy -> Int
activeThreadBlocks (DeviceProperties -> Int -> Int -> Int -> Occupancy
occupancy DeviceProperties
dev Int
thds Int
regs Int
smem)