{-# LANGUAGE BangPatterns #-}
module Foreign.CUDA.Analysis.Occupancy (
Occupancy(..),
occupancy, optimalBlockSize, optimalBlockSizeOf, maxResidentBlocks,
incPow2, incWarp, decPow2, decWarp
) where
import Data.Ord
import Data.List
import Foreign.CUDA.Analysis.Device hiding ( maxSharedMemPerBlock )
data Occupancy = Occupancy
{
Occupancy -> Int
activeThreads :: !Int,
Occupancy -> Int
activeThreadBlocks :: !Int,
Occupancy -> Int
activeWarps :: !Int,
Occupancy -> Double
occupancy100 :: !Double
}
deriving (Occupancy -> Occupancy -> Bool
(Occupancy -> Occupancy -> Bool)
-> (Occupancy -> Occupancy -> Bool) -> Eq Occupancy
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: Occupancy -> Occupancy -> Bool
== :: Occupancy -> Occupancy -> Bool
$c/= :: Occupancy -> Occupancy -> Bool
/= :: Occupancy -> Occupancy -> Bool
Eq, Eq Occupancy
Eq Occupancy =>
(Occupancy -> Occupancy -> Ordering)
-> (Occupancy -> Occupancy -> Bool)
-> (Occupancy -> Occupancy -> Bool)
-> (Occupancy -> Occupancy -> Bool)
-> (Occupancy -> Occupancy -> Bool)
-> (Occupancy -> Occupancy -> Occupancy)
-> (Occupancy -> Occupancy -> Occupancy)
-> Ord Occupancy
Occupancy -> Occupancy -> Bool
Occupancy -> Occupancy -> Ordering
Occupancy -> Occupancy -> Occupancy
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
$ccompare :: Occupancy -> Occupancy -> Ordering
compare :: Occupancy -> Occupancy -> Ordering
$c< :: Occupancy -> Occupancy -> Bool
< :: Occupancy -> Occupancy -> Bool
$c<= :: Occupancy -> Occupancy -> Bool
<= :: Occupancy -> Occupancy -> Bool
$c> :: Occupancy -> Occupancy -> Bool
> :: Occupancy -> Occupancy -> Bool
$c>= :: Occupancy -> Occupancy -> Bool
>= :: Occupancy -> Occupancy -> Bool
$cmax :: Occupancy -> Occupancy -> Occupancy
max :: Occupancy -> Occupancy -> Occupancy
$cmin :: Occupancy -> Occupancy -> Occupancy
min :: Occupancy -> Occupancy -> Occupancy
Ord, Int -> Occupancy -> ShowS
[Occupancy] -> ShowS
Occupancy -> String
(Int -> Occupancy -> ShowS)
-> (Occupancy -> String)
-> ([Occupancy] -> ShowS)
-> Show Occupancy
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Occupancy -> ShowS
showsPrec :: Int -> Occupancy -> ShowS
$cshow :: Occupancy -> String
show :: Occupancy -> String
$cshowList :: [Occupancy] -> ShowS
showList :: [Occupancy] -> ShowS
Show)
{-# INLINEABLE occupancy #-}
occupancy
:: DeviceProperties
-> Int
-> Int
-> Int
-> Occupancy
occupancy :: DeviceProperties -> Int -> Int -> Int -> Occupancy
occupancy !DeviceProperties
dev !Int
threads !Int
regs !Int
smem
= Int -> Int -> Int -> Double -> Occupancy
Occupancy Int
at Int
ab Int
aw Double
oc
where
at :: Int
at = Int
ab Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
threads
aw :: Int
aw = Int
ab Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
limitWarpsPerBlock
ab :: Int
ab = [Int] -> Int
forall a. Ord a => [a] -> a
forall (t :: * -> *) a. (Foldable t, Ord a) => t a -> a
minimum [Int
limitDueToWarps, Int
limitDueToRegs, Int
limitDueToSMem]
oc :: Double
oc = Double
100 Double -> Double -> Double
forall a. Num a => a -> a -> a
* Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
aw Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (DeviceResources -> Int
warpsPerMP DeviceResources
gpu)
ceiling' :: Double -> Int
ceiling' = Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
ceiling :: Double -> Int
floor' :: Double -> Int
floor' = Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
floor :: Double -> Int
ceilingBy :: a -> Int -> Int
ceilingBy a
x Int
s = Int
s Int -> Int -> Int
forall a. Num a => a -> a -> a
* Double -> Int
ceiling' (a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
x Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
s)
floorBy :: a -> Int -> Int
floorBy a
x Int
s = Int
s Int -> Int -> Int
forall a. Num a => a -> a -> a
* Double -> Int
floor' (a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
x Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
s)
gpu :: DeviceResources
gpu = DeviceProperties -> DeviceResources
deviceResources DeviceProperties
dev
maxSharedMemPerBlock :: Int
maxSharedMemPerBlock = Int64 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (DeviceProperties -> Int64
sharedMemPerBlock DeviceProperties
dev)
limitWarpsPerBlock :: Int
limitWarpsPerBlock = Double -> Int
ceiling' (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
threads Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (DeviceResources -> Int
threadsPerWarp DeviceResources
gpu))
limitRegsPerBlock :: Int
limitRegsPerBlock = case DeviceResources -> Allocation
regAllocationStyle DeviceResources
gpu of
Allocation
Block -> ((Int
limitWarpsPerBlock Int -> Int -> Int
forall {a}. Integral a => a -> Int -> Int
`ceilingBy` DeviceResources -> Int
warpAllocUnit DeviceResources
gpu) Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
regs Int -> Int -> Int
forall a. Num a => a -> a -> a
* DeviceResources -> Int
threadsPerWarp DeviceResources
gpu) Int -> Int -> Int
forall {a}. Integral a => a -> Int -> Int
`ceilingBy` DeviceResources -> Int
regAllocUnit DeviceResources
gpu
Allocation
Warp -> Int
limitWarpsPerBlock
limitRegsPerSM :: Int
limitRegsPerSM = case DeviceResources -> Allocation
regAllocationStyle DeviceResources
gpu of
Allocation
Block -> DeviceResources -> Int
maxRegPerBlock DeviceResources
gpu
Allocation
Warp -> (DeviceResources -> Int
maxRegPerBlock DeviceResources
gpu Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` ((Int
regs Int -> Int -> Int
forall a. Num a => a -> a -> a
* DeviceResources -> Int
threadsPerWarp DeviceResources
gpu) Int -> Int -> Int
forall {a}. Integral a => a -> Int -> Int
`ceilingBy` DeviceResources -> Int
warpRegAllocUnit DeviceResources
gpu)) Int -> Int -> Int
forall {a}. Integral a => a -> Int -> Int
`floorBy` DeviceResources -> Int
warpAllocUnit DeviceResources
gpu
limitSMemPerBlock :: Int
limitSMemPerBlock = Int
smem Int -> Int -> Int
forall {a}. Integral a => a -> Int -> Int
`ceilingBy` DeviceResources -> Int
sharedMemAllocUnit DeviceResources
gpu
limitDueToWarps :: Int
limitDueToWarps = DeviceResources -> Int
threadBlocksPerMP DeviceResources
gpu Int -> Int -> Int
forall a. Ord a => a -> a -> a
`min` (DeviceResources -> Int
warpsPerMP DeviceResources
gpu Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
limitWarpsPerBlock)
limitDueToRegs :: Int
limitDueToRegs
| Int
regs Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> DeviceResources -> Int
maxRegPerThread DeviceResources
gpu = Int
0
| Int
regs Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 = (Int
limitRegsPerSM Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
limitRegsPerBlock) Int -> Int -> Int
forall a. Num a => a -> a -> a
* (DeviceResources -> Int
regFileSizePerMP DeviceResources
gpu Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` DeviceResources -> Int
maxRegPerBlock DeviceResources
gpu)
| Bool
otherwise = DeviceResources -> Int
threadBlocksPerMP DeviceResources
gpu
limitDueToSMem :: Int
limitDueToSMem
| Int
smem Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
maxSharedMemPerBlock = Int
0
| Int
smem Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 = DeviceResources -> Int
sharedMemPerMP DeviceResources
gpu Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
limitSMemPerBlock
| Bool
otherwise = DeviceResources -> Int
threadBlocksPerMP DeviceResources
gpu
{-# INLINEABLE optimalBlockSize #-}
optimalBlockSize
:: DeviceProperties
-> (Int -> Int)
-> (Int -> Int)
-> (Int, Occupancy)
optimalBlockSize :: DeviceProperties
-> (Int -> Int) -> (Int -> Int) -> (Int, Occupancy)
optimalBlockSize DeviceProperties
dev = DeviceProperties
-> [Int] -> (Int -> Int) -> (Int -> Int) -> (Int, Occupancy)
optimalBlockSizeOf DeviceProperties
dev (DeviceProperties -> [Int]
decWarp DeviceProperties
dev)
{-# INLINEABLE optimalBlockSizeOf #-}
optimalBlockSizeOf
:: DeviceProperties
-> [Int]
-> (Int -> Int)
-> (Int -> Int)
-> (Int, Occupancy)
optimalBlockSizeOf :: DeviceProperties
-> [Int] -> (Int -> Int) -> (Int -> Int) -> (Int, Occupancy)
optimalBlockSizeOf !DeviceProperties
dev ![Int]
threads !Int -> Int
freg !Int -> Int
fsmem
= ((Int, Occupancy) -> (Int, Occupancy) -> Ordering)
-> [(Int, Occupancy)] -> (Int, Occupancy)
forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
maximumBy (((Int, Occupancy) -> Double)
-> (Int, Occupancy) -> (Int, Occupancy) -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Occupancy -> Double
occupancy100 (Occupancy -> Double)
-> ((Int, Occupancy) -> Occupancy) -> (Int, Occupancy) -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Int, Occupancy) -> Occupancy
forall a b. (a, b) -> b
snd))
([(Int, Occupancy)] -> (Int, Occupancy))
-> [(Int, Occupancy)] -> (Int, Occupancy)
forall a b. (a -> b) -> a -> b
$ [ (Int
t, DeviceProperties -> Int -> Int -> Int -> Occupancy
occupancy DeviceProperties
dev Int
t (Int -> Int
freg Int
t) (Int -> Int
fsmem Int
t)) | Int
t <- [Int]
threads ]
{-# INLINEABLE incPow2 #-}
incPow2 :: DeviceProperties -> [Int]
incPow2 :: DeviceProperties -> [Int]
incPow2 !DeviceProperties
dev = (Int -> Int) -> [Int] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map ((Int
2::Int)Int -> Int -> Int
forall a b. (Num a, Integral b) => a -> b -> a
^) [Int
lb, Int
lbInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
1 .. Int
ub]
where
round' :: Double -> Int
round' = Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
round :: Double -> Int
lb :: Int
lb = Double -> Int
round' (Double -> Int) -> (Int -> Double) -> Int -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> Double -> Double
forall a. Floating a => a -> a -> a
logBase Double
2 (Double -> Double) -> (Int -> Double) -> Int -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ DeviceProperties -> Int
warpSize DeviceProperties
dev
ub :: Int
ub = Double -> Int
round' (Double -> Int) -> (Int -> Double) -> Int -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> Double -> Double
forall a. Floating a => a -> a -> a
logBase Double
2 (Double -> Double) -> (Int -> Double) -> Int -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ DeviceProperties -> Int
maxThreadsPerBlock DeviceProperties
dev
{-# INLINEABLE decPow2 #-}
decPow2 :: DeviceProperties -> [Int]
decPow2 :: DeviceProperties -> [Int]
decPow2 !DeviceProperties
dev = (Int -> Int) -> [Int] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map ((Int
2::Int)Int -> Int -> Int
forall a b. (Num a, Integral b) => a -> b -> a
^) [Int
ub, Int
ubInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1 .. Int
lb]
where
round' :: Double -> Int
round' = Double -> Int
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
round :: Double -> Int
lb :: Int
lb = Double -> Int
round' (Double -> Int) -> (Int -> Double) -> Int -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> Double -> Double
forall a. Floating a => a -> a -> a
logBase Double
2 (Double -> Double) -> (Int -> Double) -> Int -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ DeviceProperties -> Int
warpSize DeviceProperties
dev
ub :: Int
ub = Double -> Int
round' (Double -> Int) -> (Int -> Double) -> Int -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> Double -> Double
forall a. Floating a => a -> a -> a
logBase Double
2 (Double -> Double) -> (Int -> Double) -> Int -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Int) -> Int -> Int
forall a b. (a -> b) -> a -> b
$ DeviceProperties -> Int
maxThreadsPerBlock DeviceProperties
dev
{-# INLINEABLE decWarp #-}
decWarp :: DeviceProperties -> [Int]
decWarp :: DeviceProperties -> [Int]
decWarp !DeviceProperties
dev = [Int
block, Int
blockInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
warp .. Int
warp]
where
!warp :: Int
warp = DeviceProperties -> Int
warpSize DeviceProperties
dev
!block :: Int
block = DeviceProperties -> Int
maxThreadsPerBlock DeviceProperties
dev
{-# INLINEABLE incWarp #-}
incWarp :: DeviceProperties -> [Int]
incWarp :: DeviceProperties -> [Int]
incWarp !DeviceProperties
dev = [Int
warp, Int
2Int -> Int -> Int
forall a. Num a => a -> a -> a
*Int
warp .. Int
block]
where
warp :: Int
warp = DeviceProperties -> Int
warpSize DeviceProperties
dev
block :: Int
block = DeviceProperties -> Int
maxThreadsPerBlock DeviceProperties
dev
{-# INLINEABLE maxResidentBlocks #-}
maxResidentBlocks
:: DeviceProperties
-> Int
-> Int
-> Int
-> Int
maxResidentBlocks :: DeviceProperties -> Int -> Int -> Int -> Int
maxResidentBlocks !DeviceProperties
dev !Int
thds !Int
regs !Int
smem =
DeviceProperties -> Int
multiProcessorCount DeviceProperties
dev Int -> Int -> Int
forall a. Num a => a -> a -> a
* Occupancy -> Int
activeThreadBlocks (DeviceProperties -> Int -> Int -> Int -> Occupancy
occupancy DeviceProperties
dev Int
thds Int
regs Int
smem)