-- GENERATED by C->Haskell Compiler, version 0.28.8 Switcheroo, 25 November 2017 (Haskell)
-- Edit the ORIGNAL .chs file instead!


{-# LINE 1 "src/Foreign/CUDA/Analysis/Device.chs" #-}
--------------------------------------------------------------------------------
-- |
-- Module    : Foreign.CUDA.Analysis.Device
-- Copyright : [2009..2023] Trevor L. McDonell
-- License   : BSD
--
-- Common device functions
--
--------------------------------------------------------------------------------

module Foreign.CUDA.Analysis.Device (

    Compute(..), ComputeMode(..),
    DeviceProperties(..), DeviceResources(..), Allocation(..), PCI(..),
    deviceResources,
    describe

) where





import Data.Int
import Text.Show.Describe

import Debug.Trace


-- |
-- The compute mode the device is currently in
--
data ComputeMode = Default
                 | Prohibited
                 | ExclusiveProcess
  deriving (ComputeMode -> ComputeMode -> Bool
(ComputeMode -> ComputeMode -> Bool)
-> (ComputeMode -> ComputeMode -> Bool) -> Eq ComputeMode
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: ComputeMode -> ComputeMode -> Bool
== :: ComputeMode -> ComputeMode -> Bool
$c/= :: ComputeMode -> ComputeMode -> Bool
/= :: ComputeMode -> ComputeMode -> Bool
Eq,Show)
instance Enum ComputeMode where
  succ Default = Prohibited
  succ Prohibited = ExclusiveProcess
  succ ExclusiveProcess = error "ComputeMode.succ: ExclusiveProcess has no successor"

  pred :: ComputeMode -> ComputeMode
pred ComputeMode
Prohibited = ComputeMode
Default
  pred ComputeMode
ExclusiveProcess = ComputeMode
Prohibited
  pred ComputeMode
Default = String -> ComputeMode
forall a. HasCallStack => String -> a
error String
"ComputeMode.pred: Default has no predecessor"

  enumFromTo :: ComputeMode -> ComputeMode -> [ComputeMode]
enumFromTo ComputeMode
from ComputeMode
to = ComputeMode -> [ComputeMode]
forall {t}. Enum t => t -> [t]
go ComputeMode
from
    where
      end :: Int
end = ComputeMode -> Int
forall a. Enum a => a -> Int
fromEnum ComputeMode
to
      go :: t -> [t]
go t
v = case Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (t -> Int
forall a. Enum a => a -> Int
fromEnum t
v) Int
end of
                 Ordering
LT -> t
v t -> [t] -> [t]
forall a. a -> [a] -> [a]
: t -> [t]
go (t -> t
forall a. Enum a => a -> a
succ t
v)
                 Ordering
EQ -> [t
v]
                 Ordering
GT -> []

  enumFrom from = enumFromTo from ExclusiveProcess

  fromEnum :: ComputeMode -> Int
fromEnum ComputeMode
Default = Int
0
  fromEnum ComputeMode
Prohibited = Int
2
  fromEnum ComputeMode
ExclusiveProcess = Int
3

  toEnum :: Int -> ComputeMode
toEnum Int
0 = ComputeMode
Default
  toEnum Int
2 = ComputeMode
Prohibited
  toEnum Int
3 = ComputeMode
ExclusiveProcess
  toEnum Int
unmatched = String -> ComputeMode
forall a. HasCallStack => String -> a
error (String
"ComputeMode.toEnum: Cannot match " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
unmatched)

{-# LINE 33 "src/Foreign/CUDA/Analysis/Device.chs" #-}


instance Describe ComputeMode where
  describe Default          = "Multiple contexts are allowed on the device simultaneously"
  describe Prohibited       = "No contexts can be created on this device at this time"
  describe ExclusiveProcess = "Only one context used by a single process can be present on this device at a time"


-- |
-- GPU compute capability, major and minor revision number respectively.
--
data Compute = Compute !Int !Int
  deriving Eq

instance Show Compute where
  show (Compute major minor) = show major ++ "." ++ show minor

instance Ord Compute where
  compare (Compute m1 n1) (Compute m2 n2) =
    case compare m1 m2 of
      EQ -> compare n1 n2
      x  -> x

{--
cap :: Int -> Int -> Double
cap a 0 = fromIntegral a
cap a b = let a' = fromIntegral a in
            let b' = fromIntegral b in
            a' + b' / max 10 (10^ ((ceiling . logBase 10) b' :: Int))
--}

-- |
-- The properties of a compute device
--
data DeviceProperties = DeviceProperties
  {
    DeviceProperties -> String
deviceName                    :: !String          -- ^ Identifier
  , DeviceProperties -> Compute
computeCapability             :: !Compute         -- ^ Supported compute capability
  , DeviceProperties -> Int64
totalGlobalMem                :: !Int64           -- ^ Available global memory on the device in bytes
  , DeviceProperties -> Int64
totalConstMem                 :: !Int64           -- ^ Available constant memory on the device in bytes
  , DeviceProperties -> Int64
sharedMemPerBlock             :: !Int64           -- ^ Available shared memory per block in bytes
  , DeviceProperties -> Int
regsPerBlock                  :: !Int             -- ^ 32-bit registers per block
  , DeviceProperties -> Int
warpSize                      :: !Int             -- ^ Warp size in threads (SIMD width)
  , DeviceProperties -> Int
maxThreadsPerBlock            :: !Int             -- ^ Maximum number of threads per block
  , DeviceProperties -> Int
maxThreadsPerMultiProcessor   :: !Int             -- ^ Maximum number of threads per multiprocessor
  , DeviceProperties -> (Int, Int, Int)
maxBlockSize                  :: !(Int,Int,Int)   -- ^ Maximum size of each dimension of a block
  , DeviceProperties -> (Int, Int, Int)
maxGridSize                   :: !(Int,Int,Int)   -- ^ Maximum size of each dimension of a grid
  , DeviceProperties -> Int
maxTextureDim1D               :: !Int             -- ^ Maximum texture dimensions
  , DeviceProperties -> (Int, Int)
maxTextureDim2D               :: !(Int,Int)
  , DeviceProperties -> (Int, Int, Int)
maxTextureDim3D               :: !(Int,Int,Int)
  , DeviceProperties -> Int
clockRate                     :: !Int             -- ^ Clock frequency in kilohertz
  , DeviceProperties -> Int
multiProcessorCount           :: !Int             -- ^ Number of multiprocessors on the device
  , DeviceProperties -> Int64
memPitch                      :: !Int64           -- ^ Maximum pitch in bytes allowed by memory copies
  , DeviceProperties -> Int
memBusWidth                   :: !Int             -- ^ Global memory bus width in bits
  , DeviceProperties -> Int
memClockRate                  :: !Int             -- ^ Peak memory clock frequency in kilohertz
  , DeviceProperties -> Int64
textureAlignment              :: !Int64           -- ^ Alignment requirement for textures
  , DeviceProperties -> ComputeMode
computeMode                   :: !ComputeMode
  , DeviceProperties -> Bool
deviceOverlap                 :: !Bool            -- ^ Device can concurrently copy memory and execute a kernel
  , DeviceProperties -> Bool
concurrentKernels             :: !Bool            -- ^ Device can possibly execute multiple kernels concurrently
  , DeviceProperties -> Bool
eccEnabled                    :: !Bool            -- ^ Device supports and has enabled error correction
  , DeviceProperties -> Int
asyncEngineCount              :: !Int             -- ^ Number of asynchronous engines
  , DeviceProperties -> Int
cacheMemL2                    :: !Int             -- ^ Size of the L2 cache in bytes
  , DeviceProperties -> PCI
pciInfo                       :: !PCI             -- ^ PCI device information for the device
  , DeviceProperties -> Bool
tccDriverEnabled              :: !Bool            -- ^ Whether this is a Tesla device using the TCC driver
  , DeviceProperties -> Bool
kernelExecTimeoutEnabled      :: !Bool            -- ^ Whether there is a runtime limit on kernels
  , DeviceProperties -> Bool
integrated                    :: !Bool            -- ^ As opposed to discrete
  , DeviceProperties -> Bool
canMapHostMemory              :: !Bool            -- ^ Device can use pinned memory
  , DeviceProperties -> Bool
unifiedAddressing             :: !Bool            -- ^ Device shares a unified address space with the host
  , DeviceProperties -> Bool
streamPriorities              :: !Bool            -- ^ Device supports stream priorities
  , DeviceProperties -> Bool
globalL1Cache                 :: !Bool            -- ^ Device supports caching globals in L1 cache
  , DeviceProperties -> Bool
localL1Cache                  :: !Bool            -- ^ Device supports caching locals in L1 cache
  , DeviceProperties -> Bool
managedMemory                 :: !Bool            -- ^ Device supports allocating managed memory on this system
  , DeviceProperties -> Bool
multiGPUBoard                 :: !Bool            -- ^ Device is on a multi-GPU board
  , DeviceProperties -> Int
multiGPUBoardGroupID          :: !Int             -- ^ Unique identifier for a group of devices associated with the same board
  , DeviceProperties -> Bool
preemption                    :: !Bool            -- ^ Device supports compute pre-emption
  , DeviceProperties -> Int
singleToDoublePerfRatio       :: !Int             -- ^ Ratio of single precision performance (in floating-point operations per second) to double precision performance
  , DeviceProperties -> Bool
cooperativeLaunch             :: !Bool            -- ^ Device supports launching cooperative kernels
  , DeviceProperties -> Bool
cooperativeLaunchMultiDevice  :: !Bool            -- ^ Device can participate in cooperative multi-device kernels
  }
  deriving (Int -> DeviceProperties -> ShowS
[DeviceProperties] -> ShowS
DeviceProperties -> String
(Int -> DeviceProperties -> ShowS)
-> (DeviceProperties -> String)
-> ([DeviceProperties] -> ShowS)
-> Show DeviceProperties
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> DeviceProperties -> ShowS
showsPrec :: Int -> DeviceProperties -> ShowS
$cshow :: DeviceProperties -> String
show :: DeviceProperties -> String
$cshowList :: [DeviceProperties] -> ShowS
showList :: [DeviceProperties] -> ShowS
Show)


data PCI = PCI
  {
    PCI -> Int
busID       :: !Int,      -- ^ PCI bus ID of the device
    PCI -> Int
deviceID    :: !Int,      -- ^ PCI device ID
    PCI -> Int
domainID    :: !Int       -- ^ PCI domain ID
  }
  deriving (Int -> PCI -> ShowS
[PCI] -> ShowS
PCI -> String
(Int -> PCI -> ShowS)
-> (PCI -> String) -> ([PCI] -> ShowS) -> Show PCI
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> PCI -> ShowS
showsPrec :: Int -> PCI -> ShowS
$cshow :: PCI -> String
show :: PCI -> String
$cshowList :: [PCI] -> ShowS
showList :: [PCI] -> ShowS
Show)


-- GPU Hardware Resources
--
-- These are either taken from the CUDA occupancy calculator, or the CUDA
-- wikipedia entry: <https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications>
--
data Allocation      = Warp | Block
  deriving Int -> Allocation -> ShowS
[Allocation] -> ShowS
Allocation -> String
(Int -> Allocation -> ShowS)
-> (Allocation -> String)
-> ([Allocation] -> ShowS)
-> Show Allocation
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> Allocation -> ShowS
showsPrec :: Int -> Allocation -> ShowS
$cshow :: Allocation -> String
show :: Allocation -> String
$cshowList :: [Allocation] -> ShowS
showList :: [Allocation] -> ShowS
Show

data DeviceResources = DeviceResources
  { DeviceResources -> Int
threadsPerWarp          :: !Int         -- ^ Warp size
  , DeviceResources -> Int
coresPerMP              :: !Int         -- ^ Number of SIMD arithmetic units per multiprocessor
  , DeviceResources -> Int
warpsPerMP              :: !Int         -- ^ Maximum number of in-flight warps per multiprocessor
  , DeviceResources -> Int
threadsPerMP            :: !Int         -- ^ Maximum number of in-flight threads on a multiprocessor
  , DeviceResources -> Int
threadBlocksPerMP       :: !Int         -- ^ Maximum number of thread blocks resident on a multiprocessor
  , DeviceResources -> Int
sharedMemPerMP          :: !Int         -- ^ Total amount of shared memory per multiprocessor (bytes)
  , DeviceResources -> Int
maxSharedMemPerBlock    :: !Int         -- ^ Maximum amount of shared memory per thread block (bytes)
  , DeviceResources -> Int
regFileSizePerMP        :: !Int         -- ^ Total number of registers in a multiprocessor
  , DeviceResources -> Int
maxRegPerBlock          :: !Int         -- ^ Maximum number of registers per block
  , DeviceResources -> Int
regAllocUnit            :: !Int         -- ^ Register allocation unit size
  , DeviceResources -> Allocation
regAllocationStyle      :: !Allocation  -- ^ How multiprocessor resources are divided (register allocation granularity)
  , DeviceResources -> Int
maxRegPerThread         :: !Int         -- ^ Maximum number of registers per thread
  , DeviceResources -> Int
sharedMemAllocUnit      :: !Int         -- ^ Shared memory allocation unit size (bytes)
  , DeviceResources -> Int
warpAllocUnit           :: !Int         -- ^ Warp allocation granularity
  , DeviceResources -> Int
warpRegAllocUnit        :: !Int         -- ^ Warp register allocation granularity
  , DeviceResources -> Int
maxGridsPerDevice       :: !Int         -- ^ Maximum number of resident grids per device (concurrent kernels)
  }
  deriving Int -> DeviceResources -> ShowS
[DeviceResources] -> ShowS
DeviceResources -> String
(Int -> DeviceResources -> ShowS)
-> (DeviceResources -> String)
-> ([DeviceResources] -> ShowS)
-> Show DeviceResources
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> DeviceResources -> ShowS
showsPrec :: Int -> DeviceResources -> ShowS
$cshow :: DeviceResources -> String
show :: DeviceResources -> String
$cshowList :: [DeviceResources] -> ShowS
showList :: [DeviceResources] -> ShowS
Show


-- |
-- Extract some additional hardware resource limitations for a given device.
--
deviceResources :: DeviceProperties -> DeviceResources
deviceResources :: DeviceProperties -> DeviceResources
deviceResources = Compute -> DeviceResources
resources (Compute -> DeviceResources)
-> (DeviceProperties -> Compute)
-> DeviceProperties
-> DeviceResources
forall b c a. (b -> c) -> (a -> b) -> a -> c
. DeviceProperties -> Compute
computeCapability
  where
    -- This is mostly extracted from tables in the CUDA occupancy calculator.
    --
    resources :: Compute -> DeviceResources
resources Compute
compute = case Compute
compute of
      Compute Int
1 Int
0 -> Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
1 Int
1)      -- Tesla G80
      Compute Int
1 Int
1 -> DeviceResources              -- Tesla G8x
        { threadsPerWarp :: Int
threadsPerWarp        = Int
32
        , coresPerMP :: Int
coresPerMP            = Int
8
        , warpsPerMP :: Int
warpsPerMP            = Int
24
        , threadsPerMP :: Int
threadsPerMP          = Int
768
        , threadBlocksPerMP :: Int
threadBlocksPerMP     = Int
8
        , sharedMemPerMP :: Int
sharedMemPerMP        = Int
16384
        , maxSharedMemPerBlock :: Int
maxSharedMemPerBlock  = Int
16384
        , regFileSizePerMP :: Int
regFileSizePerMP      = Int
8192
        , maxRegPerBlock :: Int
maxRegPerBlock        = Int
8192
        , regAllocUnit :: Int
regAllocUnit          = Int
256
        , regAllocationStyle :: Allocation
regAllocationStyle    = Allocation
Block
        , maxRegPerThread :: Int
maxRegPerThread       = Int
124
        , sharedMemAllocUnit :: Int
sharedMemAllocUnit    = Int
512
        , warpAllocUnit :: Int
warpAllocUnit         = Int
2
        , warpRegAllocUnit :: Int
warpRegAllocUnit      = Int
256
        , maxGridsPerDevice :: Int
maxGridsPerDevice     = Int
1
        }
      Compute Int
1 Int
2 ->  Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
1 Int
3)     -- Tesla G9x
      Compute Int
1 Int
3 -> (Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
1 Int
1))    -- Tesla GT200
        { threadsPerMP          = 1024
        , warpsPerMP            = 32
        , regFileSizePerMP      = 16384
        , maxRegPerBlock        = 16384
        , regAllocUnit          = 512
        }

      Compute Int
2 Int
0 -> DeviceResources              -- Fermi GF100
        { threadsPerWarp :: Int
threadsPerWarp        = Int
32
        , coresPerMP :: Int
coresPerMP            = Int
32
        , warpsPerMP :: Int
warpsPerMP            = Int
48
        , threadsPerMP :: Int
threadsPerMP          = Int
1536
        , threadBlocksPerMP :: Int
threadBlocksPerMP     = Int
8
        , sharedMemPerMP :: Int
sharedMemPerMP        = Int
49152
        , maxSharedMemPerBlock :: Int
maxSharedMemPerBlock  = Int
49152
        , regFileSizePerMP :: Int
regFileSizePerMP      = Int
32768
        , maxRegPerBlock :: Int
maxRegPerBlock        = Int
32768
        , regAllocUnit :: Int
regAllocUnit          = Int
64
        , regAllocationStyle :: Allocation
regAllocationStyle    = Allocation
Warp
        , maxRegPerThread :: Int
maxRegPerThread       = Int
63
        , sharedMemAllocUnit :: Int
sharedMemAllocUnit    = Int
128
        , warpAllocUnit :: Int
warpAllocUnit         = Int
2
        , warpRegAllocUnit :: Int
warpRegAllocUnit      = Int
64
        , maxGridsPerDevice :: Int
maxGridsPerDevice     = Int
16
        }
      Compute Int
2 Int
1 -> (Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
2 Int
0))    -- Fermi GF10x
        { coresPerMP            = 48
        }

      Compute Int
3 Int
0 -> DeviceResources              -- Kepler GK10x
        { threadsPerWarp :: Int
threadsPerWarp        = Int
32
        , coresPerMP :: Int
coresPerMP            = Int
192
        , warpsPerMP :: Int
warpsPerMP            = Int
64
        , threadsPerMP :: Int
threadsPerMP          = Int
2048
        , threadBlocksPerMP :: Int
threadBlocksPerMP     = Int
16
        , sharedMemPerMP :: Int
sharedMemPerMP        = Int
49152
        , maxSharedMemPerBlock :: Int
maxSharedMemPerBlock  = Int
49152
        , regFileSizePerMP :: Int
regFileSizePerMP      = Int
65536
        , maxRegPerBlock :: Int
maxRegPerBlock        = Int
65536
        , regAllocUnit :: Int
regAllocUnit          = Int
256
        , regAllocationStyle :: Allocation
regAllocationStyle    = Allocation
Warp
        , maxRegPerThread :: Int
maxRegPerThread       = Int
63
        , sharedMemAllocUnit :: Int
sharedMemAllocUnit    = Int
256
        , warpAllocUnit :: Int
warpAllocUnit         = Int
4
        , warpRegAllocUnit :: Int
warpRegAllocUnit      = Int
256
        , maxGridsPerDevice :: Int
maxGridsPerDevice     = Int
16
        }
      Compute Int
3 Int
2 -> (Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
3 Int
5))    -- Jetson TK1
        { maxRegPerBlock        = 32768
        , maxGridsPerDevice     = 4
        }
      Compute Int
3 Int
5 -> (Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
3 Int
0))    -- Kepler GK11x
        { maxRegPerThread       = 255
        , maxGridsPerDevice     = 32
        }
      Compute Int
3 Int
7 -> (Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
3 Int
5))    -- Kepler GK21x
        { sharedMemPerMP        = 114688
        , regFileSizePerMP      = 131072
        }

      Compute Int
5 Int
0 -> DeviceResources              -- Maxwell GM10x
        { threadsPerWarp :: Int
threadsPerWarp        = Int
32
        , coresPerMP :: Int
coresPerMP            = Int
128
        , warpsPerMP :: Int
warpsPerMP            = Int
64
        , threadsPerMP :: Int
threadsPerMP          = Int
2048
        , threadBlocksPerMP :: Int
threadBlocksPerMP     = Int
32
        , sharedMemPerMP :: Int
sharedMemPerMP        = Int
65536
        , maxSharedMemPerBlock :: Int
maxSharedMemPerBlock  = Int
49152
        , regFileSizePerMP :: Int
regFileSizePerMP      = Int
65536
        , maxRegPerBlock :: Int
maxRegPerBlock        = Int
65536
        , regAllocUnit :: Int
regAllocUnit          = Int
256
        , regAllocationStyle :: Allocation
regAllocationStyle    = Allocation
Warp
        , maxRegPerThread :: Int
maxRegPerThread       = Int
255
        , sharedMemAllocUnit :: Int
sharedMemAllocUnit    = Int
256
        , warpAllocUnit :: Int
warpAllocUnit         = Int
4
        , warpRegAllocUnit :: Int
warpRegAllocUnit      = Int
256
        , maxGridsPerDevice :: Int
maxGridsPerDevice     = Int
32
        }
      Compute Int
5 Int
2 -> (Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
5 Int
0))    -- Maxwell GM20x
        { sharedMemPerMP        = 98304
        , maxRegPerBlock        = 32768
        , warpAllocUnit         = 2
        }
      Compute Int
5 Int
3 -> (Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
5 Int
0))    -- Maxwell GM20B
        { maxRegPerBlock        = 32768
        , warpAllocUnit         = 2
        , maxGridsPerDevice     = 16
        }

      Compute Int
6 Int
0 -> DeviceResources              -- Pascal GP100
        { threadsPerWarp :: Int
threadsPerWarp        = Int
32
        , coresPerMP :: Int
coresPerMP            = Int
64
        , warpsPerMP :: Int
warpsPerMP            = Int
64
        , threadsPerMP :: Int
threadsPerMP          = Int
2048
        , threadBlocksPerMP :: Int
threadBlocksPerMP     = Int
32
        , sharedMemPerMP :: Int
sharedMemPerMP        = Int
65536
        , maxSharedMemPerBlock :: Int
maxSharedMemPerBlock  = Int
49152
        , regFileSizePerMP :: Int
regFileSizePerMP      = Int
65536
        , maxRegPerBlock :: Int
maxRegPerBlock        = Int
65536
        , regAllocUnit :: Int
regAllocUnit          = Int
256
        , regAllocationStyle :: Allocation
regAllocationStyle    = Allocation
Warp
        , maxRegPerThread :: Int
maxRegPerThread       = Int
255
        , sharedMemAllocUnit :: Int
sharedMemAllocUnit    = Int
256
        , warpAllocUnit :: Int
warpAllocUnit         = Int
2
        , warpRegAllocUnit :: Int
warpRegAllocUnit      = Int
256
        , maxGridsPerDevice :: Int
maxGridsPerDevice     = Int
128
        }
      Compute Int
6 Int
1 -> (Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
6 Int
0))    -- Pascal GP10x
        { coresPerMP            = 128
        , sharedMemPerMP        = 98304
        , warpAllocUnit         = 4
        , maxGridsPerDevice     = 32
        }
      Compute Int
6 Int
2 -> (Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
6 Int
0))    -- Pascal GP10B
        { coresPerMP            = 128
        , warpsPerMP            = 128
        , threadBlocksPerMP     = 4096
        , maxRegPerBlock        = 32768
        , warpAllocUnit         = 4
        , maxGridsPerDevice     = 16
        }

      Compute Int
7 Int
0 -> DeviceResources              -- Volta GV100
        { threadsPerWarp :: Int
threadsPerWarp        = Int
32
        , coresPerMP :: Int
coresPerMP            = Int
64
        , warpsPerMP :: Int
warpsPerMP            = Int
64
        , threadsPerMP :: Int
threadsPerMP          = Int
2048
        , threadBlocksPerMP :: Int
threadBlocksPerMP     = Int
32
        , sharedMemPerMP :: Int
sharedMemPerMP        = Int
98304           -- of 128KB
        , maxSharedMemPerBlock :: Int
maxSharedMemPerBlock  = Int
98304
        , regFileSizePerMP :: Int
regFileSizePerMP      = Int
65536
        , maxRegPerBlock :: Int
maxRegPerBlock        = Int
65536
        , regAllocUnit :: Int
regAllocUnit          = Int
256
        , regAllocationStyle :: Allocation
regAllocationStyle    = Allocation
Warp
        , maxRegPerThread :: Int
maxRegPerThread       = Int
255
        , sharedMemAllocUnit :: Int
sharedMemAllocUnit    = Int
256
        , warpAllocUnit :: Int
warpAllocUnit         = Int
4
        , warpRegAllocUnit :: Int
warpRegAllocUnit      = Int
256
        , maxGridsPerDevice :: Int
maxGridsPerDevice     = Int
128
        }

      Compute Int
7 Int
2 -> (Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
7 Int
0))    -- Volta GV10B
        { maxGridsPerDevice     = 16
        , maxSharedMemPerBlock  = 49152
        }

      Compute Int
7 Int
5 -> (Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
7 Int
0))    -- Turing TU1xx
        { warpsPerMP            = 32
        , threadBlocksPerMP     = 16
        , threadsPerMP          = 1024
        , maxGridsPerDevice     = 128
        , sharedMemPerMP        = 65536           -- of 96KB
        , maxSharedMemPerBlock  = 65536
        }

      Compute Int
8 Int
0 -> DeviceResources              -- Ampere GA100
        { threadsPerWarp :: Int
threadsPerWarp        = Int
32
        , coresPerMP :: Int
coresPerMP            = Int
64
        , warpsPerMP :: Int
warpsPerMP            = Int
64
        , threadsPerMP :: Int
threadsPerMP          = Int
2048
        , threadBlocksPerMP :: Int
threadBlocksPerMP     = Int
32
        , sharedMemPerMP :: Int
sharedMemPerMP        = Int
167936          -- of 192KB
        , maxSharedMemPerBlock :: Int
maxSharedMemPerBlock  = Int
167936
        , regFileSizePerMP :: Int
regFileSizePerMP      = Int
65536
        , maxRegPerBlock :: Int
maxRegPerBlock        = Int
65536
        , regAllocUnit :: Int
regAllocUnit          = Int
256
        , regAllocationStyle :: Allocation
regAllocationStyle    = Allocation
Warp
        , maxRegPerThread :: Int
maxRegPerThread       = Int
255
        , sharedMemAllocUnit :: Int
sharedMemAllocUnit    = Int
128
        , warpAllocUnit :: Int
warpAllocUnit         = Int
4
        , warpRegAllocUnit :: Int
warpRegAllocUnit      = Int
256
        , maxGridsPerDevice :: Int
maxGridsPerDevice     = Int
128
        }

      Compute Int
8 Int
6 -> (Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
8 Int
0))    -- Ampere GA102
        { warpsPerMP            = 48
        , threadsPerMP          = 1536
        , threadBlocksPerMP     = 16
        , sharedMemPerMP        = 102400
        , maxSharedMemPerBlock  = 102400
        }

      -- Something might have gone wrong, or the library just needs to be
      -- updated for the next generation of hardware, in which case we just want
      -- to pick a sensible default and carry on.
      --
      -- This is slightly dodgy as the warning message is coming from pure code.
      -- However, it should be OK because all library functions run in IO, so it
      -- is likely the user code is as well.
      --
      Compute
_           -> String -> DeviceResources -> DeviceResources
forall a. String -> a -> a
trace String
warning (DeviceResources -> DeviceResources)
-> DeviceResources -> DeviceResources
forall a b. (a -> b) -> a -> b
$ Compute -> DeviceResources
resources (Int -> Int -> Compute
Compute Int
6 Int
0)
        where warning :: String
warning = [String] -> String
unlines [ String
"*** Warning: Unknown CUDA device compute capability: " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Compute -> String
forall a. Show a => a -> String
show Compute
compute
                                , String
"*** Please submit a bug report at https://github.com/tmcdonell/cuda/issues" ]