-- GENERATED by C->Haskell Compiler, version 0.28.1 Switcheroo, 1 April 2016 (Haskell)
-- Edit the ORIGNAL .chs file instead!


{-# LINE 1 "src/Foreign/CUDA/Analysis/Device.chs" #-}
--------------------------------------------------------------------------------
-- |
-- Module    : Foreign.CUDA.Analysis.Device
-- Copyright : [2009..2014] Trevor L. McDonell
-- License   : BSD
--
-- Common device functions
--
--------------------------------------------------------------------------------

module Foreign.CUDA.Analysis.Device (

    Compute(..), ComputeMode(..),
    DeviceProperties(..), DeviceResources(..), Allocation(..), PCI(..),
    deviceResources,
    describe

) where





import Data.Int
import Text.Show.Describe

import Debug.Trace


-- |
-- The compute mode the device is currently in
--
data ComputeMode = Default
                 | Prohibited
                 | ExclusiveProcess
  deriving (Eq,Show)
instance Enum ComputeMode where
  succ Default = Prohibited
  succ Prohibited = ExclusiveProcess
  succ ExclusiveProcess = error "ComputeMode.succ: ExclusiveProcess has no successor"

  pred Prohibited = Default
  pred ExclusiveProcess = Prohibited
  pred Default = error "ComputeMode.pred: Default has no predecessor"

  enumFromTo from to = go from
    where
      end = fromEnum to
      go v = case compare (fromEnum v) end of
                 LT -> v : go (succ v)
                 EQ -> [v]
                 GT -> []

  enumFrom from = enumFromTo from ExclusiveProcess

  fromEnum Default = 0
  fromEnum Prohibited = 2
  fromEnum ExclusiveProcess = 3

  toEnum 0 = Default
  toEnum 2 = Prohibited
  toEnum 3 = ExclusiveProcess
  toEnum unmatched = error ("ComputeMode.toEnum: Cannot match " ++ show unmatched)

{-# LINE 33 "src/Foreign/CUDA/Analysis/Device.chs" #-}


instance Describe ComputeMode where
  describe Default          = "Multiple contexts are allowed on the device simultaneously"
  describe Prohibited       = "No contexts can be created on this device at this time"
  describe ExclusiveProcess = "Only one context used by a single process can be present on this device at a time"


-- |
-- GPU compute capability, major and minor revision number respectively.
--
data Compute = Compute !Int !Int
  deriving Eq

instance Show Compute where
  show (Compute major minor) = show major ++ "." ++ show minor

instance Ord Compute where
  compare (Compute m1 n1) (Compute m2 n2) =
    case compare m1 m2 of
      EQ -> compare n1 n2
      x  -> x

{--
cap :: Int -> Int -> Double
cap a 0 = fromIntegral a
cap a b = let a' = fromIntegral a in
            let b' = fromIntegral b in
            a' + b' / max 10 (10^ ((ceiling . logBase 10) b' :: Int))
--}

-- |
-- The properties of a compute device
--
data DeviceProperties = DeviceProperties
  {
    deviceName                  :: !String              -- ^ Identifier
  , computeCapability           :: !Compute             -- ^ Supported compute capability
  , totalGlobalMem              :: !Int64               -- ^ Available global memory on the device in bytes
  , totalConstMem               :: !Int64               -- ^ Available constant memory on the device in bytes
  , sharedMemPerBlock           :: !Int64               -- ^ Available shared memory per block in bytes
  , regsPerBlock                :: !Int                 -- ^ 32-bit registers per block
  , warpSize                    :: !Int                 -- ^ Warp size in threads (SIMD width)
  , maxThreadsPerBlock          :: !Int                 -- ^ Maximum number of threads per block
  , maxThreadsPerMultiProcessor :: !Int                 -- ^ Maximum number of threads per multiprocessor
  , maxBlockSize                :: !(Int,Int,Int)       -- ^ Maximum size of each dimension of a block
  , maxGridSize                 :: !(Int,Int,Int)       -- ^ Maximum size of each dimension of a grid
  , maxTextureDim1D             :: !Int                 -- ^ Maximum texture dimensions
  , maxTextureDim2D             :: !(Int,Int)
  , maxTextureDim3D             :: !(Int,Int,Int)
  , clockRate                   :: !Int                 -- ^ Clock frequency in kilohertz
  , multiProcessorCount         :: !Int                 -- ^ Number of multiprocessors on the device
  , memPitch                    :: !Int64               -- ^ Maximum pitch in bytes allowed by memory copies
  , memBusWidth                 :: !Int                 -- ^ Global memory bus width in bits
  , memClockRate                :: !Int                 -- ^ Peak memory clock frequency in kilohertz
  , textureAlignment            :: !Int64               -- ^ Alignment requirement for textures
  , computeMode                 :: !ComputeMode
  , deviceOverlap               :: !Bool                -- ^ Device can concurrently copy memory and execute a kernel
  , concurrentKernels           :: !Bool                -- ^ Device can possibly execute multiple kernels concurrently
  , eccEnabled                  :: !Bool                -- ^ Device supports and has enabled error correction
  , asyncEngineCount            :: !Int                 -- ^ Number of asynchronous engines
  , cacheMemL2                  :: !Int                 -- ^ Size of the L2 cache in bytes
  , pciInfo                     :: !PCI                 -- ^ PCI device information for the device
  , tccDriverEnabled            :: !Bool                -- ^ Whether this is a Tesla device using the TCC driver
  , kernelExecTimeoutEnabled    :: !Bool                -- ^ Whether there is a runtime limit on kernels
  , integrated                  :: !Bool                -- ^ As opposed to discrete
  , canMapHostMemory            :: !Bool                -- ^ Device can use pinned memory
  , unifiedAddressing           :: !Bool                -- ^ Device shares a unified address space with the host
  , streamPriorities            :: !Bool                -- ^ Device supports stream priorities
  , globalL1Cache               :: !Bool                -- ^ Device supports caching globals in L1 cache
  , localL1Cache                :: !Bool                -- ^ Device supports caching locals in L1 cache
  , managedMemory               :: !Bool                -- ^ Device supports allocating managed memory on this system
  , multiGPUBoard               :: !Bool                -- ^ Device is on a multi-GPU board
  , multiGPUBoardGroupID        :: !Int                 -- ^ Unique identifier for a group of devices associated with the same board
  }
  deriving (Show)


data PCI = PCI
  {
    busID       :: !Int,                -- ^ PCI bus ID of the device
    deviceID    :: !Int,                -- ^ PCI device ID
    domainID    :: !Int                 -- ^ PCI domain ID
  }
  deriving (Show)


-- GPU Hardware Resources
--
data Allocation      = Warp | Block
data DeviceResources = DeviceResources
  {
    threadsPerWarp      :: !Int,        -- ^ Warp size
    threadsPerMP        :: !Int,        -- ^ Maximum number of in-flight threads on a multiprocessor
    threadBlocksPerMP   :: !Int,        -- ^ Maximum number of thread blocks resident on a multiprocessor
    warpsPerMP          :: !Int,        -- ^ Maximum number of in-flight warps per multiprocessor
    coresPerMP          :: !Int,        -- ^ Number of SIMD arithmetic units per multiprocessor
    sharedMemPerMP      :: !Int,        -- ^ Total amount of shared memory per multiprocessor (bytes)
    sharedMemAllocUnit  :: !Int,        -- ^ Shared memory allocation unit size (bytes)
    regFileSize         :: !Int,        -- ^ Total number of registers in a multiprocessor
    regAllocUnit        :: !Int,        -- ^ Register allocation unit size
    regAllocWarp        :: !Int,        -- ^ Register allocation granularity for warps
    regPerThread        :: !Int,        -- ^ Maximum number of registers per thread
    allocation          :: !Allocation  -- ^ How multiprocessor resources are divided
  }


-- |
-- Extract some additional hardware resource limitations for a given device.
--
deviceResources :: DeviceProperties -> DeviceResources
deviceResources = resources . computeCapability
  where
    -- This is mostly extracted from tables in the CUDA occupancy calculator.
    --
    resources compute = case compute of
      Compute 1 0 -> DeviceResources 32  768  8 24   8  16384 512   8192 256 2 124 Block  -- Tesla G80
      Compute 1 1 -> DeviceResources 32  768  8 24   8  16384 512   8192 256 2 124 Block  -- Tesla G8x
      Compute 1 2 -> DeviceResources 32 1024  8 32   8  16384 512  16384 512 2 124 Block  -- Tesla G9x
      Compute 1 3 -> DeviceResources 32 1024  8 32   8  16384 512  16384 512 2 124 Block  -- Tesla GT200
      Compute 2 0 -> DeviceResources 32 1536  8 48  32  49152 128  32768  64 2  63 Warp   -- Fermi GF100
      Compute 2 1 -> DeviceResources 32 1536  8 48  48  49152 128  32768  64 2  63 Warp   -- Fermi GF10x
      Compute 3 0 -> DeviceResources 32 2048 16 64 192  49152 256  65536 256 4  63 Warp   -- Kepler GK10x
      Compute 3 2 -> DeviceResources 32 2048 16 64 192  49152 256  65536 256 4 255 Warp   -- Jetson TK1
      Compute 3 5 -> DeviceResources 32 2048 16 64 192  49152 256  65536 256 4 255 Warp   -- Kepler GK11x
      Compute 3 7 -> DeviceResources 32 2048 16 64 192 114688 256 131072 256 4 266 Warp   -- Kepler GK21x
      Compute 5 0 -> DeviceResources 32 2048 32 64 128  65536 256  65536 256 4 255 Warp   -- Maxwell GM10x
      Compute 5 2 -> DeviceResources 32 2048 32 64 128  98304 256  65536 256 4 255 Warp   -- Maxwell GM20x
      Compute 5 3 -> DeviceResources 32 2048 32 64 128  65536 256  65536 256 4 255 Warp   -- Maxwell GM20B
      Compute 6 0 -> DeviceResources 32 2048 32 64  64  65536 256  65536 256 4 255 Warp   -- Pascal GP100 (?)
      Compute 6 1 -> DeviceResources 32 2048 32 64 128  98304 256  65536 256 4 255 Warp   -- Pascal GP10x (?)
      Compute 6 2 -> DeviceResources 32 2048 32 64 128  65536 256  65536 256 4 255 Warp   -- Pascal (?)

      -- Something might have gone wrong, or the library just needs to be
      -- updated for the next generation of hardware, in which case we just want
      -- to pick a sensible default and carry on.
      --
      -- This is slightly dodgy as the warning message is coming from pure code.
      -- However, it should be OK because all library functions run in IO, so it
      -- is likely the user code is as well.
      --
      _           -> trace warning $ resources (Compute 3 0)
        where warning = unlines [ "*** Warning: Unknown CUDA device compute capability: " ++ show compute
                                , "*** Please submit a bug report at https://github.com/tmcdonell/cuda/issues" ]