{-# LANGUAGE CPP, ForeignFunctionInterface, EmptyDataDecls #-}
--------------------------------------------------------------------------------
-- |
-- Module    : Foreign.CUDA.Runtime.Device
-- Copyright : (c) [2009..2011] Trevor L. McDonell
-- License   : BSD
--
-- Device management routines
--
--------------------------------------------------------------------------------

module Foreign.CUDA.Runtime.Device (

  -- * Device Management
  ComputeMode(..), Device, DeviceFlag(..), DeviceProperties(..),
  choose, get, count, props, set, setFlags, setOrder, reset, sync,

  -- * Peer Access
  PeerFlag,
  accessible, add, remove,

  -- * Cache Configuration
  Limit(..),
  getLimit, setLimit

) where

#include <cuda_runtime_api.h>
{# context lib="cudart" #}

-- Friends
import Foreign.CUDA.Analysis.Device
import Foreign.CUDA.Runtime.Error
import Foreign.CUDA.Internal.C2HS
import Foreign.CUDA.Internal.Offsets

-- System
import Foreign
import Foreign.C

#c
typedef struct cudaDeviceProp   cudaDeviceProp;

typedef enum
{
    cudaDeviceFlagScheduleAuto    = cudaDeviceScheduleAuto,
    cudaDeviceFlagScheduleSpin    = cudaDeviceScheduleSpin,
    cudaDeviceFlagScheduleYield   = cudaDeviceScheduleYield,
    cudaDeviceFlagBlockingSync    = cudaDeviceBlockingSync,
    cudaDeviceFlagMapHost         = cudaDeviceMapHost,
#if CUDART_VERSION >= 3000
    cudaDeviceFlagLMemResizeToMax = cudaDeviceLmemResizeToMax
#endif
} cudaDeviceFlags;
#endc


--------------------------------------------------------------------------------
-- Data Types
--------------------------------------------------------------------------------

-- |
-- A device identifier
--
type Device = Int

{# pointer *cudaDeviceProp as ^ foreign -> DeviceProperties nocode #}

-- |
-- Device execution flags
--
{# enum cudaDeviceFlags as DeviceFlag { }
    with prefix="cudaDeviceFlag" deriving (Eq, Show) #}


instance Storable DeviceProperties where
  sizeOf _    = {#sizeof cudaDeviceProp#}
  alignment _ = alignment (undefined :: Ptr ())

  poke _ _    = error "no instance for Foreign.Storable.poke DeviceProperties"
  peek p      = do
    gm <- cIntConv     `fmap` {#get cudaDeviceProp.totalGlobalMem#} p
    sm <- cIntConv     `fmap` {#get cudaDeviceProp.sharedMemPerBlock#} p
    rb <- cIntConv     `fmap` {#get cudaDeviceProp.regsPerBlock#} p
    ws <- cIntConv     `fmap` {#get cudaDeviceProp.warpSize#} p
    mp <- cIntConv     `fmap` {#get cudaDeviceProp.memPitch#} p
    tb <- cIntConv     `fmap` {#get cudaDeviceProp.maxThreadsPerBlock#} p
    cl <- cIntConv     `fmap` {#get cudaDeviceProp.clockRate#} p
    cm <- cIntConv     `fmap` {#get cudaDeviceProp.totalConstMem#} p
    v1 <- fromIntegral `fmap` {#get cudaDeviceProp.major#} p
    v2 <- fromIntegral `fmap` {#get cudaDeviceProp.minor#} p
    ta <- cIntConv     `fmap` {#get cudaDeviceProp.textureAlignment#} p
    ov <- cToBool      `fmap` {#get cudaDeviceProp.deviceOverlap#} p
    pc <- cIntConv     `fmap` {#get cudaDeviceProp.multiProcessorCount#} p
    ke <- cToBool      `fmap` {#get cudaDeviceProp.kernelExecTimeoutEnabled#} p
    tg <- cToBool      `fmap` {#get cudaDeviceProp.integrated#} p
    hm <- cToBool      `fmap` {#get cudaDeviceProp.canMapHostMemory#} p
    md <- cToEnum      `fmap` {#get cudaDeviceProp.computeMode#} p
#if CUDART_VERSION >= 3000
    ck <- cToBool      `fmap` {#get cudaDeviceProp.concurrentKernels#} p
    u1 <- cIntConv     `fmap` {#get cudaDeviceProp.maxTexture1D#} p
#endif
#if CUDART_VERSION >= 3010
    ee <- cToBool      `fmap` {#get cudaDeviceProp.ECCEnabled#} p
#endif
#if CUDART_VERSION >= 4000
    ae <- cIntConv     `fmap` {#get cudaDeviceProp.asyncEngineCount#} p
    l2 <- cIntConv     `fmap` {#get cudaDeviceProp.l2CacheSize#} p
    tm <- cIntConv     `fmap` {#get cudaDeviceProp.maxThreadsPerMultiProcessor#} p
    mw <- cIntConv     `fmap` {#get cudaDeviceProp.memoryBusWidth#} p
    mc <- cIntConv     `fmap` {#get cudaDeviceProp.memoryClockRate#} p
    pb <- cIntConv     `fmap` {#get cudaDeviceProp.pciBusID#} p
    pd <- cIntConv     `fmap` {#get cudaDeviceProp.pciDeviceID#} p
    pm <- cIntConv     `fmap` {#get cudaDeviceProp.pciDomainID#} p
    tc <- cToBool      `fmap` {#get cudaDeviceProp.tccDriver#} p
    ua <- cToBool      `fmap` {#get cudaDeviceProp.unifiedAddressing#} p
#endif

    --
    -- C->Haskell returns the wrong type when accessing static arrays in
    -- structs, returning the dereferenced element but with a Ptr type. Work
    -- around this with manual pointer arithmetic...
    --
    n            <- peekCString (p `plusPtr` devNameOffset)
    (t1:t2:t3:_) <- map cIntConv `fmap` peekArray 3 (p `plusPtr` devMaxThreadDimOffset :: Ptr CInt)
    (g1:g2:g3:_) <- map cIntConv `fmap` peekArray 3 (p `plusPtr` devMaxGridSizeOffset  :: Ptr CInt)
#if CUDART_VERSION >= 3000
    (u21:u22:_)     <- map cIntConv `fmap` peekArray 2 (p `plusPtr` devMaxTexture2DOffset :: Ptr CInt)
    (u31:u32:u33:_) <- map cIntConv `fmap` peekArray 3 (p `plusPtr` devMaxTexture3DOffset :: Ptr CInt)
#endif

    let cap a 0 = a
        cap a b = a + b / max 10 (10^ ((ceiling . logBase 10) b :: Int))

    return DeviceProperties
      {
        deviceName                      = n,
        computeCapability               = cap v1 v2,
        totalGlobalMem                  = gm,
        totalConstMem                   = cm,
        sharedMemPerBlock               = sm,
        regsPerBlock                    = rb,
        warpSize                        = ws,
        maxThreadsPerBlock              = tb,
        maxBlockSize                    = (t1,t2,t3),
        maxGridSize                     = (g1,g2,g3),
        clockRate                       = cl,
        multiProcessorCount             = pc,
        memPitch                        = mp,
        textureAlignment                = ta,
        computeMode                     = md,
        deviceOverlap                   = ov,
#if CUDART_VERSION >= 3000
        concurrentKernels               = ck,
        maxTextureDim1D                 = u1,
        maxTextureDim2D                 = (u21,u22),
        maxTextureDim3D                 = (u31,u32,u33),
#endif
#if CUDART_VERSION >= 3010
        eccEnabled                      = ee,
#endif
#if CUDART_VERSION >= 3000 && CUDART_VERSION < 3010
        -- not visible from runtime API < 3.1
        eccEnabled                      = False,
#endif
#if CUDART_VERSION >= 4000
        asyncEngineCount                = ae,
        cacheMemL2                      = l2,
        maxThreadsPerMultiProcessor     = tm,
        memBusWidth                     = mw,
        memClockRate                    = mc,
        tccDriverEnabled                = tc,
        unifiedAddressing               = ua,
        pciInfo                         = PCI pb pd pm,
#endif
        kernelExecTimeoutEnabled        = ke,
        integrated                      = tg,
        canMapHostMemory                = hm
      }


--------------------------------------------------------------------------------
-- Device Management
--------------------------------------------------------------------------------

-- |
-- Select the compute device which best matches the given criteria
--
choose :: DeviceProperties -> IO Device
choose dev = resultIfOk =<< cudaChooseDevice dev

{# fun unsafe cudaChooseDevice
  { alloca-      `Int'              peekIntConv*
  , withDevProp* `DeviceProperties'              } -> `Status' cToEnum #}
  where
      withDevProp = with


-- |
-- Returns which device is currently being used
--
get :: IO Device
get = resultIfOk =<< cudaGetDevice

{# fun unsafe cudaGetDevice
  { alloca- `Int' peekIntConv* } -> `Status' cToEnum #}


-- |
-- Returns the number of devices available for execution, with compute
-- capability >= 1.0
--
count :: IO Int
count = resultIfOk =<< cudaGetDeviceCount

{# fun unsafe cudaGetDeviceCount
  { alloca- `Int' peekIntConv* } -> `Status' cToEnum #}


-- |
-- Return information about the selected compute device
--
props :: Device -> IO DeviceProperties
props n = resultIfOk =<< cudaGetDeviceProperties n

{# fun unsafe cudaGetDeviceProperties
  { alloca- `DeviceProperties' peek*
  ,         `Int'                    } -> `Status' cToEnum #}


-- |
-- Set device to be used for GPU execution
--
set :: Device -> IO ()
set n = nothingIfOk =<< cudaSetDevice n

{# fun unsafe cudaSetDevice
  { `Int' } -> `Status' cToEnum #}


-- |
-- Set flags to be used for device executions
--
setFlags :: [DeviceFlag] -> IO ()
setFlags f = nothingIfOk =<< cudaSetDeviceFlags (combineBitMasks f)

{# fun unsafe cudaSetDeviceFlags
  { `Int' } -> `Status' cToEnum #}


-- |
-- Set list of devices for CUDA execution in priority order
--
setOrder :: [Device] -> IO ()
setOrder l = nothingIfOk =<< cudaSetValidDevices l (length l)

{# fun unsafe cudaSetValidDevices
  { withArrayIntConv* `[Int]'
  ,                   `Int'   } -> `Status' cToEnum #}
  where
      withArrayIntConv = withArray . map cIntConv

-- |
-- Block until the device has completed all preceding requested tasks. Returns
-- an error if one of the tasks fails.
--
sync :: IO ()
#if CUDART_VERSION < 4000
sync = nothingIfOk =<< cudaThreadSynchronize
{# fun unsafe cudaThreadSynchronize { } -> `Status' cToEnum #}
#else
sync = nothingIfOk =<< cudaDeviceSynchronize
{# fun unsafe cudaDeviceSynchronize { } -> `Status' cToEnum #}
#endif

-- |
-- Explicitly destroys and cleans up all runtime resources associated with the
-- current device in the current process. Any subsequent API call will
-- reinitialise the device.
--
-- Note that this function will reset the device immediately. It is the caller’s
-- responsibility to ensure that the device is not being accessed by any other
-- host threads from the process when this function is called.
--
reset :: IO ()
#if CUDART_VERSION >= 4000
reset = nothingIfOk =<< cudaDeviceReset
{# fun unsafe cudaDeviceReset { } -> `Status' cToEnum #}
#else
reset = nothingIfOk =<< cudaThreadExit
{# fun unsafe cudaThreadExit  { } -> `Status' cToEnum #}
#endif


--------------------------------------------------------------------------------
-- Peer Access
--------------------------------------------------------------------------------

-- |
-- Possible option values for direct peer memory access
--
data PeerFlag
instance Enum PeerFlag where

-- |
-- Queries if the first device can directly access the memory of the second. If
-- direct access is possible, it can then be enabled with 'add'. Requires
-- cuda-4.0.
--
accessible :: Device -> Device -> IO Bool
#if CUDART_VERSION < 4000
accessible _   _    = requireSDK 4.0 "accessible"
#else
accessible dev peer = resultIfOk =<< cudaDeviceCanAccessPeer dev peer

{# fun unsafe cudaDeviceCanAccessPeer
  { alloca-  `Bool'   peekBool*
  , cIntConv `Device'
  , cIntConv `Device'           } -> `Status' cToEnum #}
#endif

-- |
-- If the devices of both the current and supplied contexts support unified
-- addressing, then enable allocations in the supplied context to be accessible
-- by the current context. Requires cuda-4.0.
--
add :: Device -> [PeerFlag] -> IO ()
#if CUDART_VERSION < 4000
add _   _     = requireSDK 4.0 "add"
#else
add dev flags = nothingIfOk =<< cudaDeviceEnablePeerAccess dev flags

{# fun unsafe cudaDeviceEnablePeerAccess
  { cIntConv        `Device'
  , combineBitMasks `[PeerFlag]' } -> `Status' cToEnum #}
#endif


-- |
-- Disable direct memory access from the current context to the supplied
-- context. Requires cuda-4.0.
--
remove :: Device -> IO ()
#if CUDART_VERSION < 4000
remove _   = requireSDK 4.0 "remove"
#else
remove dev = nothingIfOk =<< cudaDeviceDisablePeerAccess dev

{# fun unsafe cudaDeviceDisablePeerAccess
  { cIntConv `Device' } -> `Status' cToEnum #}
#endif


--------------------------------------------------------------------------------
-- Cache Configuration
--------------------------------------------------------------------------------

-- |
-- Device limit flags
--
#if CUDART_VERSION < 3010
data Limit
#else
{# enum cudaLimit as Limit
    { underscoreToCase }
    with prefix="cudaLimit" deriving (Eq, Show) #}
#endif


-- |
-- Query compute 2.0 call stack limits. Requires cuda-3.1.
--
getLimit :: Limit -> IO Int
#if   CUDART_VERSION < 3010
getLimit _ = requireSDK 3.1 "getLimit"
#elif CUDART_VERSION < 4000
getLimit l = resultIfOk =<< cudaThreadGetLimit l

{# fun unsafe cudaThreadGetLimit
  { alloca-   `Int' peekIntConv*
  , cFromEnum `Limit'            } -> `Status' cToEnum #}
#else
getLimit l = resultIfOk =<< cudaDeviceGetLimit l

{# fun unsafe cudaDeviceGetLimit
  { alloca-   `Int' peekIntConv*
  , cFromEnum `Limit'            } -> `Status' cToEnum #}
#endif


-- |
-- Set compute 2.0 call stack limits. Requires cuda-3.1.
--
setLimit :: Limit -> Int -> IO ()
#if   CUDART_VERSION < 3010
setLimit _ _ = requireSDK 3.1 "setLimit"
#elif CUDART_VERSION < 4000
setLimit l n = nothingIfOk =<< cudaThreadSetLimit l n

{# fun unsafe cudaThreadSetLimit
  { cFromEnum `Limit'
  , cIntConv  `Int'   } -> `Status' cToEnum #}
#else
setLimit l n = nothingIfOk =<< cudaDeviceSetLimit l n

{# fun unsafe cudaDeviceSetLimit
  { cFromEnum `Limit'
  , cIntConv  `Int'   } -> `Status' cToEnum #}
#endif