{-# LANGUAGE ForeignFunctionInterface #-} -------------------------------------------------------------------------------- -- | -- Module : Foreign.CUDA.Runtime.Device -- Copyright : (c) 2009 Trevor L. McDonell -- License : BSD -- -- Device management routines -- -------------------------------------------------------------------------------- module Foreign.CUDA.Runtime.Device ( ComputeMode(..), DeviceFlag(..), DeviceProperties(..), -- ** Device management choose, get, count, props, set, setFlags, setOrder ) where #include {# context lib="cudart" #} -- Friends import Foreign.CUDA.Runtime.Error import Foreign.CUDA.Internal.C2HS import Foreign.CUDA.Internal.Offsets -- System import Foreign import Foreign.C #c typedef struct cudaDeviceProp cudaDeviceProp; typedef enum { cudaDeviceFlagScheduleAuto = cudaDeviceScheduleAuto, cudaDeviceFlagScheduleSpin = cudaDeviceScheduleSpin, cudaDeviceFlagScheduleYield = cudaDeviceScheduleYield, cudaDeviceFlagBlockingSync = cudaDeviceBlockingSync, cudaDeviceFlagMapHost = cudaDeviceMapHost } cudaDeviceFlags; #endc -------------------------------------------------------------------------------- -- Data Types -------------------------------------------------------------------------------- {# pointer *cudaDeviceProp as ^ foreign -> DeviceProperties nocode #} -- | -- The compute mode the device is currently in -- {# enum cudaComputeMode as ComputeMode { } with prefix="cudaComputeMode" deriving (Eq, Show) #} -- | -- Device execution flags -- {# enum cudaDeviceFlags as DeviceFlag { } with prefix="cudaDeviceFlag" deriving (Eq, Show) #} -- | -- The properties of a compute device -- data DeviceProperties = DeviceProperties { deviceName :: String, -- ^ Identifier computeCapability :: Double, -- ^ Supported compute capability totalGlobalMem :: Int64, -- ^ Available global memory on the device in bytes totalConstMem :: Int64, -- ^ Available constant memory on the device in bytes sharedMemPerBlock :: Int64, -- ^ Available shared memory per block in bytes regsPerBlock :: Int, -- ^ 32-bit registers per block warpSize :: Int, -- ^ Warp size in threads maxThreadsPerBlock :: Int, -- ^ Max number of threads per block maxThreadsDim :: (Int,Int,Int), -- ^ Max size of each dimension of a block maxGridSize :: (Int,Int,Int), -- ^ Max size of each dimension of a grid clockRate :: Int, -- ^ Clock frequency in kilohertz multiProcessorCount :: Int, -- ^ Number of multiprocessors on the device memPitch :: Int64, -- ^ Max pitch in bytes allowed by memory copies textureAlignment :: Int64, -- ^ Alignment requirement for textures computeMode :: ComputeMode, deviceOverlap :: Bool, -- ^ Device can concurrently copy memory and execute a kernel kernelExecTimeoutEnabled :: Bool, -- ^ Whether there is a runtime limit on kernels integrated :: Bool, -- ^ As opposed to discrete canMapHostMemory :: Bool -- ^ Device can use pinned memory } deriving (Show) instance Storable DeviceProperties where sizeOf _ = {#sizeof cudaDeviceProp#} alignment _ = alignment (undefined :: Ptr ()) peek p = do gm <- cIntConv `fmap` {#get cudaDeviceProp.totalGlobalMem#} p sm <- cIntConv `fmap` {#get cudaDeviceProp.sharedMemPerBlock#} p rb <- cIntConv `fmap` {#get cudaDeviceProp.regsPerBlock#} p ws <- cIntConv `fmap` {#get cudaDeviceProp.warpSize#} p mp <- cIntConv `fmap` {#get cudaDeviceProp.memPitch#} p tb <- cIntConv `fmap` {#get cudaDeviceProp.maxThreadsPerBlock#} p cl <- cIntConv `fmap` {#get cudaDeviceProp.clockRate#} p cm <- cIntConv `fmap` {#get cudaDeviceProp.totalConstMem#} p v1 <- fromIntegral `fmap` {#get cudaDeviceProp.major#} p v2 <- fromIntegral `fmap` {#get cudaDeviceProp.minor#} p ta <- cIntConv `fmap` {#get cudaDeviceProp.textureAlignment#} p ov <- cToBool `fmap` {#get cudaDeviceProp.deviceOverlap#} p pc <- cIntConv `fmap` {#get cudaDeviceProp.multiProcessorCount#} p ke <- cToBool `fmap` {#get cudaDeviceProp.kernelExecTimeoutEnabled#} p tg <- cToBool `fmap` {#get cudaDeviceProp.integrated#} p hm <- cToBool `fmap` {#get cudaDeviceProp.canMapHostMemory#} p md <- cToEnum `fmap` {#get cudaDeviceProp.computeMode#} p -- -- C->Haskell returns the wrong type when accessing static arrays in -- structs, returning the dereferenced element but with a Ptr type. Work -- around this with manual pointer arithmetic... -- n <- peekCString (p `plusPtr` devNameOffset) (t1:t2:t3:_) <- map cIntConv `fmap` peekArray 3 (p `plusPtr` devMaxThreadDimOffset :: Ptr CInt) (g1:g2:g3:_) <- map cIntConv `fmap` peekArray 3 (p `plusPtr` devMaxGridSizeOffset :: Ptr CInt) return DeviceProperties { deviceName = n, computeCapability = v1 + v2 / max 10 (10^ ((ceiling . logBase 10) v2 :: Int)), totalGlobalMem = gm, totalConstMem = cm, sharedMemPerBlock = sm, regsPerBlock = rb, warpSize = ws, maxThreadsPerBlock = tb, maxThreadsDim = (t1,t2,t3), maxGridSize = (g1,g2,g3), clockRate = cl, multiProcessorCount = pc, memPitch = mp, textureAlignment = ta, computeMode = md, deviceOverlap = ov, kernelExecTimeoutEnabled = ke, integrated = tg, canMapHostMemory = hm } -------------------------------------------------------------------------------- -- Functions -------------------------------------------------------------------------------- -- | -- Select the compute device which best matches the given criteria -- choose :: DeviceProperties -> IO Int choose dev = resultIfOk =<< cudaChooseDevice dev {# fun unsafe cudaChooseDevice { alloca- `Int' peekIntConv* , withDevProp* `DeviceProperties' } -> `Status' cToEnum #} where withDevProp = with -- | -- Returns which device is currently being used -- get :: IO Int get = resultIfOk =<< cudaGetDevice {# fun unsafe cudaGetDevice { alloca- `Int' peekIntConv* } -> `Status' cToEnum #} -- | -- Returns the number of devices available for execution, with compute -- capability >= 1.0 -- count :: IO Int count = resultIfOk =<< cudaGetDeviceCount {# fun unsafe cudaGetDeviceCount { alloca- `Int' peekIntConv* } -> `Status' cToEnum #} -- | -- Return information about the selected compute device -- props :: Int -> IO DeviceProperties props n = resultIfOk =<< cudaGetDeviceProperties n {# fun unsafe cudaGetDeviceProperties { alloca- `DeviceProperties' peek* , `Int' } -> `Status' cToEnum #} -- | -- Set device to be used for GPU execution -- set :: Int -> IO () set n = nothingIfOk =<< cudaSetDevice n {# fun unsafe cudaSetDevice { `Int' } -> `Status' cToEnum #} -- | -- Set flags to be used for device executions -- setFlags :: [DeviceFlag] -> IO () setFlags f = nothingIfOk =<< cudaSetDeviceFlags (combineBitMasks f) {# fun unsafe cudaSetDeviceFlags { `Int' } -> `Status' cToEnum #} -- | -- Set list of devices for CUDA execution in priority order -- setOrder :: [Int] -> IO () setOrder l = nothingIfOk =<< cudaSetValidDevices l (length l) {# fun unsafe cudaSetValidDevices { withArrayIntConv* `[Int]' , `Int' } -> `Status' cToEnum #} where withArrayIntConv = withArray . map cIntConv