{-# LANGUAGE CPP, ForeignFunctionInterface, EmptyDataDecls #-} -------------------------------------------------------------------------------- -- | -- Module : Foreign.CUDA.Runtime.Device -- Copyright : (c) [2009..2011] Trevor L. McDonell -- License : BSD -- -- Device management routines -- -------------------------------------------------------------------------------- module Foreign.CUDA.Runtime.Device ( -- * Device Management ComputeMode(..), Device, DeviceFlag(..), DeviceProperties(..), choose, get, count, props, set, setFlags, setOrder, reset, sync, -- * Peer Access PeerFlag, accessible, add, remove, -- * Cache Configuration Limit(..), getLimit, setLimit ) where #include {# context lib="cudart" #} -- Friends import Foreign.CUDA.Analysis.Device import Foreign.CUDA.Runtime.Error import Foreign.CUDA.Internal.C2HS import Foreign.CUDA.Internal.Offsets -- System import Foreign import Foreign.C #c typedef struct cudaDeviceProp cudaDeviceProp; typedef enum { cudaDeviceFlagScheduleAuto = cudaDeviceScheduleAuto, cudaDeviceFlagScheduleSpin = cudaDeviceScheduleSpin, cudaDeviceFlagScheduleYield = cudaDeviceScheduleYield, cudaDeviceFlagBlockingSync = cudaDeviceBlockingSync, cudaDeviceFlagMapHost = cudaDeviceMapHost, #if CUDART_VERSION >= 3000 cudaDeviceFlagLMemResizeToMax = cudaDeviceLmemResizeToMax #endif } cudaDeviceFlags; #endc -------------------------------------------------------------------------------- -- Data Types -------------------------------------------------------------------------------- -- | -- A device identifier -- type Device = Int {# pointer *cudaDeviceProp as ^ foreign -> DeviceProperties nocode #} -- | -- Device execution flags -- {# enum cudaDeviceFlags as DeviceFlag { } with prefix="cudaDeviceFlag" deriving (Eq, Show) #} instance Storable DeviceProperties where sizeOf _ = {#sizeof cudaDeviceProp#} alignment _ = alignment (undefined :: Ptr ()) poke _ _ = error "no instance for Foreign.Storable.poke DeviceProperties" peek p = do gm <- cIntConv `fmap` {#get cudaDeviceProp.totalGlobalMem#} p sm <- cIntConv `fmap` {#get cudaDeviceProp.sharedMemPerBlock#} p rb <- cIntConv `fmap` {#get cudaDeviceProp.regsPerBlock#} p ws <- cIntConv `fmap` {#get cudaDeviceProp.warpSize#} p mp <- cIntConv `fmap` {#get cudaDeviceProp.memPitch#} p tb <- cIntConv `fmap` {#get cudaDeviceProp.maxThreadsPerBlock#} p cl <- cIntConv `fmap` {#get cudaDeviceProp.clockRate#} p cm <- cIntConv `fmap` {#get cudaDeviceProp.totalConstMem#} p v1 <- fromIntegral `fmap` {#get cudaDeviceProp.major#} p v2 <- fromIntegral `fmap` {#get cudaDeviceProp.minor#} p ta <- cIntConv `fmap` {#get cudaDeviceProp.textureAlignment#} p ov <- cToBool `fmap` {#get cudaDeviceProp.deviceOverlap#} p pc <- cIntConv `fmap` {#get cudaDeviceProp.multiProcessorCount#} p ke <- cToBool `fmap` {#get cudaDeviceProp.kernelExecTimeoutEnabled#} p tg <- cToBool `fmap` {#get cudaDeviceProp.integrated#} p hm <- cToBool `fmap` {#get cudaDeviceProp.canMapHostMemory#} p md <- cToEnum `fmap` {#get cudaDeviceProp.computeMode#} p #if CUDART_VERSION >= 3000 ck <- cToBool `fmap` {#get cudaDeviceProp.concurrentKernels#} p u1 <- cIntConv `fmap` {#get cudaDeviceProp.maxTexture1D#} p #endif #if CUDART_VERSION >= 3010 ee <- cToBool `fmap` {#get cudaDeviceProp.ECCEnabled#} p #endif #if CUDART_VERSION >= 4000 ae <- cIntConv `fmap` {#get cudaDeviceProp.asyncEngineCount#} p l2 <- cIntConv `fmap` {#get cudaDeviceProp.l2CacheSize#} p tm <- cIntConv `fmap` {#get cudaDeviceProp.maxThreadsPerMultiProcessor#} p mw <- cIntConv `fmap` {#get cudaDeviceProp.memoryBusWidth#} p mc <- cIntConv `fmap` {#get cudaDeviceProp.memoryClockRate#} p pb <- cIntConv `fmap` {#get cudaDeviceProp.pciBusID#} p pd <- cIntConv `fmap` {#get cudaDeviceProp.pciDeviceID#} p pm <- cIntConv `fmap` {#get cudaDeviceProp.pciDomainID#} p tc <- cToBool `fmap` {#get cudaDeviceProp.tccDriver#} p ua <- cToBool `fmap` {#get cudaDeviceProp.unifiedAddressing#} p #endif -- -- C->Haskell returns the wrong type when accessing static arrays in -- structs, returning the dereferenced element but with a Ptr type. Work -- around this with manual pointer arithmetic... -- n <- peekCString (p `plusPtr` devNameOffset) (t1:t2:t3:_) <- map cIntConv `fmap` peekArray 3 (p `plusPtr` devMaxThreadDimOffset :: Ptr CInt) (g1:g2:g3:_) <- map cIntConv `fmap` peekArray 3 (p `plusPtr` devMaxGridSizeOffset :: Ptr CInt) #if CUDART_VERSION >= 3000 (u21:u22:_) <- map cIntConv `fmap` peekArray 2 (p `plusPtr` devMaxTexture2DOffset :: Ptr CInt) (u31:u32:u33:_) <- map cIntConv `fmap` peekArray 3 (p `plusPtr` devMaxTexture3DOffset :: Ptr CInt) #endif let cap a 0 = a cap a b = a + b / max 10 (10^ ((ceiling . logBase 10) b :: Int)) return DeviceProperties { deviceName = n, computeCapability = cap v1 v2, totalGlobalMem = gm, totalConstMem = cm, sharedMemPerBlock = sm, regsPerBlock = rb, warpSize = ws, maxThreadsPerBlock = tb, maxBlockSize = (t1,t2,t3), maxGridSize = (g1,g2,g3), clockRate = cl, multiProcessorCount = pc, memPitch = mp, textureAlignment = ta, computeMode = md, deviceOverlap = ov, #if CUDART_VERSION >= 3000 concurrentKernels = ck, maxTextureDim1D = u1, maxTextureDim2D = (u21,u22), maxTextureDim3D = (u31,u32,u33), #endif #if CUDART_VERSION >= 3010 eccEnabled = ee, #endif #if CUDART_VERSION >= 3000 && CUDART_VERSION < 3010 -- not visible from runtime API < 3.1 eccEnabled = False, #endif #if CUDART_VERSION >= 4000 asyncEngineCount = ae, cacheMemL2 = l2, maxThreadsPerMultiProcessor = tm, memBusWidth = mw, memClockRate = mc, tccDriverEnabled = tc, unifiedAddressing = ua, pciInfo = PCI pb pd pm, #endif kernelExecTimeoutEnabled = ke, integrated = tg, canMapHostMemory = hm } -------------------------------------------------------------------------------- -- Device Management -------------------------------------------------------------------------------- -- | -- Select the compute device which best matches the given criteria -- choose :: DeviceProperties -> IO Device choose dev = resultIfOk =<< cudaChooseDevice dev {# fun unsafe cudaChooseDevice { alloca- `Int' peekIntConv* , withDevProp* `DeviceProperties' } -> `Status' cToEnum #} where withDevProp = with -- | -- Returns which device is currently being used -- get :: IO Device get = resultIfOk =<< cudaGetDevice {# fun unsafe cudaGetDevice { alloca- `Int' peekIntConv* } -> `Status' cToEnum #} -- | -- Returns the number of devices available for execution, with compute -- capability >= 1.0 -- count :: IO Int count = resultIfOk =<< cudaGetDeviceCount {# fun unsafe cudaGetDeviceCount { alloca- `Int' peekIntConv* } -> `Status' cToEnum #} -- | -- Return information about the selected compute device -- props :: Device -> IO DeviceProperties props n = resultIfOk =<< cudaGetDeviceProperties n {# fun unsafe cudaGetDeviceProperties { alloca- `DeviceProperties' peek* , `Int' } -> `Status' cToEnum #} -- | -- Set device to be used for GPU execution -- set :: Device -> IO () set n = nothingIfOk =<< cudaSetDevice n {# fun unsafe cudaSetDevice { `Int' } -> `Status' cToEnum #} -- | -- Set flags to be used for device executions -- setFlags :: [DeviceFlag] -> IO () setFlags f = nothingIfOk =<< cudaSetDeviceFlags (combineBitMasks f) {# fun unsafe cudaSetDeviceFlags { `Int' } -> `Status' cToEnum #} -- | -- Set list of devices for CUDA execution in priority order -- setOrder :: [Device] -> IO () setOrder l = nothingIfOk =<< cudaSetValidDevices l (length l) {# fun unsafe cudaSetValidDevices { withArrayIntConv* `[Int]' , `Int' } -> `Status' cToEnum #} where withArrayIntConv = withArray . map cIntConv -- | -- Block until the device has completed all preceding requested tasks. Returns -- an error if one of the tasks fails. -- sync :: IO () #if CUDART_VERSION < 4000 sync = nothingIfOk =<< cudaThreadSynchronize {# fun unsafe cudaThreadSynchronize { } -> `Status' cToEnum #} #else sync = nothingIfOk =<< cudaDeviceSynchronize {# fun unsafe cudaDeviceSynchronize { } -> `Status' cToEnum #} #endif -- | -- Explicitly destroys and cleans up all runtime resources associated with the -- current device in the current process. Any subsequent API call will -- reinitialise the device. -- -- Note that this function will reset the device immediately. It is the caller’s -- responsibility to ensure that the device is not being accessed by any other -- host threads from the process when this function is called. -- reset :: IO () #if CUDART_VERSION >= 4000 reset = nothingIfOk =<< cudaDeviceReset {# fun unsafe cudaDeviceReset { } -> `Status' cToEnum #} #else reset = nothingIfOk =<< cudaThreadExit {# fun unsafe cudaThreadExit { } -> `Status' cToEnum #} #endif -------------------------------------------------------------------------------- -- Peer Access -------------------------------------------------------------------------------- -- | -- Possible option values for direct peer memory access -- data PeerFlag instance Enum PeerFlag where -- | -- Queries if the first device can directly access the memory of the second. If -- direct access is possible, it can then be enabled with 'add'. Requires -- cuda-4.0. -- accessible :: Device -> Device -> IO Bool #if CUDART_VERSION < 4000 accessible _ _ = requireSDK 4.0 "accessible" #else accessible dev peer = resultIfOk =<< cudaDeviceCanAccessPeer dev peer {# fun unsafe cudaDeviceCanAccessPeer { alloca- `Bool' peekBool* , cIntConv `Device' , cIntConv `Device' } -> `Status' cToEnum #} #endif -- | -- If the devices of both the current and supplied contexts support unified -- addressing, then enable allocations in the supplied context to be accessible -- by the current context. Requires cuda-4.0. -- add :: Device -> [PeerFlag] -> IO () #if CUDART_VERSION < 4000 add _ _ = requireSDK 4.0 "add" #else add dev flags = nothingIfOk =<< cudaDeviceEnablePeerAccess dev flags {# fun unsafe cudaDeviceEnablePeerAccess { cIntConv `Device' , combineBitMasks `[PeerFlag]' } -> `Status' cToEnum #} #endif -- | -- Disable direct memory access from the current context to the supplied -- context. Requires cuda-4.0. -- remove :: Device -> IO () #if CUDART_VERSION < 4000 remove _ = requireSDK 4.0 "remove" #else remove dev = nothingIfOk =<< cudaDeviceDisablePeerAccess dev {# fun unsafe cudaDeviceDisablePeerAccess { cIntConv `Device' } -> `Status' cToEnum #} #endif -------------------------------------------------------------------------------- -- Cache Configuration -------------------------------------------------------------------------------- -- | -- Device limit flags -- #if CUDART_VERSION < 3010 data Limit #else {# enum cudaLimit as Limit { underscoreToCase } with prefix="cudaLimit" deriving (Eq, Show) #} #endif -- | -- Query compute 2.0 call stack limits. Requires cuda-3.1. -- getLimit :: Limit -> IO Int #if CUDART_VERSION < 3010 getLimit _ = requireSDK 3.1 "getLimit" #elif CUDART_VERSION < 4000 getLimit l = resultIfOk =<< cudaThreadGetLimit l {# fun unsafe cudaThreadGetLimit { alloca- `Int' peekIntConv* , cFromEnum `Limit' } -> `Status' cToEnum #} #else getLimit l = resultIfOk =<< cudaDeviceGetLimit l {# fun unsafe cudaDeviceGetLimit { alloca- `Int' peekIntConv* , cFromEnum `Limit' } -> `Status' cToEnum #} #endif -- | -- Set compute 2.0 call stack limits. Requires cuda-3.1. -- setLimit :: Limit -> Int -> IO () #if CUDART_VERSION < 3010 setLimit _ _ = requireSDK 3.1 "setLimit" #elif CUDART_VERSION < 4000 setLimit l n = nothingIfOk =<< cudaThreadSetLimit l n {# fun unsafe cudaThreadSetLimit { cFromEnum `Limit' , cIntConv `Int' } -> `Status' cToEnum #} #else setLimit l n = nothingIfOk =<< cudaDeviceSetLimit l n {# fun unsafe cudaDeviceSetLimit { cFromEnum `Limit' , cIntConv `Int' } -> `Status' cToEnum #} #endif