-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | FFI binding to the CUDA interface for programming NVIDIA GPUs -- -- The CUDA library provides a direct, general purpose C-like SPMD -- programming model for NVIDIA graphics cards (G8x series onwards). This -- is a collection of bindings to allow you to call and control, although -- not write, such functions from Haskell-land. You will need to install -- the CUDA driver and developer toolkit. -- -- http://developer.nvidia.com/cuda-downloads -- -- The setup script will look for your CUDA installation by checking, in -- the following order: -- --
-- p2 == p1 `plusDevPtr` (p2 `minusDevPtr` p1) --minusDevPtr :: DevicePtr a -> DevicePtr a -> Int -- | Advance a pointer into a device array by the given number of elements advanceDevPtr :: Storable a => DevicePtr a -> Int -> DevicePtr a -- | A reference to page-locked host memory. -- -- A HostPtr is just a plain Ptr, but the memory has been -- allocated by CUDA into page locked memory. This means that the data -- can be copied to the GPU via DMA (direct memory access). Note that the -- use of the system function mlock is not sufficient here --- -- the CUDA version ensures that the physical address stays this -- same, not just the virtual address. -- -- To copy data into a HostPtr array, you may use for example -- withHostPtr together with copyArray or moveArray. newtype HostPtr a HostPtr :: Ptr a -> HostPtr a [useHostPtr] :: HostPtr a -> Ptr a -- | Apply an IO action to the memory reference living inside the host -- pointer object. All uses of the pointer should be inside the -- withHostPtr bracket. withHostPtr :: HostPtr a -> (Ptr a -> IO b) -> IO b -- | The constant nullHostPtr contains the distinguished memory -- location that is not associated with a valid memory location nullHostPtr :: HostPtr a -- | Cast a host pointer from one type to another castHostPtr :: HostPtr a -> HostPtr b -- | Advance the pointer address by the given offset in bytes plusHostPtr :: HostPtr a -> Int -> HostPtr a -- | Given an alignment constraint, align the host pointer to the next -- highest address satisfying the constraint alignHostPtr :: HostPtr a -> Int -> HostPtr a -- | Compute the difference between the second and first argument minusHostPtr :: HostPtr a -> HostPtr a -> Int -- | Advance a pointer into a host array by a given number of elements advanceHostPtr :: Storable a => HostPtr a -> Int -> HostPtr a instance GHC.Classes.Ord (Foreign.CUDA.Ptr.DevicePtr a) instance GHC.Classes.Eq (Foreign.CUDA.Ptr.DevicePtr a) instance GHC.Classes.Ord (Foreign.CUDA.Ptr.HostPtr a) instance GHC.Classes.Eq (Foreign.CUDA.Ptr.HostPtr a) instance GHC.Show.Show (Foreign.CUDA.Ptr.HostPtr a) instance Foreign.Storable.Storable (Foreign.CUDA.Ptr.HostPtr a) instance GHC.Show.Show (Foreign.CUDA.Ptr.DevicePtr a) instance Foreign.Storable.Storable (Foreign.CUDA.Ptr.DevicePtr a) -- | Error handling functions module Foreign.CUDA.Runtime.Error -- | Return codes from API functions data Status Success :: Status InvalidValue :: Status MemoryAllocation :: Status InitializationError :: Status CudartUnloading :: Status ProfilerDisabled :: Status ProfilerNotInitialized :: Status ProfilerAlreadyStarted :: Status ProfilerAlreadyStopped :: Status InvalidConfiguration :: Status InvalidPitchValue :: Status InvalidSymbol :: Status InvalidHostPointer :: Status InvalidDevicePointer :: Status InvalidTexture :: Status InvalidTextureBinding :: Status InvalidChannelDescriptor :: Status InvalidMemcpyDirection :: Status AddressOfConstant :: Status TextureFetchFailed :: Status TextureNotBound :: Status SynchronizationError :: Status InvalidFilterSetting :: Status InvalidNormSetting :: Status MixedDeviceExecution :: Status NotYetImplemented :: Status MemoryValueTooLarge :: Status InsufficientDriver :: Status InvalidSurface :: Status DuplicateVariableName :: Status DuplicateTextureName :: Status DuplicateSurfaceName :: Status DevicesUnavailable :: Status IncompatibleDriverContext :: Status MissingConfiguration :: Status PriorLaunchFailure :: Status LaunchMaxDepthExceeded :: Status LaunchFileScopedTex :: Status LaunchFileScopedSurf :: Status SyncDepthExceeded :: Status LaunchPendingCountExceeded :: Status InvalidDeviceFunction :: Status NoDevice :: Status InvalidDevice :: Status StartupFailure :: Status InvalidKernelImage :: Status DeviceUninitialized :: Status MapBufferObjectFailed :: Status UnmapBufferObjectFailed :: Status ArrayIsMapped :: Status AlreadyMapped :: Status NoKernelImageForDevice :: Status AlreadyAcquired :: Status NotMapped :: Status NotMappedAsArray :: Status NotMappedAsPointer :: Status ECCUncorrectable :: Status UnsupportedLimit :: Status DeviceAlreadyInUse :: Status PeerAccessUnsupported :: Status InvalidPtx :: Status InvalidGraphicsContext :: Status NvlinkUncorrectable :: Status JitCompilerNotFound :: Status InvalidSource :: Status FileNotFound :: Status SharedObjectSymbolNotFound :: Status SharedObjectInitFailed :: Status OperatingSystem :: Status InvalidResourceHandle :: Status IllegalState :: Status SymbolNotFound :: Status NotReady :: Status IllegalAddress :: Status LaunchOutOfResources :: Status LaunchTimeout :: Status LaunchIncompatibleTexturing :: Status PeerAccessAlreadyEnabled :: Status PeerAccessNotEnabled :: Status SetOnActiveProcess :: Status ContextIsDestroyed :: Status Assert :: Status TooManyPeers :: Status HostMemoryAlreadyRegistered :: Status HostMemoryNotRegistered :: Status HardwareStackError :: Status IllegalInstruction :: Status MisalignedAddress :: Status InvalidAddressSpace :: Status InvalidPc :: Status LaunchFailure :: Status CooperativeLaunchTooLarge :: Status NotPermitted :: Status NotSupported :: Status SystemNotReady :: Status SystemDriverMismatch :: Status CompatNotSupportedOnDevice :: Status StreamCaptureUnsupported :: Status StreamCaptureInvalidated :: Status StreamCaptureMerge :: Status StreamCaptureUnmatched :: Status StreamCaptureUnjoined :: Status StreamCaptureIsolation :: Status StreamCaptureImplicit :: Status CapturedEvent :: Status StreamCaptureWrongThread :: Status Timeout :: Status GraphExecUpdateFailure :: Status Unknown :: Status ApiFailureBase :: Status data CUDAException ExitCode :: Status -> CUDAException UserError :: String -> CUDAException -- | Raise a CUDAException in the IO Monad cudaError :: String -> IO a describe :: Describe a => a -> String -- | A specially formatted error message requireSDK :: Name -> Double -> IO a -- | Return the results of a function on successful execution, otherwise -- return the error string associated with the return code resultIfOk :: (Status, a) -> IO a -- | Return the error string associated with an unsuccessful return code, -- otherwise Nothing nothingIfOk :: Status -> IO () checkStatus :: CInt -> IO () instance GHC.Show.Show Foreign.CUDA.Runtime.Error.Status instance GHC.Classes.Eq Foreign.CUDA.Runtime.Error.Status instance GHC.Exception.Type.Exception Foreign.CUDA.Runtime.Error.CUDAException instance GHC.Show.Show Foreign.CUDA.Runtime.Error.CUDAException instance GHC.Enum.Enum Foreign.CUDA.Runtime.Error.Status instance Text.Show.Describe.Describe Foreign.CUDA.Runtime.Error.Status -- | Utility functions module Foreign.CUDA.Runtime.Utils -- | Return the version number of the installed CUDA driver runtimeVersion :: IO Int -- | Return the version number of the installed CUDA runtime driverVersion :: IO Int -- | Return the version number of the CUDA library (API) that this package -- was compiled against. libraryVersion :: Int -- | Texture references module Foreign.CUDA.Runtime.Texture data Texture Texture :: !Bool -> !FilterMode -> !(AddressMode, AddressMode, AddressMode) -> !FormatDesc -> Texture -- | access texture using normalised coordinates [0.0,1.0) [normalised] :: Texture -> !Bool [filtering] :: Texture -> !FilterMode [addressing] :: Texture -> !(AddressMode, AddressMode, AddressMode) [format] :: Texture -> !FormatDesc -- | Texture channel format kind data FormatKind Signed :: FormatKind Unsigned :: FormatKind Float :: FormatKind None :: FormatKind data AddressMode Wrap :: AddressMode Clamp :: AddressMode Mirror :: AddressMode Border :: AddressMode data FilterMode Point :: FilterMode Linear :: FilterMode -- | A description of how memory read through the texture cache should be -- interpreted, including the kind of data and the number of bits of each -- component (x,y,z and w, respectively). data FormatDesc FormatDesc :: !(Int, Int, Int, Int) -> !FormatKind -> FormatDesc [depth] :: FormatDesc -> !(Int, Int, Int, Int) [kind] :: FormatDesc -> !FormatKind -- | Bind the memory area associated with the device pointer to a texture -- reference given by the named symbol. Any previously bound references -- are unbound. bind :: String -> Texture -> DevicePtr a -> Int64 -> IO () -- | Bind the two-dimensional memory area to the texture reference -- associated with the given symbol. The size of the area is constrained -- by (width,height) in texel units, and the row pitch in bytes. Any -- previously bound references are unbound. bind2D :: String -> Texture -> DevicePtr a -> (Int, Int) -> Int64 -> IO () instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.FormatKind instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.FormatKind instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.AddressMode instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.AddressMode instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.FilterMode instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.FilterMode instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.FormatDesc instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.FormatDesc instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.Texture instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.Texture instance Foreign.Storable.Storable Foreign.CUDA.Runtime.Texture.Texture instance Foreign.Storable.Storable Foreign.CUDA.Runtime.Texture.FormatDesc instance GHC.Enum.Enum Foreign.CUDA.Runtime.Texture.FilterMode instance GHC.Enum.Enum Foreign.CUDA.Runtime.Texture.AddressMode instance GHC.Enum.Enum Foreign.CUDA.Runtime.Texture.FormatKind -- | Error handling module Foreign.CUDA.Driver.Error data Status Success :: Status InvalidValue :: Status OutOfMemory :: Status NotInitialized :: Status Deinitialized :: Status ProfilerDisabled :: Status ProfilerNotInitialized :: Status ProfilerAlreadyStarted :: Status ProfilerAlreadyStopped :: Status NoDevice :: Status InvalidDevice :: Status InvalidImage :: Status InvalidContext :: Status ContextAlreadyCurrent :: Status MapFailed :: Status UnmapFailed :: Status ArrayIsMapped :: Status AlreadyMapped :: Status NoBinaryForGPU :: Status AlreadyAcquired :: Status NotMapped :: Status NotMappedAsArray :: Status NotMappedAsPointer :: Status EccUncorrectable :: Status UnsupportedLimit :: Status ContextAlreadyInUse :: Status PeerAccessUnsupported :: Status InvalidPTX :: Status InvalidGraphicsContext :: Status NvlinkUncorrectable :: Status JitCompilerNotFound :: Status InvalidSource :: Status FileNotFound :: Status SharedObjectSymbolNotFound :: Status SharedObjectInitFailed :: Status OperatingSystem :: Status InvalidHandle :: Status IllegalState :: Status NotFound :: Status NotReady :: Status IllegalAddress :: Status LaunchOutOfResources :: Status LaunchTimeout :: Status LaunchIncompatibleTexturing :: Status PeerAccessAlreadyEnabled :: Status PeerAccessNotEnabled :: Status PrimaryContextActive :: Status ContextIsDestroyed :: Status Assert :: Status TooManyPeers :: Status HostMemoryAlreadyRegistered :: Status HostMemoryNotRegistered :: Status HardwareStackError :: Status IllegalInstruction :: Status MisalignedAddress :: Status InvalidAddressSpace :: Status InvalidPC :: Status LaunchFailed :: Status CooperativeLaunchTooLarge :: Status NotPermitted :: Status NotSupported :: Status SystemNotReady :: Status SystemDriverMismatch :: Status CompatNotSupportedOnDevice :: Status StreamCaptureUnsupported :: Status StreamCaptureInvalidated :: Status StreamCaptureMerge :: Status StreamCaptureUnmatched :: Status StreamCaptureUnjoined :: Status StreamCaptureIsolation :: Status StreamCaptureImplicit :: Status CapturedEvent :: Status StreamCaptureWrongThread :: Status Timeout :: Status GraphExecUpdateFailure :: Status Unknown :: Status data CUDAException ExitCode :: Status -> CUDAException UserError :: String -> CUDAException describe :: Describe a => a -> String -- | Raise a CUDAException. Exceptions can be thrown from pure code, but -- can only be caught in the IO monad. cudaError :: String -> a -- | Raise a CUDAException in the IO Monad cudaErrorIO :: String -> IO a -- | A specially formatted error message requireSDK :: Name -> Double -> a -- | Return the results of a function on successful execution, otherwise -- throw an exception with an error string associated with the return -- code resultIfOk :: (Status, a) -> IO a -- | Throw an exception with an error string associated with an -- unsuccessful return code, otherwise return unit. nothingIfOk :: Status -> IO () checkStatus :: CInt -> IO () instance GHC.Show.Show Foreign.CUDA.Driver.Error.Status instance GHC.Classes.Eq Foreign.CUDA.Driver.Error.Status instance Text.Show.Describe.Describe Foreign.CUDA.Driver.Error.Status instance GHC.Exception.Type.Exception Foreign.CUDA.Driver.Error.CUDAException instance GHC.Show.Show Foreign.CUDA.Driver.Error.CUDAException instance GHC.Enum.Enum Foreign.CUDA.Driver.Error.Status -- | Utility functions module Foreign.CUDA.Driver.Utils -- | Return the version number of the installed CUDA driver. driverVersion :: IO Int -- | Return the version number of the CUDA library (API) that this package -- was compiled against. libraryVersion :: Int -- | Profiler control for low-level driver interface module Foreign.CUDA.Driver.Profiler -- | Profiler output mode data OutputMode KeyValuePair :: OutputMode CSV :: OutputMode -- | Initialise the CUDA profiler. -- -- The configuration file is used to specify profiling options and -- profiling counters. Refer to the "Compute Command Line Profiler User -- Guide" for supported profiler options and counters. -- -- Note that the CUDA profiler can not be initialised with this function -- if another profiling tool is already active. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PROFILER.html#group__CUDA__PROFILER initialise :: FilePath -> FilePath -> OutputMode -> IO () -- | Begin profiling collection by the active profiling tool for the -- current context. If profiling is already enabled, then this has no -- effect. -- -- start and stop can be used to programatically control -- profiling granularity, by allowing profiling to be done only on -- selected pieces of code. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PROFILER.html#group__CUDA__PROFILER_1g8a5314de2292c2efac83ac7fcfa9190e start :: IO () -- | Stop profiling collection by the active profiling tool for the current -- context, and force all pending profiler events to be written to the -- output file. If profiling is already inactive, this has no effect. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PROFILER.html#group__CUDA__PROFILER_1g4d8edef6174fd90165e6ac838f320a5f stop :: IO () instance GHC.Show.Show Foreign.CUDA.Driver.Profiler.OutputMode instance GHC.Classes.Eq Foreign.CUDA.Driver.Profiler.OutputMode instance GHC.Enum.Enum Foreign.CUDA.Driver.Profiler.OutputMode -- | Common device functions module Foreign.CUDA.Analysis.Device -- | GPU compute capability, major and minor revision number respectively. data Compute Compute :: !Int -> !Int -> Compute -- | The compute mode the device is currently in data ComputeMode Default :: ComputeMode Prohibited :: ComputeMode ExclusiveProcess :: ComputeMode -- | The properties of a compute device data DeviceProperties DeviceProperties :: !String -> !Compute -> !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> !Int -> !Int -> !(Int, Int, Int) -> !(Int, Int, Int) -> !Int -> !(Int, Int) -> !(Int, Int, Int) -> !Int -> !Int -> !Int64 -> !Int -> !Int -> !Int64 -> !ComputeMode -> !Bool -> !Bool -> !Bool -> !Int -> !Int -> !PCI -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Int -> !Bool -> !Int -> !Bool -> !Bool -> DeviceProperties -- | Identifier [deviceName] :: DeviceProperties -> !String -- | Supported compute capability [computeCapability] :: DeviceProperties -> !Compute -- | Available global memory on the device in bytes [totalGlobalMem] :: DeviceProperties -> !Int64 -- | Available constant memory on the device in bytes [totalConstMem] :: DeviceProperties -> !Int64 -- | Available shared memory per block in bytes [sharedMemPerBlock] :: DeviceProperties -> !Int64 -- | 32-bit registers per block [regsPerBlock] :: DeviceProperties -> !Int -- | Warp size in threads (SIMD width) [warpSize] :: DeviceProperties -> !Int -- | Maximum number of threads per block [maxThreadsPerBlock] :: DeviceProperties -> !Int -- | Maximum number of threads per multiprocessor [maxThreadsPerMultiProcessor] :: DeviceProperties -> !Int -- | Maximum size of each dimension of a block [maxBlockSize] :: DeviceProperties -> !(Int, Int, Int) -- | Maximum size of each dimension of a grid [maxGridSize] :: DeviceProperties -> !(Int, Int, Int) -- | Maximum texture dimensions [maxTextureDim1D] :: DeviceProperties -> !Int [maxTextureDim2D] :: DeviceProperties -> !(Int, Int) [maxTextureDim3D] :: DeviceProperties -> !(Int, Int, Int) -- | Clock frequency in kilohertz [clockRate] :: DeviceProperties -> !Int -- | Number of multiprocessors on the device [multiProcessorCount] :: DeviceProperties -> !Int -- | Maximum pitch in bytes allowed by memory copies [memPitch] :: DeviceProperties -> !Int64 -- | Global memory bus width in bits [memBusWidth] :: DeviceProperties -> !Int -- | Peak memory clock frequency in kilohertz [memClockRate] :: DeviceProperties -> !Int -- | Alignment requirement for textures [textureAlignment] :: DeviceProperties -> !Int64 [computeMode] :: DeviceProperties -> !ComputeMode -- | Device can concurrently copy memory and execute a kernel [deviceOverlap] :: DeviceProperties -> !Bool -- | Device can possibly execute multiple kernels concurrently [concurrentKernels] :: DeviceProperties -> !Bool -- | Device supports and has enabled error correction [eccEnabled] :: DeviceProperties -> !Bool -- | Number of asynchronous engines [asyncEngineCount] :: DeviceProperties -> !Int -- | Size of the L2 cache in bytes [cacheMemL2] :: DeviceProperties -> !Int -- | PCI device information for the device [pciInfo] :: DeviceProperties -> !PCI -- | Whether this is a Tesla device using the TCC driver [tccDriverEnabled] :: DeviceProperties -> !Bool -- | Whether there is a runtime limit on kernels [kernelExecTimeoutEnabled] :: DeviceProperties -> !Bool -- | As opposed to discrete [integrated] :: DeviceProperties -> !Bool -- | Device can use pinned memory [canMapHostMemory] :: DeviceProperties -> !Bool -- | Device shares a unified address space with the host [unifiedAddressing] :: DeviceProperties -> !Bool -- | Device supports stream priorities [streamPriorities] :: DeviceProperties -> !Bool -- | Device supports caching globals in L1 cache [globalL1Cache] :: DeviceProperties -> !Bool -- | Device supports caching locals in L1 cache [localL1Cache] :: DeviceProperties -> !Bool -- | Device supports allocating managed memory on this system [managedMemory] :: DeviceProperties -> !Bool -- | Device is on a multi-GPU board [multiGPUBoard] :: DeviceProperties -> !Bool -- | Unique identifier for a group of devices associated with the same -- board [multiGPUBoardGroupID] :: DeviceProperties -> !Int -- | Device supports compute pre-emption [preemption] :: DeviceProperties -> !Bool -- | Ratio of single precision performance (in floating-point operations -- per second) to double precision performance [singleToDoublePerfRatio] :: DeviceProperties -> !Int -- | Device supports launching cooperative kernels [cooperativeLaunch] :: DeviceProperties -> !Bool -- | Device can participate in cooperative multi-device kernels [cooperativeLaunchMultiDevice] :: DeviceProperties -> !Bool data DeviceResources DeviceResources :: !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Allocation -> !Int -> !Int -> !Int -> !Int -> !Int -> DeviceResources -- | Warp size [threadsPerWarp] :: DeviceResources -> !Int -- | Number of SIMD arithmetic units per multiprocessor [coresPerMP] :: DeviceResources -> !Int -- | Maximum number of in-flight warps per multiprocessor [warpsPerMP] :: DeviceResources -> !Int -- | Maximum number of in-flight threads on a multiprocessor [threadsPerMP] :: DeviceResources -> !Int -- | Maximum number of thread blocks resident on a multiprocessor [threadBlocksPerMP] :: DeviceResources -> !Int -- | Total amount of shared memory per multiprocessor (bytes) [sharedMemPerMP] :: DeviceResources -> !Int -- | Maximum amount of shared memory per thread block (bytes) [maxSharedMemPerBlock] :: DeviceResources -> !Int -- | Total number of registers in a multiprocessor [regFileSizePerMP] :: DeviceResources -> !Int -- | Maximum number of registers per block [maxRegPerBlock] :: DeviceResources -> !Int -- | Register allocation unit size [regAllocUnit] :: DeviceResources -> !Int -- | How multiprocessor resources are divided (register allocation -- granularity) [regAllocationStyle] :: DeviceResources -> !Allocation -- | Maximum number of registers per thread [maxRegPerThread] :: DeviceResources -> !Int -- | Shared memory allocation unit size (bytes) [sharedMemAllocUnit] :: DeviceResources -> !Int -- | Warp allocation granularity [warpAllocUnit] :: DeviceResources -> !Int -- | Warp register allocation granularity [warpRegAllocUnit] :: DeviceResources -> !Int -- | Maximum number of resident grids per device (concurrent kernels) [maxGridsPerDevice] :: DeviceResources -> !Int data Allocation Warp :: Allocation Block :: Allocation data PCI PCI :: !Int -> !Int -> !Int -> PCI -- | PCI bus ID of the device [busID] :: PCI -> !Int -- | PCI device ID [deviceID] :: PCI -> !Int -- | PCI domain ID [domainID] :: PCI -> !Int -- | Extract some additional hardware resource limitations for a given -- device. deviceResources :: DeviceProperties -> DeviceResources describe :: Describe a => a -> String instance GHC.Show.Show Foreign.CUDA.Analysis.Device.ComputeMode instance GHC.Classes.Eq Foreign.CUDA.Analysis.Device.ComputeMode instance GHC.Classes.Eq Foreign.CUDA.Analysis.Device.Compute instance GHC.Show.Show Foreign.CUDA.Analysis.Device.PCI instance GHC.Show.Show Foreign.CUDA.Analysis.Device.DeviceProperties instance GHC.Show.Show Foreign.CUDA.Analysis.Device.Allocation instance GHC.Show.Show Foreign.CUDA.Analysis.Device.DeviceResources instance GHC.Show.Show Foreign.CUDA.Analysis.Device.Compute instance GHC.Classes.Ord Foreign.CUDA.Analysis.Device.Compute instance GHC.Enum.Enum Foreign.CUDA.Analysis.Device.ComputeMode instance Text.Show.Describe.Describe Foreign.CUDA.Analysis.Device.ComputeMode -- | Device management routines module Foreign.CUDA.Runtime.Device -- | A device identifier type Device = Int -- | Device execution flags data DeviceFlag ScheduleAuto :: DeviceFlag ScheduleSpin :: DeviceFlag ScheduleYield :: DeviceFlag BlockingSync :: DeviceFlag MapHost :: DeviceFlag LMemResizeToMax :: DeviceFlag -- | The properties of a compute device data DeviceProperties DeviceProperties :: !String -> !Compute -> !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> !Int -> !Int -> !(Int, Int, Int) -> !(Int, Int, Int) -> !Int -> !(Int, Int) -> !(Int, Int, Int) -> !Int -> !Int -> !Int64 -> !Int -> !Int -> !Int64 -> !ComputeMode -> !Bool -> !Bool -> !Bool -> !Int -> !Int -> !PCI -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Int -> !Bool -> !Int -> !Bool -> !Bool -> DeviceProperties -- | Identifier [deviceName] :: DeviceProperties -> !String -- | Supported compute capability [computeCapability] :: DeviceProperties -> !Compute -- | Available global memory on the device in bytes [totalGlobalMem] :: DeviceProperties -> !Int64 -- | Available constant memory on the device in bytes [totalConstMem] :: DeviceProperties -> !Int64 -- | Available shared memory per block in bytes [sharedMemPerBlock] :: DeviceProperties -> !Int64 -- | 32-bit registers per block [regsPerBlock] :: DeviceProperties -> !Int -- | Warp size in threads (SIMD width) [warpSize] :: DeviceProperties -> !Int -- | Maximum number of threads per block [maxThreadsPerBlock] :: DeviceProperties -> !Int -- | Maximum number of threads per multiprocessor [maxThreadsPerMultiProcessor] :: DeviceProperties -> !Int -- | Maximum size of each dimension of a block [maxBlockSize] :: DeviceProperties -> !(Int, Int, Int) -- | Maximum size of each dimension of a grid [maxGridSize] :: DeviceProperties -> !(Int, Int, Int) -- | Maximum texture dimensions [maxTextureDim1D] :: DeviceProperties -> !Int [maxTextureDim2D] :: DeviceProperties -> !(Int, Int) [maxTextureDim3D] :: DeviceProperties -> !(Int, Int, Int) -- | Clock frequency in kilohertz [clockRate] :: DeviceProperties -> !Int -- | Number of multiprocessors on the device [multiProcessorCount] :: DeviceProperties -> !Int -- | Maximum pitch in bytes allowed by memory copies [memPitch] :: DeviceProperties -> !Int64 -- | Global memory bus width in bits [memBusWidth] :: DeviceProperties -> !Int -- | Peak memory clock frequency in kilohertz [memClockRate] :: DeviceProperties -> !Int -- | Alignment requirement for textures [textureAlignment] :: DeviceProperties -> !Int64 [computeMode] :: DeviceProperties -> !ComputeMode -- | Device can concurrently copy memory and execute a kernel [deviceOverlap] :: DeviceProperties -> !Bool -- | Device can possibly execute multiple kernels concurrently [concurrentKernels] :: DeviceProperties -> !Bool -- | Device supports and has enabled error correction [eccEnabled] :: DeviceProperties -> !Bool -- | Number of asynchronous engines [asyncEngineCount] :: DeviceProperties -> !Int -- | Size of the L2 cache in bytes [cacheMemL2] :: DeviceProperties -> !Int -- | PCI device information for the device [pciInfo] :: DeviceProperties -> !PCI -- | Whether this is a Tesla device using the TCC driver [tccDriverEnabled] :: DeviceProperties -> !Bool -- | Whether there is a runtime limit on kernels [kernelExecTimeoutEnabled] :: DeviceProperties -> !Bool -- | As opposed to discrete [integrated] :: DeviceProperties -> !Bool -- | Device can use pinned memory [canMapHostMemory] :: DeviceProperties -> !Bool -- | Device shares a unified address space with the host [unifiedAddressing] :: DeviceProperties -> !Bool -- | Device supports stream priorities [streamPriorities] :: DeviceProperties -> !Bool -- | Device supports caching globals in L1 cache [globalL1Cache] :: DeviceProperties -> !Bool -- | Device supports caching locals in L1 cache [localL1Cache] :: DeviceProperties -> !Bool -- | Device supports allocating managed memory on this system [managedMemory] :: DeviceProperties -> !Bool -- | Device is on a multi-GPU board [multiGPUBoard] :: DeviceProperties -> !Bool -- | Unique identifier for a group of devices associated with the same -- board [multiGPUBoardGroupID] :: DeviceProperties -> !Int -- | Device supports compute pre-emption [preemption] :: DeviceProperties -> !Bool -- | Ratio of single precision performance (in floating-point operations -- per second) to double precision performance [singleToDoublePerfRatio] :: DeviceProperties -> !Int -- | Device supports launching cooperative kernels [cooperativeLaunch] :: DeviceProperties -> !Bool -- | Device can participate in cooperative multi-device kernels [cooperativeLaunchMultiDevice] :: DeviceProperties -> !Bool -- | GPU compute capability, major and minor revision number respectively. data Compute Compute :: !Int -> !Int -> Compute -- | The compute mode the device is currently in data ComputeMode Default :: ComputeMode Prohibited :: ComputeMode ExclusiveProcess :: ComputeMode -- | Select the compute device which best matches the given criteria choose :: DeviceProperties -> IO Device -- | Returns which device is currently being used get :: IO Device -- | Returns the number of devices available for execution, with compute -- capability >= 1.0 count :: IO Int -- | Return information about the selected compute device props :: Device -> IO DeviceProperties -- | Set device to be used for GPU execution set :: Device -> IO () -- | Set flags to be used for device executions setFlags :: [DeviceFlag] -> IO () -- | Set list of devices for CUDA execution in priority order setOrder :: [Device] -> IO () -- | Explicitly destroys and cleans up all runtime resources associated -- with the current device in the current process. Any subsequent API -- call will reinitialise the device. -- -- Note that this function will reset the device immediately. It is the -- caller’s responsibility to ensure that the device is not being -- accessed by any other host threads from the process when this function -- is called. reset :: IO () -- | Block until the device has completed all preceding requested tasks. -- Returns an error if one of the tasks fails. sync :: IO () -- | Possible option values for direct peer memory access data PeerFlag -- | Queries if the first device can directly access the memory of the -- second. If direct access is possible, it can then be enabled with -- add. Requires cuda-4.0. accessible :: Device -> Device -> IO Bool -- | If the devices of both the current and supplied contexts support -- unified addressing, then enable allocations in the supplied context to -- be accessible by the current context. Requires cuda-4.0. add :: Device -> [PeerFlag] -> IO () -- | Disable direct memory access from the current context to the supplied -- context. Requires cuda-4.0. remove :: Device -> IO () -- | Device limit flags data Limit Stacksize :: Limit Printffifosize :: Limit Mallocheapsize :: Limit Devruntimesyncdepth :: Limit Devruntimependinglaunchcount :: Limit Maxl2fetchgranularity :: Limit -- | Query compute 2.0 call stack limits. Requires cuda-3.1. getLimit :: Limit -> IO Int -- | Set compute 2.0 call stack limits. Requires cuda-3.1. setLimit :: Limit -> Int -> IO () instance GHC.Enum.Bounded Foreign.CUDA.Runtime.Device.DeviceFlag instance GHC.Show.Show Foreign.CUDA.Runtime.Device.DeviceFlag instance GHC.Classes.Eq Foreign.CUDA.Runtime.Device.DeviceFlag instance GHC.Show.Show Foreign.CUDA.Runtime.Device.Limit instance GHC.Classes.Eq Foreign.CUDA.Runtime.Device.Limit instance GHC.Enum.Enum Foreign.CUDA.Runtime.Device.Limit instance GHC.Enum.Enum Foreign.CUDA.Runtime.Device.PeerFlag instance GHC.Enum.Enum Foreign.CUDA.Runtime.Device.DeviceFlag instance Foreign.Storable.Storable Foreign.CUDA.Analysis.Device.DeviceProperties -- | Module loading for low-level driver interface module Foreign.CUDA.Driver.Module.Base -- | A reference to a Module object, containing collections of device -- functions newtype Module Module :: Ptr () -> Module [useModule] :: Module -> Ptr () -- | Just-in-time compilation and linking options data JITOption -- | maximum number of registers per thread MaxRegisters :: !Int -> JITOption -- | number of threads per block to target for ThreadsPerBlock :: !Int -> JITOption -- | level of optimisation to apply (1-4, default 4) OptimisationLevel :: !Int -> JITOption -- | compilation target, otherwise determined from context Target :: !Compute -> JITOption -- | fallback strategy if matching cubin not found FallbackStrategy :: !JITFallback -> JITOption -- | generate debug info (-g) (requires cuda >= 5.5) GenerateDebugInfo :: JITOption -- | generate line number information (-lineinfo) (requires cuda >= 5.5) GenerateLineInfo :: JITOption -- | verbose log messages (requires cuda >= 5.5) Verbose :: JITOption -- | Online compilation target architecture data JITTarget Compute20 :: JITTarget Compute21 :: JITTarget Compute30 :: JITTarget Compute32 :: JITTarget Compute35 :: JITTarget Compute37 :: JITTarget Compute50 :: JITTarget Compute52 :: JITTarget Compute53 :: JITTarget Compute60 :: JITTarget Compute61 :: JITTarget Compute62 :: JITTarget Compute70 :: JITTarget Compute72 :: JITTarget Compute75 :: JITTarget -- | Results of online compilation data JITResult JITResult :: !Float -> !ByteString -> !Module -> JITResult -- | milliseconds spent compiling PTX [jitTime] :: JITResult -> !Float -- | information about PTX assembly [jitInfoLog] :: JITResult -> !ByteString -- | the compiled module [jitModule] :: JITResult -> !Module -- | Online compilation fallback strategy data JITFallback PreferPTX :: JITFallback PreferBinary :: JITFallback -- | Device code formats that can be used for online linking data JITInputType Cubin :: JITInputType PTX :: JITInputType Fatbinary :: JITInputType Object :: JITInputType Library :: JITInputType CuJitNumInputTypes :: JITInputType -- | Load the contents of the specified file (either a ptx or cubin file) -- to create a new module, and load that module into the current context. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g366093bd269dafd0af21f1c7d18115d3 loadFile :: FilePath -> IO Module -- | Load the contents of the given image into a new module, and load that -- module into the current context. The image is (typically) the contents -- of a cubin or PTX file. -- -- Note that the ByteString will be copied into a temporary -- staging area so that it can be passed to C. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g04ce266ce03720f479eab76136b90c0b loadData :: ByteString -> IO Module -- | As loadData, but read the image data from the given pointer. -- The image is a NULL-terminated sequence of bytes. loadDataFromPtr :: Ptr Word8 -> IO Module -- | Load the contents of the given image into a module with online -- compiler options, and load the module into the current context. The -- image is (typically) the contents of a cubin or PTX file. The actual -- attributes of the compiled kernel can be probed using -- requires. -- -- Note that the ByteString will be copied into a temporary -- staging area so that it can be passed to C. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g9e8047e9dbf725f0cd7cafd18bfd4d12 loadDataEx :: ByteString -> [JITOption] -> IO JITResult -- | As loadDataEx, but read the image data from the given pointer. -- The image is a NULL-terminated sequence of bytes. loadDataFromPtrEx :: Ptr Word8 -> [JITOption] -> IO JITResult -- | Unload a module from the current context. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b unload :: Module -> IO () instance GHC.Show.Show Foreign.CUDA.Driver.Module.Base.Module instance GHC.Classes.Eq Foreign.CUDA.Driver.Module.Base.Module instance GHC.Show.Show Foreign.CUDA.Driver.Module.Base.JITResult instance GHC.Show.Show Foreign.CUDA.Driver.Module.Base.JITTarget instance GHC.Classes.Eq Foreign.CUDA.Driver.Module.Base.JITTarget instance GHC.Show.Show Foreign.CUDA.Driver.Module.Base.JITFallback instance GHC.Classes.Eq Foreign.CUDA.Driver.Module.Base.JITFallback instance GHC.Show.Show Foreign.CUDA.Driver.Module.Base.JITOption instance GHC.Show.Show Foreign.CUDA.Driver.Module.Base.JITInputType instance GHC.Classes.Eq Foreign.CUDA.Driver.Module.Base.JITInputType instance GHC.Show.Show Foreign.CUDA.Driver.Module.Base.JITOptionInternal instance GHC.Classes.Eq Foreign.CUDA.Driver.Module.Base.JITOptionInternal instance GHC.Enum.Enum Foreign.CUDA.Driver.Module.Base.JITOptionInternal instance GHC.Enum.Enum Foreign.CUDA.Driver.Module.Base.JITInputType instance GHC.Enum.Enum Foreign.CUDA.Driver.Module.Base.JITFallback instance GHC.Enum.Enum Foreign.CUDA.Driver.Module.Base.JITTarget -- | Module linking for low-level driver interface -- -- Since CUDA-5.5 module Foreign.CUDA.Driver.Module.Link -- | A pending JIT linker state data LinkState -- | Just-in-time compilation and linking options data JITOption -- | maximum number of registers per thread MaxRegisters :: !Int -> JITOption -- | number of threads per block to target for ThreadsPerBlock :: !Int -> JITOption -- | level of optimisation to apply (1-4, default 4) OptimisationLevel :: !Int -> JITOption -- | compilation target, otherwise determined from context Target :: !Compute -> JITOption -- | fallback strategy if matching cubin not found FallbackStrategy :: !JITFallback -> JITOption -- | generate debug info (-g) (requires cuda >= 5.5) GenerateDebugInfo :: JITOption -- | generate line number information (-lineinfo) (requires cuda >= 5.5) GenerateLineInfo :: JITOption -- | verbose log messages (requires cuda >= 5.5) Verbose :: JITOption -- | Device code formats that can be used for online linking data JITInputType Cubin :: JITInputType PTX :: JITInputType Fatbinary :: JITInputType Object :: JITInputType Library :: JITInputType CuJitNumInputTypes :: JITInputType -- | Create a pending JIT linker invocation. The returned LinkState -- should be destroyed once no longer needed. The device code -- machine size will match the calling application. -- -- Requires CUDA-5.5. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g86ca4052a2fab369cb943523908aa80d create :: [JITOption] -> IO LinkState -- | Destroy the state of a JIT linker invocation. -- -- Requires CUDA-5.5. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g01b7ae2a34047b05716969af245ce2d9 destroy :: LinkState -> IO () -- | Complete a pending linker invocation and load the current module. The -- link state will be destroyed. -- -- Requires CUDA-5.5. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g818fcd84a4150a997c0bba76fef4e716 complete :: LinkState -> IO Module -- | Add an input file to a pending linker invocation. -- -- Requires CUDA-5.5. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g1224c0fd48d4a683f3ce19997f200a8c addFile :: LinkState -> FilePath -> JITInputType -> [JITOption] -> IO () -- | Add an input to a pending linker invocation. -- -- Requires CUDA-5.5. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g3ebcd2ccb772ba9c120937a2d2831b77 addData :: LinkState -> ByteString -> JITInputType -> [JITOption] -> IO () -- | As addData, but read the specified number of bytes of image -- data from the given pointer. addDataFromPtr :: LinkState -> Int -> Ptr Word8 -> JITInputType -> [JITOption] -> IO () instance GHC.Show.Show Foreign.CUDA.Driver.Module.Link.LinkState -- | Device management for low-level driver interface module Foreign.CUDA.Driver.Device -- | A CUDA device newtype Device Device :: CInt -> Device [useDevice] :: Device -> CInt -- | The properties of a compute device data DeviceProperties DeviceProperties :: !String -> !Compute -> !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> !Int -> !Int -> !(Int, Int, Int) -> !(Int, Int, Int) -> !Int -> !(Int, Int) -> !(Int, Int, Int) -> !Int -> !Int -> !Int64 -> !Int -> !Int -> !Int64 -> !ComputeMode -> !Bool -> !Bool -> !Bool -> !Int -> !Int -> !PCI -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Int -> !Bool -> !Int -> !Bool -> !Bool -> DeviceProperties -- | Identifier [deviceName] :: DeviceProperties -> !String -- | Supported compute capability [computeCapability] :: DeviceProperties -> !Compute -- | Available global memory on the device in bytes [totalGlobalMem] :: DeviceProperties -> !Int64 -- | Available constant memory on the device in bytes [totalConstMem] :: DeviceProperties -> !Int64 -- | Available shared memory per block in bytes [sharedMemPerBlock] :: DeviceProperties -> !Int64 -- | 32-bit registers per block [regsPerBlock] :: DeviceProperties -> !Int -- | Warp size in threads (SIMD width) [warpSize] :: DeviceProperties -> !Int -- | Maximum number of threads per block [maxThreadsPerBlock] :: DeviceProperties -> !Int -- | Maximum number of threads per multiprocessor [maxThreadsPerMultiProcessor] :: DeviceProperties -> !Int -- | Maximum size of each dimension of a block [maxBlockSize] :: DeviceProperties -> !(Int, Int, Int) -- | Maximum size of each dimension of a grid [maxGridSize] :: DeviceProperties -> !(Int, Int, Int) -- | Maximum texture dimensions [maxTextureDim1D] :: DeviceProperties -> !Int [maxTextureDim2D] :: DeviceProperties -> !(Int, Int) [maxTextureDim3D] :: DeviceProperties -> !(Int, Int, Int) -- | Clock frequency in kilohertz [clockRate] :: DeviceProperties -> !Int -- | Number of multiprocessors on the device [multiProcessorCount] :: DeviceProperties -> !Int -- | Maximum pitch in bytes allowed by memory copies [memPitch] :: DeviceProperties -> !Int64 -- | Global memory bus width in bits [memBusWidth] :: DeviceProperties -> !Int -- | Peak memory clock frequency in kilohertz [memClockRate] :: DeviceProperties -> !Int -- | Alignment requirement for textures [textureAlignment] :: DeviceProperties -> !Int64 [computeMode] :: DeviceProperties -> !ComputeMode -- | Device can concurrently copy memory and execute a kernel [deviceOverlap] :: DeviceProperties -> !Bool -- | Device can possibly execute multiple kernels concurrently [concurrentKernels] :: DeviceProperties -> !Bool -- | Device supports and has enabled error correction [eccEnabled] :: DeviceProperties -> !Bool -- | Number of asynchronous engines [asyncEngineCount] :: DeviceProperties -> !Int -- | Size of the L2 cache in bytes [cacheMemL2] :: DeviceProperties -> !Int -- | PCI device information for the device [pciInfo] :: DeviceProperties -> !PCI -- | Whether this is a Tesla device using the TCC driver [tccDriverEnabled] :: DeviceProperties -> !Bool -- | Whether there is a runtime limit on kernels [kernelExecTimeoutEnabled] :: DeviceProperties -> !Bool -- | As opposed to discrete [integrated] :: DeviceProperties -> !Bool -- | Device can use pinned memory [canMapHostMemory] :: DeviceProperties -> !Bool -- | Device shares a unified address space with the host [unifiedAddressing] :: DeviceProperties -> !Bool -- | Device supports stream priorities [streamPriorities] :: DeviceProperties -> !Bool -- | Device supports caching globals in L1 cache [globalL1Cache] :: DeviceProperties -> !Bool -- | Device supports caching locals in L1 cache [localL1Cache] :: DeviceProperties -> !Bool -- | Device supports allocating managed memory on this system [managedMemory] :: DeviceProperties -> !Bool -- | Device is on a multi-GPU board [multiGPUBoard] :: DeviceProperties -> !Bool -- | Unique identifier for a group of devices associated with the same -- board [multiGPUBoardGroupID] :: DeviceProperties -> !Int -- | Device supports compute pre-emption [preemption] :: DeviceProperties -> !Bool -- | Ratio of single precision performance (in floating-point operations -- per second) to double precision performance [singleToDoublePerfRatio] :: DeviceProperties -> !Int -- | Device supports launching cooperative kernels [cooperativeLaunch] :: DeviceProperties -> !Bool -- | Device can participate in cooperative multi-device kernels [cooperativeLaunchMultiDevice] :: DeviceProperties -> !Bool -- | Device attributes data DeviceAttribute MaxThreadsPerBlock :: DeviceAttribute MaxBlockDimX :: DeviceAttribute MaxBlockDimY :: DeviceAttribute MaxBlockDimZ :: DeviceAttribute MaxGridDimX :: DeviceAttribute MaxGridDimY :: DeviceAttribute MaxGridDimZ :: DeviceAttribute MaxSharedMemoryPerBlock :: DeviceAttribute SharedMemoryPerBlock :: DeviceAttribute TotalConstantMemory :: DeviceAttribute WarpSize :: DeviceAttribute MaxPitch :: DeviceAttribute MaxRegistersPerBlock :: DeviceAttribute RegistersPerBlock :: DeviceAttribute ClockRate :: DeviceAttribute TextureAlignment :: DeviceAttribute GpuOverlap :: DeviceAttribute MultiprocessorCount :: DeviceAttribute KernelExecTimeout :: DeviceAttribute Integrated :: DeviceAttribute CanMapHostMemory :: DeviceAttribute ComputeMode :: DeviceAttribute MaximumTexture1dWidth :: DeviceAttribute MaximumTexture2dWidth :: DeviceAttribute MaximumTexture2dHeight :: DeviceAttribute MaximumTexture3dWidth :: DeviceAttribute MaximumTexture3dHeight :: DeviceAttribute MaximumTexture3dDepth :: DeviceAttribute MaximumTexture2dLayeredWidth :: DeviceAttribute MaximumTexture2dArrayWidth :: DeviceAttribute MaximumTexture2dLayeredHeight :: DeviceAttribute MaximumTexture2dArrayHeight :: DeviceAttribute MaximumTexture2dLayeredLayers :: DeviceAttribute MaximumTexture2dArrayNumslices :: DeviceAttribute SurfaceAlignment :: DeviceAttribute ConcurrentKernels :: DeviceAttribute EccEnabled :: DeviceAttribute PciBusId :: DeviceAttribute PciDeviceId :: DeviceAttribute TccDriver :: DeviceAttribute MemoryClockRate :: DeviceAttribute GlobalMemoryBusWidth :: DeviceAttribute L2CacheSize :: DeviceAttribute MaxThreadsPerMultiprocessor :: DeviceAttribute AsyncEngineCount :: DeviceAttribute UnifiedAddressing :: DeviceAttribute MaximumTexture1dLayeredWidth :: DeviceAttribute MaximumTexture1dLayeredLayers :: DeviceAttribute CanTex2dGather :: DeviceAttribute MaximumTexture2dGatherWidth :: DeviceAttribute MaximumTexture2dGatherHeight :: DeviceAttribute MaximumTexture3dWidthAlternate :: DeviceAttribute MaximumTexture3dHeightAlternate :: DeviceAttribute MaximumTexture3dDepthAlternate :: DeviceAttribute PciDomainId :: DeviceAttribute TexturePitchAlignment :: DeviceAttribute MaximumTexturecubemapWidth :: DeviceAttribute MaximumTexturecubemapLayeredWidth :: DeviceAttribute MaximumTexturecubemapLayeredLayers :: DeviceAttribute MaximumSurface1dWidth :: DeviceAttribute MaximumSurface2dWidth :: DeviceAttribute MaximumSurface2dHeight :: DeviceAttribute MaximumSurface3dWidth :: DeviceAttribute MaximumSurface3dHeight :: DeviceAttribute MaximumSurface3dDepth :: DeviceAttribute MaximumSurface1dLayeredWidth :: DeviceAttribute MaximumSurface1dLayeredLayers :: DeviceAttribute MaximumSurface2dLayeredWidth :: DeviceAttribute MaximumSurface2dLayeredHeight :: DeviceAttribute MaximumSurface2dLayeredLayers :: DeviceAttribute MaximumSurfacecubemapWidth :: DeviceAttribute MaximumSurfacecubemapLayeredWidth :: DeviceAttribute MaximumSurfacecubemapLayeredLayers :: DeviceAttribute MaximumTexture1dLinearWidth :: DeviceAttribute MaximumTexture2dLinearWidth :: DeviceAttribute MaximumTexture2dLinearHeight :: DeviceAttribute MaximumTexture2dLinearPitch :: DeviceAttribute MaximumTexture2dMipmappedWidth :: DeviceAttribute MaximumTexture2dMipmappedHeight :: DeviceAttribute ComputeCapabilityMajor :: DeviceAttribute ComputeCapabilityMinor :: DeviceAttribute MaximumTexture1dMipmappedWidth :: DeviceAttribute StreamPrioritiesSupported :: DeviceAttribute GlobalL1CacheSupported :: DeviceAttribute LocalL1CacheSupported :: DeviceAttribute MaxSharedMemoryPerMultiprocessor :: DeviceAttribute MaxRegistersPerMultiprocessor :: DeviceAttribute ManagedMemory :: DeviceAttribute MultiGpuBoard :: DeviceAttribute MultiGpuBoardGroupId :: DeviceAttribute HostNativeAtomicSupported :: DeviceAttribute SingleToDoublePrecisionPerfRatio :: DeviceAttribute PageableMemoryAccess :: DeviceAttribute ConcurrentManagedAccess :: DeviceAttribute ComputePreemptionSupported :: DeviceAttribute CanUseHostPointerForRegisteredMem :: DeviceAttribute CanUseStreamMemOps :: DeviceAttribute CanUse64BitStreamMemOps :: DeviceAttribute CanUseStreamWaitValueNor :: DeviceAttribute CooperativeLaunch :: DeviceAttribute CooperativeMultiDeviceLaunch :: DeviceAttribute MaxSharedMemoryPerBlockOptin :: DeviceAttribute CanFlushRemoteWrites :: DeviceAttribute HostRegisterSupported :: DeviceAttribute PageableMemoryAccessUsesHostPageTables :: DeviceAttribute DirectManagedMemAccessFromHost :: DeviceAttribute VirtualAddressManagementSupported :: DeviceAttribute HandleTypePosixFileDescriptorSupported :: DeviceAttribute HandleTypeWin32HandleSupported :: DeviceAttribute HandleTypeWin32KmtHandleSupported :: DeviceAttribute CU_DEVICE_ATTRIBUTE_MAX :: DeviceAttribute -- | GPU compute capability, major and minor revision number respectively. data Compute Compute :: !Int -> !Int -> Compute -- | The compute mode the device is currently in data ComputeMode Default :: ComputeMode Prohibited :: ComputeMode ExclusiveProcess :: ComputeMode -- | Possible option flags for CUDA initialisation. Dummy instance until -- the API exports actual option values. data InitFlag -- | Initialise the CUDA driver API. This must be called before any other -- driver function. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3 initialise :: [InitFlag] -> IO () -- | Return the compute compatibility revision supported by the device capability :: Device -> IO Compute -- | Return a handle to the compute device at the given ordinal. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g8bdd1cc7201304b01357b8034f6587cb device :: Int -> IO Device -- | Return the selected attribute for the given device. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266 attribute :: Device -> DeviceAttribute -> IO Int -- | Return the number of device with compute capability > 1.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74 count :: IO Int -- | The identifying name of the device. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1gef75aa30df95446a845f2a7b9fffbb7f name :: Device -> IO String -- | Return the properties of the selected device props :: Device -> IO DeviceProperties -- | Returns a UUID for the device -- -- Requires CUDA-9.2 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g987b46b884c101ed5be414ab4d9e60e4 uuid :: Device -> IO UUID -- | The total memory available on the device (bytes). -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1gc6a0d6551335a3780f9f3c967a0fde5d totalMem :: Device -> IO Int64 instance GHC.Show.Show Foreign.CUDA.Driver.Device.Device instance GHC.Classes.Eq Foreign.CUDA.Driver.Device.Device instance GHC.Show.Show Foreign.CUDA.Driver.Device.DeviceAttribute instance GHC.Classes.Eq Foreign.CUDA.Driver.Device.DeviceAttribute instance GHC.Enum.Enum Foreign.CUDA.Driver.Device.InitFlag instance GHC.Enum.Enum Foreign.CUDA.Driver.Device.DeviceAttribute -- | Context management for the low-level driver interface module Foreign.CUDA.Driver.Context.Base -- | A device context newtype Context Context :: Ptr () -> Context [useContext] :: Context -> Ptr () -- | Context creation flags data ContextFlag SchedAuto :: ContextFlag SchedSpin :: ContextFlag SchedYield :: ContextFlag SchedBlockingSync :: ContextFlag -- | Deprecated: use SchedBlockingSync instead BlockingSync :: ContextFlag SchedMask :: ContextFlag MapHost :: ContextFlag LmemResizeToMax :: ContextFlag FlagsMask :: ContextFlag -- | Create a new CUDA context and associate it with the calling thread. -- The context is created with a usage count of one, and the caller of -- create must call destroy when done using the context. If -- a context is already current to the thread, it is supplanted by the -- newly created context and must be restored by a subsequent call to -- pop. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf create :: Device -> [ContextFlag] -> IO Context -- | Destroy the specified context, regardless of how many threads it is -- current to. The context will be poped from the current thread's -- context stack, but if it is current on any other threads it will -- remain current to those threads, and attempts to access it will result -- in an error. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e destroy :: Context -> IO () -- | Return the device of the currently active context -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e device :: IO Device -- | Pop the current CUDA context from the CPU thread. The context may then -- be attached to a different CPU thread by calling push. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2fac188026a062d92e91a8687d0a7902 pop :: IO Context -- | Push the given context onto the CPU's thread stack of current -- contexts. The specified context becomes the CPU thread's current -- context, so all operations that operate on the current context are -- affected. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gb02d4c850eb16f861fe5a29682cc90ba push :: Context -> IO () -- | Block until the device has completed all preceding requests. If the -- context was created with the SchedBlockingSync flag, the CPU -- thread will block until the GPU has finished its work. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616 sync :: IO () -- | Return the context bound to the calling CPU thread. -- -- Requires CUDA-4.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g8f13165846b73750693640fb3e8380d0 get :: IO (Maybe Context) -- | Bind the specified context to the calling thread. -- -- Requires CUDA-4.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7 set :: Context -> IO () -- | Increments the usage count of the context. API: no context flags are -- currently supported, so this parameter must be empty. -- | Deprecated: as of CUDA-4.0 attach :: Context -> [ContextFlag] -> IO () -- | Detach the context, and destroy if no longer used -- | Deprecated: as of CUDA-4.0 detach :: Context -> IO () instance GHC.Show.Show Foreign.CUDA.Driver.Context.Base.Context instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Base.Context instance GHC.Enum.Bounded Foreign.CUDA.Driver.Context.Base.ContextFlag instance GHC.Show.Show Foreign.CUDA.Driver.Context.Base.ContextFlag instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Base.ContextFlag instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.Base.ContextFlag -- | Stream management for low-level driver interface module Foreign.CUDA.Driver.Stream -- | A processing stream. All operations in a stream are synchronous and -- executed in sequence, but operations in different non-default streams -- may happen out-of-order or concurrently with one another. -- -- Use Events to synchronise operations between streams. newtype Stream Stream :: Ptr () -> Stream [useStream] :: Stream -> Ptr () -- | Priority of an execution stream. Work submitted to a higher priority -- stream may preempt execution of work already executing in a lower -- priority stream. Lower numbers represent higher priorities. type StreamPriority = Int type StreamCallback = ((FunPtr ((Ptr ()) -> (CInt -> ((Ptr ()) -> (IO ())))))) -- | Execution stream creation flags data StreamFlag Default :: StreamFlag NonBlocking :: StreamFlag data StreamWriteFlag WriteValueDefault :: StreamWriteFlag WriteValueNoMemoryBarrier :: StreamWriteFlag data StreamWaitFlag WaitValueGeq :: StreamWaitFlag WaitValueEq :: StreamWaitFlag WaitValueAnd :: StreamWaitFlag WaitValueNor :: StreamWaitFlag WaitValueFlush :: StreamWaitFlag data StreamCallbackFlag -- | Create a new stream. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4 create :: [StreamFlag] -> IO Stream -- | Create a stream with the given priority. Work submitted to a -- higher-priority stream may preempt work already executing in a lower -- priority stream. -- -- The convention is that lower numbers represent higher priorities. The -- default priority is zero. The range of meaningful numeric priorities -- can be queried using getStreamPriorityRange. If the specified -- priority is outside the supported numerical range, it will -- automatically be clamped to the highest or lowest number in the range -- -- Requires CUDA-5.5. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g95c1a8c7c3dacb13091692dd9c7f7471 createWithPriority :: StreamPriority -> [StreamFlag] -> IO Stream -- | Destroy a stream. If the device is still doing work in the stream when -- destroy is called, the function returns immediately and the -- resources associated with the stream will be released automatically -- once the device has completed all work. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758 destroy :: Stream -> IO () -- | Check if all operations in the stream have completed. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g1b0d24bbe97fa68e4bc511fb6adfeb0b finished :: Stream -> IO Bool -- | Wait until the device has completed all operations in the Stream. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad block :: Stream -> IO () -- | Add a callback to a compute stream. This function will be executed on -- the host after all currently queued items in the stream have -- completed. -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483 callback :: Stream -> StreamCallback -> Ptr () -> [StreamCallbackFlag] -> IO () -- | Query the flags of a given stream -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g4d39786855a6bed01215c1907fbbfbb7 getFlags :: Stream -> IO [StreamFlag] -- | Query the priority of a stream. -- -- Requires CUDA-5.5. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g5bd5cb26915a2ecf1921807339488484 getPriority :: Stream -> IO StreamPriority -- | Query the context associated with a stream -- -- Requires CUDA-9.2. -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g5bd5cb26915a2ecf1921807339488484 getContext :: Stream -> IO Context -- | Write a value to memory, (presumably) after all preceding work in the -- stream has completed. Unless the option -- WriteValueNoMemoryBarrier is supplied, the write is preceded by -- a system-wide memory fence. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g091455366d56dc2f1f69726aafa369b0 -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gc8af1e8b96d7561840affd5217dd6830 -- -- Requires CUDA-8.0 for 32-bit values. -- -- Requires CUDA-9.0 for 64-bit values. write :: Storable a => DevicePtr a -> a -> Stream -> [StreamWriteFlag] -> IO () -- | Wait on a memory location. Work ordered after the operation will block -- until the given condition on the memory is satisfied. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g629856339de7bc6606047385addbb398 -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6910c1258c5f15aa5d699f0fd60d6933 -- -- Requires CUDA-8.0 for 32-bit values. -- -- Requires CUDA-9.0 for 64-bit values. wait :: Storable a => DevicePtr a -> a -> Stream -> [StreamWaitFlag] -> IO () -- | The default execution stream. This can be configured to have either -- defaultStreamLegacy or defaultStreamPerThread -- synchronisation behaviour. -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/stream-sync-behavior.html#stream-sync-behavior__default-stream defaultStream :: Stream -- | The legacy default stream is an implicit stream which synchronises -- with all other streams in the same Context, except for -- non-blocking streams. -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/stream-sync-behavior.html#stream-sync-behavior__default-stream defaultStreamLegacy :: Stream -- | The per-thread default stream is an implicit stream local to both the -- thread and the calling Context, and which does not synchronise -- with other streams (just like explicitly created streams). The -- per-thread default stream is not a non-blocking stream and will -- synchronise with the legacy default stream if both are used in the -- same program. -- -- -- file:///Developer/NVIDIA/CUDA-9.2/doc/html/cuda-driver-api/stream-sync-behavior.html#stream-sync-behavior__default-stream defaultStreamPerThread :: Stream instance GHC.Show.Show Foreign.CUDA.Driver.Stream.Stream instance GHC.Classes.Eq Foreign.CUDA.Driver.Stream.Stream instance GHC.Enum.Bounded Foreign.CUDA.Driver.Stream.StreamFlag instance GHC.Show.Show Foreign.CUDA.Driver.Stream.StreamFlag instance GHC.Classes.Eq Foreign.CUDA.Driver.Stream.StreamFlag instance GHC.Enum.Bounded Foreign.CUDA.Driver.Stream.StreamWriteFlag instance GHC.Show.Show Foreign.CUDA.Driver.Stream.StreamWriteFlag instance GHC.Classes.Eq Foreign.CUDA.Driver.Stream.StreamWriteFlag instance GHC.Enum.Bounded Foreign.CUDA.Driver.Stream.StreamWaitFlag instance GHC.Show.Show Foreign.CUDA.Driver.Stream.StreamWaitFlag instance GHC.Classes.Eq Foreign.CUDA.Driver.Stream.StreamWaitFlag instance GHC.Enum.Enum Foreign.CUDA.Driver.Stream.StreamCallbackFlag instance GHC.Enum.Enum Foreign.CUDA.Driver.Stream.StreamWaitFlag instance GHC.Enum.Enum Foreign.CUDA.Driver.Stream.StreamWriteFlag instance GHC.Enum.Enum Foreign.CUDA.Driver.Stream.StreamFlag -- | Stream management routines module Foreign.CUDA.Runtime.Stream -- | A processing stream. All operations in a stream are synchronous and -- executed in sequence, but operations in different non-default streams -- may happen out-of-order or concurrently with one another. -- -- Use Events to synchronise operations between streams. newtype Stream Stream :: Ptr () -> Stream [useStream] :: Stream -> Ptr () -- | Create a new asynchronous stream create :: IO Stream -- | Destroy and clean up an asynchronous stream destroy :: Stream -> IO () -- | Determine if all operations in a stream have completed finished :: Stream -> IO Bool -- | Block until all operations in a Stream have been completed block :: Stream -> IO () -- | The default execution stream. This can be configured to have either -- defaultStreamLegacy or defaultStreamPerThread -- synchronisation behaviour. -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/stream-sync-behavior.html#stream-sync-behavior__default-stream defaultStream :: Stream -- | The legacy default stream is an implicit stream which synchronises -- with all other streams in the same Context, except for -- non-blocking streams. -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/stream-sync-behavior.html#stream-sync-behavior__default-stream defaultStreamLegacy :: Stream -- | The per-thread default stream is an implicit stream local to both the -- thread and the calling Context, and which does not synchronise -- with other streams (just like explicitly created streams). The -- per-thread default stream is not a non-blocking stream and will -- synchronise with the legacy default stream if both are used in the -- same program. -- -- -- file:///Developer/NVIDIA/CUDA-9.2/doc/html/cuda-driver-api/stream-sync-behavior.html#stream-sync-behavior__default-stream defaultStreamPerThread :: Stream -- | Memory management for CUDA devices module Foreign.CUDA.Runtime.Marshal -- | Options for host allocation data AllocFlag Portable :: AllocFlag DeviceMapped :: AllocFlag WriteCombined :: AllocFlag -- | Allocate a section of linear memory on the host which is page-locked -- and directly accessible from the device. The storage is sufficient to -- hold the given number of elements of a storable type. The runtime -- system automatically accelerates calls to functions such as -- peekArrayAsync and pokeArrayAsync that refer to -- page-locked memory. -- -- Note that since the amount of pageable memory is thusly reduced, -- overall system performance may suffer. This is best used sparingly to -- allocate staging areas for data exchange mallocHostArray :: Storable a => [AllocFlag] -> Int -> IO (HostPtr a) -- | Free page-locked host memory previously allocated with -- mallecHost freeHost :: HostPtr a -> IO () -- | Allocate a section of linear memory on the device, and return a -- reference to it. The memory is sufficient to hold the given number of -- elements of storable type. It is suitable aligned, and not cleared. mallocArray :: Storable a => Int -> IO (DevicePtr a) -- | Execute a computation, passing a pointer to a temporarily allocated -- block of memory sufficient to hold the given number of elements of -- storable type. The memory is freed when the computation terminates -- (normally or via an exception), so the pointer must not be used after -- this. -- -- Note that kernel launches can be asynchronous, so you may need to add -- a synchronisation point at the end of the computation. allocaArray :: Storable a => Int -> (DevicePtr a -> IO b) -> IO b -- | Free previously allocated memory on the device free :: DevicePtr a -> IO () -- | Options for unified memory allocations data AttachFlag Global :: AttachFlag Host :: AttachFlag Single :: AttachFlag -- | Allocates memory that will be automatically managed by the Unified -- Memory system mallocManagedArray :: Storable a => [AttachFlag] -> Int -> IO (DevicePtr a) -- | Copy a number of elements from the device to host memory. This is a -- synchronous operation. peekArray :: Storable a => Int -> DevicePtr a -> Ptr a -> IO () -- | Copy memory from the device asynchronously, possibly associated with a -- particular stream. The destination memory must be page locked. peekArrayAsync :: Storable a => Int -> DevicePtr a -> HostPtr a -> Maybe Stream -> IO () -- | Copy a 2D memory area from the device to the host. This is a -- synchronous operation. peekArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> Ptr a -> Int -> IO () -- | Copy a 2D memory area from the device to the host asynchronously, -- possibly associated with a particular stream. The destination array -- must be page locked. peekArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> HostPtr a -> Int -> Maybe Stream -> IO () -- | Copy a number of elements from the device into a new Haskell list. -- Note that this requires two memory copies: firstly from the device -- into a heap allocated array, and from there marshalled into a list peekListArray :: Storable a => Int -> DevicePtr a -> IO [a] -- | Copy a number of elements onto the device. This is a synchronous -- operation. pokeArray :: Storable a => Int -> Ptr a -> DevicePtr a -> IO () -- | Copy memory onto the device asynchronously, possibly associated with a -- particular stream. The source memory must be page-locked. pokeArrayAsync :: Storable a => Int -> HostPtr a -> DevicePtr a -> Maybe Stream -> IO () -- | Copy a 2D memory area onto the device. This is a synchronous -- operation. pokeArray2D :: Storable a => Int -> Int -> Ptr a -> Int -> DevicePtr a -> Int -> IO () -- | Copy a 2D memory area onto the device asynchronously, possibly -- associated with a particular stream. The source array must be page -- locked. pokeArray2DAsync :: Storable a => Int -> Int -> HostPtr a -> Int -> DevicePtr a -> Int -> Maybe Stream -> IO () -- | Write a list of storable elements into a device array. The array must -- be sufficiently large to hold the entire list. This requires two -- marshalling operations pokeListArray :: Storable a => [a] -> DevicePtr a -> IO () -- | Copy the given number of elements from the first device array (source) -- to the second (destination). The copied areas may not overlap. This -- operation is asynchronous with respect to host, but will not overlap -- other device operations. copyArray :: Storable a => Int -> DevicePtr a -> DevicePtr a -> IO () -- | Copy the given number of elements from the first device array (source) -- to the second (destination). The copied areas may not overlap. This -- operation is asynchronous with respect to the host, and may be -- associated with a particular stream. copyArrayAsync :: Storable a => Int -> DevicePtr a -> DevicePtr a -> Maybe Stream -> IO () -- | Copy a 2D memory area from the first device array (source) to the -- second (destination). The copied areas may not overlap. This operation -- is asynchronous with respect to the host, but will not overlap other -- device operations. copyArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> DevicePtr a -> Int -> IO () -- | Copy a 2D memory area from the first device array (source) to the -- second device array (destination). The copied areas may not overlay. -- This operation is asynchronous with respect to the host, and may be -- associated with a particular stream. copyArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> DevicePtr a -> Int -> Maybe Stream -> IO () -- | Write a list of storable elements into a newly allocated device array. -- This is newListArrayLen composed with fst. newListArray :: Storable a => [a] -> IO (DevicePtr a) -- | Write a list of storable elements into a newly allocated device array, -- returning the device pointer together with the number of elements that -- were written. Note that this requires two copy operations: firstly -- from a Haskell list into a heap-allocated array, and from there into -- device memory. The array should be freed when no longer -- required. newListArrayLen :: Storable a => [a] -> IO (DevicePtr a, Int) -- | Temporarily store a list of elements into a newly allocated device -- array. An IO action is applied to the array, the result of which is -- returned. Similar to newListArray, this requires two -- marshalling operations of the data. -- -- As with allocaArray, the memory is freed once the action -- completes, so you should not return the pointer from the action, and -- be sure that any asynchronous operations (such as kernel execution) -- have completed. withListArray :: Storable a => [a] -> (DevicePtr a -> IO b) -> IO b -- | A variant of withListArray which also supplies the number of -- elements in the array to the applied function withListArrayLen :: Storable a => [a] -> (Int -> DevicePtr a -> IO b) -> IO b -- | Initialise device memory to a given 8-bit value memset :: DevicePtr a -> Int64 -> Int8 -> IO () instance GHC.Enum.Bounded Foreign.CUDA.Runtime.Marshal.AllocFlag instance GHC.Show.Show Foreign.CUDA.Runtime.Marshal.AllocFlag instance GHC.Classes.Eq Foreign.CUDA.Runtime.Marshal.AllocFlag instance GHC.Enum.Bounded Foreign.CUDA.Runtime.Marshal.AttachFlag instance GHC.Show.Show Foreign.CUDA.Runtime.Marshal.AttachFlag instance GHC.Classes.Eq Foreign.CUDA.Runtime.Marshal.AttachFlag instance GHC.Show.Show Foreign.CUDA.Runtime.Marshal.CopyDirection instance GHC.Classes.Eq Foreign.CUDA.Runtime.Marshal.CopyDirection instance GHC.Enum.Enum Foreign.CUDA.Runtime.Marshal.CopyDirection instance GHC.Enum.Enum Foreign.CUDA.Runtime.Marshal.AttachFlag instance GHC.Enum.Enum Foreign.CUDA.Runtime.Marshal.AllocFlag -- | Kernel execution control for C-for-CUDA runtime interface module Foreign.CUDA.Runtime.Exec -- | A global device function. -- -- Note that the use of a string naming a function was deprecated in CUDA -- 4.1 and removed in CUDA 5.0. type Fun = FunPtr () data FunAttributes FunAttributes :: !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> FunAttributes [constSizeBytes] :: FunAttributes -> !Int64 [localSizeBytes] :: FunAttributes -> !Int64 [sharedSizeBytes] :: FunAttributes -> !Int64 -- | maximum block size that can be successively launched (based on -- register usage) [maxKernelThreadsPerBlock] :: FunAttributes -> !Int -- | number of registers required for each thread [numRegs] :: FunAttributes -> !Int -- | Kernel function parameters. Doubles will be converted to an internal -- float representation on devices that do not support doubles natively. data FunParam [IArg] :: !Int -> FunParam [FArg] :: !Float -> FunParam [DArg] :: !Double -> FunParam [VArg] :: Storable a => !a -> FunParam -- | Cache configuration preference data CacheConfig None :: CacheConfig Shared :: CacheConfig L1 :: CacheConfig Equal :: CacheConfig -- | Obtain the attributes of the named global device -- function. This itemises the requirements to successfully launch the -- given kernel. attributes :: Fun -> IO FunAttributes -- | On devices where the L1 cache and shared memory use the same hardware -- resources, this sets the preferred cache configuration for the given -- device function. This is only a preference; the driver is free to -- choose a different configuration as required to execute the function. -- -- Switching between configuration modes may insert a device-side -- synchronisation point for streamed kernel launches setCacheConfig :: Fun -> CacheConfig -> IO () -- | Invoke a kernel on a (gx * gy) grid of blocks, where each -- block contains (tx * ty * tz) threads and has access to a -- given number of bytes of shared memory. The launch may also be -- associated with a specific Stream. launchKernel :: Fun -> (Int, Int) -> (Int, Int, Int) -> Int64 -> Maybe Stream -> [FunParam] -> IO () instance GHC.Show.Show Foreign.CUDA.Runtime.Exec.FunAttributes instance GHC.Show.Show Foreign.CUDA.Runtime.Exec.CacheConfig instance GHC.Classes.Eq Foreign.CUDA.Runtime.Exec.CacheConfig instance GHC.Enum.Enum Foreign.CUDA.Runtime.Exec.CacheConfig instance Foreign.Storable.Storable Foreign.CUDA.Runtime.Exec.FunAttributes -- | Top level bindings to the C-for-CUDA runtime API module Foreign.CUDA.Runtime -- | Top level bindings. By default, expose the C-for-CUDA runtime API -- bindings, as they are slightly more user friendly. module Foreign.CUDA -- | Requires CUDA-10 module Foreign.CUDA.Driver.Graph.Capture data Status None :: Status Active :: Status Invalidated :: Status data Mode Global :: Mode ThreadLocal :: Mode Relaxed :: Mode -- | Begin graph capture on a stream -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1gea22d4496b1c8d02d0607bb05743532f start :: Stream -> Mode -> IO () -- | End graph capture on a stream -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g03dab8b2ba76b00718955177a929970c stop :: Stream -> IO Graph -- | Return a stream's capture status -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g37823c49206e3704ae23c7ad78560bca status :: Stream -> IO Status -- | Query the capture status of a stream and get an id for the capture -- sequence, which is unique over the lifetime of the process. -- -- Requires CUDA-10.1 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g13145ece1d79a1d79a1d22abb9663216 info :: Stream -> IO (Status, Int64) -- | Set the stream capture interaction mode for this thread. Return the -- previous value. -- -- Requires CUDA-10.1 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g378135b262f02a43a7caeab239ae493d mode :: Mode -> IO Mode instance GHC.Enum.Bounded Foreign.CUDA.Driver.Graph.Capture.Status instance GHC.Show.Show Foreign.CUDA.Driver.Graph.Capture.Status instance GHC.Classes.Eq Foreign.CUDA.Driver.Graph.Capture.Status instance GHC.Enum.Bounded Foreign.CUDA.Driver.Graph.Capture.Mode instance GHC.Show.Show Foreign.CUDA.Driver.Graph.Capture.Mode instance GHC.Classes.Eq Foreign.CUDA.Driver.Graph.Capture.Mode instance GHC.Enum.Enum Foreign.CUDA.Driver.Graph.Capture.Mode instance GHC.Enum.Enum Foreign.CUDA.Driver.Graph.Capture.Status -- | Event management for low-level driver interface module Foreign.CUDA.Driver.Event -- | Events are markers that can be inserted into the CUDA execution stream -- and later queried. newtype Event Event :: Ptr () -> Event [useEvent] :: Event -> Ptr () -- | Event creation flags data EventFlag Default :: EventFlag BlockingSync :: EventFlag DisableTiming :: EventFlag Interprocess :: EventFlag data WaitFlag -- | Create a new event -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db create :: [EventFlag] -> IO Event -- | Destroy an event -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef destroy :: Event -> IO () -- | Determine the elapsed time (in milliseconds) between two events -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97 elapsedTime :: Event -> Event -> IO Float -- | Determines if a event has actually been recorded -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef query :: Event -> IO Bool -- | Record an event once all operations in the current context (or -- optionally specified stream) have completed. This operation is -- asynchronous. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1 record :: Event -> Maybe Stream -> IO () -- | Makes all future work submitted to the (optional) stream wait until -- the given event reports completion before beginning execution. -- Synchronisation is performed on the device, including when the event -- and stream are from different device contexts. -- -- Requires CUDA-3.2. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g6a898b652dfc6aa1d5c8d97062618b2f wait :: Event -> Maybe Stream -> [WaitFlag] -> IO () -- | Wait until the event has been recorded -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g9e520d34e51af7f5375610bca4add99c block :: Event -> IO () instance GHC.Show.Show Foreign.CUDA.Driver.Event.Event instance GHC.Classes.Eq Foreign.CUDA.Driver.Event.Event instance GHC.Enum.Bounded Foreign.CUDA.Driver.Event.EventFlag instance GHC.Show.Show Foreign.CUDA.Driver.Event.EventFlag instance GHC.Classes.Eq Foreign.CUDA.Driver.Event.EventFlag instance GHC.Enum.Enum Foreign.CUDA.Driver.Event.WaitFlag instance GHC.Enum.Enum Foreign.CUDA.Driver.Event.EventFlag -- | Event management for C-for-CUDA runtime environment module Foreign.CUDA.Runtime.Event -- | Events are markers that can be inserted into the CUDA execution stream -- and later queried. data Event -- | Event creation flags data EventFlag Default :: EventFlag BlockingSync :: EventFlag DisableTiming :: EventFlag Interprocess :: EventFlag data WaitFlag -- | Create a new event create :: [EventFlag] -> IO Event -- | Destroy an event destroy :: Event -> IO () -- | Determine the elapsed time (in milliseconds) between two events elapsedTime :: Event -> Event -> IO Float -- | Determines if a event has actually been recorded query :: Event -> IO Bool -- | Record an event once all operations in the current context (or -- optionally specified stream) have completed. This operation is -- asynchronous. record :: Event -> Maybe Stream -> IO () -- | Makes all future work submitted to the (optional) stream wait until -- the given event reports completion before beginning execution. -- Synchronisation is performed on the device, including when the event -- and stream are from different device contexts. Requires cuda-3.2. wait :: Event -> Maybe Stream -> [WaitFlag] -> IO () -- | Wait until the event has been recorded block :: Event -> IO () -- | IPC event management for low-level driver interface. -- -- Restricted to devices which support unified addressing on Linux -- operating systems. -- -- Since CUDA-4.1. module Foreign.CUDA.Driver.IPC.Event -- | A CUDA inter-process event handle. data IPCEvent -- | Create an inter-process event handle for a previously allocated event. -- The event must be created with the Interprocess and -- DisableTiming event flags. The returned handle may then be sent -- to another process and opened to allow efficient hardware -- synchronisation between GPU work in other processes. -- -- After the event has been opened in the importing process, -- record, block, wait, query may be used in -- either process. -- -- Performing operations on the imported event after the event has been -- destroyed in the exporting process is undefined. -- -- Requires CUDA-4.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gea02eadd12483de5305878b13288a86c export :: Event -> IO IPCEvent -- | Open an inter-process event handle for use in the current process, -- returning an event that can be used in the current process and -- behaving as a locally created event with the DisableTiming flag -- specified. -- -- The event must be freed with destroy. Performing operations on -- the imported event after the exported event has been destroyed -- in the exporting process is undefined. -- -- Requires CUDA-4.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf1d525918b6c643b99ca8c8e42e36c2e open :: IPCEvent -> IO Event instance GHC.Show.Show Foreign.CUDA.Driver.IPC.Event.IPCEvent instance GHC.Classes.Eq Foreign.CUDA.Driver.IPC.Event.IPCEvent -- | Memory management for low-level driver interface module Foreign.CUDA.Driver.Marshal -- | Options for host allocation data AllocFlag Portable :: AllocFlag DeviceMapped :: AllocFlag WriteCombined :: AllocFlag -- | Allocate a section of linear memory on the host which is page-locked -- and directly accessible from the device. The storage is sufficient to -- hold the given number of elements of a storable type. -- -- Note that since the amount of pageable memory is thusly reduced, -- overall system performance may suffer. This is best used sparingly to -- allocate staging areas for data exchange. -- -- Host memory allocated in this way is automatically and immediately -- accessible to all contexts on all devices which support unified -- addressing. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0 mallocHostArray :: Storable a => [AllocFlag] -> Int -> IO (HostPtr a) -- | As mallocHostArray, but return a ForeignPtr instead. The -- array will be deallocated automatically once the last reference to the -- ForeignPtr is dropped. mallocHostForeignPtr :: Storable a => [AllocFlag] -> Int -> IO (ForeignPtr a) -- | Free a section of page-locked host memory. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c freeHost :: HostPtr a -> IO () -- | Page-locks the specified array (on the host) and maps it for the -- device(s) as specified by the given allocation flags. Subsequently, -- the memory is accessed directly by the device so can be read and -- written with much higher bandwidth than pageable memory that has not -- been registered. The memory range is added to the same tracking -- mechanism as mallocHostArray to automatically accelerate calls -- to functions such as pokeArray. -- -- Note that page-locking excessive amounts of memory may degrade system -- performance, since it reduces the amount of pageable memory available. -- This is best used sparingly to allocate staging areas for data -- exchange. -- -- This function has limited support on Mac OS X. OS 10.7 or later is -- required. -- -- Requires CUDA-4.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223 registerArray :: Storable a => [AllocFlag] -> Int -> Ptr a -> IO (HostPtr a) -- | Unmaps the memory from the given pointer, and makes it pageable again. -- -- Requires CUDA-4.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14 unregisterArray :: HostPtr a -> IO (Ptr a) -- | Allocate a section of linear memory on the device, and return a -- reference to it. The memory is sufficient to hold the given number of -- elements of storable type. It is suitably aligned for any type, and is -- not cleared. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467 mallocArray :: Storable a => Int -> IO (DevicePtr a) -- | Execute a computation on the device, passing a pointer to a -- temporarily allocated block of memory sufficient to hold the given -- number of elements of storable type. The memory is freed when the -- computation terminates (normally or via an exception), so the pointer -- must not be used after this. -- -- Note that kernel launches can be asynchronous, so you may want to add -- a synchronisation point using sync as part of the continuation. allocaArray :: Storable a => Int -> (DevicePtr a -> IO b) -> IO b -- | Release a section of device memory. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a free :: DevicePtr a -> IO () -- | Options for unified memory allocations data AttachFlag CuMemAttachGlobal :: AttachFlag CuMemAttachHost :: AttachFlag CuMemAttachSingle :: AttachFlag -- | Allocates memory that will be automatically managed by the Unified -- Memory system. The returned pointer is valid on the CPU and on all -- GPUs which supported managed memory. All accesses to this pointer must -- obey the Unified Memory programming model. -- -- On a multi-GPU system with peer-to-peer support, where multiple GPUs -- support managed memory, the physical storage is created on the GPU -- which is active at the time mallocManagedArray is called. All -- other GPUs will access the array at reduced bandwidth via peer mapping -- over the PCIe bus. The Unified Memory system does not migrate memory -- between GPUs. -- -- On a multi-GPU system where multiple GPUs support managed memory, but -- not all pairs of such GPUs have peer-to-peer support between them, the -- physical storage is allocated in system memory (zero-copy memory) and -- all GPUs will access the data at reduced bandwidth over the PCIe bus. -- -- Requires CUDA-6.0 -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32 mallocManagedArray :: Storable a => [AttachFlag] -> Int -> IO (DevicePtr a) -- | Pre-fetches the given number of elements to the specified destination -- device. If the specified device is Nothing, the data is pre-fetched to -- host memory. The pointer must refer to a memory range allocated with -- mallocManagedArray. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html#group__CUDA__UNIFIED_1gfe94f8b7fb56291ebcea44261aa4cb84 -- -- Requires CUDA-8.0. prefetchArrayAsync :: Storable a => DevicePtr a -> Int -> Maybe Device -> Maybe Stream -> IO () -- | Attach an array of the given number of elements to a stream -- asynchronously -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g6e468d680e263e7eba02a56643c50533 attachArrayAsync :: forall a. Storable a => [AttachFlag] -> Stream -> DevicePtr a -> Int -> IO () -- | Copy a number of elements from the device to host memory. This is a -- synchronous operation. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g3480368ee0208a98f75019c9a8450893 peekArray :: Storable a => Int -> DevicePtr a -> Ptr a -> IO () -- | Copy memory from the device asynchronously, possibly associated with a -- particular stream. The destination host memory must be page-locked. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362 peekArrayAsync :: Storable a => Int -> DevicePtr a -> HostPtr a -> Maybe Stream -> IO () -- | Copy a 2D array from the device to the host. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g27f885b30c34cc20a663a671dbf6fc27 peekArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> Ptr a -> Int -> Int -> Int -> IO () -- | Copy a 2D array from the device to the host asynchronously, possibly -- associated with a particular execution stream. The destination host -- memory must be page-locked. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4acf155faeb969d9d21f5433d3d0f274 peekArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> HostPtr a -> Int -> Int -> Int -> Maybe Stream -> IO () -- | Copy a number of elements from the device into a new Haskell list. -- Note that this requires two memory copies: firstly from the device -- into a heap allocated array, and from there marshalled into a list. peekListArray :: Storable a => Int -> DevicePtr a -> IO [a] -- | Copy a number of elements onto the device. This is a synchronous -- operation. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169 pokeArray :: Storable a => Int -> Ptr a -> DevicePtr a -> IO () -- | Copy memory onto the device asynchronously, possibly associated with a -- particular stream. The source host memory must be page-locked. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g1572263fe2597d7ba4f6964597a354a3 pokeArrayAsync :: Storable a => Int -> HostPtr a -> DevicePtr a -> Maybe Stream -> IO () -- | Copy a 2D array from the host to the device. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g27f885b30c34cc20a663a671dbf6fc27 pokeArray2D :: Storable a => Int -> Int -> Ptr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> IO () -- | Copy a 2D array from the host to the device asynchronously, possibly -- associated with a particular execution stream. The source host memory -- must be page-locked. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4acf155faeb969d9d21f5433d3d0f274 pokeArray2DAsync :: Storable a => Int -> Int -> HostPtr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> Maybe Stream -> IO () -- | Write a list of storable elements into a device array. The device -- array must be sufficiently large to hold the entire list. This -- requires two marshalling operations. pokeListArray :: Storable a => [a] -> DevicePtr a -> IO () -- | Copy the given number of elements from the first device array (source) -- to the second device (destination). The copied areas may not overlap. -- This operation is asynchronous with respect to the host, but will -- never overlap with kernel execution. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g1725774abf8b51b91945f3336b778c8b copyArray :: Storable a => Int -> DevicePtr a -> DevicePtr a -> IO () -- | Copy the given number of elements from the first device array (source) -- to the second device array (destination). The copied areas may not -- overlap. The operation is asynchronous with respect to the host, and -- can be asynchronous to other device operations by associating it with -- a particular stream. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g39ea09ba682b8eccc9c3e0c04319b5c8 copyArrayAsync :: Storable a => Int -> DevicePtr a -> DevicePtr a -> Maybe Stream -> IO () -- | Copy a 2D array from the first device array (source) to the second -- device array (destination). The copied areas must not overlap. This -- operation is asynchronous with respect to the host, but will never -- overlap with kernel execution. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g27f885b30c34cc20a663a671dbf6fc27 copyArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> IO () -- | Copy a 2D array from the first device array (source) to the second -- device array (destination). The copied areas may not overlap. The -- operation is asynchronous with respect to the host, and can be -- asynchronous to other device operations by associating it with a -- particular execution stream. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4acf155faeb969d9d21f5433d3d0f274 copyArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> Maybe Stream -> IO () -- | Copies an array from device memory in one context to device memory in -- another context. Note that this function is asynchronous with respect -- to the host, but serialised with respect to all pending and future -- asynchronous work in the source and destination contexts. To avoid -- this synchronisation, use copyArrayPeerAsync instead. -- -- Requires CUDA-4.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1ge1f5c7771544fee150ada8853c7cbf4a copyArrayPeer :: Storable a => Int -> DevicePtr a -> Context -> DevicePtr a -> Context -> IO () -- | Copies from device memory in one context to device memory in another -- context. Note that this function is asynchronous with respect to the -- host and all work in other streams and devices. -- -- Requires CUDA-4.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g82fcecb38018e64b98616a8ac30112f2 copyArrayPeerAsync :: Storable a => Int -> DevicePtr a -> Context -> DevicePtr a -> Context -> Maybe Stream -> IO () -- | Write a list of storable elements into a newly allocated device array. -- This is newListArrayLen composed with fst. newListArray :: Storable a => [a] -> IO (DevicePtr a) -- | Write a list of storable elements into a newly allocated device array, -- returning the device pointer together with the number of elements that -- were written. Note that this requires two memory copies: firstly from -- a Haskell list to a heap allocated array, and from there onto the -- graphics device. The memory should be freed when no longer -- required. newListArrayLen :: Storable a => [a] -> IO (DevicePtr a, Int) -- | Temporarily store a list of elements into a newly allocated device -- array. An IO action is applied to to the array, the result of which is -- returned. Similar to newListArray, this requires copying the -- data twice. -- -- As with allocaArray, the memory is freed once the action -- completes, so you should not return the pointer from the action, and -- be wary of asynchronous kernel execution. withListArray :: Storable a => [a] -> (DevicePtr a -> IO b) -> IO b -- | A variant of withListArray which also supplies the number of -- elements in the array to the applied function withListArrayLen :: Storable a => [a] -> (Int -> DevicePtr a -> IO b) -> IO b -- | Set a number of data elements to the specified value, which may be -- either 8-, 16-, or 32-bits wide. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g7d805e610054392a4d11e8a8bf5eb35c -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132 memset :: Storable a => DevicePtr a -> Int -> a -> IO () -- | Set the number of data elements to the specified value, which may be -- either 8-, 16-, or 32-bits wide. The operation is asynchronous and may -- optionally be associated with a stream. -- -- Requires CUDA-3.2. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627 -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf731438877dd8ec875e4c43d848c878c -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5 memsetAsync :: Storable a => DevicePtr a -> Int -> a -> Maybe Stream -> IO () -- | Return the device pointer associated with a mapped, pinned host -- buffer, which was allocated with the DeviceMapped option by -- mallocHostArray. -- -- Currently, no options are supported and this must be empty. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g57a39e5cba26af4d06be67fc77cc62f0 getDevicePtr :: [AllocFlag] -> HostPtr a -> IO (DevicePtr a) -- | Return the base address and allocation size of the given device -- pointer. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g64fee5711274a2a0573a789c94d8299b getBasePtr :: DevicePtr a -> IO (DevicePtr a, Int64) -- | Return the amount of free and total memory respectively available to -- the current context (bytes). -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0 getMemInfo :: IO (Int64, Int64) instance GHC.Enum.Bounded Foreign.CUDA.Driver.Marshal.AllocFlag instance GHC.Show.Show Foreign.CUDA.Driver.Marshal.AllocFlag instance GHC.Classes.Eq Foreign.CUDA.Driver.Marshal.AllocFlag instance GHC.Enum.Bounded Foreign.CUDA.Driver.Marshal.AttachFlag instance GHC.Show.Show Foreign.CUDA.Driver.Marshal.AttachFlag instance GHC.Classes.Eq Foreign.CUDA.Driver.Marshal.AttachFlag instance GHC.Enum.Enum Foreign.CUDA.Driver.Marshal.AttachFlag instance GHC.Enum.Enum Foreign.CUDA.Driver.Marshal.AllocFlag -- | Texture management for low-level driver interface module Foreign.CUDA.Driver.Texture -- | A texture reference newtype Texture Texture :: Ptr () -> Texture [useTexture] :: Texture -> Ptr () -- | Texture data formats data Format Word8 :: Format Word16 :: Format Word32 :: Format Int8 :: Format Int16 :: Format Int32 :: Format Half :: Format Float :: Format -- | Texture reference addressing modes data AddressMode Wrap :: AddressMode Clamp :: AddressMode Mirror :: AddressMode Border :: AddressMode -- | Texture reference filtering mode data FilterMode Point :: FilterMode Linear :: FilterMode -- | Texture read mode options data ReadMode ReadAsInteger :: ReadMode NormalizedCoordinates :: ReadMode SRGB :: ReadMode -- | Bind a linear array address of the given size (bytes) as a texture -- reference. Any previously bound references are unbound. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g44ef7e5055192d52b3d43456602b50a8 bind :: Texture -> DevicePtr a -> Int64 -> IO () -- | Bind a linear address range to the given texture reference as a -- two-dimensional arena. Any previously bound reference is unbound. Note -- that calls to setFormat can not follow a call to bind2D -- for the same texture reference. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g26f709bbe10516681913d1ffe8756ee2 bind2D :: Texture -> Format -> Int -> DevicePtr a -> (Int, Int) -> Int64 -> IO () -- | Get the addressing mode used by a texture reference, corresponding to -- the given dimension (currently the only supported dimension values are -- 0 or 1). -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1gfb367d93dc1d20aab0cf8ce70d543b33 getAddressMode :: Texture -> Int -> IO AddressMode -- | Get the filtering mode used by a texture reference. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g2439e069746f69b940f2f4dbc78cdf87 getFilterMode :: Texture -> IO FilterMode -- | Get the data format and number of channel components of the bound -- texture. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g90936eb6c7c4434a609e1160c278ae53 getFormat :: Texture -> IO (Format, Int) -- | Specify the addressing mode for the given dimension of a texture -- reference. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g85f4a13eeb94c8072f61091489349bcb setAddressMode :: Texture -> Int -> AddressMode -> IO () -- | Specify the filtering mode to be used when reading memory through a -- texture reference. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g595d0af02c55576f8c835e4efd1f39c0 setFilterMode :: Texture -> FilterMode -> IO () -- | Specify the format of the data and number of packed components per -- element to be read by the texture reference. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g05585ef8ea2fec728a03c6c8f87cf07a setFormat :: Texture -> Format -> Int -> IO () -- | Specify additional characteristics for reading and indexing the -- texture reference. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g554ffd896487533c36810f2e45bb7a28 setReadMode :: Texture -> ReadMode -> IO () -- | Create a new texture reference. Once created, the application must -- call setPtr to associate the reference with allocated memory. -- Other texture reference functions are used to specify the format and -- interpretation to be used when the memory is read through this -- reference. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF__DEPRECATED.html#group__CUDA__TEXREF__DEPRECATED_1g0084fabe2c6d28ffcf9d9f5c7164f16c -- | Deprecated: as of CUDA version 3.2 create :: IO Texture -- | Destroy a texture reference. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF__DEPRECATED.html#group__CUDA__TEXREF__DEPRECATED_1gea8edbd6cf9f97e6ab2b41fc6785519d -- | Deprecated: as of CUDA version 3.2 destroy :: Texture -> IO () instance GHC.Show.Show Foreign.CUDA.Driver.Texture.Texture instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.Texture instance GHC.Show.Show Foreign.CUDA.Driver.Texture.AddressMode instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.AddressMode instance GHC.Show.Show Foreign.CUDA.Driver.Texture.FilterMode instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.FilterMode instance GHC.Show.Show Foreign.CUDA.Driver.Texture.ReadMode instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.ReadMode instance GHC.Show.Show Foreign.CUDA.Driver.Texture.Format instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.Format instance GHC.Enum.Enum Foreign.CUDA.Driver.Texture.Format instance GHC.Enum.Enum Foreign.CUDA.Driver.Texture.ReadMode instance GHC.Enum.Enum Foreign.CUDA.Driver.Texture.FilterMode instance GHC.Enum.Enum Foreign.CUDA.Driver.Texture.AddressMode instance Foreign.Storable.Storable Foreign.CUDA.Driver.Texture.Texture -- | IPC memory management for low-level driver interface. -- -- Restricted to devices which support unified addressing on Linux -- operating systems. -- -- Since CUDA-4.0. module Foreign.CUDA.Driver.IPC.Marshal -- | A CUDA memory handle used for inter-process communication. data IPCDevicePtr a -- | Flags for controlling IPC memory access data IPCFlag LazyEnablePeerAccess :: IPCFlag -- | Create an inter-process memory handle for an existing device memory -- allocation. The handle can then be sent to another process and made -- available to that process via open. -- -- Requires CUDA-4.1. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6f1b5be767b275f016523b2ac49ebec1 export :: DevicePtr a -> IO (IPCDevicePtr a) -- | Open an inter-process memory handle exported from another process, -- returning a device pointer usable in the current process. -- -- Maps memory exported by another process with 'export into the current -- device address space. For contexts on different devices, open -- can attempt to enable peer access if the user called add, and -- is controlled by the LazyEnablePeerAccess flag. -- -- Each handle from a given device and context may only be opened -- by one context per device per other process. Memory returned by -- open must be freed via close. -- -- Requires CUDA-4.1. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1ga8bd126fcff919a0c996b7640f197b79 open :: IPCDevicePtr a -> [IPCFlag] -> IO (DevicePtr a) -- | Close and unmap memory returned by open. The original -- allocation in the exporting process as well as imported mappings in -- other processes are unaffected. -- -- Any resources used to enable peer access will be freed if this is the -- last mapping using them. -- -- Requires CUDA-4.1. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gd6f5d5bcf6376c6853b64635b0157b9e close :: DevicePtr a -> IO () instance GHC.Enum.Bounded Foreign.CUDA.Driver.IPC.Marshal.IPCFlag instance GHC.Show.Show Foreign.CUDA.Driver.IPC.Marshal.IPCFlag instance GHC.Classes.Eq Foreign.CUDA.Driver.IPC.Marshal.IPCFlag instance GHC.Show.Show (Foreign.CUDA.Driver.IPC.Marshal.IPCDevicePtr a) instance GHC.Classes.Eq (Foreign.CUDA.Driver.IPC.Marshal.IPCDevicePtr a) instance GHC.Enum.Enum Foreign.CUDA.Driver.IPC.Marshal.IPCFlag -- | Primary context management for low-level driver interface. The primary -- context is unique per device and shared with the Runtime API. This -- allows integration with other libraries using CUDA. -- -- Since: CUDA-7.0 module Foreign.CUDA.Driver.Context.Primary -- | Get the status of the primary context. Returns whether the current -- context is active, and the flags it was (or will be) created with. -- -- Requires CUDA-7.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1g65f3e018721b6d90aa05cfb56250f469 status :: Device -> IO (Bool, [ContextFlag]) -- | Specify the flags that the primary context should be created with. -- Note that this is an error if the primary context is already active. -- -- Requires CUDA-7.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1gd779a84f17acdad0d9143d9fe719cfdf setup :: Device -> [ContextFlag] -> IO () -- | Destroy all allocations and reset all state on the primary context of -- the given device in the current process. Requires cuda-7.0 -- -- Requires CUDA-7.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1g5d38802e8600340283958a117466ce12 reset :: Device -> IO () -- | Retain the primary context for the given device, creating it if -- necessary, and increasing its usage count. The caller must call -- release when done using the context. Unlike create the -- newly retained context is not pushed onto the stack. -- -- Requires CUDA-7.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1g9051f2d5c31501997a6cb0530290a300 retain :: Device -> IO Context -- | Release the primary context on the given device. If there are no more -- references to the primary context it will be destroyed, regardless of -- how many threads it is current to. -- -- Unlike pop this does not pop the context from the stack in any -- circumstances. -- -- Requires CUDA-7.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1gf2a8bc16f8df0c88031f6a1ba3d6e8ad release :: Device -> IO () -- | Direct peer context access functions for the low-level driver -- interface. -- -- Since: CUDA-4.0 module Foreign.CUDA.Driver.Context.Peer -- | Possible option values for direct peer memory access data PeerFlag -- | Peer-to-peer attributes data PeerAttribute PerformanceRank :: PeerAttribute AccessSupported :: PeerAttribute NativeAtomicSupported :: PeerAttribute AccessAccessSupported :: PeerAttribute CudaArrayAccessSupported :: PeerAttribute -- | Queries if the first device can directly access the memory of the -- second. If direct access is possible, it can then be enabled with -- add. -- -- Requires CUDA-4.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e accessible :: Device -> Device -> IO Bool -- | If the devices of both the current and supplied contexts support -- unified addressing, then enable allocations in the supplied context to -- be accessible by the current context. -- -- Note that access is unidirectional, and in order to access memory in -- the current context from the peer context, a separate symmetric call -- to add is required. -- -- Requires CUDA-4.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a add :: Context -> [PeerFlag] -> IO () -- | Disable direct memory access from the current context to the supplied -- peer context, and unregisters any registered allocations. -- -- Requires CUDA-4.0. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g5b4b6936ea868d4954ce4d841a3b4810 remove :: Context -> IO () -- | Queries attributes of the link between two devices -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g4c55c60508f8eba4546b51f2ee545393 -- -- Requires CUDA-8.0 -- --
-- since 0.9.0.0 --getAttribute :: PeerAttribute -> Device -> Device -> IO Int instance GHC.Show.Show Foreign.CUDA.Driver.Context.Peer.PeerAttribute instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Peer.PeerAttribute instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.Peer.PeerAttribute instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.Peer.PeerFlag -- | Context configuration for the low-level driver interface module Foreign.CUDA.Driver.Context.Config -- | Return the flags that were used to create the current context. -- -- Requires CUDA-7.0 -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gf81eef983c1e3b2ef4f166d7a930c86d getFlags :: IO [ContextFlag] -- | Device limits flags data Limit StackSize :: Limit PrintfFifoSize :: Limit MallocHeapSize :: Limit DevRuntimeSyncDepth :: Limit DevRuntimePendingLaunchCount :: Limit MaxL2FetchGranularity :: Limit Max :: Limit -- | Query compute 2.0 call stack limits. -- -- Requires CUDA-3.1. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g9f2d47d1745752aa16da7ed0d111b6a8 getLimit :: Limit -> IO Int -- | Specify the size of the call stack, for compute 2.0 devices. -- -- Requires CUDA-3.1. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g0651954dfb9788173e60a9af7201e65a setLimit :: Limit -> Int -> IO () -- | Device cache configuration preference data Cache PreferNone :: Cache PreferShared :: Cache PreferL1 :: Cache PreferEqual :: Cache -- | On devices where the L1 cache and shared memory use the same hardware -- resources, this function returns the preferred cache configuration for -- the current context. -- -- Requires CUDA-3.2. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g40b6b141698f76744dea6e39b9a25360 getCache :: IO Cache -- | On devices where the L1 cache and shared memory use the same hardware -- resources, this sets the preferred cache configuration for the current -- context. This is only a preference. -- -- Any function configuration set via setCacheConfigFun will be -- preferred over this context-wide setting. -- -- Requires CUDA-3.2. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g54699acf7e2ef27279d013ca2095f4a3 setCache :: Cache -> IO () -- | Device shared memory configuration preference data SharedMem DefaultBankSize :: SharedMem FourByteBankSize :: SharedMem EightByteBankSize :: SharedMem -- | Return the current size of the shared memory banks in the current -- context. On devices with configurable shared memory banks, -- setSharedMem can be used to change the configuration, so that -- subsequent kernel launches will by default us the new bank size. On -- devices without configurable shared memory, this function returns the -- fixed bank size of the hardware. -- -- Requires CUDA-4.2 -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74 getSharedMem :: IO SharedMem -- | On devices with configurable shared memory banks, this function will -- set the context's shared memory bank size that will be used by default -- for subsequent kernel launches. -- -- Changing the shared memory configuration between launches may insert a -- device synchronisation. -- -- Shared memory bank size does not affect shared memory usage or kernel -- occupancy, but may have major effects on performance. Larger bank -- sizes allow for greater potential bandwidth to shared memory, but -- change the kinds of accesses which result in bank conflicts. -- -- Requires CUDA-4.2 -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692 setSharedMem :: SharedMem -> IO () -- | Priority of an execution stream. Work submitted to a higher priority -- stream may preempt execution of work already executing in a lower -- priority stream. Lower numbers represent higher priorities. type StreamPriority = Int -- | Returns the numerical values that correspond to the greatest and least -- priority execution streams in the current context respectively. Stream -- priorities follow the convention that lower numerical numbers -- correspond to higher priorities. The range of meaningful stream -- priorities is given by the inclusive range -- [greatestPriority,leastPriority]. -- -- Requires CUDA-5.5. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g137920ab61a71be6ce67605b9f294091 getStreamPriorityRange :: IO (StreamPriority, StreamPriority) instance GHC.Show.Show Foreign.CUDA.Driver.Context.Config.Limit instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Config.Limit instance GHC.Show.Show Foreign.CUDA.Driver.Context.Config.Cache instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Config.Cache instance GHC.Show.Show Foreign.CUDA.Driver.Context.Config.SharedMem instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Config.SharedMem instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.Config.SharedMem instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.Config.Cache instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.Config.Limit -- | Context management for low-level driver interface module Foreign.CUDA.Driver.Context -- | Unified addressing functions for the low-level driver interface -- --
-- since 0.9.0.0 --launchKernelCooperative :: Fun -> (Int, Int, Int) -> (Int, Int, Int) -> Int -> Maybe Stream -> [FunParam] -> IO () -- | Specify the (x,y,z) dimensions of the thread blocks that are -- created when the given kernel function is launched. -- | Deprecated: use launchKernel instead setBlockShape :: Fun -> (Int, Int, Int) -> IO () -- | Set the number of bytes of dynamic shared memory to be available to -- each thread block when the function is launched -- | Deprecated: use launchKernel instead setSharedSize :: Fun -> Integer -> IO () -- | Set the parameters that will specified next time the kernel is invoked -- | Deprecated: use launchKernel instead setParams :: Fun -> [FunParam] -> IO () -- | Invoke the kernel on a size (w,h) grid of blocks. Each block -- contains the number of threads specified by a previous call to -- setBlockShape. The launch may also be associated with a -- specific Stream. -- | Deprecated: use launchKernel instead launch :: Fun -> (Int, Int) -> Maybe Stream -> IO () instance GHC.Show.Show Foreign.CUDA.Driver.Exec.FunAttribute instance GHC.Classes.Eq Foreign.CUDA.Driver.Exec.FunAttribute instance Foreign.Storable.Storable Foreign.CUDA.Driver.Exec.FunParam instance GHC.Enum.Enum Foreign.CUDA.Driver.Exec.FunAttribute -- | Querying module attributes for low-level driver interface module Foreign.CUDA.Driver.Module.Query -- | Returns a function handle. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1ga52be009b0d4045811b30c965e1cb2cf getFun :: Module -> ShortByteString -> IO Fun -- | Return a global pointer, and size of the global (in bytes). -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1gf3e43672e26073b1081476dbf47a86ab getPtr :: Module -> ShortByteString -> IO (DevicePtr a, Int) -- | Return a handle to a texture reference. This texture reference handle -- should not be destroyed, as the texture will be destroyed -- automatically when the module is unloaded. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g9607dcbf911c16420d5264273f2b5608 getTex :: Module -> ShortByteString -> IO Texture -- | Module management for low-level driver interface module Foreign.CUDA.Driver.Module -- | Device code formats that can be used for online linking data JITInputType Cubin :: JITInputType PTX :: JITInputType Fatbinary :: JITInputType Object :: JITInputType Library :: JITInputType CuJitNumInputTypes :: JITInputType -- | Online compilation fallback strategy data JITFallback PreferPTX :: JITFallback PreferBinary :: JITFallback -- | Online compilation target architecture data JITTarget Compute20 :: JITTarget Compute21 :: JITTarget Compute30 :: JITTarget Compute32 :: JITTarget Compute35 :: JITTarget Compute37 :: JITTarget Compute50 :: JITTarget Compute52 :: JITTarget Compute53 :: JITTarget Compute60 :: JITTarget Compute61 :: JITTarget Compute62 :: JITTarget Compute70 :: JITTarget Compute72 :: JITTarget Compute75 :: JITTarget -- | Results of online compilation data JITResult JITResult :: !Float -> !ByteString -> !Module -> JITResult -- | milliseconds spent compiling PTX [jitTime] :: JITResult -> !Float -- | information about PTX assembly [jitInfoLog] :: JITResult -> !ByteString -- | the compiled module [jitModule] :: JITResult -> !Module -- | Just-in-time compilation and linking options data JITOption -- | maximum number of registers per thread MaxRegisters :: !Int -> JITOption -- | number of threads per block to target for ThreadsPerBlock :: !Int -> JITOption -- | level of optimisation to apply (1-4, default 4) OptimisationLevel :: !Int -> JITOption -- | compilation target, otherwise determined from context Target :: !Compute -> JITOption -- | fallback strategy if matching cubin not found FallbackStrategy :: !JITFallback -> JITOption -- | generate debug info (-g) (requires cuda >= 5.5) GenerateDebugInfo :: JITOption -- | generate line number information (-lineinfo) (requires cuda >= 5.5) GenerateLineInfo :: JITOption -- | verbose log messages (requires cuda >= 5.5) Verbose :: JITOption -- | A reference to a Module object, containing collections of device -- functions newtype Module Module :: Ptr () -> Module -- | Load the contents of the specified file (either a ptx or cubin file) -- to create a new module, and load that module into the current context. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g366093bd269dafd0af21f1c7d18115d3 loadFile :: FilePath -> IO Module -- | Load the contents of the given image into a new module, and load that -- module into the current context. The image is (typically) the contents -- of a cubin or PTX file. -- -- Note that the ByteString will be copied into a temporary -- staging area so that it can be passed to C. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g04ce266ce03720f479eab76136b90c0b loadData :: ByteString -> IO Module -- | As loadData, but read the image data from the given pointer. -- The image is a NULL-terminated sequence of bytes. loadDataFromPtr :: Ptr Word8 -> IO Module -- | Load the contents of the given image into a module with online -- compiler options, and load the module into the current context. The -- image is (typically) the contents of a cubin or PTX file. The actual -- attributes of the compiled kernel can be probed using -- requires. -- -- Note that the ByteString will be copied into a temporary -- staging area so that it can be passed to C. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g9e8047e9dbf725f0cd7cafd18bfd4d12 loadDataEx :: ByteString -> [JITOption] -> IO JITResult -- | As loadDataEx, but read the image data from the given pointer. -- The image is a NULL-terminated sequence of bytes. loadDataFromPtrEx :: Ptr Word8 -> [JITOption] -> IO JITResult -- | Unload a module from the current context. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b unload :: Module -> IO () -- | Graph execution functions for the low-level driver interface -- -- Requires CUDA-10 module Foreign.CUDA.Driver.Graph.Exec newtype Executable Executable :: Ptr () -> Executable [useExecutable] :: Executable -> Ptr () -- | Execute a graph in the given stream. Only one instance may execute at -- a time; to execute a graph concurrently, it must be -- instantiated into multiple executables. -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g6b2dceb3901e71a390d2bd8b0491e471 launch :: Executable -> Stream -> IO () -- | Instantiate the task graph description of a program into an executable -- graph. -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g433ae118a751c9f2087f53d7add7bc2c instantiate :: Graph -> IO Executable -- | Destroy an executable graph -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1ga32ad4944cc5d408158207c978bc43a7 destroy :: Executable -> IO () -- | Update the parameters for a kernel node in the given executable graph -- -- Requires CUDA-10.1 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gd84243569e4c3d6356b9f2eea20ed48c setKernel :: Executable -> Node -> Fun -> (Int, Int, Int) -> (Int, Int, Int) -> Int -> [FunParam] -> IO () -- | Graph construction functions for the low-level driver interface -- -- Requires CUDA-10 module Foreign.CUDA.Driver.Graph.Build newtype Graph Graph :: Ptr () -> Graph [useGraph] :: Graph -> Ptr () newtype Node Node :: Ptr () -> Node [useNode] :: Node -> Ptr () data NodeType Kernel :: NodeType Memcpy :: NodeType Memset :: NodeType Host :: NodeType Subgraph :: NodeType Empty :: NodeType Count :: NodeType -- | Callback function executed on the host -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1g262cd3570ff5d396db4e3dabede3c355 type HostCallback = ((FunPtr ((Ptr ()) -> (IO ())))) -- | Create an empty task graph -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gd885f719186010727b75c3315f865fdf create :: [GraphFlag] -> IO Graph -- | Destroy a graph, as well as all of its nodes -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g718cfd9681f078693d4be2426fd689c8 destroy :: Graph -> IO () -- | Clone a graph -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g3603974654e463f2231c71d9b9d1517e clone :: Graph -> IO Graph -- | Remove a node from the graph -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g00ed16434d983d8f0011683eacaf19b9 remove :: Node -> IO () -- | Create a child graph node and add it to the graph -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g3f27c2e56e3d568b09f00d438e61ceb1 addChild :: Graph -> Graph -> [Node] -> IO Node -- | Create an empty node and add it to the graph. -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g8a8681dbe97dbbb236ea5ebf3abe2ada addEmpty :: Graph -> [Node] -> IO Node -- | Creates a host execution node and adds it to the graph -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g1ba15c2fe1afb8897091ecec4202b597 addHost :: Graph -> [Node] -> HostCallback -> Ptr () -> IO Node -- | Create a kernel execution node and adds it to the graph -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g886a9096293238937f2f3bc7f2d57635 addKernel :: Graph -> [Node] -> Fun -> (Int, Int, Int) -> (Int, Int, Int) -> Int -> [FunParam] -> IO Node -- | Create a memcpy node and add it to the graph -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gdd521e1437c1c3ea8822f66a32ff1f94 addMemcpy :: Graph -> [Node] -> Context -> Int -> Int -> Int -> Int -> MemoryType -> Ptr a -> Int -> Int -> Int -> Int -> Int -> Int -> MemoryType -> Ptr a -> Int -> Int -> Int -> Int -> Int -> IO Node -- | Create a memset node and add it to the graph -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gac7f59961798f14a9f94f9f6b53cc3b7 addMemset :: Storable a => Graph -> [Node] -> Context -> DevicePtr a -> a -> Int -> Int -> Int -> IO Node -- | Add dependency edges to the graph -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g81bf1a6965f881be6ad8d21cfe0ee44f addDependencies :: Graph -> [(Node, Node)] -> IO () -- | Remove dependency edges from the graph -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g8ab696a6b3ccd99db47feba7e97fb579 removeDependencies :: Graph -> [(Node, Node)] -> IO () -- | Return the type of a node -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gdb1776d97aa1c9d5144774b29e4b8c3e getType :: Node -> IO NodeType -- | Retrieve the embedded graph of a child sub-graph node -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gbe9fc9267316b3778ef0db507917b4fd getChildGraph :: Node -> IO Graph -- | Return a graph's dependency edges -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g2b7bd71b0b2b8521f141996e0975a0d7 getEdges :: Graph -> IO [(Node, Node)] -- | Return a graph's nodes -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gfa35a8e2d2fc32f48dbd67ba27cf27e5 getNodes :: Graph -> IO [Node] -- | Returns the root nodes of a graph -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gf8517646bd8b39ab6359f8e7f0edffbd getRootNodes :: Graph -> IO [Node] -- | Return the dependencies of a node -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g048f4c0babcbba64a933fc277cd45083 getDependencies :: Node -> IO [Node] -- | Return a node's dependent nodes -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g4b73d9e3b386a9c0b094a452b8431f59 getDependents :: Node -> IO [Node] -- | Find a cloned version of a node -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gf21f6c968e346f028737c1118bfd41c2 findInClone :: Node -> Graph -> IO Node -- | This module defines an interface to the CUDA driver API. The Driver -- API is a lower-level interface to CUDA devices than that provided by -- the Runtime API. Using the Driver API, the programmer must deal -- explicitly with operations such as initialisation, context management, -- and loading (kernel) modules. Although more difficult to use -- initially, the Driver API provides more control over how CUDA is used. -- Furthermore, since it does not require compiling and linking the -- program with nvcc, the Driver API provides better -- inter-language compatibility. -- -- The following is a short tutorial on using the Driver API. The steps -- can be copied into a file, or run directly in ghci, in which -- case ghci should be launched with the option -- -fno-ghci-sandbox. This is because CUDA maintains CPU-local -- state, so operations should always be run from a bound thread. -- --
-- >>> import Foreign.CUDA.Driver -- -- >>> initialise [] ---- -- Next, we must select a GPU that we will execute operations on. Each -- GPU is assigned a unique identifier (beginning at zero). We can get a -- handle to a compute device at a given ordinal using the device -- operation. Given a device handle, we can query the properties of that -- device using props. The number of available CUDA-capable -- devices is given via count. For example: -- --
-- >>> count
-- 1
--
-- >>> dev0 <- device 0
--
-- >>> props dev0
-- DeviceProperties {deviceName = "GeForce GT 650M", computeCapability = 3.0, ...}
--
--
-- This package also includes the executable 'nvidia-device-query', which
-- when executed displays the key properties of all available devices.
-- See Foreign.CUDA.Driver.Device for additional operations to
-- query the capabilities or status of a device.
--
-- Once you have chosen a device to use, the next step is to create a
-- CUDA context. A context is associated with a particular device, and
-- all operations, such as memory allocation and kernel execution, take
-- place with respect to that context. For example, to create a
-- new execution context on CUDA device 0:
--
-- -- >>> ctx <- create dev0 [] ---- -- The second argument is a set of ContextFlags which control how -- the context behaves in various situations, for example, whether or not -- the CPU should actively spin when waiting for results from the GPU -- (SchedSpin), or to yield control to other threads instead -- (SchedYield). -- -- The newly created context is now the active context, and all -- subsequent operations take place within that context. More than one -- context can be created per device, but resources, such as memory -- allocated in the GPU, are unique to each context. The module -- Foreign.CUDA.Driver.Context contains operations for managing -- multiple contexts. Some devices allow data to be shared between -- contexts without copying, see Foreign.CUDA.Driver.Context.Peer -- for more information. -- -- Once the context is no longer needed, it should be destroyed in -- order to free up any resources that were allocated to it. -- --
-- >>> destroy ctx ---- -- Each device also has a unique context which is used by the Runtime -- API. This context can be accessed with the module -- Foreign.CUDA.Driver.Context.Primary. -- --
-- >>> vecAdd xs ys = zipWith (+) xs ys ---- -- The following CUDA kernel can be used to implement this on the GPU: -- --
-- extern "C" __global__ void vecAdd(float *xs, float *ys, float *zs, int N)
-- {
-- int ix = blockIdx.x * blockDim.x + threadIdx.x;
--
-- if ( ix < N ) {
-- zs[ix] = xs[ix] + ys[ix];
-- }
-- }
--
--
-- Here, the global keyword marks the function as a
-- kernel that should be computed on the GPU in data parallel. When we
-- execute this function on the GPU, (at least) N threads will
-- execute N individual instances of the kernel function
-- vecAdd. Each thread will operate on a single element of each
-- input array to create a single value in the result. See the CUDA
-- programming guide for more details.
--
-- We can save this to a file vector_add.cu, and compile it
-- using nvcc into a form that we can then load onto the GPU and
-- execute:
--
-- -- $ nvcc --ptx vector_add.cu ---- -- The module Foreign.CUDA.Driver.Module contains functions for -- loading the resulting .ptx file (or .cubin files) into the running -- program. -- --
-- >>> mdl <- loadFile "vector_add.ptx" ---- -- Once finished with the module, it is also a good idea to unload -- it. -- -- Modules may export kernel functions, global variables, and texture -- references. Before we can execute our function, we need to look it up -- in the module by name. -- --
-- >>> vecAdd <- getFun mdl "vecAdd" ---- -- Given this reference to our kernel function, we are almost ready to -- execute it on the device using launchKernel, but first, we must -- create some data that we can execute the function on. -- --
-- >>> let xs = [1..1024] :: [Float] -- -- >>> let ys = [2,4..2048] :: [Float] ---- -- In CUDA, like C, all memory management is explicit, and arrays on the -- device must be explicitly allocated and freed. As mentioned -- previously, data transfer is also explicit. However, we do provide -- convenience functions for combined allocation and marshalling, as well -- as bracketed operations. -- --
-- >>> xs_dev <- newListArray xs -- -- >>> ys_dev <- newListArray ys -- -- >>> zs_dev <- mallocArray 1024 :: IO (DevicePtr Float) ---- -- After executing the kernel (see next section), we transfer the result -- back to the host, and free the memory that was allocated on the GPU. -- --
-- >>> zs <- peekListArray 1024 zs_dev -- -- >>> free xs_dev -- -- >>> free ys_dev -- -- >>> free zs_dev ---- --
-- >>> launchKernel vecAdd (4,1,1) (256,1,1) 0 Nothing [VArg xs_dev, VArg ys_dev, VArg zs_dev, IArg 1024] ---- -- Note that kernel execution is asynchronous, so we should also wait for -- the operation to complete before attempting to read the results back. -- --
-- >>> sync ---- -- And that's it! -- --
-- since 0.9.0.0 --getAttribute :: PeerAttribute -> Device -> Device -> IO Int -- | Device shared memory configuration preference data SharedMem DefaultBankSize :: SharedMem FourByteBankSize :: SharedMem EightByteBankSize :: SharedMem -- | Device cache configuration preference data Cache PreferNone :: Cache PreferShared :: Cache PreferL1 :: Cache PreferEqual :: Cache -- | Device limits flags data Limit StackSize :: Limit PrintfFifoSize :: Limit MallocHeapSize :: Limit DevRuntimeSyncDepth :: Limit DevRuntimePendingLaunchCount :: Limit MaxL2FetchGranularity :: Limit Max :: Limit -- | Return the flags that were used to create the current context. -- -- Requires CUDA-7.0 -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gf81eef983c1e3b2ef4f166d7a930c86d getFlags :: IO [ContextFlag] -- | Query compute 2.0 call stack limits. -- -- Requires CUDA-3.1. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g9f2d47d1745752aa16da7ed0d111b6a8 getLimit :: Limit -> IO Int -- | Specify the size of the call stack, for compute 2.0 devices. -- -- Requires CUDA-3.1. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g0651954dfb9788173e60a9af7201e65a setLimit :: Limit -> Int -> IO () -- | On devices where the L1 cache and shared memory use the same hardware -- resources, this function returns the preferred cache configuration for -- the current context. -- -- Requires CUDA-3.2. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g40b6b141698f76744dea6e39b9a25360 getCache :: IO Cache -- | On devices where the L1 cache and shared memory use the same hardware -- resources, this sets the preferred cache configuration for the current -- context. This is only a preference. -- -- Any function configuration set via setCacheConfigFun will be -- preferred over this context-wide setting. -- -- Requires CUDA-3.2. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g54699acf7e2ef27279d013ca2095f4a3 setCache :: Cache -> IO () -- | Return the current size of the shared memory banks in the current -- context. On devices with configurable shared memory banks, -- setSharedMem can be used to change the configuration, so that -- subsequent kernel launches will by default us the new bank size. On -- devices without configurable shared memory, this function returns the -- fixed bank size of the hardware. -- -- Requires CUDA-4.2 -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74 getSharedMem :: IO SharedMem -- | On devices with configurable shared memory banks, this function will -- set the context's shared memory bank size that will be used by default -- for subsequent kernel launches. -- -- Changing the shared memory configuration between launches may insert a -- device synchronisation. -- -- Shared memory bank size does not affect shared memory usage or kernel -- occupancy, but may have major effects on performance. Larger bank -- sizes allow for greater potential bandwidth to shared memory, but -- change the kinds of accesses which result in bank conflicts. -- -- Requires CUDA-4.2 -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692 setSharedMem :: SharedMem -> IO () -- | Returns the numerical values that correspond to the greatest and least -- priority execution streams in the current context respectively. Stream -- priorities follow the convention that lower numerical numbers -- correspond to higher priorities. The range of meaningful stream -- priorities is given by the inclusive range -- [greatestPriority,leastPriority]. -- -- Requires CUDA-5.5. -- -- -- http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g137920ab61a71be6ce67605b9f294091 getStreamPriorityRange :: IO (StreamPriority, StreamPriority) -- | Occupancy calculations for CUDA kernels -- -- -- http://developer.download.nvidia.com/compute/cuda/3_0/sdk/docs/CUDA_Occupancy_calculator.xls -- -- Determining Registers Per Thread and Shared Memory Per Block -- -- To determine the number of registers used per thread in your kernel, -- simply compile the kernel code using the option -- --
-- --ptxas-options=-v ---- -- to nvcc. This will output information about register, local memory, -- shared memory, and constant memory usage for each kernel in the -- .cu file. Alternatively, you can compile with the -- -cubin option to nvcc. This will generate a .cubin -- file, which you can open in a text editor. Look for the code -- section with your kernel's name. Within the curly braces ({ ... -- }) for that code block, you will see a line with reg = -- X, where x is the number of registers used by your -- kernel. You can also see the amount of shared memory used as smem -- = Y. However, if your kernel declares any external shared memory -- that is allocated dynamically, you will need to add the number in the -- .cubin file to the amount you dynamically allocate at run -- time to get the correct shared memory usage. -- -- Notes About Occupancy -- -- Higher occupancy does not necessarily mean higher performance. If a -- kernel is not bandwidth bound, then increasing occupancy will not -- necessarily increase performance. If a kernel invocation is already -- running at least one thread block per multiprocessor in the GPU, and -- it is bottlenecked by computation and not by global memory accesses, -- then increasing occupancy may have no effect. In fact, making changes -- just to increase occupancy can have other effects, such as additional -- instructions, spills to local memory (which is off chip), divergent -- branches, etc. As with any optimization, you should experiment to see -- how changes affect the *wall clock time* of the kernel execution. For -- bandwidth bound applications, on the other hand, increasing occupancy -- can help better hide the latency of memory accesses, and therefore -- improve performance. module Foreign.CUDA.Analysis.Occupancy data Occupancy Occupancy :: !Int -> !Int -> !Int -> !Double -> Occupancy -- | Active threads per multiprocessor [activeThreads] :: Occupancy -> !Int -- | Active thread blocks per multiprocessor [activeThreadBlocks] :: Occupancy -> !Int -- | Active warps per multiprocessor [activeWarps] :: Occupancy -> !Int -- | Occupancy of each multiprocessor (percent) [occupancy100] :: Occupancy -> !Double -- | Calculate occupancy data for a given GPU and kernel resource usage occupancy :: DeviceProperties -> Int -> Int -> Int -> Occupancy -- | Optimise multiprocessor occupancy as a function of thread block size -- and resource usage. This returns the smallest satisfying block size in -- increments of a single warp. optimalBlockSize :: DeviceProperties -> (Int -> Int) -> (Int -> Int) -> (Int, Occupancy) -- | As optimalBlockSize, but with a generator that produces the -- specific thread block sizes that should be tested. The generated list -- can produce values in any order, but the last satisfying block size -- will be returned. Hence, values should be monotonically decreasing to -- return the smallest block size yielding maximum occupancy, and -- vice-versa. optimalBlockSizeOf :: DeviceProperties -> [Int] -> (Int -> Int) -> (Int -> Int) -> (Int, Occupancy) -- | Determine the maximum number of CTAs that can be run simultaneously -- for a given kernel / device combination. maxResidentBlocks :: DeviceProperties -> Int -> Int -> Int -> Int -- | Increments in powers-of-two, over the range of supported thread block -- sizes for the given device. incPow2 :: DeviceProperties -> [Int] -- | Increments in the warp size of the device, over the range of supported -- thread block sizes. incWarp :: DeviceProperties -> [Int] -- | Decrements in powers-of-two, over the range of supported thread block -- sizes for the given device. decPow2 :: DeviceProperties -> [Int] -- | Decrements in the warp size of the device, over the range of supported -- thread block sizes. decWarp :: DeviceProperties -> [Int] instance GHC.Show.Show Foreign.CUDA.Analysis.Occupancy.Occupancy instance GHC.Classes.Ord Foreign.CUDA.Analysis.Occupancy.Occupancy instance GHC.Classes.Eq Foreign.CUDA.Analysis.Occupancy.Occupancy -- | Meta-module exporting CUDA analysis routines module Foreign.CUDA.Analysis