-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | FFI binding to the CUDA interface for programming NVIDIA GPUs -- -- The CUDA library provides a direct, general purpose C-like SPMD -- programming model for NVIDIA graphics cards (G8x series onwards). This -- is a collection of bindings to allow you to call and control, although -- not write, such functions from Haskell-land. You will need to install -- the CUDA driver and developer toolkit. -- -- http://developer.nvidia.com/cuda-downloads -- -- The configure script will look for your CUDA installation in the -- standard places, and if the nvcc compiler is found in your PATH, -- relative to that. -- -- This release is for version 6.5 of the CUDA toolkit. -- -- -- -- Due to a bug in nvcc, this package is not compatible with c2hs-0.18.* -- or c2hs-0.19.*. See tmcdonell/cuda#18. @package cuda @version 0.6.7.0 -- | Error handling module Foreign.CUDA.Driver.Error data Status Success :: Status InvalidValue :: Status OutOfMemory :: Status NotInitialized :: Status Deinitialized :: Status ProfilerDisabled :: Status ProfilerNotInitialized :: Status ProfilerAlreadyStarted :: Status ProfilerAlreadyStopped :: Status NoDevice :: Status InvalidDevice :: Status InvalidImage :: Status InvalidContext :: Status ContextAlreadyCurrent :: Status MapFailed :: Status UnmapFailed :: Status ArrayIsMapped :: Status AlreadyMapped :: Status NoBinaryForGPU :: Status AlreadyAcquired :: Status NotMapped :: Status NotMappedAsArray :: Status NotMappedAsPointer :: Status EccUncorrectable :: Status UnsupportedLimit :: Status ContextAlreadyInUse :: Status PeerAccessUnsupported :: Status InvalidPTX :: Status InvalidGraphicsContext :: Status InvalidSource :: Status FileNotFound :: Status SharedObjectSymbolNotFound :: Status SharedObjectInitFailed :: Status OperatingSystem :: Status InvalidHandle :: Status NotFound :: Status NotReady :: Status IllegalAddress :: Status LaunchOutOfResources :: Status LaunchTimeout :: Status LaunchIncompatibleTexturing :: Status PeerAccessAlreadyEnabled :: Status PeerAccessNotEnabled :: Status PrimaryContextActive :: Status ContextIsDestroyed :: Status Assert :: Status TooManyPeers :: Status HostMemoryAlreadyRegistered :: Status HostMemoryNotRegistered :: Status HardwareStackError :: Status IllegalInstruction :: Status MisalignedAddress :: Status InvalidAddressSpace :: Status InvalidPC :: Status LaunchFailed :: Status NotPermitted :: Status NotSupported :: Status Unknown :: Status -- | Return a descriptive error string associated with a particular error -- code describe :: Status -> String cuGetErrorString :: (Status) -> IO (Status, String) data CUDAException ExitCode :: Status -> CUDAException UserError :: String -> CUDAException -- | Raise a CUDAException in the IO Monad cudaError :: String -> IO a -- | A specially formatted error message requireSDK :: Double -> String -> IO a -- | Return the results of a function on successful execution, otherwise -- throw an exception with an error string associated with the return -- code resultIfOk :: (Status, a) -> IO a -- | Throw an exception with an error string associated with an -- unsuccessful return code, otherwise return unit. nothingIfOk :: Status -> IO () cuGetErrorString'_ :: CInt -> ((Ptr (Ptr CChar)) -> (IO CInt)) instance GHC.Show.Show Foreign.CUDA.Driver.Error.Status instance GHC.Classes.Eq Foreign.CUDA.Driver.Error.Status instance GHC.Enum.Enum Foreign.CUDA.Driver.Error.Status instance GHC.Exception.Exception Foreign.CUDA.Driver.Error.CUDAException instance GHC.Show.Show Foreign.CUDA.Driver.Error.CUDAException -- | Utility functions module Foreign.CUDA.Driver.Utils -- | Return the version number of the installed CUDA driver driverVersion :: IO Int -- | Error handling functions module Foreign.CUDA.Runtime.Error -- | Return codes from API functions data Status Success :: Status MissingConfiguration :: Status MemoryAllocation :: Status InitializationError :: Status LaunchFailure :: Status PriorLaunchFailure :: Status LaunchTimeout :: Status LaunchOutOfResources :: Status InvalidDeviceFunction :: Status InvalidConfiguration :: Status InvalidDevice :: Status InvalidValue :: Status InvalidPitchValue :: Status InvalidSymbol :: Status MapBufferObjectFailed :: Status UnmapBufferObjectFailed :: Status InvalidHostPointer :: Status InvalidDevicePointer :: Status InvalidTexture :: Status InvalidTextureBinding :: Status InvalidChannelDescriptor :: Status InvalidMemcpyDirection :: Status AddressOfConstant :: Status TextureFetchFailed :: Status TextureNotBound :: Status SynchronizationError :: Status InvalidFilterSetting :: Status InvalidNormSetting :: Status MixedDeviceExecution :: Status CudartUnloading :: Status Unknown :: Status NotYetImplemented :: Status MemoryValueTooLarge :: Status InvalidResourceHandle :: Status NotReady :: Status InsufficientDriver :: Status SetOnActiveProcess :: Status InvalidSurface :: Status NoDevice :: Status ECCUncorrectable :: Status SharedObjectSymbolNotFound :: Status SharedObjectInitFailed :: Status UnsupportedLimit :: Status DuplicateVariableName :: Status DuplicateTextureName :: Status DuplicateSurfaceName :: Status DevicesUnavailable :: Status InvalidKernelImage :: Status NoKernelImageForDevice :: Status IncompatibleDriverContext :: Status PeerAccessAlreadyEnabled :: Status PeerAccessNotEnabled :: Status DeviceAlreadyInUse :: Status ProfilerDisabled :: Status ProfilerNotInitialized :: Status ProfilerAlreadyStarted :: Status ProfilerAlreadyStopped :: Status Assert :: Status TooManyPeers :: Status HostMemoryAlreadyRegistered :: Status HostMemoryNotRegistered :: Status OperatingSystem :: Status PeerAccessUnsupported :: Status LaunchMaxDepthExceeded :: Status LaunchFileScopedTex :: Status LaunchFileScopedSurf :: Status SyncDepthExceeded :: Status LaunchPendingCountExceeded :: Status NotPermitted :: Status NotSupported :: Status HardwareStackError :: Status IllegalInstruction :: Status MisalignedAddress :: Status InvalidAddressSpace :: Status InvalidPc :: Status IllegalAddress :: Status InvalidPtx :: Status InvalidGraphicsContext :: Status StartupFailure :: Status ApiFailureBase :: Status data CUDAException ExitCode :: Status -> CUDAException UserError :: String -> CUDAException -- | Raise a CUDAException in the IO Monad cudaError :: String -> IO a -- | Return the descriptive string associated with a particular error code describe :: (Status) -> (String) -- | A specially formatted error message requireSDK :: Double -> String -> IO a -- | Return the results of a function on successful execution, otherwise -- return the error string associated with the return code resultIfOk :: (Status, a) -> IO a -- | Return the error string associated with an unsuccessful return code, -- otherwise Nothing nothingIfOk :: Status -> IO () instance GHC.Show.Show Foreign.CUDA.Runtime.Error.Status instance GHC.Classes.Eq Foreign.CUDA.Runtime.Error.Status instance GHC.Enum.Enum Foreign.CUDA.Runtime.Error.Status instance GHC.Exception.Exception Foreign.CUDA.Runtime.Error.CUDAException instance GHC.Show.Show Foreign.CUDA.Runtime.Error.CUDAException -- | Utility functions module Foreign.CUDA.Runtime.Utils -- | Return the version number of the installed CUDA driver runtimeVersion :: IO Int -- | Return the version number of the installed CUDA runtime driverVersion :: IO Int -- | Common device functions module Foreign.CUDA.Analysis.Device data Compute Compute :: !Int -> !Int -> Compute -- | The compute mode the device is currently in data ComputeMode Default :: ComputeMode Exclusive :: ComputeMode Prohibited :: ComputeMode ExclusiveProcess :: ComputeMode -- | The properties of a compute device data DeviceProperties DeviceProperties :: !String -> !Compute -> !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> !Int -> !Int -> !(Int, Int, Int) -> !(Int, Int, Int) -> !Int -> !(Int, Int) -> !(Int, Int, Int) -> !Int -> !Int -> !Int64 -> !Int -> !Int -> !Int64 -> !ComputeMode -> !Bool -> !Bool -> !Bool -> !Int -> !Int -> !Bool -> !PCI -> !Bool -> !Bool -> !Bool -> !Bool -> DeviceProperties -- | Identifier [deviceName] :: DeviceProperties -> !String -- | Supported compute capability [computeCapability] :: DeviceProperties -> !Compute -- | Available global memory on the device in bytes [totalGlobalMem] :: DeviceProperties -> !Int64 -- | Available constant memory on the device in bytes [totalConstMem] :: DeviceProperties -> !Int64 -- | Available shared memory per block in bytes [sharedMemPerBlock] :: DeviceProperties -> !Int64 -- | 32-bit registers per block [regsPerBlock] :: DeviceProperties -> !Int -- | Warp size in threads (SIMD width) [warpSize] :: DeviceProperties -> !Int -- | Max number of threads per block [maxThreadsPerBlock] :: DeviceProperties -> !Int -- | Max number of threads per multiprocessor [maxThreadsPerMultiProcessor] :: DeviceProperties -> !Int -- | Max size of each dimension of a block [maxBlockSize] :: DeviceProperties -> !(Int, Int, Int) -- | Max size of each dimension of a grid [maxGridSize] :: DeviceProperties -> !(Int, Int, Int) -- | Maximum texture dimensions [maxTextureDim1D] :: DeviceProperties -> !Int [maxTextureDim2D] :: DeviceProperties -> !(Int, Int) [maxTextureDim3D] :: DeviceProperties -> !(Int, Int, Int) -- | Clock frequency in kilohertz [clockRate] :: DeviceProperties -> !Int -- | Number of multiprocessors on the device [multiProcessorCount] :: DeviceProperties -> !Int -- | Max pitch in bytes allowed by memory copies [memPitch] :: DeviceProperties -> !Int64 -- | Global memory bus width in bits [memBusWidth] :: DeviceProperties -> !Int -- | Peak memory clock frequency in kilohertz [memClockRate] :: DeviceProperties -> !Int -- | Alignment requirement for textures [textureAlignment] :: DeviceProperties -> !Int64 [computeMode] :: DeviceProperties -> !ComputeMode -- | Device can concurrently copy memory and execute a kernel [deviceOverlap] :: DeviceProperties -> !Bool -- | Device can possibly execute multiple kernels concurrently [concurrentKernels] :: DeviceProperties -> !Bool -- | Device supports and has enabled error correction [eccEnabled] :: DeviceProperties -> !Bool -- | Number of asynchronous engines [asyncEngineCount] :: DeviceProperties -> !Int -- | Size of the L2 cache in bytes [cacheMemL2] :: DeviceProperties -> !Int -- | Whether this is a Tesla device using the TCC driver [tccDriverEnabled] :: DeviceProperties -> !Bool -- | PCI device information for the device [pciInfo] :: DeviceProperties -> !PCI -- | Whether there is a runtime limit on kernels [kernelExecTimeoutEnabled] :: DeviceProperties -> !Bool -- | As opposed to discrete [integrated] :: DeviceProperties -> !Bool -- | Device can use pinned memory [canMapHostMemory] :: DeviceProperties -> !Bool -- | Device shares a unified address space with the host [unifiedAddressing] :: DeviceProperties -> !Bool data DeviceResources DeviceResources :: !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Allocation -> DeviceResources -- | Warp size [threadsPerWarp] :: DeviceResources -> !Int -- | Maximum number of in-flight threads on a multiprocessor [threadsPerMP] :: DeviceResources -> !Int -- | Maximum number of thread blocks resident on a multiprocessor [threadBlocksPerMP] :: DeviceResources -> !Int -- | Maximum number of in-flight warps per multiprocessor [warpsPerMP] :: DeviceResources -> !Int -- | Number of SIMD arithmetic units per multiprocessor [coresPerMP] :: DeviceResources -> !Int -- | Total amount of shared memory per multiprocessor (bytes) [sharedMemPerMP] :: DeviceResources -> !Int -- | Shared memory allocation unit size (bytes) [sharedMemAllocUnit] :: DeviceResources -> !Int -- | Total number of registers in a multiprocessor [regFileSize] :: DeviceResources -> !Int -- | Register allocation unit size [regAllocUnit] :: DeviceResources -> !Int -- | Register allocation granularity for warps [regAllocWarp] :: DeviceResources -> !Int -- | Maximum number of registers per thread [regPerThread] :: DeviceResources -> !Int -- | How multiprocessor resources are divided [allocation] :: DeviceResources -> !Allocation data Allocation Warp :: Allocation Block :: Allocation data PCI PCI :: !Int -> !Int -> !Int -> PCI -- | PCI bus ID of the device [busID] :: PCI -> !Int -- | PCI device ID [deviceID] :: PCI -> !Int -- | PCI domain ID [domainID] :: PCI -> !Int -- | Extract some additional hardware resource limitations for a given -- device. deviceResources :: DeviceProperties -> DeviceResources instance GHC.Show.Show Foreign.CUDA.Analysis.Device.DeviceProperties instance GHC.Show.Show Foreign.CUDA.Analysis.Device.PCI instance GHC.Classes.Eq Foreign.CUDA.Analysis.Device.Compute instance GHC.Show.Show Foreign.CUDA.Analysis.Device.ComputeMode instance GHC.Classes.Eq Foreign.CUDA.Analysis.Device.ComputeMode instance GHC.Enum.Enum Foreign.CUDA.Analysis.Device.ComputeMode instance GHC.Show.Show Foreign.CUDA.Analysis.Device.Compute instance GHC.Classes.Ord Foreign.CUDA.Analysis.Device.Compute -- | Occupancy calculations for CUDA kernels -- -- -- http://developer.download.nvidia.com/compute/cuda/3_0/sdk/docs/CUDA_Occupancy_calculator.xls -- -- Determining Registers Per Thread and Shared Memory Per Block -- -- To determine the number of registers used per thread in your kernel, -- simply compile the kernel code using the option -- --
--   --ptxas-options=-v
--   
-- -- to nvcc. This will output information about register, local memory, -- shared memory, and constant memory usage for each kernel in the -- .cu file. Alternatively, you can compile with the -- -cubin option to nvcc. This will generate a .cubin -- file, which you can open in a text editor. Look for the code -- section with your kernel's name. Within the curly braces ({ ... -- }) for that code block, you will see a line with reg = -- X, where x is the number of registers used by your -- kernel. You can also see the amount of shared memory used as smem -- = Y. However, if your kernel declares any external shared memory -- that is allocated dynamically, you will need to add the number in the -- .cubin file to the amount you dynamically allocate at run -- time to get the correct shared memory usage. -- -- Notes About Occupancy -- -- Higher occupancy does not necessarily mean higher performance. If a -- kernel is not bandwidth bound, then increasing occupancy will not -- necessarily increase performance. If a kernel invocation is already -- running at least one thread block per multiprocessor in the GPU, and -- it is bottlenecked by computation and not by global memory accesses, -- then increasing occupancy may have no effect. In fact, making changes -- just to increase occupancy can have other effects, such as additional -- instructions, spills to local memory (which is off chip), divergent -- branches, etc. As with any optimization, you should experiment to see -- how changes affect the *wall clock time* of the kernel execution. For -- bandwidth bound applications, on the other hand, increasing occupancy -- can help better hide the latency of memory accesses, and therefore -- improve performance. module Foreign.CUDA.Analysis.Occupancy data Occupancy Occupancy :: !Int -> !Int -> !Int -> !Double -> Occupancy -- | Active threads per multiprocessor [activeThreads] :: Occupancy -> !Int -- | Active thread blocks per multiprocessor [activeThreadBlocks] :: Occupancy -> !Int -- | Active warps per multiprocessor [activeWarps] :: Occupancy -> !Int -- | Occupancy of each multiprocessor (percent) [occupancy100] :: Occupancy -> !Double -- | Calculate occupancy data for a given GPU and kernel resource usage occupancy :: DeviceProperties -> Int -> Int -> Int -> Occupancy -- | Optimise multiprocessor occupancy as a function of thread block size -- and resource usage. This returns the smallest satisfying block size in -- increments of a single warp. optimalBlockSize :: DeviceProperties -> (Int -> Int) -> (Int -> Int) -> (Int, Occupancy) -- | As optimalBlockSize, but with a generator that produces the -- specific thread block sizes that should be tested. The generated list -- can produce values in any order, but the last satisfying block size -- will be returned. Hence, values should be monotonically decreasing to -- return the smallest block size yielding maximum occupancy, and -- vice-versa. optimalBlockSizeBy :: DeviceProperties -> (DeviceProperties -> [Int]) -> (Int -> Int) -> (Int -> Int) -> (Int, Occupancy) -- | Determine the maximum number of CTAs that can be run simultaneously -- for a given kernel / device combination. maxResidentBlocks :: DeviceProperties -> Int -> Int -> Int -> Int -- | Increments in powers-of-two, over the range of supported thread block -- sizes for the given device. incPow2 :: DeviceProperties -> [Int] -- | Increments in the warp size of the device, over the range of supported -- thread block sizes. incWarp :: DeviceProperties -> [Int] -- | Decrements in powers-of-two, over the range of supported thread block -- sizes for the given device. decPow2 :: DeviceProperties -> [Int] -- | Decrements in the warp size of the device, over the range of supported -- thread block sizes. decWarp :: DeviceProperties -> [Int] instance GHC.Show.Show Foreign.CUDA.Analysis.Occupancy.Occupancy instance GHC.Classes.Ord Foreign.CUDA.Analysis.Occupancy.Occupancy instance GHC.Classes.Eq Foreign.CUDA.Analysis.Occupancy.Occupancy -- | Device management routines module Foreign.CUDA.Runtime.Device -- | A device identifier type Device = Int -- | Device execution flags data DeviceFlag ScheduleAuto :: DeviceFlag ScheduleSpin :: DeviceFlag ScheduleYield :: DeviceFlag BlockingSync :: DeviceFlag MapHost :: DeviceFlag LMemResizeToMax :: DeviceFlag -- | The properties of a compute device data DeviceProperties DeviceProperties :: !String -> !Compute -> !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> !Int -> !Int -> !(Int, Int, Int) -> !(Int, Int, Int) -> !Int -> !(Int, Int) -> !(Int, Int, Int) -> !Int -> !Int -> !Int64 -> !Int -> !Int -> !Int64 -> !ComputeMode -> !Bool -> !Bool -> !Bool -> !Int -> !Int -> !Bool -> !PCI -> !Bool -> !Bool -> !Bool -> !Bool -> DeviceProperties -- | Identifier [deviceName] :: DeviceProperties -> !String -- | Supported compute capability [computeCapability] :: DeviceProperties -> !Compute -- | Available global memory on the device in bytes [totalGlobalMem] :: DeviceProperties -> !Int64 -- | Available constant memory on the device in bytes [totalConstMem] :: DeviceProperties -> !Int64 -- | Available shared memory per block in bytes [sharedMemPerBlock] :: DeviceProperties -> !Int64 -- | 32-bit registers per block [regsPerBlock] :: DeviceProperties -> !Int -- | Warp size in threads (SIMD width) [warpSize] :: DeviceProperties -> !Int -- | Max number of threads per block [maxThreadsPerBlock] :: DeviceProperties -> !Int -- | Max number of threads per multiprocessor [maxThreadsPerMultiProcessor] :: DeviceProperties -> !Int -- | Max size of each dimension of a block [maxBlockSize] :: DeviceProperties -> !(Int, Int, Int) -- | Max size of each dimension of a grid [maxGridSize] :: DeviceProperties -> !(Int, Int, Int) -- | Maximum texture dimensions [maxTextureDim1D] :: DeviceProperties -> !Int [maxTextureDim2D] :: DeviceProperties -> !(Int, Int) [maxTextureDim3D] :: DeviceProperties -> !(Int, Int, Int) -- | Clock frequency in kilohertz [clockRate] :: DeviceProperties -> !Int -- | Number of multiprocessors on the device [multiProcessorCount] :: DeviceProperties -> !Int -- | Max pitch in bytes allowed by memory copies [memPitch] :: DeviceProperties -> !Int64 -- | Global memory bus width in bits [memBusWidth] :: DeviceProperties -> !Int -- | Peak memory clock frequency in kilohertz [memClockRate] :: DeviceProperties -> !Int -- | Alignment requirement for textures [textureAlignment] :: DeviceProperties -> !Int64 [computeMode] :: DeviceProperties -> !ComputeMode -- | Device can concurrently copy memory and execute a kernel [deviceOverlap] :: DeviceProperties -> !Bool -- | Device can possibly execute multiple kernels concurrently [concurrentKernels] :: DeviceProperties -> !Bool -- | Device supports and has enabled error correction [eccEnabled] :: DeviceProperties -> !Bool -- | Number of asynchronous engines [asyncEngineCount] :: DeviceProperties -> !Int -- | Size of the L2 cache in bytes [cacheMemL2] :: DeviceProperties -> !Int -- | Whether this is a Tesla device using the TCC driver [tccDriverEnabled] :: DeviceProperties -> !Bool -- | PCI device information for the device [pciInfo] :: DeviceProperties -> !PCI -- | Whether there is a runtime limit on kernels [kernelExecTimeoutEnabled] :: DeviceProperties -> !Bool -- | As opposed to discrete [integrated] :: DeviceProperties -> !Bool -- | Device can use pinned memory [canMapHostMemory] :: DeviceProperties -> !Bool -- | Device shares a unified address space with the host [unifiedAddressing] :: DeviceProperties -> !Bool data Compute Compute :: !Int -> !Int -> Compute -- | The compute mode the device is currently in data ComputeMode Default :: ComputeMode Exclusive :: ComputeMode Prohibited :: ComputeMode ExclusiveProcess :: ComputeMode -- | Select the compute device which best matches the given criteria choose :: DeviceProperties -> IO Device -- | Returns which device is currently being used get :: IO Device -- | Returns the number of devices available for execution, with compute -- capability >= 1.0 count :: IO Int -- | Return information about the selected compute device props :: Device -> IO DeviceProperties -- | Set device to be used for GPU execution set :: Device -> IO () -- | Set flags to be used for device executions setFlags :: [DeviceFlag] -> IO () -- | Set list of devices for CUDA execution in priority order setOrder :: [Device] -> IO () -- | Explicitly destroys and cleans up all runtime resources associated -- with the current device in the current process. Any subsequent API -- call will reinitialise the device. -- -- Note that this function will reset the device immediately. It is the -- caller’s responsibility to ensure that the device is not being -- accessed by any other host threads from the process when this function -- is called. reset :: IO () -- | Block until the device has completed all preceding requested tasks. -- Returns an error if one of the tasks fails. sync :: IO () -- | Possible option values for direct peer memory access data PeerFlag -- | Queries if the first device can directly access the memory of the -- second. If direct access is possible, it can then be enabled with -- add. Requires cuda-4.0. accessible :: Device -> Device -> IO Bool -- | If the devices of both the current and supplied contexts support -- unified addressing, then enable allocations in the supplied context to -- be accessible by the current context. Requires cuda-4.0. add :: Device -> [PeerFlag] -> IO () -- | Disable direct memory access from the current context to the supplied -- context. Requires cuda-4.0. remove :: Device -> IO () -- | Device limit flags data Limit Stacksize :: Limit Printffifosize :: Limit Mallocheapsize :: Limit Devruntimesyncdepth :: Limit Devruntimependinglaunchcount :: Limit -- | Query compute 2.0 call stack limits. Requires cuda-3.1. getLimit :: Limit -> IO Int -- | Set compute 2.0 call stack limits. Requires cuda-3.1. setLimit :: Limit -> Int -> IO () instance GHC.Show.Show Foreign.CUDA.Runtime.Device.Limit instance GHC.Classes.Eq Foreign.CUDA.Runtime.Device.Limit instance GHC.Show.Show Foreign.CUDA.Runtime.Device.DeviceFlag instance GHC.Classes.Eq Foreign.CUDA.Runtime.Device.DeviceFlag instance GHC.Enum.Enum Foreign.CUDA.Runtime.Device.DeviceFlag instance Foreign.Storable.Storable Foreign.CUDA.Analysis.Device.DeviceProperties instance GHC.Enum.Enum Foreign.CUDA.Runtime.Device.PeerFlag instance GHC.Enum.Enum Foreign.CUDA.Runtime.Device.Limit -- | Device management for low-level driver interface module Foreign.CUDA.Driver.Device newtype Device Device :: (CInt) -> Device [useDevice] :: Device -> (CInt) -- | The properties of a compute device data DeviceProperties DeviceProperties :: !String -> !Compute -> !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> !Int -> !Int -> !(Int, Int, Int) -> !(Int, Int, Int) -> !Int -> !(Int, Int) -> !(Int, Int, Int) -> !Int -> !Int -> !Int64 -> !Int -> !Int -> !Int64 -> !ComputeMode -> !Bool -> !Bool -> !Bool -> !Int -> !Int -> !Bool -> !PCI -> !Bool -> !Bool -> !Bool -> !Bool -> DeviceProperties -- | Identifier [deviceName] :: DeviceProperties -> !String -- | Supported compute capability [computeCapability] :: DeviceProperties -> !Compute -- | Available global memory on the device in bytes [totalGlobalMem] :: DeviceProperties -> !Int64 -- | Available constant memory on the device in bytes [totalConstMem] :: DeviceProperties -> !Int64 -- | Available shared memory per block in bytes [sharedMemPerBlock] :: DeviceProperties -> !Int64 -- | 32-bit registers per block [regsPerBlock] :: DeviceProperties -> !Int -- | Warp size in threads (SIMD width) [warpSize] :: DeviceProperties -> !Int -- | Max number of threads per block [maxThreadsPerBlock] :: DeviceProperties -> !Int -- | Max number of threads per multiprocessor [maxThreadsPerMultiProcessor] :: DeviceProperties -> !Int -- | Max size of each dimension of a block [maxBlockSize] :: DeviceProperties -> !(Int, Int, Int) -- | Max size of each dimension of a grid [maxGridSize] :: DeviceProperties -> !(Int, Int, Int) -- | Maximum texture dimensions [maxTextureDim1D] :: DeviceProperties -> !Int [maxTextureDim2D] :: DeviceProperties -> !(Int, Int) [maxTextureDim3D] :: DeviceProperties -> !(Int, Int, Int) -- | Clock frequency in kilohertz [clockRate] :: DeviceProperties -> !Int -- | Number of multiprocessors on the device [multiProcessorCount] :: DeviceProperties -> !Int -- | Max pitch in bytes allowed by memory copies [memPitch] :: DeviceProperties -> !Int64 -- | Global memory bus width in bits [memBusWidth] :: DeviceProperties -> !Int -- | Peak memory clock frequency in kilohertz [memClockRate] :: DeviceProperties -> !Int -- | Alignment requirement for textures [textureAlignment] :: DeviceProperties -> !Int64 [computeMode] :: DeviceProperties -> !ComputeMode -- | Device can concurrently copy memory and execute a kernel [deviceOverlap] :: DeviceProperties -> !Bool -- | Device can possibly execute multiple kernels concurrently [concurrentKernels] :: DeviceProperties -> !Bool -- | Device supports and has enabled error correction [eccEnabled] :: DeviceProperties -> !Bool -- | Number of asynchronous engines [asyncEngineCount] :: DeviceProperties -> !Int -- | Size of the L2 cache in bytes [cacheMemL2] :: DeviceProperties -> !Int -- | Whether this is a Tesla device using the TCC driver [tccDriverEnabled] :: DeviceProperties -> !Bool -- | PCI device information for the device [pciInfo] :: DeviceProperties -> !PCI -- | Whether there is a runtime limit on kernels [kernelExecTimeoutEnabled] :: DeviceProperties -> !Bool -- | As opposed to discrete [integrated] :: DeviceProperties -> !Bool -- | Device can use pinned memory [canMapHostMemory] :: DeviceProperties -> !Bool -- | Device shares a unified address space with the host [unifiedAddressing] :: DeviceProperties -> !Bool -- | Device attributes data DeviceAttribute MaxThreadsPerBlock :: DeviceAttribute MaxBlockDimX :: DeviceAttribute MaxBlockDimY :: DeviceAttribute MaxBlockDimZ :: DeviceAttribute MaxGridDimX :: DeviceAttribute MaxGridDimY :: DeviceAttribute MaxGridDimZ :: DeviceAttribute MaxSharedMemoryPerBlock :: DeviceAttribute SharedMemoryPerBlock :: DeviceAttribute TotalConstantMemory :: DeviceAttribute WarpSize :: DeviceAttribute MaxPitch :: DeviceAttribute MaxRegistersPerBlock :: DeviceAttribute RegistersPerBlock :: DeviceAttribute ClockRate :: DeviceAttribute TextureAlignment :: DeviceAttribute GpuOverlap :: DeviceAttribute MultiprocessorCount :: DeviceAttribute KernelExecTimeout :: DeviceAttribute Integrated :: DeviceAttribute CanMapHostMemory :: DeviceAttribute ComputeMode :: DeviceAttribute MaximumTexture1dWidth :: DeviceAttribute MaximumTexture2dWidth :: DeviceAttribute MaximumTexture2dHeight :: DeviceAttribute MaximumTexture3dWidth :: DeviceAttribute MaximumTexture3dHeight :: DeviceAttribute MaximumTexture3dDepth :: DeviceAttribute MaximumTexture2dLayeredWidth :: DeviceAttribute MaximumTexture2dArrayWidth :: DeviceAttribute MaximumTexture2dLayeredHeight :: DeviceAttribute MaximumTexture2dArrayHeight :: DeviceAttribute MaximumTexture2dLayeredLayers :: DeviceAttribute MaximumTexture2dArrayNumslices :: DeviceAttribute SurfaceAlignment :: DeviceAttribute ConcurrentKernels :: DeviceAttribute EccEnabled :: DeviceAttribute PciBusId :: DeviceAttribute PciDeviceId :: DeviceAttribute TccDriver :: DeviceAttribute MemoryClockRate :: DeviceAttribute GlobalMemoryBusWidth :: DeviceAttribute L2CacheSize :: DeviceAttribute MaxThreadsPerMultiprocessor :: DeviceAttribute AsyncEngineCount :: DeviceAttribute UnifiedAddressing :: DeviceAttribute MaximumTexture1dLayeredWidth :: DeviceAttribute MaximumTexture1dLayeredLayers :: DeviceAttribute CanTex2dGather :: DeviceAttribute MaximumTexture2dGatherWidth :: DeviceAttribute MaximumTexture2dGatherHeight :: DeviceAttribute MaximumTexture3dWidthAlternate :: DeviceAttribute MaximumTexture3dHeightAlternate :: DeviceAttribute MaximumTexture3dDepthAlternate :: DeviceAttribute PciDomainId :: DeviceAttribute TexturePitchAlignment :: DeviceAttribute MaximumTexturecubemapWidth :: DeviceAttribute MaximumTexturecubemapLayeredWidth :: DeviceAttribute MaximumTexturecubemapLayeredLayers :: DeviceAttribute MaximumSurface1dWidth :: DeviceAttribute MaximumSurface2dWidth :: DeviceAttribute MaximumSurface2dHeight :: DeviceAttribute MaximumSurface3dWidth :: DeviceAttribute MaximumSurface3dHeight :: DeviceAttribute MaximumSurface3dDepth :: DeviceAttribute MaximumSurface1dLayeredWidth :: DeviceAttribute MaximumSurface1dLayeredLayers :: DeviceAttribute MaximumSurface2dLayeredWidth :: DeviceAttribute MaximumSurface2dLayeredHeight :: DeviceAttribute MaximumSurface2dLayeredLayers :: DeviceAttribute MaximumSurfacecubemapWidth :: DeviceAttribute MaximumSurfacecubemapLayeredWidth :: DeviceAttribute MaximumSurfacecubemapLayeredLayers :: DeviceAttribute MaximumTexture1dLinearWidth :: DeviceAttribute MaximumTexture2dLinearWidth :: DeviceAttribute MaximumTexture2dLinearHeight :: DeviceAttribute MaximumTexture2dLinearPitch :: DeviceAttribute MaximumTexture2dMipmappedWidth :: DeviceAttribute MaximumTexture2dMipmappedHeight :: DeviceAttribute ComputeCapabilityMajor :: DeviceAttribute ComputeCapabilityMinor :: DeviceAttribute MaximumTexture1dMipmappedWidth :: DeviceAttribute StreamPrioritiesSupported :: DeviceAttribute GlobalL1CacheSupported :: DeviceAttribute LocalL1CacheSupported :: DeviceAttribute MaxSharedMemoryPerMultiprocessor :: DeviceAttribute MaxRegistersPerMultiprocessor :: DeviceAttribute ManagedMemory :: DeviceAttribute MultiGpuBoard :: DeviceAttribute MultiGpuBoardGroupId :: DeviceAttribute CU_DEVICE_ATTRIBUTE_MAX :: DeviceAttribute data Compute Compute :: !Int -> !Int -> Compute -- | The compute mode the device is currently in data ComputeMode Default :: ComputeMode Exclusive :: ComputeMode Prohibited :: ComputeMode ExclusiveProcess :: ComputeMode -- | Possible option flags for CUDA initialisation. Dummy instance until -- the API exports actual option values. data InitFlag -- | Initialise the CUDA driver API. Must be called before any other driver -- function. initialise :: [InitFlag] -> IO () -- | Return the compute compatibility revision supported by the device capability :: Device -> IO Compute -- | Return a device handle device :: Int -> IO Device -- | Return the selected attribute for the given device attribute :: Device -> DeviceAttribute -> IO Int -- | Return the number of device with compute capability > 1.0 count :: IO Int -- | Name of the device name :: Device -> IO String -- | Return the properties of the selected device props :: Device -> IO DeviceProperties -- | Total memory available on the device (bytes) totalMem :: Device -> IO Int64 instance GHC.Show.Show Foreign.CUDA.Driver.Device.CUDevProp instance GHC.Show.Show Foreign.CUDA.Driver.Device.DeviceAttribute instance GHC.Classes.Eq Foreign.CUDA.Driver.Device.DeviceAttribute instance GHC.Show.Show Foreign.CUDA.Driver.Device.Device instance GHC.Classes.Eq Foreign.CUDA.Driver.Device.Device instance GHC.Enum.Enum Foreign.CUDA.Driver.Device.DeviceAttribute instance Foreign.Storable.Storable Foreign.CUDA.Driver.Device.CUDevProp instance GHC.Enum.Enum Foreign.CUDA.Driver.Device.InitFlag -- | Context management for low-level driver interface module Foreign.CUDA.Driver.Context -- | A device context newtype Context Context :: ((Ptr ())) -> Context [useContext] :: Context -> ((Ptr ())) -- | Context creation flags data ContextFlag SchedAuto :: ContextFlag SchedSpin :: ContextFlag SchedYield :: ContextFlag SchedBlockingSync :: ContextFlag -- | Deprecated: use SchedBlockingSync instead BlockingSync :: ContextFlag SchedMask :: ContextFlag MapHost :: ContextFlag LmemResizeToMax :: ContextFlag FlagsMask :: ContextFlag -- | Create a new CUDA context and associate it with the calling thread create :: Device -> [ContextFlag] -> IO Context -- | Increments the usage count of the context. API: no context flags are -- currently supported, so this parameter must be empty. -- | Deprecated: deprecated as of CUDA-4.0 attach :: Context -> [ContextFlag] -> IO () -- | Detach the context, and destroy if no longer used -- | Deprecated: deprecated as of CUDA-4.0 detach :: Context -> IO () -- | Destroy the specified context. This fails if the context is more than -- a single attachment (including that from initial creation). destroy :: Context -> IO () -- | Return the device of the currently active context device :: IO Device -- | Pop the current CUDA context from the CPU thread. The context must -- have a single usage count (matching calls to attach and -- detach). If successful, the new context is returned, and the -- old may be attached to a different CPU. pop :: IO Context -- | Push the given context onto the CPU's thread stack of current -- contexts. The context must be floating (via pop), i.e. not -- attached to any thread. push :: Context -> IO () -- | Block until the device has completed all preceding requests sync :: IO () -- | Return the context bound to the calling CPU thread. Requires cuda-4.0. get :: IO Context -- | Bind the specified context to the calling thread. Requires cuda-4.0. set :: Context -> IO () -- | Possible option values for direct peer memory access data PeerFlag -- | Queries if the first device can directly access the memory of the -- second. If direct access is possible, it can then be enabled with -- add. Requires cuda-4.0. accessible :: Device -> Device -> IO Bool -- | If the devices of both the current and supplied contexts support -- unified addressing, then enable allocations in the supplied context to -- be accessible by the current context. Requires cuda-4.0. add :: Context -> [PeerFlag] -> IO () -- | Disable direct memory access from the current context to the supplied -- context. Requires cuda-4.0. remove :: Context -> IO () -- | Device cache configuration preference data Cache PreferNone :: Cache PreferShared :: Cache PreferL1 :: Cache PreferEqual :: Cache -- | Device limits flags data Limit StackSize :: Limit PrintfFifoSize :: Limit MallocHeapSize :: Limit DevRuntimeSyncDepth :: Limit DevRuntimePendingLaunchCount :: Limit Max :: Limit -- | Query compute 2.0 call stack limits. Requires cuda-3.1. getLimit :: Limit -> IO Int -- | Specify the size of the call stack, for compute 2.0 devices. Requires -- cuda-3.1. setLimit :: Limit -> Int -> IO () -- | On devices where the L1 cache and shared memory use the same hardware -- resources, this sets the preferred cache configuration for the current -- context. This is only a preference. Requires cuda-3.2. setCacheConfig :: Cache -> IO () instance GHC.Show.Show Foreign.CUDA.Driver.Context.Cache instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Cache instance GHC.Show.Show Foreign.CUDA.Driver.Context.Limit instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Limit instance GHC.Show.Show Foreign.CUDA.Driver.Context.ContextFlag instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.ContextFlag instance GHC.Show.Show Foreign.CUDA.Driver.Context.Context instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Context instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.ContextFlag instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.Limit instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.Cache instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.PeerFlag -- | Meta-module exporting CUDA analysis routines module Foreign.CUDA.Analysis -- | Data types that are equivalent and can be shared freely between the -- CUDA Runtime and Driver APIs. module Foreign.CUDA.Types -- | A reference to data stored on the device. newtype DevicePtr a DevicePtr :: Ptr a -> DevicePtr a [useDevicePtr] :: DevicePtr a -> Ptr a -- | A reference to page-locked host memory. -- -- A HostPtr is just a plain Ptr, but the memory has been -- allocated by CUDA into page locked memory. This means that the data -- can be copied to the GPU via DMA (direct memory access). Note that the -- use of the system function mlock is not sufficient here --- -- the CUDA version ensures that the physical address stays this -- same, not just the virtual address. -- -- To copy data into a HostPtr array, you may use for example -- withHostPtr together with copyArray or -- moveArray. newtype HostPtr a HostPtr :: Ptr a -> HostPtr a [useHostPtr] :: HostPtr a -> Ptr a -- | Events are markers that can be inserted into the CUDA execution stream -- and later queried. newtype Event Event :: ((Ptr ())) -> Event [useEvent] :: Event -> ((Ptr ())) -- | Event creation flags data EventFlag Default :: EventFlag BlockingSync :: EventFlag DisableTiming :: EventFlag Interprocess :: EventFlag data WaitFlag -- | A processing stream. All operations in a stream are synchronous and -- executed in sequence, but operations in different non-default streams -- may happen out-of-order or concurrently with one another. -- -- Use Events to synchronise operations between streams. newtype Stream Stream :: ((Ptr ())) -> Stream [useStream] :: Stream -> ((Ptr ())) -- | Possible option flags for stream initialisation. Dummy instance until -- the API exports actual option values. data StreamFlag -- | The main execution stream. No operations overlap with operations in -- the default stream. defaultStream :: Stream instance GHC.Show.Show Foreign.CUDA.Types.Stream instance GHC.Classes.Eq Foreign.CUDA.Types.Stream instance GHC.Show.Show Foreign.CUDA.Types.EventFlag instance GHC.Classes.Eq Foreign.CUDA.Types.EventFlag instance GHC.Show.Show Foreign.CUDA.Types.Event instance GHC.Classes.Eq Foreign.CUDA.Types.Event instance GHC.Classes.Ord (Foreign.CUDA.Types.HostPtr a) instance GHC.Classes.Eq (Foreign.CUDA.Types.HostPtr a) instance GHC.Classes.Ord (Foreign.CUDA.Types.DevicePtr a) instance GHC.Classes.Eq (Foreign.CUDA.Types.DevicePtr a) instance GHC.Show.Show (Foreign.CUDA.Types.DevicePtr a) instance Foreign.Storable.Storable (Foreign.CUDA.Types.DevicePtr a) instance GHC.Show.Show (Foreign.CUDA.Types.HostPtr a) instance Foreign.Storable.Storable (Foreign.CUDA.Types.HostPtr a) instance GHC.Enum.Enum Foreign.CUDA.Types.EventFlag instance GHC.Enum.Enum Foreign.CUDA.Types.WaitFlag instance GHC.Enum.Enum Foreign.CUDA.Types.StreamFlag -- | Event management for C-for-CUDA runtime environment module Foreign.CUDA.Runtime.Event -- | Events are markers that can be inserted into the CUDA execution stream -- and later queried. data Event -- | Event creation flags data EventFlag Default :: EventFlag BlockingSync :: EventFlag DisableTiming :: EventFlag Interprocess :: EventFlag data WaitFlag -- | Create a new event create :: [EventFlag] -> IO Event -- | Destroy an event destroy :: Event -> IO () -- | Determine the elapsed time (in milliseconds) between two events elapsedTime :: Event -> Event -> IO Float -- | Determines if a event has actually been recorded query :: Event -> IO Bool -- | Record an event once all operations in the current context (or -- optionally specified stream) have completed. This operation is -- asynchronous. record :: Event -> Maybe Stream -> IO () -- | Makes all future work submitted to the (optional) stream wait until -- the given event reports completion before beginning execution. -- Synchronisation is performed on the device, including when the event -- and stream are from different device contexts. Requires cuda-3.2. wait :: Event -> Maybe Stream -> [WaitFlag] -> IO () -- | Wait until the event has been recorded block :: Event -> IO () -- | Stream management routines module Foreign.CUDA.Runtime.Stream -- | A processing stream. All operations in a stream are synchronous and -- executed in sequence, but operations in different non-default streams -- may happen out-of-order or concurrently with one another. -- -- Use Events to synchronise operations between streams. newtype Stream Stream :: ((Ptr ())) -> Stream [useStream] :: Stream -> ((Ptr ())) -- | Create a new asynchronous stream create :: IO Stream -- | Destroy and clean up an asynchronous stream destroy :: Stream -> IO () -- | Determine if all operations in a stream have completed finished :: Stream -> IO Bool -- | Block until all operations in a Stream have been completed block :: Stream -> IO () -- | The main execution stream. No operations overlap with operations in -- the default stream. defaultStream :: Stream -- | Kernel execution control for C-for-CUDA runtime interface module Foreign.CUDA.Runtime.Exec -- | A global device function. -- -- Note that the use of a string naming a function was deprecated in CUDA -- 4.1 and removed in CUDA 5.0. type Fun = FunPtr () data FunAttributes FunAttributes :: !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> FunAttributes [constSizeBytes] :: FunAttributes -> !Int64 [localSizeBytes] :: FunAttributes -> !Int64 [sharedSizeBytes] :: FunAttributes -> !Int64 -- | maximum block size that can be successively launched (based on -- register usage) [maxKernelThreadsPerBlock] :: FunAttributes -> !Int -- | number of registers required for each thread [numRegs] :: FunAttributes -> !Int -- | Kernel function parameters. Doubles will be converted to an internal -- float representation on devices that do not support doubles natively. data FunParam IArg :: !Int -> FunParam FArg :: !Float -> FunParam DArg :: !Double -> FunParam VArg :: !a -> FunParam -- | Cache configuration preference data CacheConfig None :: CacheConfig Shared :: CacheConfig L1 :: CacheConfig Equal :: CacheConfig -- | Obtain the attributes of the named global device -- function. This itemises the requirements to successfully launch the -- given kernel. attributes :: Fun -> IO FunAttributes -- | Specify the grid and block dimensions for a device call. Used in -- conjunction with setParams, this pushes data onto the execution -- stack that will be popped when a function is launched. setConfig :: (Int, Int) -> (Int, Int, Int) -> Int64 -> Maybe Stream -> IO () -- | Set the argument parameters that will be passed to the next kernel -- invocation. This is used in conjunction with setConfig to -- control kernel execution. setParams :: [FunParam] -> IO () -- | On devices where the L1 cache and shared memory use the same hardware -- resources, this sets the preferred cache configuration for the given -- device function. This is only a preference; the driver is free to -- choose a different configuration as required to execute the function. -- -- Switching between configuration modes may insert a device-side -- synchronisation point for streamed kernel launches setCacheConfig :: Fun -> CacheConfig -> IO () -- | Invoke the global kernel function on the device. This -- must be preceded by a call to setConfig and (if appropriate) -- setParams. launch :: Fun -> IO () -- | Invoke a kernel on a (gx * gy) grid of blocks, where each -- block contains (tx * ty * tz) threads and has access to a -- given number of bytes of shared memory. The launch may also be -- associated with a specific Stream. launchKernel :: Fun -> (Int, Int) -> (Int, Int, Int) -> Int64 -> Maybe Stream -> [FunParam] -> IO () instance GHC.Show.Show Foreign.CUDA.Runtime.Exec.CacheConfig instance GHC.Classes.Eq Foreign.CUDA.Runtime.Exec.CacheConfig instance GHC.Show.Show Foreign.CUDA.Runtime.Exec.FunAttributes instance Foreign.Storable.Storable Foreign.CUDA.Runtime.Exec.FunAttributes instance GHC.Enum.Enum Foreign.CUDA.Runtime.Exec.CacheConfig -- | Event management for low-level driver interface module Foreign.CUDA.Driver.Event -- | Events are markers that can be inserted into the CUDA execution stream -- and later queried. newtype Event Event :: ((Ptr ())) -> Event [useEvent] :: Event -> ((Ptr ())) -- | Event creation flags data EventFlag Default :: EventFlag BlockingSync :: EventFlag DisableTiming :: EventFlag Interprocess :: EventFlag data WaitFlag -- | Create a new event create :: [EventFlag] -> IO Event -- | Destroy an event destroy :: Event -> IO () -- | Determine the elapsed time (in milliseconds) between two events elapsedTime :: Event -> Event -> IO Float -- | Determines if a event has actually been recorded query :: Event -> IO Bool -- | Record an event once all operations in the current context (or -- optionally specified stream) have completed. This operation is -- asynchronous. record :: Event -> Maybe Stream -> IO () -- | Makes all future work submitted to the (optional) stream wait until -- the given event reports completion before beginning execution. -- Synchronisation is performed on the device, including when the event -- and stream are from different device contexts. Requires cuda-3.2. wait :: Event -> Maybe Stream -> [WaitFlag] -> IO () -- | Wait until the event has been recorded block :: Event -> IO () -- | Stream management for low-level driver interface module Foreign.CUDA.Driver.Stream -- | A processing stream. All operations in a stream are synchronous and -- executed in sequence, but operations in different non-default streams -- may happen out-of-order or concurrently with one another. -- -- Use Events to synchronise operations between streams. newtype Stream Stream :: ((Ptr ())) -> Stream [useStream] :: Stream -> ((Ptr ())) -- | Possible option flags for stream initialisation. Dummy instance until -- the API exports actual option values. data StreamFlag -- | Create a new stream create :: [StreamFlag] -> IO Stream -- | Destroy a stream destroy :: Stream -> IO () -- | Check if all operations in the stream have completed finished :: Stream -> IO Bool -- | Wait until the device has completed all operations in the Stream block :: Stream -> IO () -- | The main execution stream. No operations overlap with operations in -- the default stream. defaultStream :: Stream -- | Kernel execution control for low-level driver interface module Foreign.CUDA.Driver.Exec -- | A global device function newtype Fun Fun :: ((Ptr ())) -> Fun data FunParam IArg :: !Int32 -> FunParam FArg :: !Float -> FunParam VArg :: !a -> FunParam -- | Function attributes data FunAttribute MaxKernelThreadsPerBlock :: FunAttribute SharedSizeBytes :: FunAttribute ConstSizeBytes :: FunAttribute LocalSizeBytes :: FunAttribute NumRegs :: FunAttribute PtxVersion :: FunAttribute BinaryVersion :: FunAttribute CacheModeCa :: FunAttribute CU_FUNC_ATTRIBUTE_MAX :: FunAttribute -- | Returns the value of the selected attribute requirement for the given -- kernel requires :: Fun -> FunAttribute -> IO Int -- | Specify the (x,y,z) dimensions of the thread blocks that are -- created when the given kernel function is launched. -- | Deprecated: use launchKernel instead setBlockShape :: Fun -> (Int, Int, Int) -> IO () -- | Set the number of bytes of dynamic shared memory to be available to -- each thread block when the function is launched -- | Deprecated: use launchKernel instead setSharedSize :: Fun -> Integer -> IO () -- | Set the parameters that will specified next time the kernel is invoked -- | Deprecated: use launchKernel instead setParams :: Fun -> [FunParam] -> IO () -- | On devices where the L1 cache and shared memory use the same hardware -- resources, this sets the preferred cache configuration for the given -- device function. This is only a preference; the driver is free to -- choose a different configuration as required to execute the function. -- -- Switching between configuration modes may insert a device-side -- synchronisation point for streamed kernel launches. setCacheConfigFun :: Fun -> Cache -> IO () -- | Invoke the kernel on a size (w,h) grid of blocks. Each block -- contains the number of threads specified by a previous call to -- setBlockShape. The launch may also be associated with a -- specific Stream. -- | Deprecated: use launchKernel instead launch :: Fun -> (Int, Int) -> Maybe Stream -> IO () -- | Invoke a kernel on a (gx * gy * gz) grid of blocks, where -- each block contains (tx * ty * tz) threads and has access to -- a given number of bytes of shared memory. The launch may also be -- associated with a specific Stream. -- -- In launchKernel, the number of kernel parameters and their -- offsets and sizes do not need to be specified, as this information is -- retrieved directly from the kernel's image. This requires the kernel -- to have been compiled with toolchain version 3.2 or later. -- -- The alternative launchKernel' will pass the arguments in -- directly, requiring the application to know the size and -- alignment/padding of each kernel parameter. launchKernel :: Fun -> (Int, Int, Int) -> (Int, Int, Int) -> Int -> Maybe Stream -> [FunParam] -> IO () -- | Invoke a kernel on a (gx * gy * gz) grid of blocks, where -- each block contains (tx * ty * tz) threads and has access to -- a given number of bytes of shared memory. The launch may also be -- associated with a specific Stream. -- -- In launchKernel, the number of kernel parameters and their -- offsets and sizes do not need to be specified, as this information is -- retrieved directly from the kernel's image. This requires the kernel -- to have been compiled with toolchain version 3.2 or later. -- -- The alternative launchKernel' will pass the arguments in -- directly, requiring the application to know the size and -- alignment/padding of each kernel parameter. launchKernel' :: Fun -> (Int, Int, Int) -> (Int, Int, Int) -> Int -> Maybe Stream -> [FunParam] -> IO () instance GHC.Show.Show Foreign.CUDA.Driver.Exec.FunAttribute instance GHC.Classes.Eq Foreign.CUDA.Driver.Exec.FunAttribute instance GHC.Enum.Enum Foreign.CUDA.Driver.Exec.FunAttribute instance Foreign.Storable.Storable Foreign.CUDA.Driver.Exec.FunParam -- | Data pointers on the host and device. These can be shared freely -- between the CUDA runtime and Driver APIs. module Foreign.CUDA.Ptr -- | A reference to data stored on the device. newtype DevicePtr a DevicePtr :: Ptr a -> DevicePtr a [useDevicePtr] :: DevicePtr a -> Ptr a -- | Look at the contents of device memory. This takes an IO action that -- will be applied to that pointer, the result of which is returned. It -- would be silly to return the pointer from the action. withDevicePtr :: DevicePtr a -> (Ptr a -> IO b) -> IO b -- | Return a unique handle associated with the given device pointer devPtrToWordPtr :: DevicePtr a -> WordPtr -- | Return a device pointer from the given handle wordPtrToDevPtr :: WordPtr -> DevicePtr a -- | The constant nullDevPtr contains the distinguished memory -- location that is not associated with a valid memory location nullDevPtr :: DevicePtr a -- | Cast a device pointer from one type to another castDevPtr :: DevicePtr a -> DevicePtr b -- | Advance the pointer address by the given offset in bytes. plusDevPtr :: DevicePtr a -> Int -> DevicePtr a -- | Given an alignment constraint, align the device pointer to the next -- highest address satisfying the constraint alignDevPtr :: DevicePtr a -> Int -> DevicePtr a -- | Compute the difference between the second and first argument. This -- fulfils the relation -- --
--   p2 == p1 `plusDevPtr` (p2 `minusDevPtr` p1)
--   
minusDevPtr :: DevicePtr a -> DevicePtr a -> Int -- | Advance a pointer into a device array by the given number of elements advanceDevPtr :: Storable a => DevicePtr a -> Int -> DevicePtr a -- | A reference to page-locked host memory. -- -- A HostPtr is just a plain Ptr, but the memory has been -- allocated by CUDA into page locked memory. This means that the data -- can be copied to the GPU via DMA (direct memory access). Note that the -- use of the system function mlock is not sufficient here --- -- the CUDA version ensures that the physical address stays this -- same, not just the virtual address. -- -- To copy data into a HostPtr array, you may use for example -- withHostPtr together with copyArray or -- moveArray. newtype HostPtr a HostPtr :: Ptr a -> HostPtr a [useHostPtr] :: HostPtr a -> Ptr a -- | Apply an IO action to the memory reference living inside the host -- pointer object. All uses of the pointer should be inside the -- withHostPtr bracket. withHostPtr :: HostPtr a -> (Ptr a -> IO b) -> IO b -- | The constant nullHostPtr contains the distinguished memory -- location that is not associated with a valid memory location nullHostPtr :: HostPtr a -- | Cast a host pointer from one type to another castHostPtr :: HostPtr a -> HostPtr b -- | Advance the pointer address by the given offset in bytes plusHostPtr :: HostPtr a -> Int -> HostPtr a -- | Given an alignment constraint, align the host pointer to the next -- highest address satisfying the constraint alignHostPtr :: HostPtr a -> Int -> HostPtr a -- | Compute the difference between the second and first argument minusHostPtr :: HostPtr a -> HostPtr a -> Int -- | Advance a pointer into a host array by a given number of elements advanceHostPtr :: Storable a => HostPtr a -> Int -> HostPtr a -- | Memory management for CUDA devices module Foreign.CUDA.Runtime.Marshal -- | Options for host allocation data AllocFlag Portable :: AllocFlag DeviceMapped :: AllocFlag WriteCombined :: AllocFlag -- | Allocate a section of linear memory on the host which is page-locked -- and directly accessible from the device. The storage is sufficient to -- hold the given number of elements of a storable type. The runtime -- system automatically accelerates calls to functions such as -- peekArrayAsync and pokeArrayAsync that refer to -- page-locked memory. -- -- Note that since the amount of pageable memory is thusly reduced, -- overall system performance may suffer. This is best used sparingly to -- allocate staging areas for data exchange mallocHostArray :: Storable a => [AllocFlag] -> Int -> IO (HostPtr a) -- | Free page-locked host memory previously allocated with -- mallecHost freeHost :: HostPtr a -> IO () -- | Allocate a section of linear memory on the device, and return a -- reference to it. The memory is sufficient to hold the given number of -- elements of storable type. It is suitable aligned, and not cleared. mallocArray :: Storable a => Int -> IO (DevicePtr a) -- | Execute a computation, passing a pointer to a temporarily allocated -- block of memory sufficient to hold the given number of elements of -- storable type. The memory is freed when the computation terminates -- (normally or via an exception), so the pointer must not be used after -- this. -- -- Note that kernel launches can be asynchronous, so you may need to add -- a synchronisation point at the end of the computation. allocaArray :: Storable a => Int -> (DevicePtr a -> IO b) -> IO b -- | Free previously allocated memory on the device free :: DevicePtr a -> IO () -- | Options for unified memory allocations data AttachFlag Global :: AttachFlag Host :: AttachFlag Single :: AttachFlag -- | Allocates memory that will be automatically managed by the Unified -- Memory system mallocManagedArray :: Storable a => [AttachFlag] -> Int -> IO (DevicePtr a) -- | Copy a number of elements from the device to host memory. This is a -- synchronous operation. peekArray :: Storable a => Int -> DevicePtr a -> Ptr a -> IO () -- | Copy memory from the device asynchronously, possibly associated with a -- particular stream. The destination memory must be page locked. peekArrayAsync :: Storable a => Int -> DevicePtr a -> HostPtr a -> Maybe Stream -> IO () -- | Copy a 2D memory area from the device to the host. This is a -- synchronous operation. peekArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> Ptr a -> Int -> IO () -- | Copy a 2D memory area from the device to the host asynchronously, -- possibly associated with a particular stream. The destination array -- must be page locked. peekArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> HostPtr a -> Int -> Maybe Stream -> IO () -- | Copy a number of elements from the device into a new Haskell list. -- Note that this requires two memory copies: firstly from the device -- into a heap allocated array, and from there marshalled into a list peekListArray :: Storable a => Int -> DevicePtr a -> IO [a] -- | Copy a number of elements onto the device. This is a synchronous -- operation. pokeArray :: Storable a => Int -> Ptr a -> DevicePtr a -> IO () -- | Copy memory onto the device asynchronously, possibly associated with a -- particular stream. The source memory must be page-locked. pokeArrayAsync :: Storable a => Int -> HostPtr a -> DevicePtr a -> Maybe Stream -> IO () -- | Copy a 2D memory area onto the device. This is a synchronous -- operation. pokeArray2D :: Storable a => Int -> Int -> Ptr a -> Int -> DevicePtr a -> Int -> IO () -- | Copy a 2D memory area onto the device asynchronously, possibly -- associated with a particular stream. The source array must be page -- locked. pokeArray2DAsync :: Storable a => Int -> Int -> HostPtr a -> Int -> DevicePtr a -> Int -> Maybe Stream -> IO () -- | Write a list of storable elements into a device array. The array must -- be sufficiently large to hold the entire list. This requires two -- marshalling operations pokeListArray :: Storable a => [a] -> DevicePtr a -> IO () -- | Copy the given number of elements from the first device array (source) -- to the second (destination). The copied areas may not overlap. This -- operation is asynchronous with respect to host, but will not overlap -- other device operations. copyArray :: Storable a => Int -> DevicePtr a -> DevicePtr a -> IO () -- | Copy the given number of elements from the first device array (source) -- to the second (destination). The copied areas may not overlap. This -- operation is asynchronous with respect to the host, and may be -- associated with a particular stream. copyArrayAsync :: Storable a => Int -> DevicePtr a -> DevicePtr a -> Maybe Stream -> IO () -- | Copy a 2D memory area from the first device array (source) to the -- second (destination). The copied areas may not overlap. This operation -- is asynchronous with respect to the host, but will not overlap other -- device operations. copyArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> DevicePtr a -> Int -> IO () -- | Copy a 2D memory area from the first device array (source) to the -- second device array (destination). The copied areas may not overlay. -- This operation is asynchronous with respect to the host, and may be -- associated with a particular stream. copyArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> DevicePtr a -> Int -> Maybe Stream -> IO () -- | Write a list of storable elements into a newly allocated device array. -- This is newListArrayLen composed with fst. newListArray :: Storable a => [a] -> IO (DevicePtr a) -- | Write a list of storable elements into a newly allocated device array, -- returning the device pointer together with the number of elements that -- were written. Note that this requires two copy operations: firstly -- from a Haskell list into a heap-allocated array, and from there into -- device memory. The array should be freed when no longer -- required. newListArrayLen :: Storable a => [a] -> IO (DevicePtr a, Int) -- | Temporarily store a list of elements into a newly allocated device -- array. An IO action is applied to the array, the result of which is -- returned. Similar to newListArray, this requires two -- marshalling operations of the data. -- -- As with allocaArray, the memory is freed once the action -- completes, so you should not return the pointer from the action, and -- be sure that any asynchronous operations (such as kernel execution) -- have completed. withListArray :: Storable a => [a] -> (DevicePtr a -> IO b) -> IO b -- | A variant of withListArray which also supplies the number of -- elements in the array to the applied function withListArrayLen :: Storable a => [a] -> (Int -> DevicePtr a -> IO b) -> IO b -- | Initialise device memory to a given 8-bit value memset :: DevicePtr a -> Int64 -> Int8 -> IO () instance GHC.Show.Show Foreign.CUDA.Runtime.Marshal.CopyDirection instance GHC.Classes.Eq Foreign.CUDA.Runtime.Marshal.CopyDirection instance GHC.Show.Show Foreign.CUDA.Runtime.Marshal.AttachFlag instance GHC.Classes.Eq Foreign.CUDA.Runtime.Marshal.AttachFlag instance GHC.Show.Show Foreign.CUDA.Runtime.Marshal.AllocFlag instance GHC.Classes.Eq Foreign.CUDA.Runtime.Marshal.AllocFlag instance GHC.Enum.Enum Foreign.CUDA.Runtime.Marshal.AllocFlag instance GHC.Enum.Enum Foreign.CUDA.Runtime.Marshal.AttachFlag instance GHC.Enum.Enum Foreign.CUDA.Runtime.Marshal.CopyDirection -- | Texture references module Foreign.CUDA.Runtime.Texture data Texture Texture :: !Bool -> !FilterMode -> !(AddressMode, AddressMode, AddressMode) -> !FormatDesc -> Texture -- | access texture using normalised coordinates [0.0,1.0) [normalised] :: Texture -> !Bool [filtering] :: Texture -> !FilterMode [addressing] :: Texture -> !(AddressMode, AddressMode, AddressMode) [format] :: Texture -> !FormatDesc -- | Texture channel format kind data FormatKind Signed :: FormatKind Unsigned :: FormatKind Float :: FormatKind None :: FormatKind data AddressMode Wrap :: AddressMode Clamp :: AddressMode Mirror :: AddressMode Border :: AddressMode data FilterMode Point :: FilterMode Linear :: FilterMode -- | A description of how memory read through the texture cache should be -- interpreted, including the kind of data and the number of bits of each -- component (x,y,z and w, respectively). data FormatDesc FormatDesc :: !(Int, Int, Int, Int) -> !FormatKind -> FormatDesc [depth] :: FormatDesc -> !(Int, Int, Int, Int) [kind] :: FormatDesc -> !FormatKind -- | Bind the memory area associated with the device pointer to a texture -- reference given by the named symbol. Any previously bound references -- are unbound. bind :: String -> Texture -> DevicePtr a -> Int64 -> IO () -- | Bind the two-dimensional memory area to the texture reference -- associated with the given symbol. The size of the area is constrained -- by (width,height) in texel units, and the row pitch in bytes. Any -- previously bound references are unbound. bind2D :: String -> Texture -> DevicePtr a -> (Int, Int) -> Int64 -> IO () instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.Texture instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.Texture instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.FormatDesc instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.FormatDesc instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.FilterMode instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.FilterMode instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.AddressMode instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.AddressMode instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.FormatKind instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.FormatKind instance GHC.Enum.Enum Foreign.CUDA.Runtime.Texture.FormatKind instance GHC.Enum.Enum Foreign.CUDA.Runtime.Texture.AddressMode instance GHC.Enum.Enum Foreign.CUDA.Runtime.Texture.FilterMode instance Foreign.Storable.Storable Foreign.CUDA.Runtime.Texture.FormatDesc instance Foreign.Storable.Storable Foreign.CUDA.Runtime.Texture.Texture -- | Memory management for low-level driver interface module Foreign.CUDA.Driver.Marshal -- | Options for host allocation data AllocFlag Portable :: AllocFlag DeviceMapped :: AllocFlag WriteCombined :: AllocFlag -- | Allocate a section of linear memory on the host which is page-locked -- and directly accessible from the device. The storage is sufficient to -- hold the given number of elements of a storable type. -- -- Note that since the amount of pageable memory is thusly reduced, -- overall system performance may suffer. This is best used sparingly to -- allocate staging areas for data exchange. mallocHostArray :: Storable a => [AllocFlag] -> Int -> IO (HostPtr a) -- | Free a section of page-locked host memory freeHost :: HostPtr a -> IO () -- | Page-locks the specified array (on the host) and maps it for the -- device(s) as specified by the given allocation flags. Subsequently, -- the memory is accessed directly by the device so can be read and -- written with much higher bandwidth than pageable memory that has not -- been registered. The memory range is added to the same tracking -- mechanism as mallocHostArray to automatically accelerate calls -- to functions such as pokeArray. -- -- Note that page-locking excessive amounts of memory may degrade system -- performance, since it reduces the amount of pageable memory available. -- This is best used sparingly to allocate staging areas for data -- exchange. -- -- This function is not yet implemented on Mac OS X. Requires cuda-4.0. registerArray :: Storable a => [AllocFlag] -> Int -> Ptr a -> IO (HostPtr a) -- | Unmaps the memory from the given pointer, and makes it pageable again. -- -- This function is not yet implemented on Mac OS X. Requires cuda-4.0. unregisterArray :: HostPtr a -> IO (Ptr a) -- | Allocate a section of linear memory on the device, and return a -- reference to it. The memory is sufficient to hold the given number of -- elements of storable type. It is suitably aligned for any type, and is -- not cleared. mallocArray :: Storable a => Int -> IO (DevicePtr a) -- | Execute a computation on the device, passing a pointer to a -- temporarily allocated block of memory sufficient to hold the given -- number of elements of storable type. The memory is freed when the -- computation terminates (normally or via an exception), so the pointer -- must not be used after this. -- -- Note that kernel launches can be asynchronous, so you may want to add -- a synchronisation point using sync as part of the -- computation. allocaArray :: Storable a => Int -> (DevicePtr a -> IO b) -> IO b -- | Release a section of device memory free :: DevicePtr a -> IO () -- | Options for unified memory allocations data AttachFlag CuMemAttachGlobal :: AttachFlag CuMemAttachHost :: AttachFlag CuMemAttachSingle :: AttachFlag -- | Allocates memory that will be automatically managed by the Unified -- Memory system mallocManagedArray :: Storable a => [AttachFlag] -> Int -> IO (DevicePtr a) -- | Copy a number of elements from the device to host memory. This is a -- synchronous operation peekArray :: Storable a => Int -> DevicePtr a -> Ptr a -> IO () -- | Copy memory from the device asynchronously, possibly associated with a -- particular stream. The destination host memory must be page-locked. peekArrayAsync :: Storable a => Int -> DevicePtr a -> HostPtr a -> Maybe Stream -> IO () -- | Copy a 2D array from the device to the host. peekArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> Ptr a -> Int -> Int -> Int -> IO () -- | Copy a 2D array from the device to the host asynchronously, possibly -- associated with a particular execution stream. The destination host -- memory must be page-locked. peekArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> HostPtr a -> Int -> Int -> Int -> Maybe Stream -> IO () -- | Copy a number of elements from the device into a new Haskell list. -- Note that this requires two memory copies: firstly from the device -- into a heap allocated array, and from there marshalled into a list. peekListArray :: Storable a => Int -> DevicePtr a -> IO [a] -- | Copy a number of elements onto the device. This is a synchronous -- operation pokeArray :: Storable a => Int -> Ptr a -> DevicePtr a -> IO () -- | Copy memory onto the device asynchronously, possibly associated with a -- particular stream. The source host memory must be page-locked. pokeArrayAsync :: Storable a => Int -> HostPtr a -> DevicePtr a -> Maybe Stream -> IO () -- | Copy a 2D array from the host to the device. pokeArray2D :: Storable a => Int -> Int -> Ptr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> IO () -- | Copy a 2D array from the host to the device asynchronously, possibly -- associated with a particular execution stream. The source host memory -- must be page-locked. pokeArray2DAsync :: Storable a => Int -> Int -> HostPtr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> Maybe Stream -> IO () -- | Write a list of storable elements into a device array. The device -- array must be sufficiently large to hold the entire list. This -- requires two marshalling operations. pokeListArray :: Storable a => [a] -> DevicePtr a -> IO () -- | Copy the given number of elements from the first device array (source) -- to the second device (destination). The copied areas may not overlap. -- This operation is asynchronous with respect to the host, but will -- never overlap with kernel execution. copyArray :: Storable a => Int -> DevicePtr a -> DevicePtr a -> IO () -- | Copy the given number of elements from the first device array (source) -- to the second device array (destination). The copied areas may not -- overlap. The operation is asynchronous with respect to the host, and -- can be asynchronous to other device operations by associating it with -- a particular stream. copyArrayAsync :: Storable a => Int -> DevicePtr a -> DevicePtr a -> Maybe Stream -> IO () -- | Copy a 2D array from the first device array (source) to the second -- device array (destination). The copied areas must not overlap. This -- operation is asynchronous with respect to the host, but will never -- overlap with kernel execution. copyArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> IO () -- | Copy a 2D array from the first device array (source) to the second -- device array (destination). The copied areas may not overlap. The -- operation is asynchronous with respect to the host, and can be -- asynchronous to other device operations by associating it with a -- particular execution stream. copyArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> Maybe Stream -> IO () -- | Copies an array from device memory in one context to device memory in -- another context. Note that this function is asynchronous with respect -- to the host, but serialised with respect to all pending and future -- asynchronous work in the source and destination contexts. To avoid -- this synchronisation, use copyArrayPeerAsync instead. copyArrayPeer :: Storable a => Int -> DevicePtr a -> Context -> DevicePtr a -> Context -> IO () -- | Copies from device memory in one context to device memory in another -- context. Note that this function is asynchronous with respect to the -- host and all work in other streams and devices. copyArrayPeerAsync :: Storable a => Int -> DevicePtr a -> Context -> DevicePtr a -> Context -> Maybe Stream -> IO () -- | Write a list of storable elements into a newly allocated device array. -- This is newListArrayLen composed with fst. newListArray :: Storable a => [a] -> IO (DevicePtr a) -- | Write a list of storable elements into a newly allocated device array, -- returning the device pointer together with the number of elements that -- were written. Note that this requires two memory copies: firstly from -- a Haskell list to a heap allocated array, and from there onto the -- graphics device. The memory should be freed when no longer -- required. newListArrayLen :: Storable a => [a] -> IO (DevicePtr a, Int) -- | Temporarily store a list of elements into a newly allocated device -- array. An IO action is applied to to the array, the result of which is -- returned. Similar to newListArray, this requires copying the -- data twice. -- -- As with allocaArray, the memory is freed once the action -- completes, so you should not return the pointer from the action, and -- be wary of asynchronous kernel execution. withListArray :: Storable a => [a] -> (DevicePtr a -> IO b) -> IO b -- | A variant of withListArray which also supplies the number of -- elements in the array to the applied function withListArrayLen :: Storable a => [a] -> (Int -> DevicePtr a -> IO b) -> IO b -- | Set a number of data elements to the specified value, which may be -- either 8-, 16-, or 32-bits wide. memset :: Storable a => DevicePtr a -> Int -> a -> IO () -- | Set the number of data elements to the specified value, which may be -- either 8-, 16-, or 32-bits wide. The operation is asynchronous and may -- optionally be associated with a stream. Requires cuda-3.2. memsetAsync :: Storable a => DevicePtr a -> Int -> a -> Maybe Stream -> IO () -- | Return the device pointer associated with a mapped, pinned host -- buffer, which was allocated with the DeviceMapped option by -- mallocHostArray. -- -- Currently, no options are supported and this must be empty. getDevicePtr :: [AllocFlag] -> HostPtr a -> IO (DevicePtr a) -- | Return the base address and allocation size of the given device -- pointer getBasePtr :: DevicePtr a -> IO (DevicePtr a, Int64) -- | Return the amount of free and total memory respectively available to -- the current context (bytes) getMemInfo :: IO (Int64, Int64) instance GHC.Show.Show Foreign.CUDA.Driver.Marshal.AttachFlag instance GHC.Classes.Eq Foreign.CUDA.Driver.Marshal.AttachFlag instance GHC.Show.Show Foreign.CUDA.Driver.Marshal.AllocFlag instance GHC.Classes.Eq Foreign.CUDA.Driver.Marshal.AllocFlag instance GHC.Enum.Enum Foreign.CUDA.Driver.Marshal.AllocFlag instance GHC.Enum.Enum Foreign.CUDA.Driver.Marshal.AttachFlag -- | Texture management for low-level driver interface module Foreign.CUDA.Driver.Texture -- | A texture reference newtype Texture Texture :: ((Ptr ())) -> Texture [useTexture] :: Texture -> ((Ptr ())) -- | Texture data formats data Format Word8 :: Format Word16 :: Format Word32 :: Format Int8 :: Format Int16 :: Format Int32 :: Format Half :: Format Float :: Format -- | Texture reference addressing modes data AddressMode Wrap :: AddressMode Clamp :: AddressMode Mirror :: AddressMode Border :: AddressMode -- | Texture reference filtering mode data FilterMode Point :: FilterMode Linear :: FilterMode -- | Texture read mode options data ReadMode ReadAsInteger :: ReadMode NormalizedCoordinates :: ReadMode SRGB :: ReadMode -- | Create a new texture reference. Once created, the application must -- call setPtr to associate the reference with allocated memory. -- Other texture reference functions are used to specify the format and -- interpretation to be used when the memory is read through this -- reference. -- | Deprecated: as of CUDA version 3.2 create :: IO Texture -- | Destroy a texture reference -- | Deprecated: as of CUDA version 3.2 destroy :: Texture -> IO () -- | Bind a linear array address of the given size (bytes) as a texture -- reference. Any previously bound references are unbound. bind :: Texture -> DevicePtr a -> Int64 -> IO () -- | Bind a linear address range to the given texture reference as a -- two-dimensional arena. Any previously bound reference is unbound. Note -- that calls to setFormat can not follow a call to bind2D -- for the same texture reference. bind2D :: Texture -> Format -> Int -> DevicePtr a -> (Int, Int) -> Int64 -> IO () -- | Get the addressing mode used by a texture reference, corresponding to -- the given dimension (currently the only supported dimension values are -- 0 or 1). getAddressMode :: Texture -> Int -> IO AddressMode -- | Get the filtering mode used by a texture reference getFilterMode :: Texture -> IO FilterMode -- | Get the data format and number of channel components of the bound -- texture getFormat :: Texture -> IO (Format, Int) -- | Specify the addressing mode for the given dimension of a texture -- reference setAddressMode :: Texture -> Int -> AddressMode -> IO () -- | Specify the filtering mode to be used when reading memory through a -- texture reference setFilterMode :: Texture -> FilterMode -> IO () -- | Specify the format of the data and number of packed components per -- element to be read by the texture reference setFormat :: Texture -> Format -> Int -> IO () -- | Specify additional characteristics for reading and indexing the -- texture reference setReadMode :: Texture -> ReadMode -> IO () instance GHC.Show.Show Foreign.CUDA.Driver.Texture.Format instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.Format instance GHC.Show.Show Foreign.CUDA.Driver.Texture.ReadMode instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.ReadMode instance GHC.Show.Show Foreign.CUDA.Driver.Texture.FilterMode instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.FilterMode instance GHC.Show.Show Foreign.CUDA.Driver.Texture.AddressMode instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.AddressMode instance GHC.Show.Show Foreign.CUDA.Driver.Texture.Texture instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.Texture instance Foreign.Storable.Storable Foreign.CUDA.Driver.Texture.Texture instance GHC.Enum.Enum Foreign.CUDA.Driver.Texture.AddressMode instance GHC.Enum.Enum Foreign.CUDA.Driver.Texture.FilterMode instance GHC.Enum.Enum Foreign.CUDA.Driver.Texture.ReadMode instance GHC.Enum.Enum Foreign.CUDA.Driver.Texture.Format -- | Module management for low-level driver interface module Foreign.CUDA.Driver.Module -- | A reference to a Module object, containing collections of device -- functions data Module -- | Just-in-time compilation options data JITOption -- | maximum number of registers per thread MaxRegisters :: !Int -> JITOption -- | number of threads per block to target for ThreadsPerBlock :: !Int -> JITOption -- | level of optimisation to apply (1-4, default 4) OptimisationLevel :: !Int -> JITOption -- | compilation target, otherwise determined from context Target :: !Compute -> JITOption -- | fallback strategy if matching cubin not found FallbackStrategy :: !JITFallback -> JITOption -- | generate debug info (-g) (requires cuda >= 5.5) GenerateDebugInfo :: JITOption -- | generate line number information (-lineinfo) (requires cuda >= 5.5) GenerateLineInfo :: JITOption -- | verbose log messages (requires cuda >= 5.5) Verbose :: JITOption data JITTarget Compute10 :: JITTarget Compute11 :: JITTarget Compute12 :: JITTarget Compute13 :: JITTarget Compute20 :: JITTarget Compute21 :: JITTarget Compute30 :: JITTarget Compute32 :: JITTarget Compute35 :: JITTarget Compute37 :: JITTarget Compute50 :: JITTarget Compute52 :: JITTarget -- | Results of online compilation data JITResult JITResult :: !Float -> !ByteString -> !Module -> JITResult -- | milliseconds spent compiling PTX [jitTime] :: JITResult -> !Float -- | information about PTX assembly [jitInfoLog] :: JITResult -> !ByteString -- | compilation error log or compiled module [jitModule] :: JITResult -> !Module data JITFallback PTX :: JITFallback Binary :: JITFallback -- | Returns a function handle getFun :: Module -> String -> IO Fun -- | Return a global pointer, and size of the global (in bytes) getPtr :: Module -> String -> IO (DevicePtr a, Int) -- | Return a handle to a texture reference getTex :: Module -> String -> IO Texture -- | Load the contents of the specified file (either a ptx or cubin file) -- to create a new module, and load that module into the current context. loadFile :: FilePath -> IO Module -- | Load the contents of the given image into a new module, and load that -- module into the current context. The image is (typically) the contents -- of a cubin or PTX file. -- -- Note that the ByteString will be copied into a temporary -- staging area so that it can be passed to C. loadData :: ByteString -> IO Module -- | As loadData, but read the image data from the given pointer. -- The image is a NULL-terminated sequence of bytes. loadDataFromPtr :: Ptr Word8 -> IO Module -- | Load the contents of the given image into a module with online -- compiler options, and load the module into the current context. The -- image is (typically) the contents of a cubin or PTX file. The actual -- attributes of the compiled kernel can be probed using requires. -- -- Note that the ByteString will be copied into a temporary -- staging area so that it can be passed to C. loadDataEx :: ByteString -> [JITOption] -> IO JITResult -- | As loadDataEx, but read the image data from the given pointer. -- The image is a NULL-terminated sequence of bytes. loadDataFromPtrEx :: Ptr Word8 -> [JITOption] -> IO JITResult -- | Unload a module from the current context unload :: Module -> IO () instance GHC.Show.Show Foreign.CUDA.Driver.Module.JITOption instance GHC.Show.Show Foreign.CUDA.Driver.Module.JITFallback instance GHC.Classes.Eq Foreign.CUDA.Driver.Module.JITFallback instance GHC.Show.Show Foreign.CUDA.Driver.Module.JITTarget instance GHC.Classes.Eq Foreign.CUDA.Driver.Module.JITTarget instance GHC.Show.Show Foreign.CUDA.Driver.Module.JITOptionInternal instance GHC.Classes.Eq Foreign.CUDA.Driver.Module.JITOptionInternal instance GHC.Show.Show Foreign.CUDA.Driver.Module.JITResult instance GHC.Show.Show Foreign.CUDA.Driver.Module.Module instance GHC.Classes.Eq Foreign.CUDA.Driver.Module.Module instance GHC.Enum.Enum Foreign.CUDA.Driver.Module.JITOptionInternal instance GHC.Enum.Enum Foreign.CUDA.Driver.Module.JITTarget instance GHC.Enum.Enum Foreign.CUDA.Driver.Module.JITFallback -- | Top level bindings to CUDA driver API module Foreign.CUDA.Driver -- | Top level bindings to the C-for-CUDA runtime API module Foreign.CUDA.Runtime -- | Top level bindings. By default, expose the C-for-CUDA runtime API -- bindings, as they are slightly more user friendly. module Foreign.CUDA