-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | FFI binding to the CUDA interface for programming NVIDIA GPUs
--   
--   The CUDA library provides a direct, general purpose C-like SPMD
--   programming model for NVIDIA graphics cards (G8x series onwards). This
--   is a collection of bindings to allow you to call and control, although
--   not write, such functions from Haskell-land. You will need to install
--   the CUDA driver and developer toolkit.
--   
--   <a>http://developer.nvidia.com/cuda-downloads</a>
--   
--   The configure script will look for your CUDA installation in the
--   standard places, and if the nvcc compiler is found in your PATH,
--   relative to that.
--   
--   This release is for version 6.5 of the CUDA toolkit.
--   
--   <ul>
--   <li><i><i>NOTE:</i></i></li>
--   </ul>
--   
--   Due to a bug in nvcc, this package is not compatible with c2hs-0.18.*
--   or c2hs-0.19.*. See tmcdonell/cuda#18.
@package cuda
@version 0.6.6.0


-- | Error handling
module Foreign.CUDA.Driver.Error
data Status
Success :: Status
InvalidValue :: Status
OutOfMemory :: Status
NotInitialized :: Status
Deinitialized :: Status
ProfilerDisabled :: Status
ProfilerNotInitialized :: Status
ProfilerAlreadyStarted :: Status
ProfilerAlreadyStopped :: Status
NoDevice :: Status
InvalidDevice :: Status
InvalidImage :: Status
InvalidContext :: Status
ContextAlreadyCurrent :: Status
MapFailed :: Status
UnmapFailed :: Status
ArrayIsMapped :: Status
AlreadyMapped :: Status
NoBinaryForGPU :: Status
AlreadyAcquired :: Status
NotMapped :: Status
NotMappedAsArray :: Status
NotMappedAsPointer :: Status
EccUncorrectable :: Status
UnsupportedLimit :: Status
ContextAlreadyInUse :: Status
PeerAccessUnsupported :: Status
InvalidPTX :: Status
InvalidGraphicsContext :: Status
InvalidSource :: Status
FileNotFound :: Status
SharedObjectSymbolNotFound :: Status
SharedObjectInitFailed :: Status
OperatingSystem :: Status
InvalidHandle :: Status
NotFound :: Status
NotReady :: Status
IllegalAddress :: Status
LaunchOutOfResources :: Status
LaunchTimeout :: Status
LaunchIncompatibleTexturing :: Status
PeerAccessAlreadyEnabled :: Status
PeerAccessNotEnabled :: Status
PrimaryContextActive :: Status
ContextIsDestroyed :: Status
Assert :: Status
TooManyPeers :: Status
HostMemoryAlreadyRegistered :: Status
HostMemoryNotRegistered :: Status
HardwareStackError :: Status
IllegalInstruction :: Status
MisalignedAddress :: Status
InvalidAddressSpace :: Status
InvalidPC :: Status
LaunchFailed :: Status
NotPermitted :: Status
NotSupported :: Status
Unknown :: Status

-- | Return a descriptive error string associated with a particular error
--   code
describe :: Status -> String
cuGetErrorString :: (Status) -> IO (Status, String)
data CUDAException
ExitCode :: Status -> CUDAException
UserError :: String -> CUDAException

-- | Raise a CUDAException in the IO Monad
cudaError :: String -> IO a

-- | A specially formatted error message
requireSDK :: Double -> String -> IO a

-- | Return the results of a function on successful execution, otherwise
--   throw an exception with an error string associated with the return
--   code
resultIfOk :: (Status, a) -> IO a

-- | Throw an exception with an error string associated with an
--   unsuccessful return code, otherwise return unit.
nothingIfOk :: Status -> IO ()
cuGetErrorString'_ :: CInt -> ((Ptr (Ptr CChar)) -> (IO CInt))
instance Typeable CUDAException
instance Eq Status
instance Show Status
instance Show CUDAException
instance Exception CUDAException
instance Enum Status


-- | Utility functions
module Foreign.CUDA.Driver.Utils

-- | Return the version number of the installed CUDA driver
driverVersion :: IO Int


-- | Error handling functions
module Foreign.CUDA.Runtime.Error

-- | Return codes from API functions
data Status
Success :: Status
MissingConfiguration :: Status
MemoryAllocation :: Status
InitializationError :: Status
LaunchFailure :: Status
PriorLaunchFailure :: Status
LaunchTimeout :: Status
LaunchOutOfResources :: Status
InvalidDeviceFunction :: Status
InvalidConfiguration :: Status
InvalidDevice :: Status
InvalidValue :: Status
InvalidPitchValue :: Status
InvalidSymbol :: Status
MapBufferObjectFailed :: Status
UnmapBufferObjectFailed :: Status
InvalidHostPointer :: Status
InvalidDevicePointer :: Status
InvalidTexture :: Status
InvalidTextureBinding :: Status
InvalidChannelDescriptor :: Status
InvalidMemcpyDirection :: Status
AddressOfConstant :: Status
TextureFetchFailed :: Status
TextureNotBound :: Status
SynchronizationError :: Status
InvalidFilterSetting :: Status
InvalidNormSetting :: Status
MixedDeviceExecution :: Status
CudartUnloading :: Status
Unknown :: Status
NotYetImplemented :: Status
MemoryValueTooLarge :: Status
InvalidResourceHandle :: Status
NotReady :: Status
InsufficientDriver :: Status
SetOnActiveProcess :: Status
InvalidSurface :: Status
NoDevice :: Status
ECCUncorrectable :: Status
SharedObjectSymbolNotFound :: Status
SharedObjectInitFailed :: Status
UnsupportedLimit :: Status
DuplicateVariableName :: Status
DuplicateTextureName :: Status
DuplicateSurfaceName :: Status
DevicesUnavailable :: Status
InvalidKernelImage :: Status
NoKernelImageForDevice :: Status
IncompatibleDriverContext :: Status
PeerAccessAlreadyEnabled :: Status
PeerAccessNotEnabled :: Status
DeviceAlreadyInUse :: Status
ProfilerDisabled :: Status
ProfilerNotInitialized :: Status
ProfilerAlreadyStarted :: Status
ProfilerAlreadyStopped :: Status
Assert :: Status
TooManyPeers :: Status
HostMemoryAlreadyRegistered :: Status
HostMemoryNotRegistered :: Status
OperatingSystem :: Status
PeerAccessUnsupported :: Status
LaunchMaxDepthExceeded :: Status
LaunchFileScopedTex :: Status
LaunchFileScopedSurf :: Status
SyncDepthExceeded :: Status
LaunchPendingCountExceeded :: Status
NotPermitted :: Status
NotSupported :: Status
HardwareStackError :: Status
IllegalInstruction :: Status
MisalignedAddress :: Status
InvalidAddressSpace :: Status
InvalidPc :: Status
IllegalAddress :: Status
InvalidPtx :: Status
InvalidGraphicsContext :: Status
StartupFailure :: Status
ApiFailureBase :: Status
data CUDAException
ExitCode :: Status -> CUDAException
UserError :: String -> CUDAException

-- | Raise a <a>CUDAException</a> in the IO Monad
cudaError :: String -> IO a

-- | Return the descriptive string associated with a particular error code
describe :: (Status) -> (String)

-- | A specially formatted error message
requireSDK :: Double -> String -> IO a

-- | Return the results of a function on successful execution, otherwise
--   return the error string associated with the return code
resultIfOk :: (Status, a) -> IO a

-- | Return the error string associated with an unsuccessful return code,
--   otherwise Nothing
nothingIfOk :: Status -> IO ()
instance Typeable CUDAException
instance Eq Status
instance Show Status
instance Show CUDAException
instance Exception CUDAException
instance Enum Status


-- | Utility functions
module Foreign.CUDA.Runtime.Utils

-- | Return the version number of the installed CUDA driver
runtimeVersion :: IO Int

-- | Return the version number of the installed CUDA runtime
driverVersion :: IO Int


-- | Common device functions
module Foreign.CUDA.Analysis.Device
data Compute
Compute :: !Int -> !Int -> Compute

-- | The compute mode the device is currently in
data ComputeMode
Default :: ComputeMode
Exclusive :: ComputeMode
Prohibited :: ComputeMode
ExclusiveProcess :: ComputeMode

-- | The properties of a compute device
data DeviceProperties
DeviceProperties :: !String -> !Compute -> !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> !Int -> !Int -> !(Int, Int, Int) -> !(Int, Int, Int) -> !Int -> !(Int, Int) -> !(Int, Int, Int) -> !Int -> !Int -> !Int64 -> !Int -> !Int -> !Int64 -> !ComputeMode -> !Bool -> !Bool -> !Bool -> !Int -> !Int -> !Bool -> !PCI -> !Bool -> !Bool -> !Bool -> !Bool -> DeviceProperties

-- | Identifier
deviceName :: DeviceProperties -> !String

-- | Supported compute capability
computeCapability :: DeviceProperties -> !Compute

-- | Available global memory on the device in bytes
totalGlobalMem :: DeviceProperties -> !Int64

-- | Available constant memory on the device in bytes
totalConstMem :: DeviceProperties -> !Int64

-- | Available shared memory per block in bytes
sharedMemPerBlock :: DeviceProperties -> !Int64

-- | 32-bit registers per block
regsPerBlock :: DeviceProperties -> !Int

-- | Warp size in threads (SIMD width)
warpSize :: DeviceProperties -> !Int

-- | Max number of threads per block
maxThreadsPerBlock :: DeviceProperties -> !Int

-- | Max number of threads per multiprocessor
maxThreadsPerMultiProcessor :: DeviceProperties -> !Int

-- | Max size of each dimension of a block
maxBlockSize :: DeviceProperties -> !(Int, Int, Int)

-- | Max size of each dimension of a grid
maxGridSize :: DeviceProperties -> !(Int, Int, Int)

-- | Maximum texture dimensions
maxTextureDim1D :: DeviceProperties -> !Int
maxTextureDim2D :: DeviceProperties -> !(Int, Int)
maxTextureDim3D :: DeviceProperties -> !(Int, Int, Int)

-- | Clock frequency in kilohertz
clockRate :: DeviceProperties -> !Int

-- | Number of multiprocessors on the device
multiProcessorCount :: DeviceProperties -> !Int

-- | Max pitch in bytes allowed by memory copies
memPitch :: DeviceProperties -> !Int64

-- | Global memory bus width in bits
memBusWidth :: DeviceProperties -> !Int

-- | Peak memory clock frequency in kilohertz
memClockRate :: DeviceProperties -> !Int

-- | Alignment requirement for textures
textureAlignment :: DeviceProperties -> !Int64
computeMode :: DeviceProperties -> !ComputeMode

-- | Device can concurrently copy memory and execute a kernel
deviceOverlap :: DeviceProperties -> !Bool

-- | Device can possibly execute multiple kernels concurrently
concurrentKernels :: DeviceProperties -> !Bool

-- | Device supports and has enabled error correction
eccEnabled :: DeviceProperties -> !Bool

-- | Number of asynchronous engines
asyncEngineCount :: DeviceProperties -> !Int

-- | Size of the L2 cache in bytes
cacheMemL2 :: DeviceProperties -> !Int

-- | Whether this is a Tesla device using the TCC driver
tccDriverEnabled :: DeviceProperties -> !Bool

-- | PCI device information for the device
pciInfo :: DeviceProperties -> !PCI

-- | Whether there is a runtime limit on kernels
kernelExecTimeoutEnabled :: DeviceProperties -> !Bool

-- | As opposed to discrete
integrated :: DeviceProperties -> !Bool

-- | Device can use pinned memory
canMapHostMemory :: DeviceProperties -> !Bool

-- | Device shares a unified address space with the host
unifiedAddressing :: DeviceProperties -> !Bool
data DeviceResources
DeviceResources :: !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Allocation -> DeviceResources

-- | Warp size
threadsPerWarp :: DeviceResources -> !Int

-- | Maximum number of in-flight threads on a multiprocessor
threadsPerMP :: DeviceResources -> !Int

-- | Maximum number of thread blocks resident on a multiprocessor
threadBlocksPerMP :: DeviceResources -> !Int

-- | Maximum number of in-flight warps per multiprocessor
warpsPerMP :: DeviceResources -> !Int

-- | Number of SIMD arithmetic units per multiprocessor
coresPerMP :: DeviceResources -> !Int

-- | Total amount of shared memory per multiprocessor (bytes)
sharedMemPerMP :: DeviceResources -> !Int

-- | Shared memory allocation unit size (bytes)
sharedMemAllocUnit :: DeviceResources -> !Int

-- | Total number of registers in a multiprocessor
regFileSize :: DeviceResources -> !Int

-- | Register allocation unit size
regAllocUnit :: DeviceResources -> !Int

-- | Register allocation granularity for warps
regAllocWarp :: DeviceResources -> !Int

-- | Maximum number of registers per thread
regPerThread :: DeviceResources -> !Int

-- | How multiprocessor resources are divided
allocation :: DeviceResources -> !Allocation
data Allocation
Warp :: Allocation
Block :: Allocation
data PCI
PCI :: !Int -> !Int -> !Int -> PCI

-- | PCI bus ID of the device
busID :: PCI -> !Int

-- | PCI device ID
deviceID :: PCI -> !Int

-- | PCI domain ID
domainID :: PCI -> !Int

-- | Extract some additional hardware resource limitations for a given
--   device.
deviceResources :: DeviceProperties -> DeviceResources
instance Eq ComputeMode
instance Show ComputeMode
instance Eq Compute
instance Show PCI
instance Show DeviceProperties
instance Ord Compute
instance Show Compute
instance Enum ComputeMode


-- | Occupancy calculations for CUDA kernels
--   
--   
--   <a>http://developer.download.nvidia.com/compute/cuda/3_0/sdk/docs/CUDA_Occupancy_calculator.xls</a>
--   
--   <i>Determining Registers Per Thread and Shared Memory Per Block</i>
--   
--   To determine the number of registers used per thread in your kernel,
--   simply compile the kernel code using the option
--   
--   <pre>
--   --ptxas-options=-v
--   </pre>
--   
--   to nvcc. This will output information about register, local memory,
--   shared memory, and constant memory usage for each kernel in the
--   <tt>.cu</tt> file. Alternatively, you can compile with the
--   <tt>-cubin</tt> option to nvcc. This will generate a <tt>.cubin</tt>
--   file, which you can open in a text editor. Look for the <tt>code</tt>
--   section with your kernel's name. Within the curly braces (<tt>{ ...
--   }</tt>) for that code block, you will see a line with <tt>reg =
--   X</tt>, where <tt>x</tt> is the number of registers used by your
--   kernel. You can also see the amount of shared memory used as <tt>smem
--   = Y</tt>. However, if your kernel declares any external shared memory
--   that is allocated dynamically, you will need to add the number in the
--   <tt>.cubin</tt> file to the amount you dynamically allocate at run
--   time to get the correct shared memory usage.
--   
--   <i>Notes About Occupancy</i>
--   
--   Higher occupancy does not necessarily mean higher performance. If a
--   kernel is not bandwidth bound, then increasing occupancy will not
--   necessarily increase performance. If a kernel invocation is already
--   running at least one thread block per multiprocessor in the GPU, and
--   it is bottlenecked by computation and not by global memory accesses,
--   then increasing occupancy may have no effect. In fact, making changes
--   just to increase occupancy can have other effects, such as additional
--   instructions, spills to local memory (which is off chip), divergent
--   branches, etc. As with any optimization, you should experiment to see
--   how changes affect the *wall clock time* of the kernel execution. For
--   bandwidth bound applications, on the other hand, increasing occupancy
--   can help better hide the latency of memory accesses, and therefore
--   improve performance.
module Foreign.CUDA.Analysis.Occupancy
data Occupancy
Occupancy :: !Int -> !Int -> !Int -> !Double -> Occupancy

-- | Active threads per multiprocessor
activeThreads :: Occupancy -> !Int

-- | Active thread blocks per multiprocessor
activeThreadBlocks :: Occupancy -> !Int

-- | Active warps per multiprocessor
activeWarps :: Occupancy -> !Int

-- | Occupancy of each multiprocessor (percent)
occupancy100 :: Occupancy -> !Double

-- | Calculate occupancy data for a given GPU and kernel resource usage
occupancy :: DeviceProperties -> Int -> Int -> Int -> Occupancy

-- | Optimise multiprocessor occupancy as a function of thread block size
--   and resource usage. This returns the smallest satisfying block size in
--   increments of a single warp.
optimalBlockSize :: DeviceProperties -> (Int -> Int) -> (Int -> Int) -> (Int, Occupancy)

-- | As <a>optimalBlockSize</a>, but with a generator that produces the
--   specific thread block sizes that should be tested. The generated list
--   can produce values in any order, but the last satisfying block size
--   will be returned. Hence, values should be monotonically decreasing to
--   return the smallest block size yielding maximum occupancy, and
--   vice-versa.
optimalBlockSizeBy :: DeviceProperties -> (DeviceProperties -> [Int]) -> (Int -> Int) -> (Int -> Int) -> (Int, Occupancy)

-- | Determine the maximum number of CTAs that can be run simultaneously
--   for a given kernel / device combination.
maxResidentBlocks :: DeviceProperties -> Int -> Int -> Int -> Int

-- | Increments in powers-of-two, over the range of supported thread block
--   sizes for the given device.
incPow2 :: DeviceProperties -> [Int]

-- | Increments in the warp size of the device, over the range of supported
--   thread block sizes.
incWarp :: DeviceProperties -> [Int]

-- | Decrements in powers-of-two, over the range of supported thread block
--   sizes for the given device.
decPow2 :: DeviceProperties -> [Int]

-- | Decrements in the warp size of the device, over the range of supported
--   thread block sizes.
decWarp :: DeviceProperties -> [Int]
instance Eq Occupancy
instance Ord Occupancy
instance Show Occupancy


-- | Device management routines
module Foreign.CUDA.Runtime.Device

-- | A device identifier
type Device = Int

-- | Device execution flags
data DeviceFlag
ScheduleAuto :: DeviceFlag
ScheduleSpin :: DeviceFlag
ScheduleYield :: DeviceFlag
BlockingSync :: DeviceFlag
MapHost :: DeviceFlag
LMemResizeToMax :: DeviceFlag

-- | The properties of a compute device
data DeviceProperties
DeviceProperties :: !String -> !Compute -> !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> !Int -> !Int -> !(Int, Int, Int) -> !(Int, Int, Int) -> !Int -> !(Int, Int) -> !(Int, Int, Int) -> !Int -> !Int -> !Int64 -> !Int -> !Int -> !Int64 -> !ComputeMode -> !Bool -> !Bool -> !Bool -> !Int -> !Int -> !Bool -> !PCI -> !Bool -> !Bool -> !Bool -> !Bool -> DeviceProperties

-- | Identifier
deviceName :: DeviceProperties -> !String

-- | Supported compute capability
computeCapability :: DeviceProperties -> !Compute

-- | Available global memory on the device in bytes
totalGlobalMem :: DeviceProperties -> !Int64

-- | Available constant memory on the device in bytes
totalConstMem :: DeviceProperties -> !Int64

-- | Available shared memory per block in bytes
sharedMemPerBlock :: DeviceProperties -> !Int64

-- | 32-bit registers per block
regsPerBlock :: DeviceProperties -> !Int

-- | Warp size in threads (SIMD width)
warpSize :: DeviceProperties -> !Int

-- | Max number of threads per block
maxThreadsPerBlock :: DeviceProperties -> !Int

-- | Max number of threads per multiprocessor
maxThreadsPerMultiProcessor :: DeviceProperties -> !Int

-- | Max size of each dimension of a block
maxBlockSize :: DeviceProperties -> !(Int, Int, Int)

-- | Max size of each dimension of a grid
maxGridSize :: DeviceProperties -> !(Int, Int, Int)

-- | Maximum texture dimensions
maxTextureDim1D :: DeviceProperties -> !Int
maxTextureDim2D :: DeviceProperties -> !(Int, Int)
maxTextureDim3D :: DeviceProperties -> !(Int, Int, Int)

-- | Clock frequency in kilohertz
clockRate :: DeviceProperties -> !Int

-- | Number of multiprocessors on the device
multiProcessorCount :: DeviceProperties -> !Int

-- | Max pitch in bytes allowed by memory copies
memPitch :: DeviceProperties -> !Int64

-- | Global memory bus width in bits
memBusWidth :: DeviceProperties -> !Int

-- | Peak memory clock frequency in kilohertz
memClockRate :: DeviceProperties -> !Int

-- | Alignment requirement for textures
textureAlignment :: DeviceProperties -> !Int64
computeMode :: DeviceProperties -> !ComputeMode

-- | Device can concurrently copy memory and execute a kernel
deviceOverlap :: DeviceProperties -> !Bool

-- | Device can possibly execute multiple kernels concurrently
concurrentKernels :: DeviceProperties -> !Bool

-- | Device supports and has enabled error correction
eccEnabled :: DeviceProperties -> !Bool

-- | Number of asynchronous engines
asyncEngineCount :: DeviceProperties -> !Int

-- | Size of the L2 cache in bytes
cacheMemL2 :: DeviceProperties -> !Int

-- | Whether this is a Tesla device using the TCC driver
tccDriverEnabled :: DeviceProperties -> !Bool

-- | PCI device information for the device
pciInfo :: DeviceProperties -> !PCI

-- | Whether there is a runtime limit on kernels
kernelExecTimeoutEnabled :: DeviceProperties -> !Bool

-- | As opposed to discrete
integrated :: DeviceProperties -> !Bool

-- | Device can use pinned memory
canMapHostMemory :: DeviceProperties -> !Bool

-- | Device shares a unified address space with the host
unifiedAddressing :: DeviceProperties -> !Bool
data Compute
Compute :: !Int -> !Int -> Compute

-- | The compute mode the device is currently in
data ComputeMode
Default :: ComputeMode
Exclusive :: ComputeMode
Prohibited :: ComputeMode
ExclusiveProcess :: ComputeMode

-- | Select the compute device which best matches the given criteria
choose :: DeviceProperties -> IO Device

-- | Returns which device is currently being used
get :: IO Device

-- | Returns the number of devices available for execution, with compute
--   capability &gt;= 1.0
count :: IO Int

-- | Return information about the selected compute device
props :: Device -> IO DeviceProperties

-- | Set device to be used for GPU execution
set :: Device -> IO ()

-- | Set flags to be used for device executions
setFlags :: [DeviceFlag] -> IO ()

-- | Set list of devices for CUDA execution in priority order
setOrder :: [Device] -> IO ()

-- | Explicitly destroys and cleans up all runtime resources associated
--   with the current device in the current process. Any subsequent API
--   call will reinitialise the device.
--   
--   Note that this function will reset the device immediately. It is the
--   caller’s responsibility to ensure that the device is not being
--   accessed by any other host threads from the process when this function
--   is called.
reset :: IO ()

-- | Block until the device has completed all preceding requested tasks.
--   Returns an error if one of the tasks fails.
sync :: IO ()

-- | Possible option values for direct peer memory access
data PeerFlag

-- | Queries if the first device can directly access the memory of the
--   second. If direct access is possible, it can then be enabled with
--   <a>add</a>. Requires cuda-4.0.
accessible :: Device -> Device -> IO Bool

-- | If the devices of both the current and supplied contexts support
--   unified addressing, then enable allocations in the supplied context to
--   be accessible by the current context. Requires cuda-4.0.
add :: Device -> [PeerFlag] -> IO ()

-- | Disable direct memory access from the current context to the supplied
--   context. Requires cuda-4.0.
remove :: Device -> IO ()

-- | Device limit flags
data Limit
Stacksize :: Limit
Printffifosize :: Limit
Mallocheapsize :: Limit
Devruntimesyncdepth :: Limit
Devruntimependinglaunchcount :: Limit

-- | Query compute 2.0 call stack limits. Requires cuda-3.1.
getLimit :: Limit -> IO Int

-- | Set compute 2.0 call stack limits. Requires cuda-3.1.
setLimit :: Limit -> Int -> IO ()
instance Eq DeviceFlag
instance Show DeviceFlag
instance Eq Limit
instance Show Limit
instance Enum Limit
instance Enum PeerFlag
instance Storable DeviceProperties
instance Enum DeviceFlag


-- | Device management for low-level driver interface
module Foreign.CUDA.Driver.Device
newtype Device
Device :: (CInt) -> Device
useDevice :: Device -> (CInt)

-- | The properties of a compute device
data DeviceProperties
DeviceProperties :: !String -> !Compute -> !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> !Int -> !Int -> !(Int, Int, Int) -> !(Int, Int, Int) -> !Int -> !(Int, Int) -> !(Int, Int, Int) -> !Int -> !Int -> !Int64 -> !Int -> !Int -> !Int64 -> !ComputeMode -> !Bool -> !Bool -> !Bool -> !Int -> !Int -> !Bool -> !PCI -> !Bool -> !Bool -> !Bool -> !Bool -> DeviceProperties

-- | Identifier
deviceName :: DeviceProperties -> !String

-- | Supported compute capability
computeCapability :: DeviceProperties -> !Compute

-- | Available global memory on the device in bytes
totalGlobalMem :: DeviceProperties -> !Int64

-- | Available constant memory on the device in bytes
totalConstMem :: DeviceProperties -> !Int64

-- | Available shared memory per block in bytes
sharedMemPerBlock :: DeviceProperties -> !Int64

-- | 32-bit registers per block
regsPerBlock :: DeviceProperties -> !Int

-- | Warp size in threads (SIMD width)
warpSize :: DeviceProperties -> !Int

-- | Max number of threads per block
maxThreadsPerBlock :: DeviceProperties -> !Int

-- | Max number of threads per multiprocessor
maxThreadsPerMultiProcessor :: DeviceProperties -> !Int

-- | Max size of each dimension of a block
maxBlockSize :: DeviceProperties -> !(Int, Int, Int)

-- | Max size of each dimension of a grid
maxGridSize :: DeviceProperties -> !(Int, Int, Int)

-- | Maximum texture dimensions
maxTextureDim1D :: DeviceProperties -> !Int
maxTextureDim2D :: DeviceProperties -> !(Int, Int)
maxTextureDim3D :: DeviceProperties -> !(Int, Int, Int)

-- | Clock frequency in kilohertz
clockRate :: DeviceProperties -> !Int

-- | Number of multiprocessors on the device
multiProcessorCount :: DeviceProperties -> !Int

-- | Max pitch in bytes allowed by memory copies
memPitch :: DeviceProperties -> !Int64

-- | Global memory bus width in bits
memBusWidth :: DeviceProperties -> !Int

-- | Peak memory clock frequency in kilohertz
memClockRate :: DeviceProperties -> !Int

-- | Alignment requirement for textures
textureAlignment :: DeviceProperties -> !Int64
computeMode :: DeviceProperties -> !ComputeMode

-- | Device can concurrently copy memory and execute a kernel
deviceOverlap :: DeviceProperties -> !Bool

-- | Device can possibly execute multiple kernels concurrently
concurrentKernels :: DeviceProperties -> !Bool

-- | Device supports and has enabled error correction
eccEnabled :: DeviceProperties -> !Bool

-- | Number of asynchronous engines
asyncEngineCount :: DeviceProperties -> !Int

-- | Size of the L2 cache in bytes
cacheMemL2 :: DeviceProperties -> !Int

-- | Whether this is a Tesla device using the TCC driver
tccDriverEnabled :: DeviceProperties -> !Bool

-- | PCI device information for the device
pciInfo :: DeviceProperties -> !PCI

-- | Whether there is a runtime limit on kernels
kernelExecTimeoutEnabled :: DeviceProperties -> !Bool

-- | As opposed to discrete
integrated :: DeviceProperties -> !Bool

-- | Device can use pinned memory
canMapHostMemory :: DeviceProperties -> !Bool

-- | Device shares a unified address space with the host
unifiedAddressing :: DeviceProperties -> !Bool

-- | Device attributes
data DeviceAttribute
MaxThreadsPerBlock :: DeviceAttribute
MaxBlockDimX :: DeviceAttribute
MaxBlockDimY :: DeviceAttribute
MaxBlockDimZ :: DeviceAttribute
MaxGridDimX :: DeviceAttribute
MaxGridDimY :: DeviceAttribute
MaxGridDimZ :: DeviceAttribute
MaxSharedMemoryPerBlock :: DeviceAttribute
SharedMemoryPerBlock :: DeviceAttribute
TotalConstantMemory :: DeviceAttribute
WarpSize :: DeviceAttribute
MaxPitch :: DeviceAttribute
MaxRegistersPerBlock :: DeviceAttribute
RegistersPerBlock :: DeviceAttribute
ClockRate :: DeviceAttribute
TextureAlignment :: DeviceAttribute
GpuOverlap :: DeviceAttribute
MultiprocessorCount :: DeviceAttribute
KernelExecTimeout :: DeviceAttribute
Integrated :: DeviceAttribute
CanMapHostMemory :: DeviceAttribute
ComputeMode :: DeviceAttribute
MaximumTexture1dWidth :: DeviceAttribute
MaximumTexture2dWidth :: DeviceAttribute
MaximumTexture2dHeight :: DeviceAttribute
MaximumTexture3dWidth :: DeviceAttribute
MaximumTexture3dHeight :: DeviceAttribute
MaximumTexture3dDepth :: DeviceAttribute
MaximumTexture2dLayeredWidth :: DeviceAttribute
MaximumTexture2dArrayWidth :: DeviceAttribute
MaximumTexture2dLayeredHeight :: DeviceAttribute
MaximumTexture2dArrayHeight :: DeviceAttribute
MaximumTexture2dLayeredLayers :: DeviceAttribute
MaximumTexture2dArrayNumslices :: DeviceAttribute
SurfaceAlignment :: DeviceAttribute
ConcurrentKernels :: DeviceAttribute
EccEnabled :: DeviceAttribute
PciBusId :: DeviceAttribute
PciDeviceId :: DeviceAttribute
TccDriver :: DeviceAttribute
MemoryClockRate :: DeviceAttribute
GlobalMemoryBusWidth :: DeviceAttribute
L2CacheSize :: DeviceAttribute
MaxThreadsPerMultiprocessor :: DeviceAttribute
AsyncEngineCount :: DeviceAttribute
UnifiedAddressing :: DeviceAttribute
MaximumTexture1dLayeredWidth :: DeviceAttribute
MaximumTexture1dLayeredLayers :: DeviceAttribute
CanTex2dGather :: DeviceAttribute
MaximumTexture2dGatherWidth :: DeviceAttribute
MaximumTexture2dGatherHeight :: DeviceAttribute
MaximumTexture3dWidthAlternate :: DeviceAttribute
MaximumTexture3dHeightAlternate :: DeviceAttribute
MaximumTexture3dDepthAlternate :: DeviceAttribute
PciDomainId :: DeviceAttribute
TexturePitchAlignment :: DeviceAttribute
MaximumTexturecubemapWidth :: DeviceAttribute
MaximumTexturecubemapLayeredWidth :: DeviceAttribute
MaximumTexturecubemapLayeredLayers :: DeviceAttribute
MaximumSurface1dWidth :: DeviceAttribute
MaximumSurface2dWidth :: DeviceAttribute
MaximumSurface2dHeight :: DeviceAttribute
MaximumSurface3dWidth :: DeviceAttribute
MaximumSurface3dHeight :: DeviceAttribute
MaximumSurface3dDepth :: DeviceAttribute
MaximumSurface1dLayeredWidth :: DeviceAttribute
MaximumSurface1dLayeredLayers :: DeviceAttribute
MaximumSurface2dLayeredWidth :: DeviceAttribute
MaximumSurface2dLayeredHeight :: DeviceAttribute
MaximumSurface2dLayeredLayers :: DeviceAttribute
MaximumSurfacecubemapWidth :: DeviceAttribute
MaximumSurfacecubemapLayeredWidth :: DeviceAttribute
MaximumSurfacecubemapLayeredLayers :: DeviceAttribute
MaximumTexture1dLinearWidth :: DeviceAttribute
MaximumTexture2dLinearWidth :: DeviceAttribute
MaximumTexture2dLinearHeight :: DeviceAttribute
MaximumTexture2dLinearPitch :: DeviceAttribute
MaximumTexture2dMipmappedWidth :: DeviceAttribute
MaximumTexture2dMipmappedHeight :: DeviceAttribute
ComputeCapabilityMajor :: DeviceAttribute
ComputeCapabilityMinor :: DeviceAttribute
MaximumTexture1dMipmappedWidth :: DeviceAttribute
StreamPrioritiesSupported :: DeviceAttribute
GlobalL1CacheSupported :: DeviceAttribute
LocalL1CacheSupported :: DeviceAttribute
MaxSharedMemoryPerMultiprocessor :: DeviceAttribute
MaxRegistersPerMultiprocessor :: DeviceAttribute
ManagedMemory :: DeviceAttribute
MultiGpuBoard :: DeviceAttribute
MultiGpuBoardGroupId :: DeviceAttribute
CU_DEVICE_ATTRIBUTE_MAX :: DeviceAttribute
data Compute
Compute :: !Int -> !Int -> Compute

-- | The compute mode the device is currently in
data ComputeMode
Default :: ComputeMode
Exclusive :: ComputeMode
Prohibited :: ComputeMode
ExclusiveProcess :: ComputeMode

-- | Possible option flags for CUDA initialisation. Dummy instance until
--   the API exports actual option values.
data InitFlag

-- | Initialise the CUDA driver API. Must be called before any other driver
--   function.
initialise :: [InitFlag] -> IO ()
capability :: Device -> IO Compute

-- | Return a device handle
device :: Int -> IO Device

-- | Return the selected attribute for the given device
attribute :: Device -> DeviceAttribute -> IO Int

-- | Return the number of device with compute capability &gt; 1.0
count :: IO Int

-- | Name of the device
name :: Device -> IO String

-- | Return the properties of the selected device
props :: Device -> IO DeviceProperties

-- | Total memory available on the device (bytes)
totalMem :: Device -> IO Int64
instance Eq Device
instance Show Device
instance Eq DeviceAttribute
instance Show DeviceAttribute
instance Show CUDevProp
instance Enum InitFlag
instance Storable CUDevProp
instance Enum DeviceAttribute


-- | Context management for low-level driver interface
module Foreign.CUDA.Driver.Context

-- | A device context
newtype Context
Context :: ((Ptr ())) -> Context
useContext :: Context -> ((Ptr ()))

-- | Context creation flags
data ContextFlag
SchedAuto :: ContextFlag
SchedSpin :: ContextFlag
SchedYield :: ContextFlag
SchedBlockingSync :: ContextFlag

-- | <i>Deprecated: use SchedBlockingSync instead</i>
BlockingSync :: ContextFlag
SchedMask :: ContextFlag
MapHost :: ContextFlag
LmemResizeToMax :: ContextFlag
FlagsMask :: ContextFlag

-- | Create a new CUDA context and associate it with the calling thread
create :: Device -> [ContextFlag] -> IO Context

-- | Increments the usage count of the context. API: no context flags are
--   currently supported, so this parameter must be empty.

-- | <i>Deprecated: deprecated as of CUDA-4.0</i>
attach :: Context -> [ContextFlag] -> IO ()

-- | Detach the context, and destroy if no longer used

-- | <i>Deprecated: deprecated as of CUDA-4.0</i>
detach :: Context -> IO ()

-- | Destroy the specified context. This fails if the context is more than
--   a single attachment (including that from initial creation).
destroy :: Context -> IO ()

-- | Return the device of the currently active context
device :: IO Device

-- | Pop the current CUDA context from the CPU thread. The context must
--   have a single usage count (matching calls to <a>attach</a> and
--   <a>detach</a>). If successful, the new context is returned, and the
--   old may be attached to a different CPU.
pop :: IO Context

-- | Push the given context onto the CPU's thread stack of current
--   contexts. The context must be floating (via <a>pop</a>), i.e. not
--   attached to any thread.
push :: Context -> IO ()

-- | Block until the device has completed all preceding requests
sync :: IO ()

-- | Return the context bound to the calling CPU thread. Requires cuda-4.0.
get :: IO Context

-- | Bind the specified context to the calling thread. Requires cuda-4.0.
set :: Context -> IO ()

-- | Possible option values for direct peer memory access
data PeerFlag

-- | Queries if the first device can directly access the memory of the
--   second. If direct access is possible, it can then be enabled with
--   <a>add</a>. Requires cuda-4.0.
accessible :: Device -> Device -> IO Bool

-- | If the devices of both the current and supplied contexts support
--   unified addressing, then enable allocations in the supplied context to
--   be accessible by the current context. Requires cuda-4.0.
add :: Context -> [PeerFlag] -> IO ()

-- | Disable direct memory access from the current context to the supplied
--   context. Requires cuda-4.0.
remove :: Context -> IO ()

-- | Device cache configuration preference
data Cache
PreferNone :: Cache
PreferShared :: Cache
PreferL1 :: Cache
PreferEqual :: Cache

-- | Device limits flags
data Limit
StackSize :: Limit
PrintfFifoSize :: Limit
MallocHeapSize :: Limit
DevRuntimeSyncDepth :: Limit
DevRuntimePendingLaunchCount :: Limit
Max :: Limit

-- | Query compute 2.0 call stack limits. Requires cuda-3.1.
getLimit :: Limit -> IO Int

-- | Specify the size of the call stack, for compute 2.0 devices. Requires
--   cuda-3.1.
setLimit :: Limit -> Int -> IO ()

-- | On devices where the L1 cache and shared memory use the same hardware
--   resources, this sets the preferred cache configuration for the current
--   context. This is only a preference. Requires cuda-3.2.
setCacheConfig :: Cache -> IO ()
instance Eq Context
instance Show Context
instance Eq ContextFlag
instance Show ContextFlag
instance Eq Limit
instance Show Limit
instance Eq Cache
instance Show Cache
instance Enum PeerFlag
instance Enum Cache
instance Enum Limit
instance Enum ContextFlag


-- | Meta-module exporting CUDA analysis routines
module Foreign.CUDA.Analysis


-- | Data types that are equivalent and can be shared freely between the
--   CUDA Runtime and Driver APIs.
module Foreign.CUDA.Types

-- | A reference to data stored on the device.
newtype DevicePtr a
DevicePtr :: Ptr a -> DevicePtr a
useDevicePtr :: DevicePtr a -> Ptr a

-- | A reference to page-locked host memory.
--   
--   A <a>HostPtr</a> is just a plain <a>Ptr</a>, but the memory has been
--   allocated by CUDA into page locked memory. This means that the data
--   can be copied to the GPU via DMA (direct memory access). Note that the
--   use of the system function <tt>mlock</tt> is not sufficient here ---
--   the CUDA version ensures that the <i>physical</i> address stays this
--   same, not just the virtual address.
--   
--   To copy data into a <a>HostPtr</a> array, you may use for example
--   <tt>withHostPtr</tt> together with <a>copyArray</a> or
--   <a>moveArray</a>.
newtype HostPtr a
HostPtr :: Ptr a -> HostPtr a
useHostPtr :: HostPtr a -> Ptr a

-- | Events are markers that can be inserted into the CUDA execution stream
--   and later queried.
newtype Event
Event :: ((Ptr ())) -> Event
useEvent :: Event -> ((Ptr ()))

-- | Event creation flags
data EventFlag
Default :: EventFlag
BlockingSync :: EventFlag
DisableTiming :: EventFlag
Interprocess :: EventFlag
data WaitFlag

-- | A processing stream. All operations in a stream are synchronous and
--   executed in sequence, but operations in different non-default streams
--   may happen out-of-order or concurrently with one another.
--   
--   Use <a>Event</a>s to synchronise operations between streams.
newtype Stream
Stream :: ((Ptr ())) -> Stream
useStream :: Stream -> ((Ptr ()))

-- | Possible option flags for stream initialisation. Dummy instance until
--   the API exports actual option values.
data StreamFlag

-- | The main execution stream. No operations overlap with operations in
--   the default stream.
defaultStream :: Stream
instance Eq (DevicePtr a)
instance Ord (DevicePtr a)
instance Eq (HostPtr a)
instance Ord (HostPtr a)
instance Eq Event
instance Show Event
instance Eq EventFlag
instance Show EventFlag
instance Eq Stream
instance Show Stream
instance Enum StreamFlag
instance Enum WaitFlag
instance Enum EventFlag
instance Storable (HostPtr a)
instance Show (HostPtr a)
instance Storable (DevicePtr a)
instance Show (DevicePtr a)


-- | Event management for C-for-CUDA runtime environment
module Foreign.CUDA.Runtime.Event

-- | Events are markers that can be inserted into the CUDA execution stream
--   and later queried.
data Event

-- | Event creation flags
data EventFlag
Default :: EventFlag
BlockingSync :: EventFlag
DisableTiming :: EventFlag
Interprocess :: EventFlag
data WaitFlag

-- | Create a new event
create :: [EventFlag] -> IO Event

-- | Destroy an event
destroy :: Event -> IO ()

-- | Determine the elapsed time (in milliseconds) between two events
elapsedTime :: Event -> Event -> IO Float

-- | Determines if a event has actually been recorded
query :: Event -> IO Bool

-- | Record an event once all operations in the current context (or
--   optionally specified stream) have completed. This operation is
--   asynchronous.
record :: Event -> Maybe Stream -> IO ()

-- | Makes all future work submitted to the (optional) stream wait until
--   the given event reports completion before beginning execution.
--   Synchronisation is performed on the device, including when the event
--   and stream are from different device contexts. Requires cuda-3.2.
wait :: Event -> Maybe Stream -> [WaitFlag] -> IO ()

-- | Wait until the event has been recorded
block :: Event -> IO ()


-- | Stream management routines
module Foreign.CUDA.Runtime.Stream

-- | A processing stream. All operations in a stream are synchronous and
--   executed in sequence, but operations in different non-default streams
--   may happen out-of-order or concurrently with one another.
--   
--   Use <a>Event</a>s to synchronise operations between streams.
newtype Stream
Stream :: ((Ptr ())) -> Stream
useStream :: Stream -> ((Ptr ()))

-- | Create a new asynchronous stream
create :: IO Stream

-- | Destroy and clean up an asynchronous stream
destroy :: Stream -> IO ()

-- | Determine if all operations in a stream have completed
finished :: Stream -> IO Bool

-- | Block until all operations in a Stream have been completed
block :: Stream -> IO ()

-- | The main execution stream. No operations overlap with operations in
--   the default stream.
defaultStream :: Stream


-- | Kernel execution control for C-for-CUDA runtime interface
module Foreign.CUDA.Runtime.Exec

-- | A <tt><b>global</b></tt> device function.
--   
--   Note that the use of a string naming a function was deprecated in CUDA
--   4.1 and removed in CUDA 5.0.
type Fun = FunPtr ()
data FunAttributes
FunAttributes :: !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> FunAttributes
constSizeBytes :: FunAttributes -> !Int64
localSizeBytes :: FunAttributes -> !Int64
sharedSizeBytes :: FunAttributes -> !Int64

-- | maximum block size that can be successively launched (based on
--   register usage)
maxKernelThreadsPerBlock :: FunAttributes -> !Int

-- | number of registers required for each thread
numRegs :: FunAttributes -> !Int

-- | Kernel function parameters. Doubles will be converted to an internal
--   float representation on devices that do not support doubles natively.
data FunParam
IArg :: !Int -> FunParam
FArg :: !Float -> FunParam
DArg :: !Double -> FunParam
VArg :: !a -> FunParam

-- | Cache configuration preference
data CacheConfig
None :: CacheConfig
Shared :: CacheConfig
L1 :: CacheConfig
Equal :: CacheConfig

-- | Obtain the attributes of the named <tt><b>global</b></tt> device
--   function. This itemises the requirements to successfully launch the
--   given kernel.
attributes :: Fun -> IO FunAttributes

-- | Specify the grid and block dimensions for a device call. Used in
--   conjunction with <a>setParams</a>, this pushes data onto the execution
--   stack that will be popped when a function is <a>launch</a>ed.
setConfig :: (Int, Int) -> (Int, Int, Int) -> Int64 -> Maybe Stream -> IO ()

-- | Set the argument parameters that will be passed to the next kernel
--   invocation. This is used in conjunction with <a>setConfig</a> to
--   control kernel execution.
setParams :: [FunParam] -> IO ()

-- | On devices where the L1 cache and shared memory use the same hardware
--   resources, this sets the preferred cache configuration for the given
--   device function. This is only a preference; the driver is free to
--   choose a different configuration as required to execute the function.
--   
--   Switching between configuration modes may insert a device-side
--   synchronisation point for streamed kernel launches
setCacheConfig :: Fun -> CacheConfig -> IO ()

-- | Invoke the <tt><b>global</b></tt> kernel function on the device. This
--   must be preceded by a call to <a>setConfig</a> and (if appropriate)
--   <a>setParams</a>.
launch :: Fun -> IO ()

-- | Invoke a kernel on a <tt>(gx * gy)</tt> grid of blocks, where each
--   block contains <tt>(tx * ty * tz)</tt> threads and has access to a
--   given number of bytes of shared memory. The launch may also be
--   associated with a specific <a>Stream</a>.
launchKernel :: Fun -> (Int, Int) -> (Int, Int, Int) -> Int64 -> Maybe Stream -> [FunParam] -> IO ()
instance Show FunAttributes
instance Eq CacheConfig
instance Show CacheConfig
instance Enum CacheConfig
instance Storable FunAttributes


-- | Event management for low-level driver interface
module Foreign.CUDA.Driver.Event

-- | Events are markers that can be inserted into the CUDA execution stream
--   and later queried.
newtype Event
Event :: ((Ptr ())) -> Event
useEvent :: Event -> ((Ptr ()))

-- | Event creation flags
data EventFlag
Default :: EventFlag
BlockingSync :: EventFlag
DisableTiming :: EventFlag
Interprocess :: EventFlag
data WaitFlag

-- | Create a new event
create :: [EventFlag] -> IO Event

-- | Destroy an event
destroy :: Event -> IO ()

-- | Determine the elapsed time (in milliseconds) between two events
elapsedTime :: Event -> Event -> IO Float

-- | Determines if a event has actually been recorded
query :: Event -> IO Bool

-- | Record an event once all operations in the current context (or
--   optionally specified stream) have completed. This operation is
--   asynchronous.
record :: Event -> Maybe Stream -> IO ()

-- | Makes all future work submitted to the (optional) stream wait until
--   the given event reports completion before beginning execution.
--   Synchronisation is performed on the device, including when the event
--   and stream are from different device contexts. Requires cuda-3.2.
wait :: Event -> Maybe Stream -> [WaitFlag] -> IO ()

-- | Wait until the event has been recorded
block :: Event -> IO ()


-- | Stream management for low-level driver interface
module Foreign.CUDA.Driver.Stream

-- | A processing stream. All operations in a stream are synchronous and
--   executed in sequence, but operations in different non-default streams
--   may happen out-of-order or concurrently with one another.
--   
--   Use <a>Event</a>s to synchronise operations between streams.
newtype Stream
Stream :: ((Ptr ())) -> Stream
useStream :: Stream -> ((Ptr ()))

-- | Possible option flags for stream initialisation. Dummy instance until
--   the API exports actual option values.
data StreamFlag

-- | Create a new stream
create :: [StreamFlag] -> IO Stream

-- | Destroy a stream
destroy :: Stream -> IO ()

-- | Check if all operations in the stream have completed
finished :: Stream -> IO Bool

-- | Wait until the device has completed all operations in the Stream
block :: Stream -> IO ()

-- | The main execution stream. No operations overlap with operations in
--   the default stream.
defaultStream :: Stream


-- | Kernel execution control for low-level driver interface
module Foreign.CUDA.Driver.Exec

-- | A <tt><b>global</b></tt> device function
newtype Fun
Fun :: ((Ptr ())) -> Fun
data FunParam
IArg :: !Int32 -> FunParam
FArg :: !Float -> FunParam
VArg :: !a -> FunParam

-- | Function attributes
data FunAttribute
MaxKernelThreadsPerBlock :: FunAttribute
SharedSizeBytes :: FunAttribute
ConstSizeBytes :: FunAttribute
LocalSizeBytes :: FunAttribute
NumRegs :: FunAttribute
PtxVersion :: FunAttribute
BinaryVersion :: FunAttribute
CacheModeCa :: FunAttribute
CU_FUNC_ATTRIBUTE_MAX :: FunAttribute

-- | Returns the value of the selected attribute requirement for the given
--   kernel
requires :: Fun -> FunAttribute -> IO Int

-- | Specify the <tt>(x,y,z)</tt> dimensions of the thread blocks that are
--   created when the given kernel function is launched.

-- | <i>Deprecated: use launchKernel instead</i>
setBlockShape :: Fun -> (Int, Int, Int) -> IO ()

-- | Set the number of bytes of dynamic shared memory to be available to
--   each thread block when the function is launched

-- | <i>Deprecated: use launchKernel instead</i>
setSharedSize :: Fun -> Integer -> IO ()

-- | Set the parameters that will specified next time the kernel is invoked

-- | <i>Deprecated: use launchKernel instead</i>
setParams :: Fun -> [FunParam] -> IO ()

-- | On devices where the L1 cache and shared memory use the same hardware
--   resources, this sets the preferred cache configuration for the given
--   device function. This is only a preference; the driver is free to
--   choose a different configuration as required to execute the function.
--   
--   Switching between configuration modes may insert a device-side
--   synchronisation point for streamed kernel launches.
setCacheConfigFun :: Fun -> Cache -> IO ()

-- | Invoke the kernel on a size <tt>(w,h)</tt> grid of blocks. Each block
--   contains the number of threads specified by a previous call to
--   <a>setBlockShape</a>. The launch may also be associated with a
--   specific <a>Stream</a>.

-- | <i>Deprecated: use launchKernel instead</i>
launch :: Fun -> (Int, Int) -> Maybe Stream -> IO ()

-- | Invoke a kernel on a <tt>(gx * gy * gz)</tt> grid of blocks, where
--   each block contains <tt>(tx * ty * tz)</tt> threads and has access to
--   a given number of bytes of shared memory. The launch may also be
--   associated with a specific <a>Stream</a>.
--   
--   In <a>launchKernel</a>, the number of kernel parameters and their
--   offsets and sizes do not need to be specified, as this information is
--   retrieved directly from the kernel's image. This requires the kernel
--   to have been compiled with toolchain version 3.2 or later.
--   
--   The alternative <a>launchKernel'</a> will pass the arguments in
--   directly, requiring the application to know the size and
--   alignment/padding of each kernel parameter.
launchKernel :: Fun -> (Int, Int, Int) -> (Int, Int, Int) -> Int -> Maybe Stream -> [FunParam] -> IO ()

-- | Invoke a kernel on a <tt>(gx * gy * gz)</tt> grid of blocks, where
--   each block contains <tt>(tx * ty * tz)</tt> threads and has access to
--   a given number of bytes of shared memory. The launch may also be
--   associated with a specific <a>Stream</a>.
--   
--   In <a>launchKernel</a>, the number of kernel parameters and their
--   offsets and sizes do not need to be specified, as this information is
--   retrieved directly from the kernel's image. This requires the kernel
--   to have been compiled with toolchain version 3.2 or later.
--   
--   The alternative <a>launchKernel'</a> will pass the arguments in
--   directly, requiring the application to know the size and
--   alignment/padding of each kernel parameter.
launchKernel' :: Fun -> (Int, Int, Int) -> (Int, Int, Int) -> Int -> Maybe Stream -> [FunParam] -> IO ()
instance Eq FunAttribute
instance Show FunAttribute
instance Storable FunParam
instance Enum FunAttribute


-- | Data pointers on the host and device. These can be shared freely
--   between the CUDA runtime and Driver APIs.
module Foreign.CUDA.Ptr

-- | A reference to data stored on the device.
newtype DevicePtr a
DevicePtr :: Ptr a -> DevicePtr a
useDevicePtr :: DevicePtr a -> Ptr a

-- | Look at the contents of device memory. This takes an IO action that
--   will be applied to that pointer, the result of which is returned. It
--   would be silly to return the pointer from the action.
withDevicePtr :: DevicePtr a -> (Ptr a -> IO b) -> IO b

-- | Return a unique handle associated with the given device pointer
devPtrToWordPtr :: DevicePtr a -> WordPtr

-- | Return a device pointer from the given handle
wordPtrToDevPtr :: WordPtr -> DevicePtr a

-- | The constant <a>nullDevPtr</a> contains the distinguished memory
--   location that is not associated with a valid memory location
nullDevPtr :: DevicePtr a

-- | Cast a device pointer from one type to another
castDevPtr :: DevicePtr a -> DevicePtr b

-- | Advance the pointer address by the given offset in bytes.
plusDevPtr :: DevicePtr a -> Int -> DevicePtr a

-- | Given an alignment constraint, align the device pointer to the next
--   highest address satisfying the constraint
alignDevPtr :: DevicePtr a -> Int -> DevicePtr a

-- | Compute the difference between the second and first argument. This
--   fulfils the relation
--   
--   <pre>
--   p2 == p1 `plusDevPtr` (p2 `minusDevPtr` p1)
--   </pre>
minusDevPtr :: DevicePtr a -> DevicePtr a -> Int

-- | Advance a pointer into a device array by the given number of elements
advanceDevPtr :: Storable a => DevicePtr a -> Int -> DevicePtr a

-- | A reference to page-locked host memory.
--   
--   A <a>HostPtr</a> is just a plain <a>Ptr</a>, but the memory has been
--   allocated by CUDA into page locked memory. This means that the data
--   can be copied to the GPU via DMA (direct memory access). Note that the
--   use of the system function <tt>mlock</tt> is not sufficient here ---
--   the CUDA version ensures that the <i>physical</i> address stays this
--   same, not just the virtual address.
--   
--   To copy data into a <a>HostPtr</a> array, you may use for example
--   <tt>withHostPtr</tt> together with <a>copyArray</a> or
--   <a>moveArray</a>.
newtype HostPtr a
HostPtr :: Ptr a -> HostPtr a
useHostPtr :: HostPtr a -> Ptr a

-- | Apply an IO action to the memory reference living inside the host
--   pointer object. All uses of the pointer should be inside the
--   <a>withHostPtr</a> bracket.
withHostPtr :: HostPtr a -> (Ptr a -> IO b) -> IO b

-- | The constant <a>nullHostPtr</a> contains the distinguished memory
--   location that is not associated with a valid memory location
nullHostPtr :: HostPtr a

-- | Cast a host pointer from one type to another
castHostPtr :: HostPtr a -> HostPtr b

-- | Advance the pointer address by the given offset in bytes
plusHostPtr :: HostPtr a -> Int -> HostPtr a

-- | Given an alignment constraint, align the host pointer to the next
--   highest address satisfying the constraint
alignHostPtr :: HostPtr a -> Int -> HostPtr a

-- | Compute the difference between the second and first argument
minusHostPtr :: HostPtr a -> HostPtr a -> Int

-- | Advance a pointer into a host array by a given number of elements
advanceHostPtr :: Storable a => HostPtr a -> Int -> HostPtr a


-- | Memory management for CUDA devices
module Foreign.CUDA.Runtime.Marshal

-- | Options for host allocation
data AllocFlag
Portable :: AllocFlag
DeviceMapped :: AllocFlag
WriteCombined :: AllocFlag

-- | Allocate a section of linear memory on the host which is page-locked
--   and directly accessible from the device. The storage is sufficient to
--   hold the given number of elements of a storable type. The runtime
--   system automatically accelerates calls to functions such as
--   <a>peekArrayAsync</a> and <a>pokeArrayAsync</a> that refer to
--   page-locked memory.
--   
--   Note that since the amount of pageable memory is thusly reduced,
--   overall system performance may suffer. This is best used sparingly to
--   allocate staging areas for data exchange
mallocHostArray :: Storable a => [AllocFlag] -> Int -> IO (HostPtr a)

-- | Free page-locked host memory previously allocated with
--   <tt>mallecHost</tt>
freeHost :: HostPtr a -> IO ()

-- | Allocate a section of linear memory on the device, and return a
--   reference to it. The memory is sufficient to hold the given number of
--   elements of storable type. It is suitable aligned, and not cleared.
mallocArray :: Storable a => Int -> IO (DevicePtr a)

-- | Execute a computation, passing a pointer to a temporarily allocated
--   block of memory sufficient to hold the given number of elements of
--   storable type. The memory is freed when the computation terminates
--   (normally or via an exception), so the pointer must not be used after
--   this.
--   
--   Note that kernel launches can be asynchronous, so you may need to add
--   a synchronisation point at the end of the computation.
allocaArray :: Storable a => Int -> (DevicePtr a -> IO b) -> IO b

-- | Free previously allocated memory on the device
free :: DevicePtr a -> IO ()

-- | Options for unified memory allocations
data AttachFlag
Global :: AttachFlag
Host :: AttachFlag
Single :: AttachFlag

-- | Allocates memory that will be automatically managed by the Unified
--   Memory system
mallocManagedArray :: Storable a => [AttachFlag] -> Int -> IO (DevicePtr a)

-- | Copy a number of elements from the device to host memory. This is a
--   synchronous operation.
peekArray :: Storable a => Int -> DevicePtr a -> Ptr a -> IO ()

-- | Copy memory from the device asynchronously, possibly associated with a
--   particular stream. The destination memory must be page locked.
peekArrayAsync :: Storable a => Int -> DevicePtr a -> HostPtr a -> Maybe Stream -> IO ()

-- | Copy a 2D memory area from the device to the host. This is a
--   synchronous operation.
peekArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> Ptr a -> Int -> IO ()

-- | Copy a 2D memory area from the device to the host asynchronously,
--   possibly associated with a particular stream. The destination array
--   must be page locked.
peekArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> HostPtr a -> Int -> Maybe Stream -> IO ()

-- | Copy a number of elements from the device into a new Haskell list.
--   Note that this requires two memory copies: firstly from the device
--   into a heap allocated array, and from there marshalled into a list
peekListArray :: Storable a => Int -> DevicePtr a -> IO [a]

-- | Copy a number of elements onto the device. This is a synchronous
--   operation.
pokeArray :: Storable a => Int -> Ptr a -> DevicePtr a -> IO ()

-- | Copy memory onto the device asynchronously, possibly associated with a
--   particular stream. The source memory must be page-locked.
pokeArrayAsync :: Storable a => Int -> HostPtr a -> DevicePtr a -> Maybe Stream -> IO ()

-- | Copy a 2D memory area onto the device. This is a synchronous
--   operation.
pokeArray2D :: Storable a => Int -> Int -> Ptr a -> Int -> DevicePtr a -> Int -> IO ()

-- | Copy a 2D memory area onto the device asynchronously, possibly
--   associated with a particular stream. The source array must be page
--   locked.
pokeArray2DAsync :: Storable a => Int -> Int -> HostPtr a -> Int -> DevicePtr a -> Int -> Maybe Stream -> IO ()

-- | Write a list of storable elements into a device array. The array must
--   be sufficiently large to hold the entire list. This requires two
--   marshalling operations
pokeListArray :: Storable a => [a] -> DevicePtr a -> IO ()

-- | Copy the given number of elements from the first device array (source)
--   to the second (destination). The copied areas may not overlap. This
--   operation is asynchronous with respect to host, but will not overlap
--   other device operations.
copyArray :: Storable a => Int -> DevicePtr a -> DevicePtr a -> IO ()

-- | Copy the given number of elements from the first device array (source)
--   to the second (destination). The copied areas may not overlap. This
--   operation is asynchronous with respect to the host, and may be
--   associated with a particular stream.
copyArrayAsync :: Storable a => Int -> DevicePtr a -> DevicePtr a -> Maybe Stream -> IO ()

-- | Copy a 2D memory area from the first device array (source) to the
--   second (destination). The copied areas may not overlap. This operation
--   is asynchronous with respect to the host, but will not overlap other
--   device operations.
copyArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> DevicePtr a -> Int -> IO ()

-- | Copy a 2D memory area from the first device array (source) to the
--   second device array (destination). The copied areas may not overlay.
--   This operation is asynchronous with respect to the host, and may be
--   associated with a particular stream.
copyArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> DevicePtr a -> Int -> Maybe Stream -> IO ()

-- | Write a list of storable elements into a newly allocated device array.
--   This is <a>newListArrayLen</a> composed with <a>fst</a>.
newListArray :: Storable a => [a] -> IO (DevicePtr a)

-- | Write a list of storable elements into a newly allocated device array,
--   returning the device pointer together with the number of elements that
--   were written. Note that this requires two copy operations: firstly
--   from a Haskell list into a heap-allocated array, and from there into
--   device memory. The array should be <a>free</a>d when no longer
--   required.
newListArrayLen :: Storable a => [a] -> IO (DevicePtr a, Int)

-- | Temporarily store a list of elements into a newly allocated device
--   array. An IO action is applied to the array, the result of which is
--   returned. Similar to <a>newListArray</a>, this requires two
--   marshalling operations of the data.
--   
--   As with <a>allocaArray</a>, the memory is freed once the action
--   completes, so you should not return the pointer from the action, and
--   be sure that any asynchronous operations (such as kernel execution)
--   have completed.
withListArray :: Storable a => [a] -> (DevicePtr a -> IO b) -> IO b

-- | A variant of <a>withListArray</a> which also supplies the number of
--   elements in the array to the applied function
withListArrayLen :: Storable a => [a] -> (Int -> DevicePtr a -> IO b) -> IO b

-- | Initialise device memory to a given 8-bit value
memset :: DevicePtr a -> Int64 -> Int8 -> IO ()
instance Eq AllocFlag
instance Show AllocFlag
instance Eq AttachFlag
instance Show AttachFlag
instance Eq CopyDirection
instance Show CopyDirection
instance Enum CopyDirection
instance Enum AttachFlag
instance Enum AllocFlag


-- | Texture references
module Foreign.CUDA.Runtime.Texture
data Texture
Texture :: !Bool -> !FilterMode -> !(AddressMode, AddressMode, AddressMode) -> !FormatDesc -> Texture

-- | access texture using normalised coordinates [0.0,1.0)
normalised :: Texture -> !Bool
filtering :: Texture -> !FilterMode
addressing :: Texture -> !(AddressMode, AddressMode, AddressMode)
format :: Texture -> !FormatDesc

-- | Texture channel format kind
data FormatKind
Signed :: FormatKind
Unsigned :: FormatKind
Float :: FormatKind
None :: FormatKind
data AddressMode
Wrap :: AddressMode
Clamp :: AddressMode
Mirror :: AddressMode
Border :: AddressMode
data FilterMode
Point :: FilterMode
Linear :: FilterMode

-- | A description of how memory read through the texture cache should be
--   interpreted, including the kind of data and the number of bits of each
--   component (x,y,z and w, respectively).
data FormatDesc
FormatDesc :: !(Int, Int, Int, Int) -> !FormatKind -> FormatDesc
depth :: FormatDesc -> !(Int, Int, Int, Int)
kind :: FormatDesc -> !FormatKind

-- | Bind the memory area associated with the device pointer to a texture
--   reference given by the named symbol. Any previously bound references
--   are unbound.
bind :: String -> Texture -> DevicePtr a -> Int64 -> IO ()

-- | Bind the two-dimensional memory area to the texture reference
--   associated with the given symbol. The size of the area is constrained
--   by (width,height) in texel units, and the row pitch in bytes. Any
--   previously bound references are unbound.
bind2D :: String -> Texture -> DevicePtr a -> (Int, Int) -> Int64 -> IO ()
instance Eq FormatKind
instance Show FormatKind
instance Eq AddressMode
instance Show AddressMode
instance Eq FilterMode
instance Show FilterMode
instance Eq FormatDesc
instance Show FormatDesc
instance Eq Texture
instance Show Texture
instance Storable Texture
instance Storable FormatDesc
instance Enum FilterMode
instance Enum AddressMode
instance Enum FormatKind


-- | Memory management for low-level driver interface
module Foreign.CUDA.Driver.Marshal

-- | Options for host allocation
data AllocFlag
Portable :: AllocFlag
DeviceMapped :: AllocFlag
WriteCombined :: AllocFlag

-- | Allocate a section of linear memory on the host which is page-locked
--   and directly accessible from the device. The storage is sufficient to
--   hold the given number of elements of a storable type.
--   
--   Note that since the amount of pageable memory is thusly reduced,
--   overall system performance may suffer. This is best used sparingly to
--   allocate staging areas for data exchange.
mallocHostArray :: Storable a => [AllocFlag] -> Int -> IO (HostPtr a)

-- | Free a section of page-locked host memory
freeHost :: HostPtr a -> IO ()

-- | Page-locks the specified array (on the host) and maps it for the
--   device(s) as specified by the given allocation flags. Subsequently,
--   the memory is accessed directly by the device so can be read and
--   written with much higher bandwidth than pageable memory that has not
--   been registered. The memory range is added to the same tracking
--   mechanism as <a>mallocHostArray</a> to automatically accelerate calls
--   to functions such as <a>pokeArray</a>.
--   
--   Note that page-locking excessive amounts of memory may degrade system
--   performance, since it reduces the amount of pageable memory available.
--   This is best used sparingly to allocate staging areas for data
--   exchange.
--   
--   This function is not yet implemented on Mac OS X. Requires cuda-4.0.
registerArray :: Storable a => [AllocFlag] -> Int -> Ptr a -> IO (HostPtr a)

-- | Unmaps the memory from the given pointer, and makes it pageable again.
--   
--   This function is not yet implemented on Mac OS X. Requires cuda-4.0.
unregisterArray :: HostPtr a -> IO (Ptr a)

-- | Allocate a section of linear memory on the device, and return a
--   reference to it. The memory is sufficient to hold the given number of
--   elements of storable type. It is suitably aligned for any type, and is
--   not cleared.
mallocArray :: Storable a => Int -> IO (DevicePtr a)

-- | Execute a computation on the device, passing a pointer to a
--   temporarily allocated block of memory sufficient to hold the given
--   number of elements of storable type. The memory is freed when the
--   computation terminates (normally or via an exception), so the pointer
--   must not be used after this.
--   
--   Note that kernel launches can be asynchronous, so you may want to add
--   a synchronisation point using <tt>sync</tt> as part of the
--   computation.
allocaArray :: Storable a => Int -> (DevicePtr a -> IO b) -> IO b

-- | Release a section of device memory
free :: DevicePtr a -> IO ()

-- | Options for unified memory allocations
data AttachFlag
CuMemAttachGlobal :: AttachFlag
CuMemAttachHost :: AttachFlag
CuMemAttachSingle :: AttachFlag

-- | Allocates memory that will be automatically managed by the Unified
--   Memory system
mallocManagedArray :: Storable a => [AttachFlag] -> Int -> IO (DevicePtr a)

-- | Copy a number of elements from the device to host memory. This is a
--   synchronous operation
peekArray :: Storable a => Int -> DevicePtr a -> Ptr a -> IO ()

-- | Copy memory from the device asynchronously, possibly associated with a
--   particular stream. The destination host memory must be page-locked.
peekArrayAsync :: Storable a => Int -> DevicePtr a -> HostPtr a -> Maybe Stream -> IO ()

-- | Copy a 2D array from the device to the host.
peekArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> Ptr a -> Int -> Int -> Int -> IO ()

-- | Copy a 2D array from the device to the host asynchronously, possibly
--   associated with a particular execution stream. The destination host
--   memory must be page-locked.
peekArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> HostPtr a -> Int -> Int -> Int -> Maybe Stream -> IO ()

-- | Copy a number of elements from the device into a new Haskell list.
--   Note that this requires two memory copies: firstly from the device
--   into a heap allocated array, and from there marshalled into a list.
peekListArray :: Storable a => Int -> DevicePtr a -> IO [a]

-- | Copy a number of elements onto the device. This is a synchronous
--   operation
pokeArray :: Storable a => Int -> Ptr a -> DevicePtr a -> IO ()

-- | Copy memory onto the device asynchronously, possibly associated with a
--   particular stream. The source host memory must be page-locked.
pokeArrayAsync :: Storable a => Int -> HostPtr a -> DevicePtr a -> Maybe Stream -> IO ()

-- | Copy a 2D array from the host to the device.
pokeArray2D :: Storable a => Int -> Int -> Ptr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> IO ()

-- | Copy a 2D array from the host to the device asynchronously, possibly
--   associated with a particular execution stream. The source host memory
--   must be page-locked.
pokeArray2DAsync :: Storable a => Int -> Int -> HostPtr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> Maybe Stream -> IO ()

-- | Write a list of storable elements into a device array. The device
--   array must be sufficiently large to hold the entire list. This
--   requires two marshalling operations.
pokeListArray :: Storable a => [a] -> DevicePtr a -> IO ()

-- | Copy the given number of elements from the first device array (source)
--   to the second device (destination). The copied areas may not overlap.
--   This operation is asynchronous with respect to the host, but will
--   never overlap with kernel execution.
copyArray :: Storable a => Int -> DevicePtr a -> DevicePtr a -> IO ()

-- | Copy the given number of elements from the first device array (source)
--   to the second device array (destination). The copied areas may not
--   overlap. The operation is asynchronous with respect to the host, and
--   can be asynchronous to other device operations by associating it with
--   a particular stream.
copyArrayAsync :: Storable a => Int -> DevicePtr a -> DevicePtr a -> Maybe Stream -> IO ()

-- | Copy a 2D array from the first device array (source) to the second
--   device array (destination). The copied areas must not overlap. This
--   operation is asynchronous with respect to the host, but will never
--   overlap with kernel execution.
copyArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> IO ()

-- | Copy a 2D array from the first device array (source) to the second
--   device array (destination). The copied areas may not overlap. The
--   operation is asynchronous with respect to the host, and can be
--   asynchronous to other device operations by associating it with a
--   particular execution stream.
copyArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> Maybe Stream -> IO ()

-- | Copies an array from device memory in one context to device memory in
--   another context. Note that this function is asynchronous with respect
--   to the host, but serialised with respect to all pending and future
--   asynchronous work in the source and destination contexts. To avoid
--   this synchronisation, use <a>copyArrayPeerAsync</a> instead.
copyArrayPeer :: Storable a => Int -> DevicePtr a -> Context -> DevicePtr a -> Context -> IO ()

-- | Copies from device memory in one context to device memory in another
--   context. Note that this function is asynchronous with respect to the
--   host and all work in other streams and devices.
copyArrayPeerAsync :: Storable a => Int -> DevicePtr a -> Context -> DevicePtr a -> Context -> Maybe Stream -> IO ()

-- | Write a list of storable elements into a newly allocated device array.
--   This is <a>newListArrayLen</a> composed with <a>fst</a>.
newListArray :: Storable a => [a] -> IO (DevicePtr a)

-- | Write a list of storable elements into a newly allocated device array,
--   returning the device pointer together with the number of elements that
--   were written. Note that this requires two memory copies: firstly from
--   a Haskell list to a heap allocated array, and from there onto the
--   graphics device. The memory should be <a>free</a>d when no longer
--   required.
newListArrayLen :: Storable a => [a] -> IO (DevicePtr a, Int)

-- | Temporarily store a list of elements into a newly allocated device
--   array. An IO action is applied to to the array, the result of which is
--   returned. Similar to <a>newListArray</a>, this requires copying the
--   data twice.
--   
--   As with <a>allocaArray</a>, the memory is freed once the action
--   completes, so you should not return the pointer from the action, and
--   be wary of asynchronous kernel execution.
withListArray :: Storable a => [a] -> (DevicePtr a -> IO b) -> IO b

-- | A variant of <a>withListArray</a> which also supplies the number of
--   elements in the array to the applied function
withListArrayLen :: Storable a => [a] -> (Int -> DevicePtr a -> IO b) -> IO b

-- | Set a number of data elements to the specified value, which may be
--   either 8-, 16-, or 32-bits wide.
memset :: Storable a => DevicePtr a -> Int -> a -> IO ()

-- | Set the number of data elements to the specified value, which may be
--   either 8-, 16-, or 32-bits wide. The operation is asynchronous and may
--   optionally be associated with a stream. Requires cuda-3.2.
memsetAsync :: Storable a => DevicePtr a -> Int -> a -> Maybe Stream -> IO ()

-- | Return the device pointer associated with a mapped, pinned host
--   buffer, which was allocated with the <a>DeviceMapped</a> option by
--   <a>mallocHostArray</a>.
--   
--   Currently, no options are supported and this must be empty.
getDevicePtr :: [AllocFlag] -> HostPtr a -> IO (DevicePtr a)

-- | Return the base address and allocation size of the given device
--   pointer
getBasePtr :: DevicePtr a -> IO (DevicePtr a, Int64)

-- | Return the amount of free and total memory respectively available to
--   the current context (bytes)
getMemInfo :: IO (Int64, Int64)
instance Eq AllocFlag
instance Show AllocFlag
instance Eq AttachFlag
instance Show AttachFlag
instance Enum AttachFlag
instance Enum AllocFlag


-- | Texture management for low-level driver interface
module Foreign.CUDA.Driver.Texture

-- | A texture reference
newtype Texture
Texture :: ((Ptr ())) -> Texture
useTexture :: Texture -> ((Ptr ()))

-- | Texture data formats
data Format
Word8 :: Format
Word16 :: Format
Word32 :: Format
Int8 :: Format
Int16 :: Format
Int32 :: Format
Half :: Format
Float :: Format

-- | Texture reference addressing modes
data AddressMode
Wrap :: AddressMode
Clamp :: AddressMode
Mirror :: AddressMode
Border :: AddressMode

-- | Texture reference filtering mode
data FilterMode
Point :: FilterMode
Linear :: FilterMode

-- | Texture read mode options
data ReadMode
ReadAsInteger :: ReadMode
NormalizedCoordinates :: ReadMode
SRGB :: ReadMode

-- | Create a new texture reference. Once created, the application must
--   call <tt>setPtr</tt> to associate the reference with allocated memory.
--   Other texture reference functions are used to specify the format and
--   interpretation to be used when the memory is read through this
--   reference.

-- | <i>Deprecated: as of CUDA version 3.2</i>
create :: IO Texture

-- | Destroy a texture reference

-- | <i>Deprecated: as of CUDA version 3.2</i>
destroy :: Texture -> IO ()

-- | Bind a linear array address of the given size (bytes) as a texture
--   reference. Any previously bound references are unbound.
bind :: Texture -> DevicePtr a -> Int64 -> IO ()

-- | Bind a linear address range to the given texture reference as a
--   two-dimensional arena. Any previously bound reference is unbound. Note
--   that calls to <a>setFormat</a> can not follow a call to <a>bind2D</a>
--   for the same texture reference.
bind2D :: Texture -> Format -> Int -> DevicePtr a -> (Int, Int) -> Int64 -> IO ()

-- | Get the addressing mode used by a texture reference, corresponding to
--   the given dimension (currently the only supported dimension values are
--   0 or 1).
getAddressMode :: Texture -> Int -> IO AddressMode

-- | Get the filtering mode used by a texture reference
getFilterMode :: Texture -> IO FilterMode

-- | Get the data format and number of channel components of the bound
--   texture
getFormat :: Texture -> IO (Format, Int)

-- | Specify the addressing mode for the given dimension of a texture
--   reference
setAddressMode :: Texture -> Int -> AddressMode -> IO ()

-- | Specify the filtering mode to be used when reading memory through a
--   texture reference
setFilterMode :: Texture -> FilterMode -> IO ()

-- | Specify the format of the data and number of packed components per
--   element to be read by the texture reference
setFormat :: Texture -> Format -> Int -> IO ()

-- | Specify additional characteristics for reading and indexing the
--   texture reference
setReadMode :: Texture -> ReadMode -> IO ()
instance Eq Texture
instance Show Texture
instance Eq AddressMode
instance Show AddressMode
instance Eq FilterMode
instance Show FilterMode
instance Eq ReadMode
instance Show ReadMode
instance Eq Format
instance Show Format
instance Enum Format
instance Enum ReadMode
instance Enum FilterMode
instance Enum AddressMode
instance Storable Texture


-- | Module management for low-level driver interface
module Foreign.CUDA.Driver.Module

-- | A reference to a Module object, containing collections of device
--   functions
data Module

-- | Just-in-time compilation options
data JITOption

-- | maximum number of registers per thread
MaxRegisters :: !Int -> JITOption

-- | number of threads per block to target for
ThreadsPerBlock :: !Int -> JITOption

-- | level of optimisation to apply (1-4, default 4)
OptimisationLevel :: !Int -> JITOption

-- | compilation target, otherwise determined from context
Target :: !Compute -> JITOption

-- | fallback strategy if matching cubin not found
FallbackStrategy :: !JITFallback -> JITOption

-- | generate debug info (-g) (requires cuda &gt;= 5.5)
GenerateDebugInfo :: JITOption

-- | generate line number information (-lineinfo) (requires cuda &gt;= 5.5)
GenerateLineInfo :: JITOption

-- | verbose log messages (requires cuda &gt;= 5.5)
Verbose :: JITOption
data JITTarget
Compute10 :: JITTarget
Compute11 :: JITTarget
Compute12 :: JITTarget
Compute13 :: JITTarget
Compute20 :: JITTarget
Compute21 :: JITTarget
Compute30 :: JITTarget
Compute32 :: JITTarget
Compute35 :: JITTarget
Compute37 :: JITTarget
Compute50 :: JITTarget

-- | Results of online compilation
data JITResult
JITResult :: !Float -> !ByteString -> !Module -> JITResult

-- | milliseconds spent compiling PTX
jitTime :: JITResult -> !Float

-- | information about PTX assembly
jitInfoLog :: JITResult -> !ByteString

-- | compilation error log or compiled module
jitModule :: JITResult -> !Module
data JITFallback
PTX :: JITFallback
Binary :: JITFallback

-- | Returns a function handle
getFun :: Module -> String -> IO Fun

-- | Return a global pointer, and size of the global (in bytes)
getPtr :: Module -> String -> IO (DevicePtr a, Int)

-- | Return a handle to a texture reference
getTex :: Module -> String -> IO Texture

-- | Load the contents of the specified file (either a ptx or cubin file)
--   to create a new module, and load that module into the current context.
loadFile :: FilePath -> IO Module

-- | Load the contents of the given image into a new module, and load that
--   module into the current context. The image is (typically) the contents
--   of a cubin or PTX file.
--   
--   Note that the <a>ByteString</a> will be copied into a temporary
--   staging area so that it can be passed to C.
loadData :: ByteString -> IO Module

-- | As <a>loadData</a>, but read the image data from the given pointer.
--   The image is a NULL-terminated sequence of bytes.
loadDataFromPtr :: Ptr Word8 -> IO Module

-- | Load the contents of the given image into a module with online
--   compiler options, and load the module into the current context. The
--   image is (typically) the contents of a cubin or PTX file. The actual
--   attributes of the compiled kernel can be probed using <a>requires</a>.
--   
--   Note that the <a>ByteString</a> will be copied into a temporary
--   staging area so that it can be passed to C.
loadDataEx :: ByteString -> [JITOption] -> IO JITResult

-- | As <a>loadDataEx</a>, but read the image data from the given pointer.
--   The image is a NULL-terminated sequence of bytes.
loadDataFromPtrEx :: Ptr Word8 -> [JITOption] -> IO JITResult

-- | Unload a module from the current context
unload :: Module -> IO ()
instance Eq Module
instance Show Module
instance Show JITResult
instance Eq JITOptionInternal
instance Show JITOptionInternal
instance Eq JITTarget
instance Show JITTarget
instance Eq JITFallback
instance Show JITFallback
instance Show JITOption
instance Enum JITFallback
instance Enum JITTarget
instance Enum JITOptionInternal


-- | Top level bindings to CUDA driver API
module Foreign.CUDA.Driver


-- | Top level bindings to the C-for-CUDA runtime API
module Foreign.CUDA.Runtime


-- | Top level bindings. By default, expose the C-for-CUDA runtime API
--   bindings, as they are slightly more user friendly.
module Foreign.CUDA