-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | FFI binding to the CUDA interface for programming NVIDIA GPUs
--   
--   The CUDA library provides a direct, general purpose C-like SPMD
--   programming model for NVIDIA graphics cards (G8x series onwards). This
--   is a collection of bindings to allow you to call and control, although
--   not write, such functions from Haskell-land. You will need to install
--   the CUDA driver and developer toolkit.
--   
--   <a>http://developer.nvidia.com/cuda-downloads</a>
--   
--   The setup script will look for your CUDA installation by checking, in
--   the following order:
--   
--   <ol>
--   <li>The <tt>CUDA_PATH</tt> environment variable</li>
--   <li>Searching for <tt>nvcc</tt> on the <tt>PATH</tt></li>
--   <li>Checking at <tt>/usr/local/cuda</tt></li>
--   <li><tt>CUDA_PATH_Vx_y</tt> environment variable, for recent CUDA
--   toolkit versions x.y</li>
--   </ol>
--   
--   This library provides bindings to both the CUDA Driver and Runtime
--   APIs. To get started, see one of:
--   
--   <ul>
--   <li><a>Foreign.CUDA.Driver</a> (a short tutorial is available
--   here)</li>
--   <li><a>Foreign.CUDA.Runtime</a></li>
--   </ul>
--   
--   Tested with library versions up to CUDA-10.2. See also the
--   <a>travis-ci.org</a> build matrix for version compatibility.
--   
--   <ul>
--   <li><i><i>NOTES:</i></i></li>
--   </ul>
--   
--   The setup script for this package requires at least Cabal-1.24. To
--   upgrade, execute one of:
--   
--   <ul>
--   <li>cabal users: <tt>cabal install Cabal --constraint="Cabal &gt;=
--   1.24"</tt></li>
--   <li>stack users: <tt>stack setup --upgrade-cabal</tt></li>
--   </ul>
--   
--   Due to an interaction between GHC-8 and unified virtual address spaces
--   in CUDA, this package does not currently work with GHCi on ghc-8.0.1
--   (compiled programs should work). See the following for more details:
--   
--   <ul>
--   <li><a>https://github.com/tmcdonell/cuda/issues/39</a></li>
--   <li><a>https://ghc.haskell.org/trac/ghc/ticket/12573</a></li>
--   </ul>
--   
--   The bug should be fixed in ghc-8.0.2 and beyond.
--   
--   For additional notes on installing on Windows, see:
--   
--   <ul>
--   
--   <li><a>https://github.com/tmcdonell/cuda/blob/master/WINDOWS.md</a></li>
--   </ul>
@package cuda
@version 0.10.2.0


module Foreign.C.Extra
c_strnlen :: CString -> Int -> IO Int


-- | Graph execution functions for the low-level driver interface
--   
--   Requires CUDA-10
module Foreign.CUDA.Driver.Graph.Base
newtype Graph
Graph :: Ptr () -> Graph
[useGraph] :: Graph -> Ptr ()
data GraphFlag
newtype Node
Node :: Ptr () -> Node
[useNode] :: Node -> Ptr ()
data NodeType
Kernel :: NodeType
Memcpy :: NodeType
Memset :: NodeType
Host :: NodeType
Subgraph :: NodeType
Empty :: NodeType
Count :: NodeType
newtype Executable
Executable :: Ptr () -> Executable
[useExecutable] :: Executable -> Ptr ()
instance GHC.Show.Show Foreign.CUDA.Driver.Graph.Base.Graph
instance GHC.Classes.Eq Foreign.CUDA.Driver.Graph.Base.Graph
instance Foreign.Storable.Storable Foreign.CUDA.Driver.Graph.Base.Node
instance GHC.Show.Show Foreign.CUDA.Driver.Graph.Base.Node
instance GHC.Classes.Eq Foreign.CUDA.Driver.Graph.Base.Node
instance GHC.Enum.Bounded Foreign.CUDA.Driver.Graph.Base.NodeType
instance GHC.Show.Show Foreign.CUDA.Driver.Graph.Base.NodeType
instance GHC.Classes.Eq Foreign.CUDA.Driver.Graph.Base.NodeType
instance GHC.Show.Show Foreign.CUDA.Driver.Graph.Base.Executable
instance GHC.Classes.Eq Foreign.CUDA.Driver.Graph.Base.Executable
instance GHC.Enum.Enum Foreign.CUDA.Driver.Graph.Base.NodeType
instance GHC.Enum.Enum Foreign.CUDA.Driver.Graph.Base.GraphFlag


module Foreign.CUDA.Path

-- | The base path to the CUDA toolkit installation that this package was
--   compiled against.
cudaInstallPath :: FilePath

-- | The path where the CUDA toolkit executables, such as <tt>nvcc</tt> and
--   <tt>ptxas</tt>, can be found.
cudaBinPath :: FilePath

-- | The path where the CUDA libraries this package was linked against are
--   located
cudaLibraryPath :: FilePath

-- | The path where the CUDA headers this package was built against are
--   located
cudaIncludePath :: FilePath


-- | Data pointers on the host and device. These can be shared freely
--   between the CUDA runtime and Driver APIs.
module Foreign.CUDA.Ptr

-- | A reference to data stored on the device.
newtype DevicePtr a
DevicePtr :: Ptr a -> DevicePtr a
[useDevicePtr] :: DevicePtr a -> Ptr a

-- | Look at the contents of device memory. This takes an IO action that
--   will be applied to that pointer, the result of which is returned. It
--   would be silly to return the pointer from the action.
withDevicePtr :: DevicePtr a -> (Ptr a -> IO b) -> IO b

-- | Return a unique handle associated with the given device pointer
devPtrToWordPtr :: DevicePtr a -> WordPtr

-- | Return a device pointer from the given handle
wordPtrToDevPtr :: WordPtr -> DevicePtr a

-- | The constant <a>nullDevPtr</a> contains the distinguished memory
--   location that is not associated with a valid memory location
nullDevPtr :: DevicePtr a

-- | Cast a device pointer from one type to another
castDevPtr :: DevicePtr a -> DevicePtr b

-- | Advance the pointer address by the given offset in bytes.
plusDevPtr :: DevicePtr a -> Int -> DevicePtr a

-- | Given an alignment constraint, align the device pointer to the next
--   highest address satisfying the constraint
alignDevPtr :: DevicePtr a -> Int -> DevicePtr a

-- | Compute the difference between the second and first argument. This
--   fulfils the relation
--   
--   <pre>
--   p2 == p1 `plusDevPtr` (p2 `minusDevPtr` p1)
--   </pre>
minusDevPtr :: DevicePtr a -> DevicePtr a -> Int

-- | Advance a pointer into a device array by the given number of elements
advanceDevPtr :: Storable a => DevicePtr a -> Int -> DevicePtr a

-- | A reference to page-locked host memory.
--   
--   A <a>HostPtr</a> is just a plain <a>Ptr</a>, but the memory has been
--   allocated by CUDA into page locked memory. This means that the data
--   can be copied to the GPU via DMA (direct memory access). Note that the
--   use of the system function <tt>mlock</tt> is not sufficient here ---
--   the CUDA version ensures that the <i>physical</i> address stays this
--   same, not just the virtual address.
--   
--   To copy data into a <a>HostPtr</a> array, you may use for example
--   <a>withHostPtr</a> together with <a>copyArray</a> or <a>moveArray</a>.
newtype HostPtr a
HostPtr :: Ptr a -> HostPtr a
[useHostPtr] :: HostPtr a -> Ptr a

-- | Apply an IO action to the memory reference living inside the host
--   pointer object. All uses of the pointer should be inside the
--   <a>withHostPtr</a> bracket.
withHostPtr :: HostPtr a -> (Ptr a -> IO b) -> IO b

-- | The constant <a>nullHostPtr</a> contains the distinguished memory
--   location that is not associated with a valid memory location
nullHostPtr :: HostPtr a

-- | Cast a host pointer from one type to another
castHostPtr :: HostPtr a -> HostPtr b

-- | Advance the pointer address by the given offset in bytes
plusHostPtr :: HostPtr a -> Int -> HostPtr a

-- | Given an alignment constraint, align the host pointer to the next
--   highest address satisfying the constraint
alignHostPtr :: HostPtr a -> Int -> HostPtr a

-- | Compute the difference between the second and first argument
minusHostPtr :: HostPtr a -> HostPtr a -> Int

-- | Advance a pointer into a host array by a given number of elements
advanceHostPtr :: Storable a => HostPtr a -> Int -> HostPtr a
instance GHC.Classes.Ord (Foreign.CUDA.Ptr.DevicePtr a)
instance GHC.Classes.Eq (Foreign.CUDA.Ptr.DevicePtr a)
instance GHC.Classes.Ord (Foreign.CUDA.Ptr.HostPtr a)
instance GHC.Classes.Eq (Foreign.CUDA.Ptr.HostPtr a)
instance GHC.Show.Show (Foreign.CUDA.Ptr.HostPtr a)
instance Foreign.Storable.Storable (Foreign.CUDA.Ptr.HostPtr a)
instance GHC.Show.Show (Foreign.CUDA.Ptr.DevicePtr a)
instance Foreign.Storable.Storable (Foreign.CUDA.Ptr.DevicePtr a)


-- | Error handling functions
module Foreign.CUDA.Runtime.Error

-- | Return codes from API functions
data Status
Success :: Status
InvalidValue :: Status
MemoryAllocation :: Status
InitializationError :: Status
CudartUnloading :: Status
ProfilerDisabled :: Status
ProfilerNotInitialized :: Status
ProfilerAlreadyStarted :: Status
ProfilerAlreadyStopped :: Status
InvalidConfiguration :: Status
InvalidPitchValue :: Status
InvalidSymbol :: Status
InvalidHostPointer :: Status
InvalidDevicePointer :: Status
InvalidTexture :: Status
InvalidTextureBinding :: Status
InvalidChannelDescriptor :: Status
InvalidMemcpyDirection :: Status
AddressOfConstant :: Status
TextureFetchFailed :: Status
TextureNotBound :: Status
SynchronizationError :: Status
InvalidFilterSetting :: Status
InvalidNormSetting :: Status
MixedDeviceExecution :: Status
NotYetImplemented :: Status
MemoryValueTooLarge :: Status
InsufficientDriver :: Status
InvalidSurface :: Status
DuplicateVariableName :: Status
DuplicateTextureName :: Status
DuplicateSurfaceName :: Status
DevicesUnavailable :: Status
IncompatibleDriverContext :: Status
MissingConfiguration :: Status
PriorLaunchFailure :: Status
LaunchMaxDepthExceeded :: Status
LaunchFileScopedTex :: Status
LaunchFileScopedSurf :: Status
SyncDepthExceeded :: Status
LaunchPendingCountExceeded :: Status
InvalidDeviceFunction :: Status
NoDevice :: Status
InvalidDevice :: Status
StartupFailure :: Status
InvalidKernelImage :: Status
DeviceUninitialized :: Status
MapBufferObjectFailed :: Status
UnmapBufferObjectFailed :: Status
ArrayIsMapped :: Status
AlreadyMapped :: Status
NoKernelImageForDevice :: Status
AlreadyAcquired :: Status
NotMapped :: Status
NotMappedAsArray :: Status
NotMappedAsPointer :: Status
ECCUncorrectable :: Status
UnsupportedLimit :: Status
DeviceAlreadyInUse :: Status
PeerAccessUnsupported :: Status
InvalidPtx :: Status
InvalidGraphicsContext :: Status
NvlinkUncorrectable :: Status
JitCompilerNotFound :: Status
InvalidSource :: Status
FileNotFound :: Status
SharedObjectSymbolNotFound :: Status
SharedObjectInitFailed :: Status
OperatingSystem :: Status
InvalidResourceHandle :: Status
IllegalState :: Status
SymbolNotFound :: Status
NotReady :: Status
IllegalAddress :: Status
LaunchOutOfResources :: Status
LaunchTimeout :: Status
LaunchIncompatibleTexturing :: Status
PeerAccessAlreadyEnabled :: Status
PeerAccessNotEnabled :: Status
SetOnActiveProcess :: Status
ContextIsDestroyed :: Status
Assert :: Status
TooManyPeers :: Status
HostMemoryAlreadyRegistered :: Status
HostMemoryNotRegistered :: Status
HardwareStackError :: Status
IllegalInstruction :: Status
MisalignedAddress :: Status
InvalidAddressSpace :: Status
InvalidPc :: Status
LaunchFailure :: Status
CooperativeLaunchTooLarge :: Status
NotPermitted :: Status
NotSupported :: Status
SystemNotReady :: Status
SystemDriverMismatch :: Status
CompatNotSupportedOnDevice :: Status
StreamCaptureUnsupported :: Status
StreamCaptureInvalidated :: Status
StreamCaptureMerge :: Status
StreamCaptureUnmatched :: Status
StreamCaptureUnjoined :: Status
StreamCaptureIsolation :: Status
StreamCaptureImplicit :: Status
CapturedEvent :: Status
StreamCaptureWrongThread :: Status
Timeout :: Status
GraphExecUpdateFailure :: Status
Unknown :: Status
ApiFailureBase :: Status
data CUDAException
ExitCode :: Status -> CUDAException
UserError :: String -> CUDAException

-- | Raise a <a>CUDAException</a> in the IO Monad
cudaError :: String -> IO a
describe :: Describe a => a -> String

-- | A specially formatted error message
requireSDK :: Name -> Double -> IO a

-- | Return the results of a function on successful execution, otherwise
--   return the error string associated with the return code
resultIfOk :: (Status, a) -> IO a

-- | Return the error string associated with an unsuccessful return code,
--   otherwise Nothing
nothingIfOk :: Status -> IO ()
checkStatus :: CInt -> IO ()
instance GHC.Show.Show Foreign.CUDA.Runtime.Error.Status
instance GHC.Classes.Eq Foreign.CUDA.Runtime.Error.Status
instance GHC.Exception.Type.Exception Foreign.CUDA.Runtime.Error.CUDAException
instance GHC.Show.Show Foreign.CUDA.Runtime.Error.CUDAException
instance GHC.Enum.Enum Foreign.CUDA.Runtime.Error.Status
instance Text.Show.Describe.Describe Foreign.CUDA.Runtime.Error.Status


-- | Utility functions
module Foreign.CUDA.Runtime.Utils

-- | Return the version number of the installed CUDA driver
runtimeVersion :: IO Int

-- | Return the version number of the installed CUDA runtime
driverVersion :: IO Int

-- | Return the version number of the CUDA library (API) that this package
--   was compiled against.
libraryVersion :: Int


-- | Texture references
module Foreign.CUDA.Runtime.Texture
data Texture
Texture :: !Bool -> !FilterMode -> !(AddressMode, AddressMode, AddressMode) -> !FormatDesc -> Texture

-- | access texture using normalised coordinates [0.0,1.0)
[normalised] :: Texture -> !Bool
[filtering] :: Texture -> !FilterMode
[addressing] :: Texture -> !(AddressMode, AddressMode, AddressMode)
[format] :: Texture -> !FormatDesc

-- | Texture channel format kind
data FormatKind
Signed :: FormatKind
Unsigned :: FormatKind
Float :: FormatKind
None :: FormatKind
data AddressMode
Wrap :: AddressMode
Clamp :: AddressMode
Mirror :: AddressMode
Border :: AddressMode
data FilterMode
Point :: FilterMode
Linear :: FilterMode

-- | A description of how memory read through the texture cache should be
--   interpreted, including the kind of data and the number of bits of each
--   component (x,y,z and w, respectively).
data FormatDesc
FormatDesc :: !(Int, Int, Int, Int) -> !FormatKind -> FormatDesc
[depth] :: FormatDesc -> !(Int, Int, Int, Int)
[kind] :: FormatDesc -> !FormatKind

-- | Bind the memory area associated with the device pointer to a texture
--   reference given by the named symbol. Any previously bound references
--   are unbound.
bind :: String -> Texture -> DevicePtr a -> Int64 -> IO ()

-- | Bind the two-dimensional memory area to the texture reference
--   associated with the given symbol. The size of the area is constrained
--   by (width,height) in texel units, and the row pitch in bytes. Any
--   previously bound references are unbound.
bind2D :: String -> Texture -> DevicePtr a -> (Int, Int) -> Int64 -> IO ()
instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.FormatKind
instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.FormatKind
instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.AddressMode
instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.AddressMode
instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.FilterMode
instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.FilterMode
instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.FormatDesc
instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.FormatDesc
instance GHC.Show.Show Foreign.CUDA.Runtime.Texture.Texture
instance GHC.Classes.Eq Foreign.CUDA.Runtime.Texture.Texture
instance Foreign.Storable.Storable Foreign.CUDA.Runtime.Texture.Texture
instance Foreign.Storable.Storable Foreign.CUDA.Runtime.Texture.FormatDesc
instance GHC.Enum.Enum Foreign.CUDA.Runtime.Texture.FilterMode
instance GHC.Enum.Enum Foreign.CUDA.Runtime.Texture.AddressMode
instance GHC.Enum.Enum Foreign.CUDA.Runtime.Texture.FormatKind


-- | Error handling
module Foreign.CUDA.Driver.Error
data Status
Success :: Status
InvalidValue :: Status
OutOfMemory :: Status
NotInitialized :: Status
Deinitialized :: Status
ProfilerDisabled :: Status
ProfilerNotInitialized :: Status
ProfilerAlreadyStarted :: Status
ProfilerAlreadyStopped :: Status
NoDevice :: Status
InvalidDevice :: Status
InvalidImage :: Status
InvalidContext :: Status
ContextAlreadyCurrent :: Status
MapFailed :: Status
UnmapFailed :: Status
ArrayIsMapped :: Status
AlreadyMapped :: Status
NoBinaryForGPU :: Status
AlreadyAcquired :: Status
NotMapped :: Status
NotMappedAsArray :: Status
NotMappedAsPointer :: Status
EccUncorrectable :: Status
UnsupportedLimit :: Status
ContextAlreadyInUse :: Status
PeerAccessUnsupported :: Status
InvalidPTX :: Status
InvalidGraphicsContext :: Status
NvlinkUncorrectable :: Status
JitCompilerNotFound :: Status
InvalidSource :: Status
FileNotFound :: Status
SharedObjectSymbolNotFound :: Status
SharedObjectInitFailed :: Status
OperatingSystem :: Status
InvalidHandle :: Status
IllegalState :: Status
NotFound :: Status
NotReady :: Status
IllegalAddress :: Status
LaunchOutOfResources :: Status
LaunchTimeout :: Status
LaunchIncompatibleTexturing :: Status
PeerAccessAlreadyEnabled :: Status
PeerAccessNotEnabled :: Status
PrimaryContextActive :: Status
ContextIsDestroyed :: Status
Assert :: Status
TooManyPeers :: Status
HostMemoryAlreadyRegistered :: Status
HostMemoryNotRegistered :: Status
HardwareStackError :: Status
IllegalInstruction :: Status
MisalignedAddress :: Status
InvalidAddressSpace :: Status
InvalidPC :: Status
LaunchFailed :: Status
CooperativeLaunchTooLarge :: Status
NotPermitted :: Status
NotSupported :: Status
SystemNotReady :: Status
SystemDriverMismatch :: Status
CompatNotSupportedOnDevice :: Status
StreamCaptureUnsupported :: Status
StreamCaptureInvalidated :: Status
StreamCaptureMerge :: Status
StreamCaptureUnmatched :: Status
StreamCaptureUnjoined :: Status
StreamCaptureIsolation :: Status
StreamCaptureImplicit :: Status
CapturedEvent :: Status
StreamCaptureWrongThread :: Status
Timeout :: Status
GraphExecUpdateFailure :: Status
Unknown :: Status
data CUDAException
ExitCode :: Status -> CUDAException
UserError :: String -> CUDAException
describe :: Describe a => a -> String

-- | Raise a CUDAException. Exceptions can be thrown from pure code, but
--   can only be caught in the <a>IO</a> monad.
cudaError :: String -> a

-- | Raise a CUDAException in the IO Monad
cudaErrorIO :: String -> IO a

-- | A specially formatted error message
requireSDK :: Name -> Double -> a

-- | Return the results of a function on successful execution, otherwise
--   throw an exception with an error string associated with the return
--   code
resultIfOk :: (Status, a) -> IO a

-- | Throw an exception with an error string associated with an
--   unsuccessful return code, otherwise return unit.
nothingIfOk :: Status -> IO ()
checkStatus :: CInt -> IO ()
instance GHC.Show.Show Foreign.CUDA.Driver.Error.Status
instance GHC.Classes.Eq Foreign.CUDA.Driver.Error.Status
instance Text.Show.Describe.Describe Foreign.CUDA.Driver.Error.Status
instance GHC.Exception.Type.Exception Foreign.CUDA.Driver.Error.CUDAException
instance GHC.Show.Show Foreign.CUDA.Driver.Error.CUDAException
instance GHC.Enum.Enum Foreign.CUDA.Driver.Error.Status


-- | Utility functions
module Foreign.CUDA.Driver.Utils

-- | Return the version number of the installed CUDA driver.
driverVersion :: IO Int

-- | Return the version number of the CUDA library (API) that this package
--   was compiled against.
libraryVersion :: Int


-- | Profiler control for low-level driver interface
module Foreign.CUDA.Driver.Profiler

-- | Profiler output mode
data OutputMode
KeyValuePair :: OutputMode
CSV :: OutputMode

-- | Initialise the CUDA profiler.
--   
--   The configuration file is used to specify profiling options and
--   profiling counters. Refer to the "Compute Command Line Profiler User
--   Guide" for supported profiler options and counters.
--   
--   Note that the CUDA profiler can not be initialised with this function
--   if another profiling tool is already active.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PROFILER.html#group__CUDA__PROFILER</a>
initialise :: FilePath -> FilePath -> OutputMode -> IO ()

-- | Begin profiling collection by the active profiling tool for the
--   current context. If profiling is already enabled, then this has no
--   effect.
--   
--   <a>start</a> and <a>stop</a> can be used to programatically control
--   profiling granularity, by allowing profiling to be done only on
--   selected pieces of code.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PROFILER.html#group__CUDA__PROFILER_1g8a5314de2292c2efac83ac7fcfa9190e</a>
start :: IO ()

-- | Stop profiling collection by the active profiling tool for the current
--   context, and force all pending profiler events to be written to the
--   output file. If profiling is already inactive, this has no effect.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PROFILER.html#group__CUDA__PROFILER_1g4d8edef6174fd90165e6ac838f320a5f</a>
stop :: IO ()
instance GHC.Show.Show Foreign.CUDA.Driver.Profiler.OutputMode
instance GHC.Classes.Eq Foreign.CUDA.Driver.Profiler.OutputMode
instance GHC.Enum.Enum Foreign.CUDA.Driver.Profiler.OutputMode


-- | Common device functions
module Foreign.CUDA.Analysis.Device

-- | GPU compute capability, major and minor revision number respectively.
data Compute
Compute :: !Int -> !Int -> Compute

-- | The compute mode the device is currently in
data ComputeMode
Default :: ComputeMode
Prohibited :: ComputeMode
ExclusiveProcess :: ComputeMode

-- | The properties of a compute device
data DeviceProperties
DeviceProperties :: !String -> !Compute -> !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> !Int -> !Int -> !(Int, Int, Int) -> !(Int, Int, Int) -> !Int -> !(Int, Int) -> !(Int, Int, Int) -> !Int -> !Int -> !Int64 -> !Int -> !Int -> !Int64 -> !ComputeMode -> !Bool -> !Bool -> !Bool -> !Int -> !Int -> !PCI -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Int -> !Bool -> !Int -> !Bool -> !Bool -> DeviceProperties

-- | Identifier
[deviceName] :: DeviceProperties -> !String

-- | Supported compute capability
[computeCapability] :: DeviceProperties -> !Compute

-- | Available global memory on the device in bytes
[totalGlobalMem] :: DeviceProperties -> !Int64

-- | Available constant memory on the device in bytes
[totalConstMem] :: DeviceProperties -> !Int64

-- | Available shared memory per block in bytes
[sharedMemPerBlock] :: DeviceProperties -> !Int64

-- | 32-bit registers per block
[regsPerBlock] :: DeviceProperties -> !Int

-- | Warp size in threads (SIMD width)
[warpSize] :: DeviceProperties -> !Int

-- | Maximum number of threads per block
[maxThreadsPerBlock] :: DeviceProperties -> !Int

-- | Maximum number of threads per multiprocessor
[maxThreadsPerMultiProcessor] :: DeviceProperties -> !Int

-- | Maximum size of each dimension of a block
[maxBlockSize] :: DeviceProperties -> !(Int, Int, Int)

-- | Maximum size of each dimension of a grid
[maxGridSize] :: DeviceProperties -> !(Int, Int, Int)

-- | Maximum texture dimensions
[maxTextureDim1D] :: DeviceProperties -> !Int
[maxTextureDim2D] :: DeviceProperties -> !(Int, Int)
[maxTextureDim3D] :: DeviceProperties -> !(Int, Int, Int)

-- | Clock frequency in kilohertz
[clockRate] :: DeviceProperties -> !Int

-- | Number of multiprocessors on the device
[multiProcessorCount] :: DeviceProperties -> !Int

-- | Maximum pitch in bytes allowed by memory copies
[memPitch] :: DeviceProperties -> !Int64

-- | Global memory bus width in bits
[memBusWidth] :: DeviceProperties -> !Int

-- | Peak memory clock frequency in kilohertz
[memClockRate] :: DeviceProperties -> !Int

-- | Alignment requirement for textures
[textureAlignment] :: DeviceProperties -> !Int64
[computeMode] :: DeviceProperties -> !ComputeMode

-- | Device can concurrently copy memory and execute a kernel
[deviceOverlap] :: DeviceProperties -> !Bool

-- | Device can possibly execute multiple kernels concurrently
[concurrentKernels] :: DeviceProperties -> !Bool

-- | Device supports and has enabled error correction
[eccEnabled] :: DeviceProperties -> !Bool

-- | Number of asynchronous engines
[asyncEngineCount] :: DeviceProperties -> !Int

-- | Size of the L2 cache in bytes
[cacheMemL2] :: DeviceProperties -> !Int

-- | PCI device information for the device
[pciInfo] :: DeviceProperties -> !PCI

-- | Whether this is a Tesla device using the TCC driver
[tccDriverEnabled] :: DeviceProperties -> !Bool

-- | Whether there is a runtime limit on kernels
[kernelExecTimeoutEnabled] :: DeviceProperties -> !Bool

-- | As opposed to discrete
[integrated] :: DeviceProperties -> !Bool

-- | Device can use pinned memory
[canMapHostMemory] :: DeviceProperties -> !Bool

-- | Device shares a unified address space with the host
[unifiedAddressing] :: DeviceProperties -> !Bool

-- | Device supports stream priorities
[streamPriorities] :: DeviceProperties -> !Bool

-- | Device supports caching globals in L1 cache
[globalL1Cache] :: DeviceProperties -> !Bool

-- | Device supports caching locals in L1 cache
[localL1Cache] :: DeviceProperties -> !Bool

-- | Device supports allocating managed memory on this system
[managedMemory] :: DeviceProperties -> !Bool

-- | Device is on a multi-GPU board
[multiGPUBoard] :: DeviceProperties -> !Bool

-- | Unique identifier for a group of devices associated with the same
--   board
[multiGPUBoardGroupID] :: DeviceProperties -> !Int

-- | Device supports compute pre-emption
[preemption] :: DeviceProperties -> !Bool

-- | Ratio of single precision performance (in floating-point operations
--   per second) to double precision performance
[singleToDoublePerfRatio] :: DeviceProperties -> !Int

-- | Device supports launching cooperative kernels
[cooperativeLaunch] :: DeviceProperties -> !Bool

-- | Device can participate in cooperative multi-device kernels
[cooperativeLaunchMultiDevice] :: DeviceProperties -> !Bool
data DeviceResources
DeviceResources :: !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Int -> !Allocation -> !Int -> !Int -> !Int -> !Int -> !Int -> DeviceResources

-- | Warp size
[threadsPerWarp] :: DeviceResources -> !Int

-- | Number of SIMD arithmetic units per multiprocessor
[coresPerMP] :: DeviceResources -> !Int

-- | Maximum number of in-flight warps per multiprocessor
[warpsPerMP] :: DeviceResources -> !Int

-- | Maximum number of in-flight threads on a multiprocessor
[threadsPerMP] :: DeviceResources -> !Int

-- | Maximum number of thread blocks resident on a multiprocessor
[threadBlocksPerMP] :: DeviceResources -> !Int

-- | Total amount of shared memory per multiprocessor (bytes)
[sharedMemPerMP] :: DeviceResources -> !Int

-- | Maximum amount of shared memory per thread block (bytes)
[maxSharedMemPerBlock] :: DeviceResources -> !Int

-- | Total number of registers in a multiprocessor
[regFileSizePerMP] :: DeviceResources -> !Int

-- | Maximum number of registers per block
[maxRegPerBlock] :: DeviceResources -> !Int

-- | Register allocation unit size
[regAllocUnit] :: DeviceResources -> !Int

-- | How multiprocessor resources are divided (register allocation
--   granularity)
[regAllocationStyle] :: DeviceResources -> !Allocation

-- | Maximum number of registers per thread
[maxRegPerThread] :: DeviceResources -> !Int

-- | Shared memory allocation unit size (bytes)
[sharedMemAllocUnit] :: DeviceResources -> !Int

-- | Warp allocation granularity
[warpAllocUnit] :: DeviceResources -> !Int

-- | Warp register allocation granularity
[warpRegAllocUnit] :: DeviceResources -> !Int

-- | Maximum number of resident grids per device (concurrent kernels)
[maxGridsPerDevice] :: DeviceResources -> !Int
data Allocation
Warp :: Allocation
Block :: Allocation
data PCI
PCI :: !Int -> !Int -> !Int -> PCI

-- | PCI bus ID of the device
[busID] :: PCI -> !Int

-- | PCI device ID
[deviceID] :: PCI -> !Int

-- | PCI domain ID
[domainID] :: PCI -> !Int

-- | Extract some additional hardware resource limitations for a given
--   device.
deviceResources :: DeviceProperties -> DeviceResources
describe :: Describe a => a -> String
instance GHC.Show.Show Foreign.CUDA.Analysis.Device.ComputeMode
instance GHC.Classes.Eq Foreign.CUDA.Analysis.Device.ComputeMode
instance GHC.Classes.Eq Foreign.CUDA.Analysis.Device.Compute
instance GHC.Show.Show Foreign.CUDA.Analysis.Device.PCI
instance GHC.Show.Show Foreign.CUDA.Analysis.Device.DeviceProperties
instance GHC.Show.Show Foreign.CUDA.Analysis.Device.Allocation
instance GHC.Show.Show Foreign.CUDA.Analysis.Device.DeviceResources
instance GHC.Show.Show Foreign.CUDA.Analysis.Device.Compute
instance GHC.Classes.Ord Foreign.CUDA.Analysis.Device.Compute
instance GHC.Enum.Enum Foreign.CUDA.Analysis.Device.ComputeMode
instance Text.Show.Describe.Describe Foreign.CUDA.Analysis.Device.ComputeMode


-- | Device management routines
module Foreign.CUDA.Runtime.Device

-- | A device identifier
type Device = Int

-- | Device execution flags
data DeviceFlag
ScheduleAuto :: DeviceFlag
ScheduleSpin :: DeviceFlag
ScheduleYield :: DeviceFlag
BlockingSync :: DeviceFlag
MapHost :: DeviceFlag
LMemResizeToMax :: DeviceFlag

-- | The properties of a compute device
data DeviceProperties
DeviceProperties :: !String -> !Compute -> !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> !Int -> !Int -> !(Int, Int, Int) -> !(Int, Int, Int) -> !Int -> !(Int, Int) -> !(Int, Int, Int) -> !Int -> !Int -> !Int64 -> !Int -> !Int -> !Int64 -> !ComputeMode -> !Bool -> !Bool -> !Bool -> !Int -> !Int -> !PCI -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Int -> !Bool -> !Int -> !Bool -> !Bool -> DeviceProperties

-- | Identifier
[deviceName] :: DeviceProperties -> !String

-- | Supported compute capability
[computeCapability] :: DeviceProperties -> !Compute

-- | Available global memory on the device in bytes
[totalGlobalMem] :: DeviceProperties -> !Int64

-- | Available constant memory on the device in bytes
[totalConstMem] :: DeviceProperties -> !Int64

-- | Available shared memory per block in bytes
[sharedMemPerBlock] :: DeviceProperties -> !Int64

-- | 32-bit registers per block
[regsPerBlock] :: DeviceProperties -> !Int

-- | Warp size in threads (SIMD width)
[warpSize] :: DeviceProperties -> !Int

-- | Maximum number of threads per block
[maxThreadsPerBlock] :: DeviceProperties -> !Int

-- | Maximum number of threads per multiprocessor
[maxThreadsPerMultiProcessor] :: DeviceProperties -> !Int

-- | Maximum size of each dimension of a block
[maxBlockSize] :: DeviceProperties -> !(Int, Int, Int)

-- | Maximum size of each dimension of a grid
[maxGridSize] :: DeviceProperties -> !(Int, Int, Int)

-- | Maximum texture dimensions
[maxTextureDim1D] :: DeviceProperties -> !Int
[maxTextureDim2D] :: DeviceProperties -> !(Int, Int)
[maxTextureDim3D] :: DeviceProperties -> !(Int, Int, Int)

-- | Clock frequency in kilohertz
[clockRate] :: DeviceProperties -> !Int

-- | Number of multiprocessors on the device
[multiProcessorCount] :: DeviceProperties -> !Int

-- | Maximum pitch in bytes allowed by memory copies
[memPitch] :: DeviceProperties -> !Int64

-- | Global memory bus width in bits
[memBusWidth] :: DeviceProperties -> !Int

-- | Peak memory clock frequency in kilohertz
[memClockRate] :: DeviceProperties -> !Int

-- | Alignment requirement for textures
[textureAlignment] :: DeviceProperties -> !Int64
[computeMode] :: DeviceProperties -> !ComputeMode

-- | Device can concurrently copy memory and execute a kernel
[deviceOverlap] :: DeviceProperties -> !Bool

-- | Device can possibly execute multiple kernels concurrently
[concurrentKernels] :: DeviceProperties -> !Bool

-- | Device supports and has enabled error correction
[eccEnabled] :: DeviceProperties -> !Bool

-- | Number of asynchronous engines
[asyncEngineCount] :: DeviceProperties -> !Int

-- | Size of the L2 cache in bytes
[cacheMemL2] :: DeviceProperties -> !Int

-- | PCI device information for the device
[pciInfo] :: DeviceProperties -> !PCI

-- | Whether this is a Tesla device using the TCC driver
[tccDriverEnabled] :: DeviceProperties -> !Bool

-- | Whether there is a runtime limit on kernels
[kernelExecTimeoutEnabled] :: DeviceProperties -> !Bool

-- | As opposed to discrete
[integrated] :: DeviceProperties -> !Bool

-- | Device can use pinned memory
[canMapHostMemory] :: DeviceProperties -> !Bool

-- | Device shares a unified address space with the host
[unifiedAddressing] :: DeviceProperties -> !Bool

-- | Device supports stream priorities
[streamPriorities] :: DeviceProperties -> !Bool

-- | Device supports caching globals in L1 cache
[globalL1Cache] :: DeviceProperties -> !Bool

-- | Device supports caching locals in L1 cache
[localL1Cache] :: DeviceProperties -> !Bool

-- | Device supports allocating managed memory on this system
[managedMemory] :: DeviceProperties -> !Bool

-- | Device is on a multi-GPU board
[multiGPUBoard] :: DeviceProperties -> !Bool

-- | Unique identifier for a group of devices associated with the same
--   board
[multiGPUBoardGroupID] :: DeviceProperties -> !Int

-- | Device supports compute pre-emption
[preemption] :: DeviceProperties -> !Bool

-- | Ratio of single precision performance (in floating-point operations
--   per second) to double precision performance
[singleToDoublePerfRatio] :: DeviceProperties -> !Int

-- | Device supports launching cooperative kernels
[cooperativeLaunch] :: DeviceProperties -> !Bool

-- | Device can participate in cooperative multi-device kernels
[cooperativeLaunchMultiDevice] :: DeviceProperties -> !Bool

-- | GPU compute capability, major and minor revision number respectively.
data Compute
Compute :: !Int -> !Int -> Compute

-- | The compute mode the device is currently in
data ComputeMode
Default :: ComputeMode
Prohibited :: ComputeMode
ExclusiveProcess :: ComputeMode

-- | Select the compute device which best matches the given criteria
choose :: DeviceProperties -> IO Device

-- | Returns which device is currently being used
get :: IO Device

-- | Returns the number of devices available for execution, with compute
--   capability &gt;= 1.0
count :: IO Int

-- | Return information about the selected compute device
props :: Device -> IO DeviceProperties

-- | Set device to be used for GPU execution
set :: Device -> IO ()

-- | Set flags to be used for device executions
setFlags :: [DeviceFlag] -> IO ()

-- | Set list of devices for CUDA execution in priority order
setOrder :: [Device] -> IO ()

-- | Explicitly destroys and cleans up all runtime resources associated
--   with the current device in the current process. Any subsequent API
--   call will reinitialise the device.
--   
--   Note that this function will reset the device immediately. It is the
--   caller’s responsibility to ensure that the device is not being
--   accessed by any other host threads from the process when this function
--   is called.
reset :: IO ()

-- | Block until the device has completed all preceding requested tasks.
--   Returns an error if one of the tasks fails.
sync :: IO ()

-- | Possible option values for direct peer memory access
data PeerFlag

-- | Queries if the first device can directly access the memory of the
--   second. If direct access is possible, it can then be enabled with
--   <a>add</a>. Requires cuda-4.0.
accessible :: Device -> Device -> IO Bool

-- | If the devices of both the current and supplied contexts support
--   unified addressing, then enable allocations in the supplied context to
--   be accessible by the current context. Requires cuda-4.0.
add :: Device -> [PeerFlag] -> IO ()

-- | Disable direct memory access from the current context to the supplied
--   context. Requires cuda-4.0.
remove :: Device -> IO ()

-- | Device limit flags
data Limit
Stacksize :: Limit
Printffifosize :: Limit
Mallocheapsize :: Limit
Devruntimesyncdepth :: Limit
Devruntimependinglaunchcount :: Limit
Maxl2fetchgranularity :: Limit

-- | Query compute 2.0 call stack limits. Requires cuda-3.1.
getLimit :: Limit -> IO Int

-- | Set compute 2.0 call stack limits. Requires cuda-3.1.
setLimit :: Limit -> Int -> IO ()
instance GHC.Enum.Bounded Foreign.CUDA.Runtime.Device.DeviceFlag
instance GHC.Show.Show Foreign.CUDA.Runtime.Device.DeviceFlag
instance GHC.Classes.Eq Foreign.CUDA.Runtime.Device.DeviceFlag
instance GHC.Show.Show Foreign.CUDA.Runtime.Device.Limit
instance GHC.Classes.Eq Foreign.CUDA.Runtime.Device.Limit
instance GHC.Enum.Enum Foreign.CUDA.Runtime.Device.Limit
instance GHC.Enum.Enum Foreign.CUDA.Runtime.Device.PeerFlag
instance GHC.Enum.Enum Foreign.CUDA.Runtime.Device.DeviceFlag
instance Foreign.Storable.Storable Foreign.CUDA.Analysis.Device.DeviceProperties


-- | Module loading for low-level driver interface
module Foreign.CUDA.Driver.Module.Base

-- | A reference to a Module object, containing collections of device
--   functions
newtype Module
Module :: Ptr () -> Module
[useModule] :: Module -> Ptr ()

-- | Just-in-time compilation and linking options
data JITOption

-- | maximum number of registers per thread
MaxRegisters :: !Int -> JITOption

-- | number of threads per block to target for
ThreadsPerBlock :: !Int -> JITOption

-- | level of optimisation to apply (1-4, default 4)
OptimisationLevel :: !Int -> JITOption

-- | compilation target, otherwise determined from context
Target :: !Compute -> JITOption

-- | fallback strategy if matching cubin not found
FallbackStrategy :: !JITFallback -> JITOption

-- | generate debug info (-g) (requires cuda &gt;= 5.5)
GenerateDebugInfo :: JITOption

-- | generate line number information (-lineinfo) (requires cuda &gt;= 5.5)
GenerateLineInfo :: JITOption

-- | verbose log messages (requires cuda &gt;= 5.5)
Verbose :: JITOption

-- | Online compilation target architecture
data JITTarget
Compute20 :: JITTarget
Compute21 :: JITTarget
Compute30 :: JITTarget
Compute32 :: JITTarget
Compute35 :: JITTarget
Compute37 :: JITTarget
Compute50 :: JITTarget
Compute52 :: JITTarget
Compute53 :: JITTarget
Compute60 :: JITTarget
Compute61 :: JITTarget
Compute62 :: JITTarget
Compute70 :: JITTarget
Compute72 :: JITTarget
Compute75 :: JITTarget

-- | Results of online compilation
data JITResult
JITResult :: !Float -> !ByteString -> !Module -> JITResult

-- | milliseconds spent compiling PTX
[jitTime] :: JITResult -> !Float

-- | information about PTX assembly
[jitInfoLog] :: JITResult -> !ByteString

-- | the compiled module
[jitModule] :: JITResult -> !Module

-- | Online compilation fallback strategy
data JITFallback
PreferPTX :: JITFallback
PreferBinary :: JITFallback

-- | Device code formats that can be used for online linking
data JITInputType
Cubin :: JITInputType
PTX :: JITInputType
Fatbinary :: JITInputType
Object :: JITInputType
Library :: JITInputType
CuJitNumInputTypes :: JITInputType

-- | Load the contents of the specified file (either a ptx or cubin file)
--   to create a new module, and load that module into the current context.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g366093bd269dafd0af21f1c7d18115d3</a>
loadFile :: FilePath -> IO Module

-- | Load the contents of the given image into a new module, and load that
--   module into the current context. The image is (typically) the contents
--   of a cubin or PTX file.
--   
--   Note that the <a>ByteString</a> will be copied into a temporary
--   staging area so that it can be passed to C.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g04ce266ce03720f479eab76136b90c0b</a>
loadData :: ByteString -> IO Module

-- | As <a>loadData</a>, but read the image data from the given pointer.
--   The image is a NULL-terminated sequence of bytes.
loadDataFromPtr :: Ptr Word8 -> IO Module

-- | Load the contents of the given image into a module with online
--   compiler options, and load the module into the current context. The
--   image is (typically) the contents of a cubin or PTX file. The actual
--   attributes of the compiled kernel can be probed using
--   <tt>requires</tt>.
--   
--   Note that the <a>ByteString</a> will be copied into a temporary
--   staging area so that it can be passed to C.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g9e8047e9dbf725f0cd7cafd18bfd4d12</a>
loadDataEx :: ByteString -> [JITOption] -> IO JITResult

-- | As <a>loadDataEx</a>, but read the image data from the given pointer.
--   The image is a NULL-terminated sequence of bytes.
loadDataFromPtrEx :: Ptr Word8 -> [JITOption] -> IO JITResult

-- | Unload a module from the current context.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b</a>
unload :: Module -> IO ()
instance GHC.Show.Show Foreign.CUDA.Driver.Module.Base.Module
instance GHC.Classes.Eq Foreign.CUDA.Driver.Module.Base.Module
instance GHC.Show.Show Foreign.CUDA.Driver.Module.Base.JITResult
instance GHC.Show.Show Foreign.CUDA.Driver.Module.Base.JITTarget
instance GHC.Classes.Eq Foreign.CUDA.Driver.Module.Base.JITTarget
instance GHC.Show.Show Foreign.CUDA.Driver.Module.Base.JITFallback
instance GHC.Classes.Eq Foreign.CUDA.Driver.Module.Base.JITFallback
instance GHC.Show.Show Foreign.CUDA.Driver.Module.Base.JITOption
instance GHC.Show.Show Foreign.CUDA.Driver.Module.Base.JITInputType
instance GHC.Classes.Eq Foreign.CUDA.Driver.Module.Base.JITInputType
instance GHC.Show.Show Foreign.CUDA.Driver.Module.Base.JITOptionInternal
instance GHC.Classes.Eq Foreign.CUDA.Driver.Module.Base.JITOptionInternal
instance GHC.Enum.Enum Foreign.CUDA.Driver.Module.Base.JITOptionInternal
instance GHC.Enum.Enum Foreign.CUDA.Driver.Module.Base.JITInputType
instance GHC.Enum.Enum Foreign.CUDA.Driver.Module.Base.JITFallback
instance GHC.Enum.Enum Foreign.CUDA.Driver.Module.Base.JITTarget


-- | Module linking for low-level driver interface
--   
--   Since CUDA-5.5
module Foreign.CUDA.Driver.Module.Link

-- | A pending JIT linker state
data LinkState

-- | Just-in-time compilation and linking options
data JITOption

-- | maximum number of registers per thread
MaxRegisters :: !Int -> JITOption

-- | number of threads per block to target for
ThreadsPerBlock :: !Int -> JITOption

-- | level of optimisation to apply (1-4, default 4)
OptimisationLevel :: !Int -> JITOption

-- | compilation target, otherwise determined from context
Target :: !Compute -> JITOption

-- | fallback strategy if matching cubin not found
FallbackStrategy :: !JITFallback -> JITOption

-- | generate debug info (-g) (requires cuda &gt;= 5.5)
GenerateDebugInfo :: JITOption

-- | generate line number information (-lineinfo) (requires cuda &gt;= 5.5)
GenerateLineInfo :: JITOption

-- | verbose log messages (requires cuda &gt;= 5.5)
Verbose :: JITOption

-- | Device code formats that can be used for online linking
data JITInputType
Cubin :: JITInputType
PTX :: JITInputType
Fatbinary :: JITInputType
Object :: JITInputType
Library :: JITInputType
CuJitNumInputTypes :: JITInputType

-- | Create a pending JIT linker invocation. The returned <a>LinkState</a>
--   should be <a>destroy</a>ed once no longer needed. The device code
--   machine size will match the calling application.
--   
--   Requires CUDA-5.5.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g86ca4052a2fab369cb943523908aa80d</a>
create :: [JITOption] -> IO LinkState

-- | Destroy the state of a JIT linker invocation.
--   
--   Requires CUDA-5.5.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g01b7ae2a34047b05716969af245ce2d9</a>
destroy :: LinkState -> IO ()

-- | Complete a pending linker invocation and load the current module. The
--   link state will be destroyed.
--   
--   Requires CUDA-5.5.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g818fcd84a4150a997c0bba76fef4e716</a>
complete :: LinkState -> IO Module

-- | Add an input file to a pending linker invocation.
--   
--   Requires CUDA-5.5.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g1224c0fd48d4a683f3ce19997f200a8c</a>
addFile :: LinkState -> FilePath -> JITInputType -> [JITOption] -> IO ()

-- | Add an input to a pending linker invocation.
--   
--   Requires CUDA-5.5.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g3ebcd2ccb772ba9c120937a2d2831b77</a>
addData :: LinkState -> ByteString -> JITInputType -> [JITOption] -> IO ()

-- | As <a>addData</a>, but read the specified number of bytes of image
--   data from the given pointer.
addDataFromPtr :: LinkState -> Int -> Ptr Word8 -> JITInputType -> [JITOption] -> IO ()
instance GHC.Show.Show Foreign.CUDA.Driver.Module.Link.LinkState


-- | Device management for low-level driver interface
module Foreign.CUDA.Driver.Device

-- | A CUDA device
newtype Device
Device :: CInt -> Device
[useDevice] :: Device -> CInt

-- | The properties of a compute device
data DeviceProperties
DeviceProperties :: !String -> !Compute -> !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> !Int -> !Int -> !(Int, Int, Int) -> !(Int, Int, Int) -> !Int -> !(Int, Int) -> !(Int, Int, Int) -> !Int -> !Int -> !Int64 -> !Int -> !Int -> !Int64 -> !ComputeMode -> !Bool -> !Bool -> !Bool -> !Int -> !Int -> !PCI -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Bool -> !Int -> !Bool -> !Int -> !Bool -> !Bool -> DeviceProperties

-- | Identifier
[deviceName] :: DeviceProperties -> !String

-- | Supported compute capability
[computeCapability] :: DeviceProperties -> !Compute

-- | Available global memory on the device in bytes
[totalGlobalMem] :: DeviceProperties -> !Int64

-- | Available constant memory on the device in bytes
[totalConstMem] :: DeviceProperties -> !Int64

-- | Available shared memory per block in bytes
[sharedMemPerBlock] :: DeviceProperties -> !Int64

-- | 32-bit registers per block
[regsPerBlock] :: DeviceProperties -> !Int

-- | Warp size in threads (SIMD width)
[warpSize] :: DeviceProperties -> !Int

-- | Maximum number of threads per block
[maxThreadsPerBlock] :: DeviceProperties -> !Int

-- | Maximum number of threads per multiprocessor
[maxThreadsPerMultiProcessor] :: DeviceProperties -> !Int

-- | Maximum size of each dimension of a block
[maxBlockSize] :: DeviceProperties -> !(Int, Int, Int)

-- | Maximum size of each dimension of a grid
[maxGridSize] :: DeviceProperties -> !(Int, Int, Int)

-- | Maximum texture dimensions
[maxTextureDim1D] :: DeviceProperties -> !Int
[maxTextureDim2D] :: DeviceProperties -> !(Int, Int)
[maxTextureDim3D] :: DeviceProperties -> !(Int, Int, Int)

-- | Clock frequency in kilohertz
[clockRate] :: DeviceProperties -> !Int

-- | Number of multiprocessors on the device
[multiProcessorCount] :: DeviceProperties -> !Int

-- | Maximum pitch in bytes allowed by memory copies
[memPitch] :: DeviceProperties -> !Int64

-- | Global memory bus width in bits
[memBusWidth] :: DeviceProperties -> !Int

-- | Peak memory clock frequency in kilohertz
[memClockRate] :: DeviceProperties -> !Int

-- | Alignment requirement for textures
[textureAlignment] :: DeviceProperties -> !Int64
[computeMode] :: DeviceProperties -> !ComputeMode

-- | Device can concurrently copy memory and execute a kernel
[deviceOverlap] :: DeviceProperties -> !Bool

-- | Device can possibly execute multiple kernels concurrently
[concurrentKernels] :: DeviceProperties -> !Bool

-- | Device supports and has enabled error correction
[eccEnabled] :: DeviceProperties -> !Bool

-- | Number of asynchronous engines
[asyncEngineCount] :: DeviceProperties -> !Int

-- | Size of the L2 cache in bytes
[cacheMemL2] :: DeviceProperties -> !Int

-- | PCI device information for the device
[pciInfo] :: DeviceProperties -> !PCI

-- | Whether this is a Tesla device using the TCC driver
[tccDriverEnabled] :: DeviceProperties -> !Bool

-- | Whether there is a runtime limit on kernels
[kernelExecTimeoutEnabled] :: DeviceProperties -> !Bool

-- | As opposed to discrete
[integrated] :: DeviceProperties -> !Bool

-- | Device can use pinned memory
[canMapHostMemory] :: DeviceProperties -> !Bool

-- | Device shares a unified address space with the host
[unifiedAddressing] :: DeviceProperties -> !Bool

-- | Device supports stream priorities
[streamPriorities] :: DeviceProperties -> !Bool

-- | Device supports caching globals in L1 cache
[globalL1Cache] :: DeviceProperties -> !Bool

-- | Device supports caching locals in L1 cache
[localL1Cache] :: DeviceProperties -> !Bool

-- | Device supports allocating managed memory on this system
[managedMemory] :: DeviceProperties -> !Bool

-- | Device is on a multi-GPU board
[multiGPUBoard] :: DeviceProperties -> !Bool

-- | Unique identifier for a group of devices associated with the same
--   board
[multiGPUBoardGroupID] :: DeviceProperties -> !Int

-- | Device supports compute pre-emption
[preemption] :: DeviceProperties -> !Bool

-- | Ratio of single precision performance (in floating-point operations
--   per second) to double precision performance
[singleToDoublePerfRatio] :: DeviceProperties -> !Int

-- | Device supports launching cooperative kernels
[cooperativeLaunch] :: DeviceProperties -> !Bool

-- | Device can participate in cooperative multi-device kernels
[cooperativeLaunchMultiDevice] :: DeviceProperties -> !Bool

-- | Device attributes
data DeviceAttribute
MaxThreadsPerBlock :: DeviceAttribute
MaxBlockDimX :: DeviceAttribute
MaxBlockDimY :: DeviceAttribute
MaxBlockDimZ :: DeviceAttribute
MaxGridDimX :: DeviceAttribute
MaxGridDimY :: DeviceAttribute
MaxGridDimZ :: DeviceAttribute
MaxSharedMemoryPerBlock :: DeviceAttribute
SharedMemoryPerBlock :: DeviceAttribute
TotalConstantMemory :: DeviceAttribute
WarpSize :: DeviceAttribute
MaxPitch :: DeviceAttribute
MaxRegistersPerBlock :: DeviceAttribute
RegistersPerBlock :: DeviceAttribute
ClockRate :: DeviceAttribute
TextureAlignment :: DeviceAttribute
GpuOverlap :: DeviceAttribute
MultiprocessorCount :: DeviceAttribute
KernelExecTimeout :: DeviceAttribute
Integrated :: DeviceAttribute
CanMapHostMemory :: DeviceAttribute
ComputeMode :: DeviceAttribute
MaximumTexture1dWidth :: DeviceAttribute
MaximumTexture2dWidth :: DeviceAttribute
MaximumTexture2dHeight :: DeviceAttribute
MaximumTexture3dWidth :: DeviceAttribute
MaximumTexture3dHeight :: DeviceAttribute
MaximumTexture3dDepth :: DeviceAttribute
MaximumTexture2dLayeredWidth :: DeviceAttribute
MaximumTexture2dArrayWidth :: DeviceAttribute
MaximumTexture2dLayeredHeight :: DeviceAttribute
MaximumTexture2dArrayHeight :: DeviceAttribute
MaximumTexture2dLayeredLayers :: DeviceAttribute
MaximumTexture2dArrayNumslices :: DeviceAttribute
SurfaceAlignment :: DeviceAttribute
ConcurrentKernels :: DeviceAttribute
EccEnabled :: DeviceAttribute
PciBusId :: DeviceAttribute
PciDeviceId :: DeviceAttribute
TccDriver :: DeviceAttribute
MemoryClockRate :: DeviceAttribute
GlobalMemoryBusWidth :: DeviceAttribute
L2CacheSize :: DeviceAttribute
MaxThreadsPerMultiprocessor :: DeviceAttribute
AsyncEngineCount :: DeviceAttribute
UnifiedAddressing :: DeviceAttribute
MaximumTexture1dLayeredWidth :: DeviceAttribute
MaximumTexture1dLayeredLayers :: DeviceAttribute
CanTex2dGather :: DeviceAttribute
MaximumTexture2dGatherWidth :: DeviceAttribute
MaximumTexture2dGatherHeight :: DeviceAttribute
MaximumTexture3dWidthAlternate :: DeviceAttribute
MaximumTexture3dHeightAlternate :: DeviceAttribute
MaximumTexture3dDepthAlternate :: DeviceAttribute
PciDomainId :: DeviceAttribute
TexturePitchAlignment :: DeviceAttribute
MaximumTexturecubemapWidth :: DeviceAttribute
MaximumTexturecubemapLayeredWidth :: DeviceAttribute
MaximumTexturecubemapLayeredLayers :: DeviceAttribute
MaximumSurface1dWidth :: DeviceAttribute
MaximumSurface2dWidth :: DeviceAttribute
MaximumSurface2dHeight :: DeviceAttribute
MaximumSurface3dWidth :: DeviceAttribute
MaximumSurface3dHeight :: DeviceAttribute
MaximumSurface3dDepth :: DeviceAttribute
MaximumSurface1dLayeredWidth :: DeviceAttribute
MaximumSurface1dLayeredLayers :: DeviceAttribute
MaximumSurface2dLayeredWidth :: DeviceAttribute
MaximumSurface2dLayeredHeight :: DeviceAttribute
MaximumSurface2dLayeredLayers :: DeviceAttribute
MaximumSurfacecubemapWidth :: DeviceAttribute
MaximumSurfacecubemapLayeredWidth :: DeviceAttribute
MaximumSurfacecubemapLayeredLayers :: DeviceAttribute
MaximumTexture1dLinearWidth :: DeviceAttribute
MaximumTexture2dLinearWidth :: DeviceAttribute
MaximumTexture2dLinearHeight :: DeviceAttribute
MaximumTexture2dLinearPitch :: DeviceAttribute
MaximumTexture2dMipmappedWidth :: DeviceAttribute
MaximumTexture2dMipmappedHeight :: DeviceAttribute
ComputeCapabilityMajor :: DeviceAttribute
ComputeCapabilityMinor :: DeviceAttribute
MaximumTexture1dMipmappedWidth :: DeviceAttribute
StreamPrioritiesSupported :: DeviceAttribute
GlobalL1CacheSupported :: DeviceAttribute
LocalL1CacheSupported :: DeviceAttribute
MaxSharedMemoryPerMultiprocessor :: DeviceAttribute
MaxRegistersPerMultiprocessor :: DeviceAttribute
ManagedMemory :: DeviceAttribute
MultiGpuBoard :: DeviceAttribute
MultiGpuBoardGroupId :: DeviceAttribute
HostNativeAtomicSupported :: DeviceAttribute
SingleToDoublePrecisionPerfRatio :: DeviceAttribute
PageableMemoryAccess :: DeviceAttribute
ConcurrentManagedAccess :: DeviceAttribute
ComputePreemptionSupported :: DeviceAttribute
CanUseHostPointerForRegisteredMem :: DeviceAttribute
CanUseStreamMemOps :: DeviceAttribute
CanUse64BitStreamMemOps :: DeviceAttribute
CanUseStreamWaitValueNor :: DeviceAttribute
CooperativeLaunch :: DeviceAttribute
CooperativeMultiDeviceLaunch :: DeviceAttribute
MaxSharedMemoryPerBlockOptin :: DeviceAttribute
CanFlushRemoteWrites :: DeviceAttribute
HostRegisterSupported :: DeviceAttribute
PageableMemoryAccessUsesHostPageTables :: DeviceAttribute
DirectManagedMemAccessFromHost :: DeviceAttribute
VirtualAddressManagementSupported :: DeviceAttribute
HandleTypePosixFileDescriptorSupported :: DeviceAttribute
HandleTypeWin32HandleSupported :: DeviceAttribute
HandleTypeWin32KmtHandleSupported :: DeviceAttribute
CU_DEVICE_ATTRIBUTE_MAX :: DeviceAttribute

-- | GPU compute capability, major and minor revision number respectively.
data Compute
Compute :: !Int -> !Int -> Compute

-- | The compute mode the device is currently in
data ComputeMode
Default :: ComputeMode
Prohibited :: ComputeMode
ExclusiveProcess :: ComputeMode

-- | Possible option flags for CUDA initialisation. Dummy instance until
--   the API exports actual option values.
data InitFlag

-- | Initialise the CUDA driver API. This must be called before any other
--   driver function.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__INITIALIZE.html#group__CUDA__INITIALIZE_1g0a2f1517e1bd8502c7194c3a8c134bc3</a>
initialise :: [InitFlag] -> IO ()

-- | Return the compute compatibility revision supported by the device
capability :: Device -> IO Compute

-- | Return a handle to the compute device at the given ordinal.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g8bdd1cc7201304b01357b8034f6587cb</a>
device :: Int -> IO Device

-- | Return the selected attribute for the given device.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g9c3e1414f0ad901d3278a4d6645fc266</a>
attribute :: Device -> DeviceAttribute -> IO Int

-- | Return the number of device with compute capability &gt; 1.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g52b5ce05cb8c5fb6831b2c0ff2887c74</a>
count :: IO Int

-- | The identifying name of the device.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1gef75aa30df95446a845f2a7b9fffbb7f</a>
name :: Device -> IO String

-- | Return the properties of the selected device
props :: Device -> IO DeviceProperties

-- | Returns a UUID for the device
--   
--   Requires CUDA-9.2
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1g987b46b884c101ed5be414ab4d9e60e4</a>
uuid :: Device -> IO UUID

-- | The total memory available on the device (bytes).
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__DEVICE.html#group__CUDA__DEVICE_1gc6a0d6551335a3780f9f3c967a0fde5d</a>
totalMem :: Device -> IO Int64
instance GHC.Show.Show Foreign.CUDA.Driver.Device.Device
instance GHC.Classes.Eq Foreign.CUDA.Driver.Device.Device
instance GHC.Show.Show Foreign.CUDA.Driver.Device.DeviceAttribute
instance GHC.Classes.Eq Foreign.CUDA.Driver.Device.DeviceAttribute
instance GHC.Enum.Enum Foreign.CUDA.Driver.Device.InitFlag
instance GHC.Enum.Enum Foreign.CUDA.Driver.Device.DeviceAttribute


-- | Context management for the low-level driver interface
module Foreign.CUDA.Driver.Context.Base

-- | A device context
newtype Context
Context :: Ptr () -> Context
[useContext] :: Context -> Ptr ()

-- | Context creation flags
data ContextFlag
SchedAuto :: ContextFlag
SchedSpin :: ContextFlag
SchedYield :: ContextFlag
SchedBlockingSync :: ContextFlag

-- | <i>Deprecated: use SchedBlockingSync instead</i>
BlockingSync :: ContextFlag
SchedMask :: ContextFlag
MapHost :: ContextFlag
LmemResizeToMax :: ContextFlag
FlagsMask :: ContextFlag

-- | Create a new CUDA context and associate it with the calling thread.
--   The context is created with a usage count of one, and the caller of
--   <a>create</a> must call <a>destroy</a> when done using the context. If
--   a context is already current to the thread, it is supplanted by the
--   newly created context and must be restored by a subsequent call to
--   <a>pop</a>.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf</a>
create :: Device -> [ContextFlag] -> IO Context

-- | Destroy the specified context, regardless of how many threads it is
--   current to. The context will be <a>pop</a>ed from the current thread's
--   context stack, but if it is current on any other threads it will
--   remain current to those threads, and attempts to access it will result
--   in an error.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e</a>
destroy :: Context -> IO ()

-- | Return the device of the currently active context
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g4e84b109eba36cdaaade167f34ae881e</a>
device :: IO Device

-- | Pop the current CUDA context from the CPU thread. The context may then
--   be attached to a different CPU thread by calling <a>push</a>.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2fac188026a062d92e91a8687d0a7902</a>
pop :: IO Context

-- | Push the given context onto the CPU's thread stack of current
--   contexts. The specified context becomes the CPU thread's current
--   context, so all operations that operate on the current context are
--   affected.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gb02d4c850eb16f861fe5a29682cc90ba</a>
push :: Context -> IO ()

-- | Block until the device has completed all preceding requests. If the
--   context was created with the <a>SchedBlockingSync</a> flag, the CPU
--   thread will block until the GPU has finished its work.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616</a>
sync :: IO ()

-- | Return the context bound to the calling CPU thread.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g8f13165846b73750693640fb3e8380d0</a>
get :: IO (Maybe Context)

-- | Bind the specified context to the calling thread.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7</a>
set :: Context -> IO ()

-- | Increments the usage count of the context. API: no context flags are
--   currently supported, so this parameter must be empty.

-- | <i>Deprecated: as of CUDA-4.0</i>
attach :: Context -> [ContextFlag] -> IO ()

-- | Detach the context, and destroy if no longer used

-- | <i>Deprecated: as of CUDA-4.0</i>
detach :: Context -> IO ()
instance GHC.Show.Show Foreign.CUDA.Driver.Context.Base.Context
instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Base.Context
instance GHC.Enum.Bounded Foreign.CUDA.Driver.Context.Base.ContextFlag
instance GHC.Show.Show Foreign.CUDA.Driver.Context.Base.ContextFlag
instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Base.ContextFlag
instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.Base.ContextFlag


-- | Stream management for low-level driver interface
module Foreign.CUDA.Driver.Stream

-- | A processing stream. All operations in a stream are synchronous and
--   executed in sequence, but operations in different non-default streams
--   may happen out-of-order or concurrently with one another.
--   
--   Use <tt>Event</tt>s to synchronise operations between streams.
newtype Stream
Stream :: Ptr () -> Stream
[useStream] :: Stream -> Ptr ()

-- | Priority of an execution stream. Work submitted to a higher priority
--   stream may preempt execution of work already executing in a lower
--   priority stream. Lower numbers represent higher priorities.
type StreamPriority = Int
type StreamCallback = ((FunPtr ((Ptr ()) -> (CInt -> ((Ptr ()) -> (IO ()))))))

-- | Execution stream creation flags
data StreamFlag
Default :: StreamFlag
NonBlocking :: StreamFlag
data StreamWriteFlag
WriteValueDefault :: StreamWriteFlag
WriteValueNoMemoryBarrier :: StreamWriteFlag
data StreamWaitFlag
WaitValueGeq :: StreamWaitFlag
WaitValueEq :: StreamWaitFlag
WaitValueAnd :: StreamWaitFlag
WaitValueNor :: StreamWaitFlag
WaitValueFlush :: StreamWaitFlag
data StreamCallbackFlag

-- | Create a new stream.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4</a>
create :: [StreamFlag] -> IO Stream

-- | Create a stream with the given priority. Work submitted to a
--   higher-priority stream may preempt work already executing in a lower
--   priority stream.
--   
--   The convention is that lower numbers represent higher priorities. The
--   default priority is zero. The range of meaningful numeric priorities
--   can be queried using <a>getStreamPriorityRange</a>. If the specified
--   priority is outside the supported numerical range, it will
--   automatically be clamped to the highest or lowest number in the range
--   
--   Requires CUDA-5.5.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g95c1a8c7c3dacb13091692dd9c7f7471</a>
createWithPriority :: StreamPriority -> [StreamFlag] -> IO Stream

-- | Destroy a stream. If the device is still doing work in the stream when
--   <a>destroy</a> is called, the function returns immediately and the
--   resources associated with the stream will be released automatically
--   once the device has completed all work.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g244c8833de4596bcd31a06cdf21ee758</a>
destroy :: Stream -> IO ()

-- | Check if all operations in the stream have completed.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g1b0d24bbe97fa68e4bc511fb6adfeb0b</a>
finished :: Stream -> IO Bool

-- | Wait until the device has completed all operations in the Stream.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g15e49dd91ec15991eb7c0a741beb7dad</a>
block :: Stream -> IO ()

-- | Add a callback to a compute stream. This function will be executed on
--   the host after all currently queued items in the stream have
--   completed.
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483</a>
callback :: Stream -> StreamCallback -> Ptr () -> [StreamCallbackFlag] -> IO ()

-- | Query the flags of a given stream
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g4d39786855a6bed01215c1907fbbfbb7</a>
getFlags :: Stream -> IO [StreamFlag]

-- | Query the priority of a stream.
--   
--   Requires CUDA-5.5.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g5bd5cb26915a2ecf1921807339488484</a>
getPriority :: Stream -> IO StreamPriority

-- | Query the context associated with a stream
--   
--   Requires CUDA-9.2.
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g5bd5cb26915a2ecf1921807339488484</a>
getContext :: Stream -> IO Context

-- | Write a value to memory, (presumably) after all preceding work in the
--   stream has completed. Unless the option
--   <a>WriteValueNoMemoryBarrier</a> is supplied, the write is preceded by
--   a system-wide memory fence.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g091455366d56dc2f1f69726aafa369b0</a>
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gc8af1e8b96d7561840affd5217dd6830</a>
--   
--   Requires CUDA-8.0 for 32-bit values.
--   
--   Requires CUDA-9.0 for 64-bit values.
write :: Storable a => DevicePtr a -> a -> Stream -> [StreamWriteFlag] -> IO ()

-- | Wait on a memory location. Work ordered after the operation will block
--   until the given condition on the memory is satisfied.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g629856339de7bc6606047385addbb398</a>
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6910c1258c5f15aa5d699f0fd60d6933</a>
--   
--   Requires CUDA-8.0 for 32-bit values.
--   
--   Requires CUDA-9.0 for 64-bit values.
wait :: Storable a => DevicePtr a -> a -> Stream -> [StreamWaitFlag] -> IO ()

-- | The default execution stream. This can be configured to have either
--   <a>defaultStreamLegacy</a> or <a>defaultStreamPerThread</a>
--   synchronisation behaviour.
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/stream-sync-behavior.html#stream-sync-behavior__default-stream</a>
defaultStream :: Stream

-- | The legacy default stream is an implicit stream which synchronises
--   with all other streams in the same <a>Context</a>, except for
--   non-blocking streams.
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/stream-sync-behavior.html#stream-sync-behavior__default-stream</a>
defaultStreamLegacy :: Stream

-- | The per-thread default stream is an implicit stream local to both the
--   thread and the calling <a>Context</a>, and which does not synchronise
--   with other streams (just like explicitly created streams). The
--   per-thread default stream is not a non-blocking stream and will
--   synchronise with the legacy default stream if both are used in the
--   same program.
--   
--   
--   <a>file:///Developer/NVIDIA/CUDA-9.2/doc/html/cuda-driver-api/stream-sync-behavior.html#stream-sync-behavior__default-stream</a>
defaultStreamPerThread :: Stream
instance GHC.Show.Show Foreign.CUDA.Driver.Stream.Stream
instance GHC.Classes.Eq Foreign.CUDA.Driver.Stream.Stream
instance GHC.Enum.Bounded Foreign.CUDA.Driver.Stream.StreamFlag
instance GHC.Show.Show Foreign.CUDA.Driver.Stream.StreamFlag
instance GHC.Classes.Eq Foreign.CUDA.Driver.Stream.StreamFlag
instance GHC.Enum.Bounded Foreign.CUDA.Driver.Stream.StreamWriteFlag
instance GHC.Show.Show Foreign.CUDA.Driver.Stream.StreamWriteFlag
instance GHC.Classes.Eq Foreign.CUDA.Driver.Stream.StreamWriteFlag
instance GHC.Enum.Bounded Foreign.CUDA.Driver.Stream.StreamWaitFlag
instance GHC.Show.Show Foreign.CUDA.Driver.Stream.StreamWaitFlag
instance GHC.Classes.Eq Foreign.CUDA.Driver.Stream.StreamWaitFlag
instance GHC.Enum.Enum Foreign.CUDA.Driver.Stream.StreamCallbackFlag
instance GHC.Enum.Enum Foreign.CUDA.Driver.Stream.StreamWaitFlag
instance GHC.Enum.Enum Foreign.CUDA.Driver.Stream.StreamWriteFlag
instance GHC.Enum.Enum Foreign.CUDA.Driver.Stream.StreamFlag


-- | Stream management routines
module Foreign.CUDA.Runtime.Stream

-- | A processing stream. All operations in a stream are synchronous and
--   executed in sequence, but operations in different non-default streams
--   may happen out-of-order or concurrently with one another.
--   
--   Use <tt>Event</tt>s to synchronise operations between streams.
newtype Stream
Stream :: Ptr () -> Stream
[useStream] :: Stream -> Ptr ()

-- | Create a new asynchronous stream
create :: IO Stream

-- | Destroy and clean up an asynchronous stream
destroy :: Stream -> IO ()

-- | Determine if all operations in a stream have completed
finished :: Stream -> IO Bool

-- | Block until all operations in a Stream have been completed
block :: Stream -> IO ()

-- | The default execution stream. This can be configured to have either
--   <a>defaultStreamLegacy</a> or <a>defaultStreamPerThread</a>
--   synchronisation behaviour.
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/stream-sync-behavior.html#stream-sync-behavior__default-stream</a>
defaultStream :: Stream

-- | The legacy default stream is an implicit stream which synchronises
--   with all other streams in the same <a>Context</a>, except for
--   non-blocking streams.
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/stream-sync-behavior.html#stream-sync-behavior__default-stream</a>
defaultStreamLegacy :: Stream

-- | The per-thread default stream is an implicit stream local to both the
--   thread and the calling <a>Context</a>, and which does not synchronise
--   with other streams (just like explicitly created streams). The
--   per-thread default stream is not a non-blocking stream and will
--   synchronise with the legacy default stream if both are used in the
--   same program.
--   
--   
--   <a>file:///Developer/NVIDIA/CUDA-9.2/doc/html/cuda-driver-api/stream-sync-behavior.html#stream-sync-behavior__default-stream</a>
defaultStreamPerThread :: Stream


-- | Memory management for CUDA devices
module Foreign.CUDA.Runtime.Marshal

-- | Options for host allocation
data AllocFlag
Portable :: AllocFlag
DeviceMapped :: AllocFlag
WriteCombined :: AllocFlag

-- | Allocate a section of linear memory on the host which is page-locked
--   and directly accessible from the device. The storage is sufficient to
--   hold the given number of elements of a storable type. The runtime
--   system automatically accelerates calls to functions such as
--   <a>peekArrayAsync</a> and <a>pokeArrayAsync</a> that refer to
--   page-locked memory.
--   
--   Note that since the amount of pageable memory is thusly reduced,
--   overall system performance may suffer. This is best used sparingly to
--   allocate staging areas for data exchange
mallocHostArray :: Storable a => [AllocFlag] -> Int -> IO (HostPtr a)

-- | Free page-locked host memory previously allocated with
--   <tt>mallecHost</tt>
freeHost :: HostPtr a -> IO ()

-- | Allocate a section of linear memory on the device, and return a
--   reference to it. The memory is sufficient to hold the given number of
--   elements of storable type. It is suitable aligned, and not cleared.
mallocArray :: Storable a => Int -> IO (DevicePtr a)

-- | Execute a computation, passing a pointer to a temporarily allocated
--   block of memory sufficient to hold the given number of elements of
--   storable type. The memory is freed when the computation terminates
--   (normally or via an exception), so the pointer must not be used after
--   this.
--   
--   Note that kernel launches can be asynchronous, so you may need to add
--   a synchronisation point at the end of the computation.
allocaArray :: Storable a => Int -> (DevicePtr a -> IO b) -> IO b

-- | Free previously allocated memory on the device
free :: DevicePtr a -> IO ()

-- | Options for unified memory allocations
data AttachFlag
Global :: AttachFlag
Host :: AttachFlag
Single :: AttachFlag

-- | Allocates memory that will be automatically managed by the Unified
--   Memory system
mallocManagedArray :: Storable a => [AttachFlag] -> Int -> IO (DevicePtr a)

-- | Copy a number of elements from the device to host memory. This is a
--   synchronous operation.
peekArray :: Storable a => Int -> DevicePtr a -> Ptr a -> IO ()

-- | Copy memory from the device asynchronously, possibly associated with a
--   particular stream. The destination memory must be page locked.
peekArrayAsync :: Storable a => Int -> DevicePtr a -> HostPtr a -> Maybe Stream -> IO ()

-- | Copy a 2D memory area from the device to the host. This is a
--   synchronous operation.
peekArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> Ptr a -> Int -> IO ()

-- | Copy a 2D memory area from the device to the host asynchronously,
--   possibly associated with a particular stream. The destination array
--   must be page locked.
peekArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> HostPtr a -> Int -> Maybe Stream -> IO ()

-- | Copy a number of elements from the device into a new Haskell list.
--   Note that this requires two memory copies: firstly from the device
--   into a heap allocated array, and from there marshalled into a list
peekListArray :: Storable a => Int -> DevicePtr a -> IO [a]

-- | Copy a number of elements onto the device. This is a synchronous
--   operation.
pokeArray :: Storable a => Int -> Ptr a -> DevicePtr a -> IO ()

-- | Copy memory onto the device asynchronously, possibly associated with a
--   particular stream. The source memory must be page-locked.
pokeArrayAsync :: Storable a => Int -> HostPtr a -> DevicePtr a -> Maybe Stream -> IO ()

-- | Copy a 2D memory area onto the device. This is a synchronous
--   operation.
pokeArray2D :: Storable a => Int -> Int -> Ptr a -> Int -> DevicePtr a -> Int -> IO ()

-- | Copy a 2D memory area onto the device asynchronously, possibly
--   associated with a particular stream. The source array must be page
--   locked.
pokeArray2DAsync :: Storable a => Int -> Int -> HostPtr a -> Int -> DevicePtr a -> Int -> Maybe Stream -> IO ()

-- | Write a list of storable elements into a device array. The array must
--   be sufficiently large to hold the entire list. This requires two
--   marshalling operations
pokeListArray :: Storable a => [a] -> DevicePtr a -> IO ()

-- | Copy the given number of elements from the first device array (source)
--   to the second (destination). The copied areas may not overlap. This
--   operation is asynchronous with respect to host, but will not overlap
--   other device operations.
copyArray :: Storable a => Int -> DevicePtr a -> DevicePtr a -> IO ()

-- | Copy the given number of elements from the first device array (source)
--   to the second (destination). The copied areas may not overlap. This
--   operation is asynchronous with respect to the host, and may be
--   associated with a particular stream.
copyArrayAsync :: Storable a => Int -> DevicePtr a -> DevicePtr a -> Maybe Stream -> IO ()

-- | Copy a 2D memory area from the first device array (source) to the
--   second (destination). The copied areas may not overlap. This operation
--   is asynchronous with respect to the host, but will not overlap other
--   device operations.
copyArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> DevicePtr a -> Int -> IO ()

-- | Copy a 2D memory area from the first device array (source) to the
--   second device array (destination). The copied areas may not overlay.
--   This operation is asynchronous with respect to the host, and may be
--   associated with a particular stream.
copyArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> DevicePtr a -> Int -> Maybe Stream -> IO ()

-- | Write a list of storable elements into a newly allocated device array.
--   This is <a>newListArrayLen</a> composed with <a>fst</a>.
newListArray :: Storable a => [a] -> IO (DevicePtr a)

-- | Write a list of storable elements into a newly allocated device array,
--   returning the device pointer together with the number of elements that
--   were written. Note that this requires two copy operations: firstly
--   from a Haskell list into a heap-allocated array, and from there into
--   device memory. The array should be <a>free</a>d when no longer
--   required.
newListArrayLen :: Storable a => [a] -> IO (DevicePtr a, Int)

-- | Temporarily store a list of elements into a newly allocated device
--   array. An IO action is applied to the array, the result of which is
--   returned. Similar to <a>newListArray</a>, this requires two
--   marshalling operations of the data.
--   
--   As with <a>allocaArray</a>, the memory is freed once the action
--   completes, so you should not return the pointer from the action, and
--   be sure that any asynchronous operations (such as kernel execution)
--   have completed.
withListArray :: Storable a => [a] -> (DevicePtr a -> IO b) -> IO b

-- | A variant of <a>withListArray</a> which also supplies the number of
--   elements in the array to the applied function
withListArrayLen :: Storable a => [a] -> (Int -> DevicePtr a -> IO b) -> IO b

-- | Initialise device memory to a given 8-bit value
memset :: DevicePtr a -> Int64 -> Int8 -> IO ()
instance GHC.Enum.Bounded Foreign.CUDA.Runtime.Marshal.AllocFlag
instance GHC.Show.Show Foreign.CUDA.Runtime.Marshal.AllocFlag
instance GHC.Classes.Eq Foreign.CUDA.Runtime.Marshal.AllocFlag
instance GHC.Enum.Bounded Foreign.CUDA.Runtime.Marshal.AttachFlag
instance GHC.Show.Show Foreign.CUDA.Runtime.Marshal.AttachFlag
instance GHC.Classes.Eq Foreign.CUDA.Runtime.Marshal.AttachFlag
instance GHC.Show.Show Foreign.CUDA.Runtime.Marshal.CopyDirection
instance GHC.Classes.Eq Foreign.CUDA.Runtime.Marshal.CopyDirection
instance GHC.Enum.Enum Foreign.CUDA.Runtime.Marshal.CopyDirection
instance GHC.Enum.Enum Foreign.CUDA.Runtime.Marshal.AttachFlag
instance GHC.Enum.Enum Foreign.CUDA.Runtime.Marshal.AllocFlag


-- | Kernel execution control for C-for-CUDA runtime interface
module Foreign.CUDA.Runtime.Exec

-- | A <tt><b>global</b></tt> device function.
--   
--   Note that the use of a string naming a function was deprecated in CUDA
--   4.1 and removed in CUDA 5.0.
type Fun = FunPtr ()
data FunAttributes
FunAttributes :: !Int64 -> !Int64 -> !Int64 -> !Int -> !Int -> FunAttributes
[constSizeBytes] :: FunAttributes -> !Int64
[localSizeBytes] :: FunAttributes -> !Int64
[sharedSizeBytes] :: FunAttributes -> !Int64

-- | maximum block size that can be successively launched (based on
--   register usage)
[maxKernelThreadsPerBlock] :: FunAttributes -> !Int

-- | number of registers required for each thread
[numRegs] :: FunAttributes -> !Int

-- | Kernel function parameters. Doubles will be converted to an internal
--   float representation on devices that do not support doubles natively.
data FunParam
[IArg] :: !Int -> FunParam
[FArg] :: !Float -> FunParam
[DArg] :: !Double -> FunParam
[VArg] :: Storable a => !a -> FunParam

-- | Cache configuration preference
data CacheConfig
None :: CacheConfig
Shared :: CacheConfig
L1 :: CacheConfig
Equal :: CacheConfig

-- | Obtain the attributes of the named <tt><b>global</b></tt> device
--   function. This itemises the requirements to successfully launch the
--   given kernel.
attributes :: Fun -> IO FunAttributes

-- | On devices where the L1 cache and shared memory use the same hardware
--   resources, this sets the preferred cache configuration for the given
--   device function. This is only a preference; the driver is free to
--   choose a different configuration as required to execute the function.
--   
--   Switching between configuration modes may insert a device-side
--   synchronisation point for streamed kernel launches
setCacheConfig :: Fun -> CacheConfig -> IO ()

-- | Invoke a kernel on a <tt>(gx * gy)</tt> grid of blocks, where each
--   block contains <tt>(tx * ty * tz)</tt> threads and has access to a
--   given number of bytes of shared memory. The launch may also be
--   associated with a specific <a>Stream</a>.
launchKernel :: Fun -> (Int, Int) -> (Int, Int, Int) -> Int64 -> Maybe Stream -> [FunParam] -> IO ()
instance GHC.Show.Show Foreign.CUDA.Runtime.Exec.FunAttributes
instance GHC.Show.Show Foreign.CUDA.Runtime.Exec.CacheConfig
instance GHC.Classes.Eq Foreign.CUDA.Runtime.Exec.CacheConfig
instance GHC.Enum.Enum Foreign.CUDA.Runtime.Exec.CacheConfig
instance Foreign.Storable.Storable Foreign.CUDA.Runtime.Exec.FunAttributes


-- | Top level bindings to the C-for-CUDA runtime API
module Foreign.CUDA.Runtime


-- | Top level bindings. By default, expose the C-for-CUDA runtime API
--   bindings, as they are slightly more user friendly.
module Foreign.CUDA


-- | Requires CUDA-10
module Foreign.CUDA.Driver.Graph.Capture
data Status
None :: Status
Active :: Status
Invalidated :: Status
data Mode
Global :: Mode
ThreadLocal :: Mode
Relaxed :: Mode

-- | Begin graph capture on a stream
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1gea22d4496b1c8d02d0607bb05743532f</a>
start :: Stream -> Mode -> IO ()

-- | End graph capture on a stream
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g03dab8b2ba76b00718955177a929970c</a>
stop :: Stream -> IO Graph

-- | Return a stream's capture status
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g37823c49206e3704ae23c7ad78560bca</a>
status :: Stream -> IO Status

-- | Query the capture status of a stream and get an id for the capture
--   sequence, which is unique over the lifetime of the process.
--   
--   Requires CUDA-10.1
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g13145ece1d79a1d79a1d22abb9663216</a>
info :: Stream -> IO (Status, Int64)

-- | Set the stream capture interaction mode for this thread. Return the
--   previous value.
--   
--   Requires CUDA-10.1
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g378135b262f02a43a7caeab239ae493d</a>
mode :: Mode -> IO Mode
instance GHC.Enum.Bounded Foreign.CUDA.Driver.Graph.Capture.Status
instance GHC.Show.Show Foreign.CUDA.Driver.Graph.Capture.Status
instance GHC.Classes.Eq Foreign.CUDA.Driver.Graph.Capture.Status
instance GHC.Enum.Bounded Foreign.CUDA.Driver.Graph.Capture.Mode
instance GHC.Show.Show Foreign.CUDA.Driver.Graph.Capture.Mode
instance GHC.Classes.Eq Foreign.CUDA.Driver.Graph.Capture.Mode
instance GHC.Enum.Enum Foreign.CUDA.Driver.Graph.Capture.Mode
instance GHC.Enum.Enum Foreign.CUDA.Driver.Graph.Capture.Status


-- | Event management for low-level driver interface
module Foreign.CUDA.Driver.Event

-- | Events are markers that can be inserted into the CUDA execution stream
--   and later queried.
newtype Event
Event :: Ptr () -> Event
[useEvent] :: Event -> Ptr ()

-- | Event creation flags
data EventFlag
Default :: EventFlag
BlockingSync :: EventFlag
DisableTiming :: EventFlag
Interprocess :: EventFlag
data WaitFlag

-- | Create a new event
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g450687e75f3ff992fe01662a43d9d3db</a>
create :: [EventFlag] -> IO Event

-- | Destroy an event
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g593ec73a8ec5a5fc031311d3e4dca1ef</a>
destroy :: Event -> IO ()

-- | Determine the elapsed time (in milliseconds) between two events
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1gdfb1178807353bbcaa9e245da497cf97</a>
elapsedTime :: Event -> Event -> IO Float

-- | Determines if a event has actually been recorded
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g6f0704d755066b0ee705749ae911deef</a>
query :: Event -> IO Bool

-- | Record an event once all operations in the current context (or
--   optionally specified stream) have completed. This operation is
--   asynchronous.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g95424d3be52c4eb95d83861b70fb89d1</a>
record :: Event -> Maybe Stream -> IO ()

-- | Makes all future work submitted to the (optional) stream wait until
--   the given event reports completion before beginning execution.
--   Synchronisation is performed on the device, including when the event
--   and stream are from different device contexts.
--   
--   Requires CUDA-3.2.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g6a898b652dfc6aa1d5c8d97062618b2f</a>
wait :: Event -> Maybe Stream -> [WaitFlag] -> IO ()

-- | Wait until the event has been recorded
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EVENT.html#group__CUDA__EVENT_1g9e520d34e51af7f5375610bca4add99c</a>
block :: Event -> IO ()
instance GHC.Show.Show Foreign.CUDA.Driver.Event.Event
instance GHC.Classes.Eq Foreign.CUDA.Driver.Event.Event
instance GHC.Enum.Bounded Foreign.CUDA.Driver.Event.EventFlag
instance GHC.Show.Show Foreign.CUDA.Driver.Event.EventFlag
instance GHC.Classes.Eq Foreign.CUDA.Driver.Event.EventFlag
instance GHC.Enum.Enum Foreign.CUDA.Driver.Event.WaitFlag
instance GHC.Enum.Enum Foreign.CUDA.Driver.Event.EventFlag


-- | Event management for C-for-CUDA runtime environment
module Foreign.CUDA.Runtime.Event

-- | Events are markers that can be inserted into the CUDA execution stream
--   and later queried.
data Event

-- | Event creation flags
data EventFlag
Default :: EventFlag
BlockingSync :: EventFlag
DisableTiming :: EventFlag
Interprocess :: EventFlag
data WaitFlag

-- | Create a new event
create :: [EventFlag] -> IO Event

-- | Destroy an event
destroy :: Event -> IO ()

-- | Determine the elapsed time (in milliseconds) between two events
elapsedTime :: Event -> Event -> IO Float

-- | Determines if a event has actually been recorded
query :: Event -> IO Bool

-- | Record an event once all operations in the current context (or
--   optionally specified stream) have completed. This operation is
--   asynchronous.
record :: Event -> Maybe Stream -> IO ()

-- | Makes all future work submitted to the (optional) stream wait until
--   the given event reports completion before beginning execution.
--   Synchronisation is performed on the device, including when the event
--   and stream are from different device contexts. Requires cuda-3.2.
wait :: Event -> Maybe Stream -> [WaitFlag] -> IO ()

-- | Wait until the event has been recorded
block :: Event -> IO ()


-- | IPC event management for low-level driver interface.
--   
--   Restricted to devices which support unified addressing on Linux
--   operating systems.
--   
--   Since CUDA-4.1.
module Foreign.CUDA.Driver.IPC.Event

-- | A CUDA inter-process event handle.
data IPCEvent

-- | Create an inter-process event handle for a previously allocated event.
--   The event must be created with the <a>Interprocess</a> and
--   <a>DisableTiming</a> event flags. The returned handle may then be sent
--   to another process and <a>open</a>ed to allow efficient hardware
--   synchronisation between GPU work in other processes.
--   
--   After the event has been opened in the importing process,
--   <a>record</a>, <a>block</a>, <a>wait</a>, <a>query</a> may be used in
--   either process.
--   
--   Performing operations on the imported event after the event has been
--   <a>destroy</a>ed in the exporting process is undefined.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gea02eadd12483de5305878b13288a86c</a>
export :: Event -> IO IPCEvent

-- | Open an inter-process event handle for use in the current process,
--   returning an event that can be used in the current process and
--   behaving as a locally created event with the <a>DisableTiming</a> flag
--   specified.
--   
--   The event must be freed with <a>destroy</a>. Performing operations on
--   the imported event after the exported event has been <a>destroy</a>ed
--   in the exporting process is undefined.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf1d525918b6c643b99ca8c8e42e36c2e</a>
open :: IPCEvent -> IO Event
instance GHC.Show.Show Foreign.CUDA.Driver.IPC.Event.IPCEvent
instance GHC.Classes.Eq Foreign.CUDA.Driver.IPC.Event.IPCEvent


-- | Memory management for low-level driver interface
module Foreign.CUDA.Driver.Marshal

-- | Options for host allocation
data AllocFlag
Portable :: AllocFlag
DeviceMapped :: AllocFlag
WriteCombined :: AllocFlag

-- | Allocate a section of linear memory on the host which is page-locked
--   and directly accessible from the device. The storage is sufficient to
--   hold the given number of elements of a storable type.
--   
--   Note that since the amount of pageable memory is thusly reduced,
--   overall system performance may suffer. This is best used sparingly to
--   allocate staging areas for data exchange.
--   
--   Host memory allocated in this way is automatically and immediately
--   accessible to all contexts on all devices which support unified
--   addressing.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0</a>
mallocHostArray :: Storable a => [AllocFlag] -> Int -> IO (HostPtr a)

-- | As <a>mallocHostArray</a>, but return a <a>ForeignPtr</a> instead. The
--   array will be deallocated automatically once the last reference to the
--   <a>ForeignPtr</a> is dropped.
mallocHostForeignPtr :: Storable a => [AllocFlag] -> Int -> IO (ForeignPtr a)

-- | Free a section of page-locked host memory.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g62e0fdbe181dab6b1c90fa1a51c7b92c</a>
freeHost :: HostPtr a -> IO ()

-- | Page-locks the specified array (on the host) and maps it for the
--   device(s) as specified by the given allocation flags. Subsequently,
--   the memory is accessed directly by the device so can be read and
--   written with much higher bandwidth than pageable memory that has not
--   been registered. The memory range is added to the same tracking
--   mechanism as <a>mallocHostArray</a> to automatically accelerate calls
--   to functions such as <a>pokeArray</a>.
--   
--   Note that page-locking excessive amounts of memory may degrade system
--   performance, since it reduces the amount of pageable memory available.
--   This is best used sparingly to allocate staging areas for data
--   exchange.
--   
--   This function has limited support on Mac OS X. OS 10.7 or later is
--   required.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf0a9fe11544326dabd743b7aa6b54223</a>
registerArray :: Storable a => [AllocFlag] -> Int -> Ptr a -> IO (HostPtr a)

-- | Unmaps the memory from the given pointer, and makes it pageable again.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g63f450c8125359be87b7623b1c0b2a14</a>
unregisterArray :: HostPtr a -> IO (Ptr a)

-- | Allocate a section of linear memory on the device, and return a
--   reference to it. The memory is sufficient to hold the given number of
--   elements of storable type. It is suitably aligned for any type, and is
--   not cleared.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb82d2a09844a58dd9e744dc31e8aa467</a>
mallocArray :: Storable a => Int -> IO (DevicePtr a)

-- | Execute a computation on the device, passing a pointer to a
--   temporarily allocated block of memory sufficient to hold the given
--   number of elements of storable type. The memory is freed when the
--   computation terminates (normally or via an exception), so the pointer
--   must not be used after this.
--   
--   Note that kernel launches can be asynchronous, so you may want to add
--   a synchronisation point using <a>sync</a> as part of the continuation.
allocaArray :: Storable a => Int -> (DevicePtr a -> IO b) -> IO b

-- | Release a section of device memory.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a</a>
free :: DevicePtr a -> IO ()

-- | Options for unified memory allocations
data AttachFlag
CuMemAttachGlobal :: AttachFlag
CuMemAttachHost :: AttachFlag
CuMemAttachSingle :: AttachFlag

-- | Allocates memory that will be automatically managed by the Unified
--   Memory system. The returned pointer is valid on the CPU and on all
--   GPUs which supported managed memory. All accesses to this pointer must
--   obey the Unified Memory programming model.
--   
--   On a multi-GPU system with peer-to-peer support, where multiple GPUs
--   support managed memory, the physical storage is created on the GPU
--   which is active at the time <a>mallocManagedArray</a> is called. All
--   other GPUs will access the array at reduced bandwidth via peer mapping
--   over the PCIe bus. The Unified Memory system does not migrate memory
--   between GPUs.
--   
--   On a multi-GPU system where multiple GPUs support managed memory, but
--   not all pairs of such GPUs have peer-to-peer support between them, the
--   physical storage is allocated in system memory (zero-copy memory) and
--   all GPUs will access the data at reduced bandwidth over the PCIe bus.
--   
--   Requires CUDA-6.0
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32</a>
mallocManagedArray :: Storable a => [AttachFlag] -> Int -> IO (DevicePtr a)

-- | Pre-fetches the given number of elements to the specified destination
--   device. If the specified device is Nothing, the data is pre-fetched to
--   host memory. The pointer must refer to a memory range allocated with
--   <a>mallocManagedArray</a>.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html#group__CUDA__UNIFIED_1gfe94f8b7fb56291ebcea44261aa4cb84</a>
--   
--   Requires CUDA-8.0.
prefetchArrayAsync :: Storable a => DevicePtr a -> Int -> Maybe Device -> Maybe Stream -> IO ()

-- | Attach an array of the given number of elements to a stream
--   asynchronously
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g6e468d680e263e7eba02a56643c50533</a>
attachArrayAsync :: forall a. Storable a => [AttachFlag] -> Stream -> DevicePtr a -> Int -> IO ()

-- | Copy a number of elements from the device to host memory. This is a
--   synchronous operation.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g3480368ee0208a98f75019c9a8450893</a>
peekArray :: Storable a => Int -> DevicePtr a -> Ptr a -> IO ()

-- | Copy memory from the device asynchronously, possibly associated with a
--   particular stream. The destination host memory must be page-locked.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g56f30236c7c5247f8e061b59d3268362</a>
peekArrayAsync :: Storable a => Int -> DevicePtr a -> HostPtr a -> Maybe Stream -> IO ()

-- | Copy a 2D array from the device to the host.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g27f885b30c34cc20a663a671dbf6fc27</a>
peekArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> Ptr a -> Int -> Int -> Int -> IO ()

-- | Copy a 2D array from the device to the host asynchronously, possibly
--   associated with a particular execution stream. The destination host
--   memory must be page-locked.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4acf155faeb969d9d21f5433d3d0f274</a>
peekArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> HostPtr a -> Int -> Int -> Int -> Maybe Stream -> IO ()

-- | Copy a number of elements from the device into a new Haskell list.
--   Note that this requires two memory copies: firstly from the device
--   into a heap allocated array, and from there marshalled into a list.
peekListArray :: Storable a => Int -> DevicePtr a -> IO [a]

-- | Copy a number of elements onto the device. This is a synchronous
--   operation.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4d32266788c440b0220b1a9ba5795169</a>
pokeArray :: Storable a => Int -> Ptr a -> DevicePtr a -> IO ()

-- | Copy memory onto the device asynchronously, possibly associated with a
--   particular stream. The source host memory must be page-locked.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g1572263fe2597d7ba4f6964597a354a3</a>
pokeArrayAsync :: Storable a => Int -> HostPtr a -> DevicePtr a -> Maybe Stream -> IO ()

-- | Copy a 2D array from the host to the device.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g27f885b30c34cc20a663a671dbf6fc27</a>
pokeArray2D :: Storable a => Int -> Int -> Ptr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> IO ()

-- | Copy a 2D array from the host to the device asynchronously, possibly
--   associated with a particular execution stream. The source host memory
--   must be page-locked.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4acf155faeb969d9d21f5433d3d0f274</a>
pokeArray2DAsync :: Storable a => Int -> Int -> HostPtr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> Maybe Stream -> IO ()

-- | Write a list of storable elements into a device array. The device
--   array must be sufficiently large to hold the entire list. This
--   requires two marshalling operations.
pokeListArray :: Storable a => [a] -> DevicePtr a -> IO ()

-- | Copy the given number of elements from the first device array (source)
--   to the second device (destination). The copied areas may not overlap.
--   This operation is asynchronous with respect to the host, but will
--   never overlap with kernel execution.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g1725774abf8b51b91945f3336b778c8b</a>
copyArray :: Storable a => Int -> DevicePtr a -> DevicePtr a -> IO ()

-- | Copy the given number of elements from the first device array (source)
--   to the second device array (destination). The copied areas may not
--   overlap. The operation is asynchronous with respect to the host, and
--   can be asynchronous to other device operations by associating it with
--   a particular stream.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g39ea09ba682b8eccc9c3e0c04319b5c8</a>
copyArrayAsync :: Storable a => Int -> DevicePtr a -> DevicePtr a -> Maybe Stream -> IO ()

-- | Copy a 2D array from the first device array (source) to the second
--   device array (destination). The copied areas must not overlap. This
--   operation is asynchronous with respect to the host, but will never
--   overlap with kernel execution.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g27f885b30c34cc20a663a671dbf6fc27</a>
copyArray2D :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> IO ()

-- | Copy a 2D array from the first device array (source) to the second
--   device array (destination). The copied areas may not overlap. The
--   operation is asynchronous with respect to the host, and can be
--   asynchronous to other device operations by associating it with a
--   particular execution stream.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g4acf155faeb969d9d21f5433d3d0f274</a>
copyArray2DAsync :: Storable a => Int -> Int -> DevicePtr a -> Int -> Int -> Int -> DevicePtr a -> Int -> Int -> Int -> Maybe Stream -> IO ()

-- | Copies an array from device memory in one context to device memory in
--   another context. Note that this function is asynchronous with respect
--   to the host, but serialised with respect to all pending and future
--   asynchronous work in the source and destination contexts. To avoid
--   this synchronisation, use <a>copyArrayPeerAsync</a> instead.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1ge1f5c7771544fee150ada8853c7cbf4a</a>
copyArrayPeer :: Storable a => Int -> DevicePtr a -> Context -> DevicePtr a -> Context -> IO ()

-- | Copies from device memory in one context to device memory in another
--   context. Note that this function is asynchronous with respect to the
--   host and all work in other streams and devices.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g82fcecb38018e64b98616a8ac30112f2</a>
copyArrayPeerAsync :: Storable a => Int -> DevicePtr a -> Context -> DevicePtr a -> Context -> Maybe Stream -> IO ()

-- | Write a list of storable elements into a newly allocated device array.
--   This is <a>newListArrayLen</a> composed with <a>fst</a>.
newListArray :: Storable a => [a] -> IO (DevicePtr a)

-- | Write a list of storable elements into a newly allocated device array,
--   returning the device pointer together with the number of elements that
--   were written. Note that this requires two memory copies: firstly from
--   a Haskell list to a heap allocated array, and from there onto the
--   graphics device. The memory should be <a>free</a>d when no longer
--   required.
newListArrayLen :: Storable a => [a] -> IO (DevicePtr a, Int)

-- | Temporarily store a list of elements into a newly allocated device
--   array. An IO action is applied to to the array, the result of which is
--   returned. Similar to <a>newListArray</a>, this requires copying the
--   data twice.
--   
--   As with <a>allocaArray</a>, the memory is freed once the action
--   completes, so you should not return the pointer from the action, and
--   be wary of asynchronous kernel execution.
withListArray :: Storable a => [a] -> (DevicePtr a -> IO b) -> IO b

-- | A variant of <a>withListArray</a> which also supplies the number of
--   elements in the array to the applied function
withListArrayLen :: Storable a => [a] -> (Int -> DevicePtr a -> IO b) -> IO b

-- | Set a number of data elements to the specified value, which may be
--   either 8-, 16-, or 32-bits wide.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6e582bf866e9e2fb014297bfaf354d7b</a>
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g7d805e610054392a4d11e8a8bf5eb35c</a>
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g983e8d8759acd1b64326317481fbf132</a>
memset :: Storable a => DevicePtr a -> Int -> a -> IO ()

-- | Set the number of data elements to the specified value, which may be
--   either 8-, 16-, or 32-bits wide. The operation is asynchronous and may
--   optionally be associated with a stream.
--   
--   Requires CUDA-3.2.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gaef08a7ccd61112f94e82f2b30d43627</a>
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gf731438877dd8ec875e4c43d848c878c</a>
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g58229da5d30f1c0cdf667b320ec2c0f5</a>
memsetAsync :: Storable a => DevicePtr a -> Int -> a -> Maybe Stream -> IO ()

-- | Return the device pointer associated with a mapped, pinned host
--   buffer, which was allocated with the <a>DeviceMapped</a> option by
--   <a>mallocHostArray</a>.
--   
--   Currently, no options are supported and this must be empty.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g57a39e5cba26af4d06be67fc77cc62f0</a>
getDevicePtr :: [AllocFlag] -> HostPtr a -> IO (DevicePtr a)

-- | Return the base address and allocation size of the given device
--   pointer.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g64fee5711274a2a0573a789c94d8299b</a>
getBasePtr :: DevicePtr a -> IO (DevicePtr a, Int64)

-- | Return the amount of free and total memory respectively available to
--   the current context (bytes).
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g808f555540d0143a331cc42aa98835c0</a>
getMemInfo :: IO (Int64, Int64)
instance GHC.Enum.Bounded Foreign.CUDA.Driver.Marshal.AllocFlag
instance GHC.Show.Show Foreign.CUDA.Driver.Marshal.AllocFlag
instance GHC.Classes.Eq Foreign.CUDA.Driver.Marshal.AllocFlag
instance GHC.Enum.Bounded Foreign.CUDA.Driver.Marshal.AttachFlag
instance GHC.Show.Show Foreign.CUDA.Driver.Marshal.AttachFlag
instance GHC.Classes.Eq Foreign.CUDA.Driver.Marshal.AttachFlag
instance GHC.Enum.Enum Foreign.CUDA.Driver.Marshal.AttachFlag
instance GHC.Enum.Enum Foreign.CUDA.Driver.Marshal.AllocFlag


-- | Texture management for low-level driver interface
module Foreign.CUDA.Driver.Texture

-- | A texture reference
newtype Texture
Texture :: Ptr () -> Texture
[useTexture] :: Texture -> Ptr ()

-- | Texture data formats
data Format
Word8 :: Format
Word16 :: Format
Word32 :: Format
Int8 :: Format
Int16 :: Format
Int32 :: Format
Half :: Format
Float :: Format

-- | Texture reference addressing modes
data AddressMode
Wrap :: AddressMode
Clamp :: AddressMode
Mirror :: AddressMode
Border :: AddressMode

-- | Texture reference filtering mode
data FilterMode
Point :: FilterMode
Linear :: FilterMode

-- | Texture read mode options
data ReadMode
ReadAsInteger :: ReadMode
NormalizedCoordinates :: ReadMode
SRGB :: ReadMode

-- | Bind a linear array address of the given size (bytes) as a texture
--   reference. Any previously bound references are unbound.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g44ef7e5055192d52b3d43456602b50a8</a>
bind :: Texture -> DevicePtr a -> Int64 -> IO ()

-- | Bind a linear address range to the given texture reference as a
--   two-dimensional arena. Any previously bound reference is unbound. Note
--   that calls to <a>setFormat</a> can not follow a call to <a>bind2D</a>
--   for the same texture reference.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g26f709bbe10516681913d1ffe8756ee2</a>
bind2D :: Texture -> Format -> Int -> DevicePtr a -> (Int, Int) -> Int64 -> IO ()

-- | Get the addressing mode used by a texture reference, corresponding to
--   the given dimension (currently the only supported dimension values are
--   0 or 1).
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1gfb367d93dc1d20aab0cf8ce70d543b33</a>
getAddressMode :: Texture -> Int -> IO AddressMode

-- | Get the filtering mode used by a texture reference.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g2439e069746f69b940f2f4dbc78cdf87</a>
getFilterMode :: Texture -> IO FilterMode

-- | Get the data format and number of channel components of the bound
--   texture.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g90936eb6c7c4434a609e1160c278ae53</a>
getFormat :: Texture -> IO (Format, Int)

-- | Specify the addressing mode for the given dimension of a texture
--   reference.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g85f4a13eeb94c8072f61091489349bcb</a>
setAddressMode :: Texture -> Int -> AddressMode -> IO ()

-- | Specify the filtering mode to be used when reading memory through a
--   texture reference.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g595d0af02c55576f8c835e4efd1f39c0</a>
setFilterMode :: Texture -> FilterMode -> IO ()

-- | Specify the format of the data and number of packed components per
--   element to be read by the texture reference.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g05585ef8ea2fec728a03c6c8f87cf07a</a>
setFormat :: Texture -> Format -> Int -> IO ()

-- | Specify additional characteristics for reading and indexing the
--   texture reference.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF.html#group__CUDA__TEXREF_1g554ffd896487533c36810f2e45bb7a28</a>
setReadMode :: Texture -> ReadMode -> IO ()

-- | Create a new texture reference. Once created, the application must
--   call <tt>setPtr</tt> to associate the reference with allocated memory.
--   Other texture reference functions are used to specify the format and
--   interpretation to be used when the memory is read through this
--   reference.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF__DEPRECATED.html#group__CUDA__TEXREF__DEPRECATED_1g0084fabe2c6d28ffcf9d9f5c7164f16c</a>

-- | <i>Deprecated: as of CUDA version 3.2</i>
create :: IO Texture

-- | Destroy a texture reference.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXREF__DEPRECATED.html#group__CUDA__TEXREF__DEPRECATED_1gea8edbd6cf9f97e6ab2b41fc6785519d</a>

-- | <i>Deprecated: as of CUDA version 3.2</i>
destroy :: Texture -> IO ()
instance GHC.Show.Show Foreign.CUDA.Driver.Texture.Texture
instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.Texture
instance GHC.Show.Show Foreign.CUDA.Driver.Texture.AddressMode
instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.AddressMode
instance GHC.Show.Show Foreign.CUDA.Driver.Texture.FilterMode
instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.FilterMode
instance GHC.Show.Show Foreign.CUDA.Driver.Texture.ReadMode
instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.ReadMode
instance GHC.Show.Show Foreign.CUDA.Driver.Texture.Format
instance GHC.Classes.Eq Foreign.CUDA.Driver.Texture.Format
instance GHC.Enum.Enum Foreign.CUDA.Driver.Texture.Format
instance GHC.Enum.Enum Foreign.CUDA.Driver.Texture.ReadMode
instance GHC.Enum.Enum Foreign.CUDA.Driver.Texture.FilterMode
instance GHC.Enum.Enum Foreign.CUDA.Driver.Texture.AddressMode
instance Foreign.Storable.Storable Foreign.CUDA.Driver.Texture.Texture


-- | IPC memory management for low-level driver interface.
--   
--   Restricted to devices which support unified addressing on Linux
--   operating systems.
--   
--   Since CUDA-4.0.
module Foreign.CUDA.Driver.IPC.Marshal

-- | A CUDA memory handle used for inter-process communication.
data IPCDevicePtr a

-- | Flags for controlling IPC memory access
data IPCFlag
LazyEnablePeerAccess :: IPCFlag

-- | Create an inter-process memory handle for an existing device memory
--   allocation. The handle can then be sent to another process and made
--   available to that process via <a>open</a>.
--   
--   Requires CUDA-4.1.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g6f1b5be767b275f016523b2ac49ebec1</a>
export :: DevicePtr a -> IO (IPCDevicePtr a)

-- | Open an inter-process memory handle exported from another process,
--   returning a device pointer usable in the current process.
--   
--   Maps memory exported by another process with 'export into the current
--   device address space. For contexts on different devices, <a>open</a>
--   can attempt to enable peer access if the user called <a>add</a>, and
--   is controlled by the <a>LazyEnablePeerAccess</a> flag.
--   
--   Each handle from a given device and context may only be <a>open</a>ed
--   by one context per device per other process. Memory returned by
--   <a>open</a> must be freed via <a>close</a>.
--   
--   Requires CUDA-4.1.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1ga8bd126fcff919a0c996b7640f197b79</a>
open :: IPCDevicePtr a -> [IPCFlag] -> IO (DevicePtr a)

-- | Close and unmap memory returned by <a>open</a>. The original
--   allocation in the exporting process as well as imported mappings in
--   other processes are unaffected.
--   
--   Any resources used to enable peer access will be freed if this is the
--   last mapping using them.
--   
--   Requires CUDA-4.1.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gd6f5d5bcf6376c6853b64635b0157b9e</a>
close :: DevicePtr a -> IO ()
instance GHC.Enum.Bounded Foreign.CUDA.Driver.IPC.Marshal.IPCFlag
instance GHC.Show.Show Foreign.CUDA.Driver.IPC.Marshal.IPCFlag
instance GHC.Classes.Eq Foreign.CUDA.Driver.IPC.Marshal.IPCFlag
instance GHC.Show.Show (Foreign.CUDA.Driver.IPC.Marshal.IPCDevicePtr a)
instance GHC.Classes.Eq (Foreign.CUDA.Driver.IPC.Marshal.IPCDevicePtr a)
instance GHC.Enum.Enum Foreign.CUDA.Driver.IPC.Marshal.IPCFlag


-- | Primary context management for low-level driver interface. The primary
--   context is unique per device and shared with the Runtime API. This
--   allows integration with other libraries using CUDA.
--   
--   Since: CUDA-7.0
module Foreign.CUDA.Driver.Context.Primary

-- | Get the status of the primary context. Returns whether the current
--   context is active, and the flags it was (or will be) created with.
--   
--   Requires CUDA-7.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1g65f3e018721b6d90aa05cfb56250f469</a>
status :: Device -> IO (Bool, [ContextFlag])

-- | Specify the flags that the primary context should be created with.
--   Note that this is an error if the primary context is already active.
--   
--   Requires CUDA-7.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1gd779a84f17acdad0d9143d9fe719cfdf</a>
setup :: Device -> [ContextFlag] -> IO ()

-- | Destroy all allocations and reset all state on the primary context of
--   the given device in the current process. Requires cuda-7.0
--   
--   Requires CUDA-7.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1g5d38802e8600340283958a117466ce12</a>
reset :: Device -> IO ()

-- | Retain the primary context for the given device, creating it if
--   necessary, and increasing its usage count. The caller must call
--   <a>release</a> when done using the context. Unlike <a>create</a> the
--   newly retained context is not pushed onto the stack.
--   
--   Requires CUDA-7.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1g9051f2d5c31501997a6cb0530290a300</a>
retain :: Device -> IO Context

-- | Release the primary context on the given device. If there are no more
--   references to the primary context it will be destroyed, regardless of
--   how many threads it is current to.
--   
--   Unlike <a>pop</a> this does not pop the context from the stack in any
--   circumstances.
--   
--   Requires CUDA-7.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PRIMARY__CTX.html#group__CUDA__PRIMARY__CTX_1gf2a8bc16f8df0c88031f6a1ba3d6e8ad</a>
release :: Device -> IO ()


-- | Direct peer context access functions for the low-level driver
--   interface.
--   
--   Since: CUDA-4.0
module Foreign.CUDA.Driver.Context.Peer

-- | Possible option values for direct peer memory access
data PeerFlag

-- | Peer-to-peer attributes
data PeerAttribute
PerformanceRank :: PeerAttribute
AccessSupported :: PeerAttribute
NativeAtomicSupported :: PeerAttribute
AccessAccessSupported :: PeerAttribute
CudaArrayAccessSupported :: PeerAttribute

-- | Queries if the first device can directly access the memory of the
--   second. If direct access is possible, it can then be enabled with
--   <a>add</a>.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e</a>
accessible :: Device -> Device -> IO Bool

-- | If the devices of both the current and supplied contexts support
--   unified addressing, then enable allocations in the supplied context to
--   be accessible by the current context.
--   
--   Note that access is unidirectional, and in order to access memory in
--   the current context from the peer context, a separate symmetric call
--   to <a>add</a> is required.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a</a>
add :: Context -> [PeerFlag] -> IO ()

-- | Disable direct memory access from the current context to the supplied
--   peer context, and unregisters any registered allocations.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g5b4b6936ea868d4954ce4d841a3b4810</a>
remove :: Context -> IO ()

-- | Queries attributes of the link between two devices
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g4c55c60508f8eba4546b51f2ee545393</a>
--   
--   Requires CUDA-8.0
--   
--   <pre>
--   since 0.9.0.0
--   </pre>
getAttribute :: PeerAttribute -> Device -> Device -> IO Int
instance GHC.Show.Show Foreign.CUDA.Driver.Context.Peer.PeerAttribute
instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Peer.PeerAttribute
instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.Peer.PeerAttribute
instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.Peer.PeerFlag


-- | Context configuration for the low-level driver interface
module Foreign.CUDA.Driver.Context.Config

-- | Return the flags that were used to create the current context.
--   
--   Requires CUDA-7.0
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gf81eef983c1e3b2ef4f166d7a930c86d</a>
getFlags :: IO [ContextFlag]

-- | Device limits flags
data Limit
StackSize :: Limit
PrintfFifoSize :: Limit
MallocHeapSize :: Limit
DevRuntimeSyncDepth :: Limit
DevRuntimePendingLaunchCount :: Limit
MaxL2FetchGranularity :: Limit
Max :: Limit

-- | Query compute 2.0 call stack limits.
--   
--   Requires CUDA-3.1.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g9f2d47d1745752aa16da7ed0d111b6a8</a>
getLimit :: Limit -> IO Int

-- | Specify the size of the call stack, for compute 2.0 devices.
--   
--   Requires CUDA-3.1.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g0651954dfb9788173e60a9af7201e65a</a>
setLimit :: Limit -> Int -> IO ()

-- | Device cache configuration preference
data Cache
PreferNone :: Cache
PreferShared :: Cache
PreferL1 :: Cache
PreferEqual :: Cache

-- | On devices where the L1 cache and shared memory use the same hardware
--   resources, this function returns the preferred cache configuration for
--   the current context.
--   
--   Requires CUDA-3.2.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g40b6b141698f76744dea6e39b9a25360</a>
getCache :: IO Cache

-- | On devices where the L1 cache and shared memory use the same hardware
--   resources, this sets the preferred cache configuration for the current
--   context. This is only a preference.
--   
--   Any function configuration set via <a>setCacheConfigFun</a> will be
--   preferred over this context-wide setting.
--   
--   Requires CUDA-3.2.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g54699acf7e2ef27279d013ca2095f4a3</a>
setCache :: Cache -> IO ()

-- | Device shared memory configuration preference
data SharedMem
DefaultBankSize :: SharedMem
FourByteBankSize :: SharedMem
EightByteBankSize :: SharedMem

-- | Return the current size of the shared memory banks in the current
--   context. On devices with configurable shared memory banks,
--   <a>setSharedMem</a> can be used to change the configuration, so that
--   subsequent kernel launches will by default us the new bank size. On
--   devices without configurable shared memory, this function returns the
--   fixed bank size of the hardware.
--   
--   Requires CUDA-4.2
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74</a>
getSharedMem :: IO SharedMem

-- | On devices with configurable shared memory banks, this function will
--   set the context's shared memory bank size that will be used by default
--   for subsequent kernel launches.
--   
--   Changing the shared memory configuration between launches may insert a
--   device synchronisation.
--   
--   Shared memory bank size does not affect shared memory usage or kernel
--   occupancy, but may have major effects on performance. Larger bank
--   sizes allow for greater potential bandwidth to shared memory, but
--   change the kinds of accesses which result in bank conflicts.
--   
--   Requires CUDA-4.2
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692</a>
setSharedMem :: SharedMem -> IO ()

-- | Priority of an execution stream. Work submitted to a higher priority
--   stream may preempt execution of work already executing in a lower
--   priority stream. Lower numbers represent higher priorities.
type StreamPriority = Int

-- | Returns the numerical values that correspond to the greatest and least
--   priority execution streams in the current context respectively. Stream
--   priorities follow the convention that lower numerical numbers
--   correspond to higher priorities. The range of meaningful stream
--   priorities is given by the inclusive range
--   [greatestPriority,leastPriority].
--   
--   Requires CUDA-5.5.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g137920ab61a71be6ce67605b9f294091</a>
getStreamPriorityRange :: IO (StreamPriority, StreamPriority)
instance GHC.Show.Show Foreign.CUDA.Driver.Context.Config.Limit
instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Config.Limit
instance GHC.Show.Show Foreign.CUDA.Driver.Context.Config.Cache
instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Config.Cache
instance GHC.Show.Show Foreign.CUDA.Driver.Context.Config.SharedMem
instance GHC.Classes.Eq Foreign.CUDA.Driver.Context.Config.SharedMem
instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.Config.SharedMem
instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.Config.Cache
instance GHC.Enum.Enum Foreign.CUDA.Driver.Context.Config.Limit


-- | Context management for low-level driver interface
module Foreign.CUDA.Driver.Context


-- | Unified addressing functions for the low-level driver interface
--   
--   <ul>
--   <li><i><i>Overview</i></i></li>
--   </ul>
--   
--   CUDA devices can share a unified address space with the host. For
--   these devices, there is no distinction between a device pointer and a
--   host pointer---the same pointer value may be used to access memory
--   from the host program and from a kernel running on the device (with
--   exceptions enumerated below).
--   
--   <ul>
--   <li><i><i>Support</i></i></li>
--   </ul>
--   
--   Whether or not a device supports unified addressing may be queried by
--   calling <a>attribute</a> with the <a>UnifiedAddressing</a> attribute.
--   
--   Unified addressing is automatically enabled in 64-bit processes on
--   devices with compute capability at leas 2.0.
--   
--   <ul>
--   <li><i><i>Looking up information about pointers</i></i></li>
--   </ul>
--   
--   It is possible to look up information about the memory which backs a
--   pointer; that is, whether the memory resides on the host or the device
--   (and in particular, which device).
--   
--   <ul>
--   <li><i><i>Automatic mapping of host memory</i></i></li>
--   </ul>
--   
--   All host memory allocated in all contexts using <a>mallocHostArray</a>
--   or <a>mallocHostForeignPtr</a> is always directly accessible from all
--   contexts on all devices which support unified addressing. This is the
--   case whether or not the flags <a>Portable</a> or <a>DeviceMapped</a>
--   are specified.
--   
--   The pointer value through which allocated host memory may be accessed
--   in kernels on all devices which support unified addressing is the same
--   as the pointer value as on the host; that is, it is not necessary to
--   call <a>getDevicePtr</a> for these allocations.
--   
--   Note that this is not the case for memory allocated using the
--   <a>WriteCombined</a> option; see below.
--   
--   <ul>
--   <li><i><i>Automatic registration of peer memory</i></i></li>
--   </ul>
--   
--   Upon enabling direct access from a context which supports unified
--   addressing to another peer context which supports unified addressing
--   using <a>add</a>, all memory allocated in the peer context will
--   immediately be accessible by the current context. The device pointer
--   values are the same on both contexts.
--   
--   <ul>
--   <li><i><i>Exceptions (disjoint addressing</i></i></li>
--   </ul>
--   
--   Not all memory may be accessed on devices through the same pointer
--   value as they are accessed with on the host. These exceptions are host
--   arrays registered with <a>registerArray</a>, and those allocated with
--   the flag <a>WriteCombined</a>. In these cases, the host and device
--   arrays have distinct addresses (pointer values). However, the device
--   address is guaranteed to not overlap with any valid host pointer range
--   and is guaranteed to have the same value across all contexts which
--   support unified addressing.
--   
--   The value of the device pointer may be queried with
--   <a>getDevicePtr</a> from any context supporting unified addressing.
module Foreign.CUDA.Driver.Unified

-- | Information about a pointer
data PointerAttributes a
PointerAttributes :: {-# UNPACK #-} !Context -> {-# UNPACK #-} !DevicePtr a -> {-# UNPACK #-} !HostPtr a -> {-# UNPACK #-} !CULLong -> !MemoryType -> !Bool -> !Bool -> PointerAttributes a
[ptrContext] :: PointerAttributes a -> {-# UNPACK #-} !Context
[ptrDevice] :: PointerAttributes a -> {-# UNPACK #-} !DevicePtr a
[ptrHost] :: PointerAttributes a -> {-# UNPACK #-} !HostPtr a
[ptrBufferID] :: PointerAttributes a -> {-# UNPACK #-} !CULLong
[ptrMemoryType] :: PointerAttributes a -> !MemoryType
[ptrSyncMemops] :: PointerAttributes a -> !Bool
[ptrIsManaged] :: PointerAttributes a -> !Bool

-- | Advise about the usage of a given range of memory. If the supplied
--   device is Nothing, then the preferred location is taken to mean the
--   CPU.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__UNIFIED.html#group__CUDA__UNIFIED_1g27608c857a9254789c13f3e3b72029e2</a>
--   
--   Requires CUDA-8.0.
advise :: Storable a => Ptr a -> Int -> Advice -> Maybe Device -> IO ()
instance GHC.Enum.Bounded Foreign.CUDA.Driver.Unified.MemoryType
instance GHC.Show.Show Foreign.CUDA.Driver.Unified.MemoryType
instance GHC.Classes.Eq Foreign.CUDA.Driver.Unified.MemoryType
instance GHC.Show.Show (Foreign.CUDA.Driver.Unified.PointerAttributes a)
instance GHC.Enum.Bounded Foreign.CUDA.Driver.Unified.PointerAttribute
instance GHC.Show.Show Foreign.CUDA.Driver.Unified.PointerAttribute
instance GHC.Classes.Eq Foreign.CUDA.Driver.Unified.PointerAttribute
instance GHC.Enum.Bounded Foreign.CUDA.Driver.Unified.Advice
instance GHC.Show.Show Foreign.CUDA.Driver.Unified.Advice
instance GHC.Classes.Eq Foreign.CUDA.Driver.Unified.Advice
instance GHC.Enum.Enum Foreign.CUDA.Driver.Unified.Advice
instance GHC.Enum.Enum Foreign.CUDA.Driver.Unified.PointerAttribute
instance GHC.Enum.Enum Foreign.CUDA.Driver.Unified.MemoryType


-- | Kernel execution control for low-level driver interface
module Foreign.CUDA.Driver.Exec

-- | A <tt>__global__</tt> device function
newtype Fun
Fun :: Ptr () -> Fun
[useFun] :: Fun -> Ptr ()

-- | Kernel function parameters
data FunParam
[IArg] :: !Int32 -> FunParam
[FArg] :: !Float -> FunParam
[VArg] :: Storable a => !a -> FunParam

-- | Function attributes
data FunAttribute
MaxKernelThreadsPerBlock :: FunAttribute
SharedSizeBytes :: FunAttribute
ConstSizeBytes :: FunAttribute
LocalSizeBytes :: FunAttribute
NumRegs :: FunAttribute
PtxVersion :: FunAttribute
BinaryVersion :: FunAttribute
CacheModeCa :: FunAttribute
MaxDynamicSharedSizeBytes :: FunAttribute
PreferredSharedMemoryCarveout :: FunAttribute
CU_FUNC_ATTRIBUTE_MAX :: FunAttribute

-- | Device shared memory configuration preference
data SharedMem
DefaultBankSize :: SharedMem
FourByteBankSize :: SharedMem
EightByteBankSize :: SharedMem

-- | Returns the value of the selected attribute requirement for the given
--   kernel.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b</a>
requires :: Fun -> FunAttribute -> IO Int

-- | On devices where the L1 cache and shared memory use the same hardware
--   resources, this sets the preferred cache configuration for the given
--   device function. This is only a preference; the driver is free to
--   choose a different configuration as required to execute the function.
--   
--   Switching between configuration modes may insert a device-side
--   synchronisation point for streamed kernel launches.
--   
--   Requires CUDA-3.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g40f8c11e81def95dc0072a375f965681</a>
setCacheConfigFun :: Fun -> Cache -> IO ()

-- | Set the shared memory configuration of a device function.
--   
--   On devices with configurable shared memory banks, this will force all
--   subsequent launches of the given device function to use the specified
--   shared memory bank size configuration. On launch of the function, the
--   shared memory configuration of the device will be temporarily changed
--   if needed to suit the function configuration. Changes in shared memory
--   configuration may introduction a device side synchronisation between
--   kernel launches.
--   
--   Any per-function configuration specified by
--   <tt>setSharedMemConfig</tt> will override the context-wide
--   configuration set with <a>setSharedMem</a>.
--   
--   Changing the shared memory bank size will not increase shared memory
--   usage or affect occupancy of kernels, but may have major effects on
--   performance. Larger bank sizes will allow for greater potential
--   bandwidth to shared memory, but will change what kinds of accesses to
--   shared memory will result in bank conflicts.
--   
--   This function will do nothing on devices with fixed shared memory bank
--   size.
--   
--   Requires CUDA-5.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g430b913f24970e63869635395df6d9f5</a>
setSharedMemConfigFun :: Fun -> SharedMem -> IO ()

-- | Invoke a kernel on a <tt>(gx * gy * gz)</tt> grid of blocks, where
--   each block contains <tt>(tx * ty * tz)</tt> threads and has access to
--   a given number of bytes of shared memory. The launch may also be
--   associated with a specific <a>Stream</a>.
--   
--   In <a>launchKernel</a>, the number of kernel parameters and their
--   offsets and sizes do not need to be specified, as this information is
--   retrieved directly from the kernel's image. This requires the kernel
--   to have been compiled with toolchain version 3.2 or later.
--   
--   The alternative <a>launchKernel'</a> will pass the arguments in
--   directly, requiring the application to know the size and
--   alignment/padding of each kernel parameter.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15</a>
launchKernel :: Fun -> (Int, Int, Int) -> (Int, Int, Int) -> Int -> Maybe Stream -> [FunParam] -> IO ()

-- | Invoke a kernel on a <tt>(gx * gy * gz)</tt> grid of blocks, where
--   each block contains <tt>(tx * ty * tz)</tt> threads and has access to
--   a given number of bytes of shared memory. The launch may also be
--   associated with a specific <a>Stream</a>.
--   
--   In <a>launchKernel</a>, the number of kernel parameters and their
--   offsets and sizes do not need to be specified, as this information is
--   retrieved directly from the kernel's image. This requires the kernel
--   to have been compiled with toolchain version 3.2 or later.
--   
--   The alternative <a>launchKernel'</a> will pass the arguments in
--   directly, requiring the application to know the size and
--   alignment/padding of each kernel parameter.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1gb8f3dc3031b40da29d5f9a7139e52e15</a>
launchKernel' :: Fun -> (Int, Int, Int) -> (Int, Int, Int) -> Int -> Maybe Stream -> [FunParam] -> IO ()

-- | Invoke a kernel on a <tt>(gx * gy * gz)</tt> grid of blocks, where
--   each block contains <tt>(tx * ty * tz)</tt> threads and has access to
--   a given number of bytes of shared memory. The launch may also be
--   associated with a specific stream.
--   
--   The thread blocks can cooperate and synchronise as they execute.
--   
--   The device on which this kernel is invoked must have <a>attribute</a>
--   <a>CooperativeLaunch</a>.
--   
--   The total number of blocks launched can not exceed the maximum number
--   of active thread blocks per multiprocessor (<a>threadBlocksPerMP</a>),
--   multiplied by the number of multiprocessors
--   (<a>multiProcessorCount</a>).
--   
--   The kernel can not make use of dynamic parallelism.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g06d753134145c4584c0c62525c1894cb</a>
--   
--   Requires CUDA-9.0
--   
--   <pre>
--   since 0.9.0.0
--   </pre>
launchKernelCooperative :: Fun -> (Int, Int, Int) -> (Int, Int, Int) -> Int -> Maybe Stream -> [FunParam] -> IO ()

-- | Specify the <tt>(x,y,z)</tt> dimensions of the thread blocks that are
--   created when the given kernel function is launched.

-- | <i>Deprecated: use launchKernel instead</i>
setBlockShape :: Fun -> (Int, Int, Int) -> IO ()

-- | Set the number of bytes of dynamic shared memory to be available to
--   each thread block when the function is launched

-- | <i>Deprecated: use launchKernel instead</i>
setSharedSize :: Fun -> Integer -> IO ()

-- | Set the parameters that will specified next time the kernel is invoked

-- | <i>Deprecated: use launchKernel instead</i>
setParams :: Fun -> [FunParam] -> IO ()

-- | Invoke the kernel on a size <tt>(w,h)</tt> grid of blocks. Each block
--   contains the number of threads specified by a previous call to
--   <a>setBlockShape</a>. The launch may also be associated with a
--   specific <a>Stream</a>.

-- | <i>Deprecated: use launchKernel instead</i>
launch :: Fun -> (Int, Int) -> Maybe Stream -> IO ()
instance GHC.Show.Show Foreign.CUDA.Driver.Exec.FunAttribute
instance GHC.Classes.Eq Foreign.CUDA.Driver.Exec.FunAttribute
instance Foreign.Storable.Storable Foreign.CUDA.Driver.Exec.FunParam
instance GHC.Enum.Enum Foreign.CUDA.Driver.Exec.FunAttribute


-- | Querying module attributes for low-level driver interface
module Foreign.CUDA.Driver.Module.Query

-- | Returns a function handle.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1ga52be009b0d4045811b30c965e1cb2cf</a>
getFun :: Module -> ShortByteString -> IO Fun

-- | Return a global pointer, and size of the global (in bytes).
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1gf3e43672e26073b1081476dbf47a86ab</a>
getPtr :: Module -> ShortByteString -> IO (DevicePtr a, Int)

-- | Return a handle to a texture reference. This texture reference handle
--   should not be destroyed, as the texture will be destroyed
--   automatically when the module is unloaded.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g9607dcbf911c16420d5264273f2b5608</a>
getTex :: Module -> ShortByteString -> IO Texture


-- | Module management for low-level driver interface
module Foreign.CUDA.Driver.Module

-- | Device code formats that can be used for online linking
data JITInputType
Cubin :: JITInputType
PTX :: JITInputType
Fatbinary :: JITInputType
Object :: JITInputType
Library :: JITInputType
CuJitNumInputTypes :: JITInputType

-- | Online compilation fallback strategy
data JITFallback
PreferPTX :: JITFallback
PreferBinary :: JITFallback

-- | Online compilation target architecture
data JITTarget
Compute20 :: JITTarget
Compute21 :: JITTarget
Compute30 :: JITTarget
Compute32 :: JITTarget
Compute35 :: JITTarget
Compute37 :: JITTarget
Compute50 :: JITTarget
Compute52 :: JITTarget
Compute53 :: JITTarget
Compute60 :: JITTarget
Compute61 :: JITTarget
Compute62 :: JITTarget
Compute70 :: JITTarget
Compute72 :: JITTarget
Compute75 :: JITTarget

-- | Results of online compilation
data JITResult
JITResult :: !Float -> !ByteString -> !Module -> JITResult

-- | milliseconds spent compiling PTX
[jitTime] :: JITResult -> !Float

-- | information about PTX assembly
[jitInfoLog] :: JITResult -> !ByteString

-- | the compiled module
[jitModule] :: JITResult -> !Module

-- | Just-in-time compilation and linking options
data JITOption

-- | maximum number of registers per thread
MaxRegisters :: !Int -> JITOption

-- | number of threads per block to target for
ThreadsPerBlock :: !Int -> JITOption

-- | level of optimisation to apply (1-4, default 4)
OptimisationLevel :: !Int -> JITOption

-- | compilation target, otherwise determined from context
Target :: !Compute -> JITOption

-- | fallback strategy if matching cubin not found
FallbackStrategy :: !JITFallback -> JITOption

-- | generate debug info (-g) (requires cuda &gt;= 5.5)
GenerateDebugInfo :: JITOption

-- | generate line number information (-lineinfo) (requires cuda &gt;= 5.5)
GenerateLineInfo :: JITOption

-- | verbose log messages (requires cuda &gt;= 5.5)
Verbose :: JITOption

-- | A reference to a Module object, containing collections of device
--   functions
newtype Module
Module :: Ptr () -> Module

-- | Load the contents of the specified file (either a ptx or cubin file)
--   to create a new module, and load that module into the current context.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g366093bd269dafd0af21f1c7d18115d3</a>
loadFile :: FilePath -> IO Module

-- | Load the contents of the given image into a new module, and load that
--   module into the current context. The image is (typically) the contents
--   of a cubin or PTX file.
--   
--   Note that the <a>ByteString</a> will be copied into a temporary
--   staging area so that it can be passed to C.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g04ce266ce03720f479eab76136b90c0b</a>
loadData :: ByteString -> IO Module

-- | As <a>loadData</a>, but read the image data from the given pointer.
--   The image is a NULL-terminated sequence of bytes.
loadDataFromPtr :: Ptr Word8 -> IO Module

-- | Load the contents of the given image into a module with online
--   compiler options, and load the module into the current context. The
--   image is (typically) the contents of a cubin or PTX file. The actual
--   attributes of the compiled kernel can be probed using
--   <tt>requires</tt>.
--   
--   Note that the <a>ByteString</a> will be copied into a temporary
--   staging area so that it can be passed to C.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g9e8047e9dbf725f0cd7cafd18bfd4d12</a>
loadDataEx :: ByteString -> [JITOption] -> IO JITResult

-- | As <a>loadDataEx</a>, but read the image data from the given pointer.
--   The image is a NULL-terminated sequence of bytes.
loadDataFromPtrEx :: Ptr Word8 -> [JITOption] -> IO JITResult

-- | Unload a module from the current context.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MODULE.html#group__CUDA__MODULE_1g8ea3d716524369de3763104ced4ea57b</a>
unload :: Module -> IO ()


-- | Graph execution functions for the low-level driver interface
--   
--   Requires CUDA-10
module Foreign.CUDA.Driver.Graph.Exec
newtype Executable
Executable :: Ptr () -> Executable
[useExecutable] :: Executable -> Ptr ()

-- | Execute a graph in the given stream. Only one instance may execute at
--   a time; to execute a graph concurrently, it must be
--   <a>instantiate</a>d into multiple executables.
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g6b2dceb3901e71a390d2bd8b0491e471</a>
launch :: Executable -> Stream -> IO ()

-- | Instantiate the task graph description of a program into an executable
--   graph.
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g433ae118a751c9f2087f53d7add7bc2c</a>
instantiate :: Graph -> IO Executable

-- | Destroy an executable graph
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1ga32ad4944cc5d408158207c978bc43a7</a>
destroy :: Executable -> IO ()

-- | Update the parameters for a kernel node in the given executable graph
--   
--   Requires CUDA-10.1
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gd84243569e4c3d6356b9f2eea20ed48c</a>
setKernel :: Executable -> Node -> Fun -> (Int, Int, Int) -> (Int, Int, Int) -> Int -> [FunParam] -> IO ()


-- | Graph construction functions for the low-level driver interface
--   
--   Requires CUDA-10
module Foreign.CUDA.Driver.Graph.Build
newtype Graph
Graph :: Ptr () -> Graph
[useGraph] :: Graph -> Ptr ()
newtype Node
Node :: Ptr () -> Node
[useNode] :: Node -> Ptr ()
data NodeType
Kernel :: NodeType
Memcpy :: NodeType
Memset :: NodeType
Host :: NodeType
Subgraph :: NodeType
Empty :: NodeType
Count :: NodeType

-- | Callback function executed on the host
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1g262cd3570ff5d396db4e3dabede3c355</a>
type HostCallback = ((FunPtr ((Ptr ()) -> (IO ()))))

-- | Create an empty task graph
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gd885f719186010727b75c3315f865fdf</a>
create :: [GraphFlag] -> IO Graph

-- | Destroy a graph, as well as all of its nodes
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g718cfd9681f078693d4be2426fd689c8</a>
destroy :: Graph -> IO ()

-- | Clone a graph
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g3603974654e463f2231c71d9b9d1517e</a>
clone :: Graph -> IO Graph

-- | Remove a node from the graph
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g00ed16434d983d8f0011683eacaf19b9</a>
remove :: Node -> IO ()

-- | Create a child graph node and add it to the graph
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g3f27c2e56e3d568b09f00d438e61ceb1</a>
addChild :: Graph -> Graph -> [Node] -> IO Node

-- | Create an empty node and add it to the graph.
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g8a8681dbe97dbbb236ea5ebf3abe2ada</a>
addEmpty :: Graph -> [Node] -> IO Node

-- | Creates a host execution node and adds it to the graph
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g1ba15c2fe1afb8897091ecec4202b597</a>
addHost :: Graph -> [Node] -> HostCallback -> Ptr () -> IO Node

-- | Create a kernel execution node and adds it to the graph
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g886a9096293238937f2f3bc7f2d57635</a>
addKernel :: Graph -> [Node] -> Fun -> (Int, Int, Int) -> (Int, Int, Int) -> Int -> [FunParam] -> IO Node

-- | Create a memcpy node and add it to the graph
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gdd521e1437c1c3ea8822f66a32ff1f94</a>
addMemcpy :: Graph -> [Node] -> Context -> Int -> Int -> Int -> Int -> MemoryType -> Ptr a -> Int -> Int -> Int -> Int -> Int -> Int -> MemoryType -> Ptr a -> Int -> Int -> Int -> Int -> Int -> IO Node

-- | Create a memset node and add it to the graph
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gac7f59961798f14a9f94f9f6b53cc3b7</a>
addMemset :: Storable a => Graph -> [Node] -> Context -> DevicePtr a -> a -> Int -> Int -> Int -> IO Node

-- | Add dependency edges to the graph
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g81bf1a6965f881be6ad8d21cfe0ee44f</a>
addDependencies :: Graph -> [(Node, Node)] -> IO ()

-- | Remove dependency edges from the graph
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g8ab696a6b3ccd99db47feba7e97fb579</a>
removeDependencies :: Graph -> [(Node, Node)] -> IO ()

-- | Return the type of a node
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gdb1776d97aa1c9d5144774b29e4b8c3e</a>
getType :: Node -> IO NodeType

-- | Retrieve the embedded graph of a child sub-graph node
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gbe9fc9267316b3778ef0db507917b4fd</a>
getChildGraph :: Node -> IO Graph

-- | Return a graph's dependency edges
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g2b7bd71b0b2b8521f141996e0975a0d7</a>
getEdges :: Graph -> IO [(Node, Node)]

-- | Return a graph's nodes
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gfa35a8e2d2fc32f48dbd67ba27cf27e5</a>
getNodes :: Graph -> IO [Node]

-- | Returns the root nodes of a graph
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gf8517646bd8b39ab6359f8e7f0edffbd</a>
getRootNodes :: Graph -> IO [Node]

-- | Return the dependencies of a node
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g048f4c0babcbba64a933fc277cd45083</a>
getDependencies :: Node -> IO [Node]

-- | Return a node's dependent nodes
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g4b73d9e3b386a9c0b094a452b8431f59</a>
getDependents :: Node -> IO [Node]

-- | Find a cloned version of a node
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gf21f6c968e346f028737c1118bfd41c2</a>
findInClone :: Node -> Graph -> IO Node


-- | This module defines an interface to the CUDA driver API. The Driver
--   API is a lower-level interface to CUDA devices than that provided by
--   the Runtime API. Using the Driver API, the programmer must deal
--   explicitly with operations such as initialisation, context management,
--   and loading (kernel) modules. Although more difficult to use
--   initially, the Driver API provides more control over how CUDA is used.
--   Furthermore, since it does not require compiling and linking the
--   program with <tt>nvcc</tt>, the Driver API provides better
--   inter-language compatibility.
--   
--   The following is a short tutorial on using the Driver API. The steps
--   can be copied into a file, or run directly in <tt>ghci</tt>, in which
--   case <tt>ghci</tt> should be launched with the option
--   <tt>-fno-ghci-sandbox</tt>. This is because CUDA maintains CPU-local
--   state, so operations should always be run from a bound thread.
--   
--   <ul>
--   <li><i><i>Using the Driver API</i></i></li>
--   </ul>
--   
--   Before any operation can be performed, the Driver API must be
--   initialised:
--   
--   <pre>
--   &gt;&gt;&gt; import Foreign.CUDA.Driver
--   
--   &gt;&gt;&gt; initialise []
--   </pre>
--   
--   Next, we must select a GPU that we will execute operations on. Each
--   GPU is assigned a unique identifier (beginning at zero). We can get a
--   handle to a compute device at a given ordinal using the <a>device</a>
--   operation. Given a device handle, we can query the properties of that
--   device using <a>props</a>. The number of available CUDA-capable
--   devices is given via <a>count</a>. For example:
--   
--   <pre>
--   &gt;&gt;&gt; count
--   1
--   
--   &gt;&gt;&gt; dev0 &lt;- device 0
--   
--   &gt;&gt;&gt; props dev0
--   DeviceProperties {deviceName = "GeForce GT 650M", computeCapability = 3.0, ...}
--   </pre>
--   
--   This package also includes the executable 'nvidia-device-query', which
--   when executed displays the key properties of all available devices.
--   See <a>Foreign.CUDA.Driver.Device</a> for additional operations to
--   query the capabilities or status of a device.
--   
--   Once you have chosen a device to use, the next step is to create a
--   CUDA context. A context is associated with a particular device, and
--   all operations, such as memory allocation and kernel execution, take
--   place with respect to that context. For example, to <a>create</a> a
--   new execution context on CUDA device 0:
--   
--   <pre>
--   &gt;&gt;&gt; ctx &lt;- create dev0 []
--   </pre>
--   
--   The second argument is a set of <a>ContextFlag</a>s which control how
--   the context behaves in various situations, for example, whether or not
--   the CPU should actively spin when waiting for results from the GPU
--   (<a>SchedSpin</a>), or to yield control to other threads instead
--   (<a>SchedYield</a>).
--   
--   The newly created context is now the <i>active</i> context, and all
--   subsequent operations take place within that context. More than one
--   context can be created per device, but resources, such as memory
--   allocated in the GPU, are unique to each context. The module
--   <a>Foreign.CUDA.Driver.Context</a> contains operations for managing
--   multiple contexts. Some devices allow data to be shared between
--   contexts without copying, see <a>Foreign.CUDA.Driver.Context.Peer</a>
--   for more information.
--   
--   Once the context is no longer needed, it should be <a>destroy</a>ed in
--   order to free up any resources that were allocated to it.
--   
--   <pre>
--   &gt;&gt;&gt; destroy ctx
--   </pre>
--   
--   Each device also has a unique context which is used by the Runtime
--   API. This context can be accessed with the module
--   <a>Foreign.CUDA.Driver.Context.Primary</a>.
--   
--   <ul>
--   <li><i><i>Executing kernels onto the GPU</i></i></li>
--   </ul>
--   
--   Once the Driver API is initialised and an execution context is created
--   on the GPU, we can begin to interact with it.
--   
--   At an example, we'll step through executing the CUDA equivalent of the
--   following Haskell function, which element-wise adds the elements of
--   two arrays:
--   
--   <pre>
--   &gt;&gt;&gt; vecAdd xs ys = zipWith (+) xs ys
--   </pre>
--   
--   The following CUDA kernel can be used to implement this on the GPU:
--   
--   <pre>
--   extern "C" __global__ void vecAdd(float *xs, float *ys, float *zs, int N)
--   {
--       int ix = blockIdx.x * blockDim.x + threadIdx.x;
--   
--       if ( ix &lt; N ) {
--           zs[ix] = xs[ix] + ys[ix];
--       }
--   }
--   </pre>
--   
--   Here, the <tt><b>global</b></tt> keyword marks the function as a
--   kernel that should be computed on the GPU in data parallel. When we
--   execute this function on the GPU, (at least) <i>N</i> threads will
--   execute <i>N</i> individual instances of the kernel function
--   <tt>vecAdd</tt>. Each thread will operate on a single element of each
--   input array to create a single value in the result. See the CUDA
--   programming guide for more details.
--   
--   We can save this to a file <tt>vector_add.cu</tt>, and compile it
--   using <tt>nvcc</tt> into a form that we can then load onto the GPU and
--   execute:
--   
--   <pre>
--   $ nvcc --ptx vector_add.cu
--   </pre>
--   
--   The module <a>Foreign.CUDA.Driver.Module</a> contains functions for
--   loading the resulting .ptx file (or .cubin files) into the running
--   program.
--   
--   <pre>
--   &gt;&gt;&gt; mdl &lt;- loadFile "vector_add.ptx"
--   </pre>
--   
--   Once finished with the module, it is also a good idea to <a>unload</a>
--   it.
--   
--   Modules may export kernel functions, global variables, and texture
--   references. Before we can execute our function, we need to look it up
--   in the module by name.
--   
--   <pre>
--   &gt;&gt;&gt; vecAdd &lt;- getFun mdl "vecAdd"
--   </pre>
--   
--   Given this reference to our kernel function, we are almost ready to
--   execute it on the device using <a>launchKernel</a>, but first, we must
--   create some data that we can execute the function on.
--   
--   <ul>
--   <li><i><i>Transferring data to and from the GPU</i></i></li>
--   </ul>
--   
--   GPUs typically have their own memory which is separate from the CPU's
--   memory, and we need to explicitly copy data back and forth between
--   these two regions. The module <a>Foreign.CUDA.Driver.Marshal</a>
--   provides functions for allocating memory on the GPU, and copying data
--   between the CPU and GPU, as well as directly between multiple GPUs.
--   
--   For simplicity, we'll use standard Haskell lists for our input and
--   output data structure. Note however that this will have significantly
--   lower effective bandwidth than reading a single contiguous region of
--   memory, so for most practical purposes you will want to use some kind
--   of unboxed array.
--   
--   <pre>
--   &gt;&gt;&gt; let xs = [1..1024]   :: [Float]
--   
--   &gt;&gt;&gt; let ys = [2,4..2048] :: [Float]
--   </pre>
--   
--   In CUDA, like C, all memory management is explicit, and arrays on the
--   device must be explicitly allocated and freed. As mentioned
--   previously, data transfer is also explicit. However, we do provide
--   convenience functions for combined allocation and marshalling, as well
--   as bracketed operations.
--   
--   <pre>
--   &gt;&gt;&gt; xs_dev &lt;- newListArray xs
--   
--   &gt;&gt;&gt; ys_dev &lt;- newListArray ys
--   
--   &gt;&gt;&gt; zs_dev &lt;- mallocArray 1024 :: IO (DevicePtr Float)
--   </pre>
--   
--   After executing the kernel (see next section), we transfer the result
--   back to the host, and free the memory that was allocated on the GPU.
--   
--   <pre>
--   &gt;&gt;&gt; zs &lt;- peekListArray 1024 zs_dev
--   
--   &gt;&gt;&gt; free xs_dev
--   
--   &gt;&gt;&gt; free ys_dev
--   
--   &gt;&gt;&gt; free zs_dev
--   </pre>
--   
--   <ul>
--   <li><i><i>Piecing it all together</i></i></li>
--   </ul>
--   
--   Finally, we have everything in place to execute our operation on the
--   GPU. Launching a kernel on the GPU consists of creating many threads
--   on the GPU which all execute the same function, and each thread has a
--   unique identifier in the grid/block hierarchy which can be used to
--   identify exactly which element this thread should process (the
--   <tt>blockIdx</tt> and <tt>threadIdx</tt> parameters that we saw
--   earlier, respectively).
--   
--   To execute our function, we will use a grid of 4 blocks, each
--   containing 256 threads. Thus, a total of 1024 threads will be
--   launched, which will each compute a single element of the output array
--   (recall that our input arrays each have 1024 elements). The module
--   <a>Foreign.CUDA.Analysis.Occupancy</a> contains functions to help
--   determine the ideal thread block size for a given kernel and GPU
--   combination.
--   
--   <pre>
--   &gt;&gt;&gt; launchKernel vecAdd (4,1,1) (256,1,1) 0 Nothing [VArg xs_dev, VArg ys_dev, VArg zs_dev, IArg 1024]
--   </pre>
--   
--   Note that kernel execution is asynchronous, so we should also wait for
--   the operation to complete before attempting to read the results back.
--   
--   <pre>
--   &gt;&gt;&gt; sync
--   </pre>
--   
--   And that's it!
--   
--   <ul>
--   <li><i><i>Next steps</i></i></li>
--   </ul>
--   
--   As mentioned at the end of the previous section, kernels on the GPU
--   are executed asynchronously with respect to the host, and other
--   operations such as data transfers can also be executed asynchronously.
--   This allows the CPU to continue doing other work while the GPU is
--   busy. <a>Event</a>s can be used to check whether an operation has
--   completed yet.
--   
--   It is also possible to execute multiple kernels or data transfers
--   concurrently with each other, by assigning those operations to
--   different execution <a>Stream</a>s. Used in conjunction with
--   <a>Event</a>s, operations will be scheduled efficiently only once all
--   dependencies (in the form of <a>Event</a>s) have been cleared.
--   
--   See <a>Foreign.CUDA.Driver.Event</a> and
--   <a>Foreign.CUDA.Driver.Stream</a> for more information on this topic.
module Foreign.CUDA.Driver

-- | Context creation flags
data ContextFlag
SchedAuto :: ContextFlag
SchedSpin :: ContextFlag
SchedYield :: ContextFlag
SchedBlockingSync :: ContextFlag

-- | <i>Deprecated: use SchedBlockingSync instead</i>
BlockingSync :: ContextFlag
SchedMask :: ContextFlag
MapHost :: ContextFlag
LmemResizeToMax :: ContextFlag
FlagsMask :: ContextFlag

-- | A device context
newtype Context
Context :: Ptr () -> Context
[useContext] :: Context -> Ptr ()

-- | Create a new CUDA context and associate it with the calling thread.
--   The context is created with a usage count of one, and the caller of
--   <a>create</a> must call <a>destroy</a> when done using the context. If
--   a context is already current to the thread, it is supplanted by the
--   newly created context and must be restored by a subsequent call to
--   <a>pop</a>.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf</a>
create :: Device -> [ContextFlag] -> IO Context

-- | Increments the usage count of the context. API: no context flags are
--   currently supported, so this parameter must be empty.

-- | <i>Deprecated: as of CUDA-4.0</i>
attach :: Context -> [ContextFlag] -> IO ()

-- | Detach the context, and destroy if no longer used

-- | <i>Deprecated: as of CUDA-4.0</i>
detach :: Context -> IO ()

-- | Destroy the specified context, regardless of how many threads it is
--   current to. The context will be <a>pop</a>ed from the current thread's
--   context stack, but if it is current on any other threads it will
--   remain current to those threads, and attempts to access it will result
--   in an error.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g27a365aebb0eb548166309f58a1e8b8e</a>
destroy :: Context -> IO ()

-- | Return the context bound to the calling CPU thread.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g8f13165846b73750693640fb3e8380d0</a>
get :: IO (Maybe Context)

-- | Bind the specified context to the calling thread.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gbe562ee6258b4fcc272ca6478ca2a2f7</a>
set :: Context -> IO ()

-- | Pop the current CUDA context from the CPU thread. The context may then
--   be attached to a different CPU thread by calling <a>push</a>.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2fac188026a062d92e91a8687d0a7902</a>
pop :: IO Context

-- | Push the given context onto the CPU's thread stack of current
--   contexts. The specified context becomes the CPU thread's current
--   context, so all operations that operate on the current context are
--   affected.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gb02d4c850eb16f861fe5a29682cc90ba</a>
push :: Context -> IO ()

-- | Block until the device has completed all preceding requests. If the
--   context was created with the <a>SchedBlockingSync</a> flag, the CPU
--   thread will block until the GPU has finished its work.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g7a54725f28d34b8c6299f0c6ca579616</a>
sync :: IO ()

-- | Priority of an execution stream. Work submitted to a higher priority
--   stream may preempt execution of work already executing in a lower
--   priority stream. Lower numbers represent higher priorities.
type StreamPriority = Int

-- | Peer-to-peer attributes
data PeerAttribute
PerformanceRank :: PeerAttribute
AccessSupported :: PeerAttribute
NativeAtomicSupported :: PeerAttribute
AccessAccessSupported :: PeerAttribute
CudaArrayAccessSupported :: PeerAttribute

-- | Possible option values for direct peer memory access
data PeerFlag

-- | Queries if the first device can directly access the memory of the
--   second. If direct access is possible, it can then be enabled with
--   <a>add</a>.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g496bdaae1f632ebfb695b99d2c40f19e</a>
accessible :: Device -> Device -> IO Bool

-- | If the devices of both the current and supplied contexts support
--   unified addressing, then enable allocations in the supplied context to
--   be accessible by the current context.
--   
--   Note that access is unidirectional, and in order to access memory in
--   the current context from the peer context, a separate symmetric call
--   to <a>add</a> is required.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g0889ec6728e61c05ed359551d67b3f5a</a>
add :: Context -> [PeerFlag] -> IO ()

-- | Disable direct memory access from the current context to the supplied
--   peer context, and unregisters any registered allocations.
--   
--   Requires CUDA-4.0.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g5b4b6936ea868d4954ce4d841a3b4810</a>
remove :: Context -> IO ()

-- | Queries attributes of the link between two devices
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__PEER__ACCESS.html#group__CUDA__PEER__ACCESS_1g4c55c60508f8eba4546b51f2ee545393</a>
--   
--   Requires CUDA-8.0
--   
--   <pre>
--   since 0.9.0.0
--   </pre>
getAttribute :: PeerAttribute -> Device -> Device -> IO Int

-- | Device shared memory configuration preference
data SharedMem
DefaultBankSize :: SharedMem
FourByteBankSize :: SharedMem
EightByteBankSize :: SharedMem

-- | Device cache configuration preference
data Cache
PreferNone :: Cache
PreferShared :: Cache
PreferL1 :: Cache
PreferEqual :: Cache

-- | Device limits flags
data Limit
StackSize :: Limit
PrintfFifoSize :: Limit
MallocHeapSize :: Limit
DevRuntimeSyncDepth :: Limit
DevRuntimePendingLaunchCount :: Limit
MaxL2FetchGranularity :: Limit
Max :: Limit

-- | Return the flags that were used to create the current context.
--   
--   Requires CUDA-7.0
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1gf81eef983c1e3b2ef4f166d7a930c86d</a>
getFlags :: IO [ContextFlag]

-- | Query compute 2.0 call stack limits.
--   
--   Requires CUDA-3.1.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g9f2d47d1745752aa16da7ed0d111b6a8</a>
getLimit :: Limit -> IO Int

-- | Specify the size of the call stack, for compute 2.0 devices.
--   
--   Requires CUDA-3.1.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g0651954dfb9788173e60a9af7201e65a</a>
setLimit :: Limit -> Int -> IO ()

-- | On devices where the L1 cache and shared memory use the same hardware
--   resources, this function returns the preferred cache configuration for
--   the current context.
--   
--   Requires CUDA-3.2.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g40b6b141698f76744dea6e39b9a25360</a>
getCache :: IO Cache

-- | On devices where the L1 cache and shared memory use the same hardware
--   resources, this sets the preferred cache configuration for the current
--   context. This is only a preference.
--   
--   Any function configuration set via <a>setCacheConfigFun</a> will be
--   preferred over this context-wide setting.
--   
--   Requires CUDA-3.2.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g54699acf7e2ef27279d013ca2095f4a3</a>
setCache :: Cache -> IO ()

-- | Return the current size of the shared memory banks in the current
--   context. On devices with configurable shared memory banks,
--   <a>setSharedMem</a> can be used to change the configuration, so that
--   subsequent kernel launches will by default us the new bank size. On
--   devices without configurable shared memory, this function returns the
--   fixed bank size of the hardware.
--   
--   Requires CUDA-4.2
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g17153a1b8b8c756f7ab8505686a4ad74</a>
getSharedMem :: IO SharedMem

-- | On devices with configurable shared memory banks, this function will
--   set the context's shared memory bank size that will be used by default
--   for subsequent kernel launches.
--   
--   Changing the shared memory configuration between launches may insert a
--   device synchronisation.
--   
--   Shared memory bank size does not affect shared memory usage or kernel
--   occupancy, but may have major effects on performance. Larger bank
--   sizes allow for greater potential bandwidth to shared memory, but
--   change the kinds of accesses which result in bank conflicts.
--   
--   Requires CUDA-4.2
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g2574235fa643f8f251bf7bc28fac3692</a>
setSharedMem :: SharedMem -> IO ()

-- | Returns the numerical values that correspond to the greatest and least
--   priority execution streams in the current context respectively. Stream
--   priorities follow the convention that lower numerical numbers
--   correspond to higher priorities. The range of meaningful stream
--   priorities is given by the inclusive range
--   [greatestPriority,leastPriority].
--   
--   Requires CUDA-5.5.
--   
--   
--   <a>http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g137920ab61a71be6ce67605b9f294091</a>
getStreamPriorityRange :: IO (StreamPriority, StreamPriority)


-- | Occupancy calculations for CUDA kernels
--   
--   
--   <a>http://developer.download.nvidia.com/compute/cuda/3_0/sdk/docs/CUDA_Occupancy_calculator.xls</a>
--   
--   <i>Determining Registers Per Thread and Shared Memory Per Block</i>
--   
--   To determine the number of registers used per thread in your kernel,
--   simply compile the kernel code using the option
--   
--   <pre>
--   --ptxas-options=-v
--   </pre>
--   
--   to nvcc. This will output information about register, local memory,
--   shared memory, and constant memory usage for each kernel in the
--   <tt>.cu</tt> file. Alternatively, you can compile with the
--   <tt>-cubin</tt> option to nvcc. This will generate a <tt>.cubin</tt>
--   file, which you can open in a text editor. Look for the <tt>code</tt>
--   section with your kernel's name. Within the curly braces (<tt>{ ...
--   }</tt>) for that code block, you will see a line with <tt>reg =
--   X</tt>, where <tt>x</tt> is the number of registers used by your
--   kernel. You can also see the amount of shared memory used as <tt>smem
--   = Y</tt>. However, if your kernel declares any external shared memory
--   that is allocated dynamically, you will need to add the number in the
--   <tt>.cubin</tt> file to the amount you dynamically allocate at run
--   time to get the correct shared memory usage.
--   
--   <i>Notes About Occupancy</i>
--   
--   Higher occupancy does not necessarily mean higher performance. If a
--   kernel is not bandwidth bound, then increasing occupancy will not
--   necessarily increase performance. If a kernel invocation is already
--   running at least one thread block per multiprocessor in the GPU, and
--   it is bottlenecked by computation and not by global memory accesses,
--   then increasing occupancy may have no effect. In fact, making changes
--   just to increase occupancy can have other effects, such as additional
--   instructions, spills to local memory (which is off chip), divergent
--   branches, etc. As with any optimization, you should experiment to see
--   how changes affect the *wall clock time* of the kernel execution. For
--   bandwidth bound applications, on the other hand, increasing occupancy
--   can help better hide the latency of memory accesses, and therefore
--   improve performance.
module Foreign.CUDA.Analysis.Occupancy
data Occupancy
Occupancy :: !Int -> !Int -> !Int -> !Double -> Occupancy

-- | Active threads per multiprocessor
[activeThreads] :: Occupancy -> !Int

-- | Active thread blocks per multiprocessor
[activeThreadBlocks] :: Occupancy -> !Int

-- | Active warps per multiprocessor
[activeWarps] :: Occupancy -> !Int

-- | Occupancy of each multiprocessor (percent)
[occupancy100] :: Occupancy -> !Double

-- | Calculate occupancy data for a given GPU and kernel resource usage
occupancy :: DeviceProperties -> Int -> Int -> Int -> Occupancy

-- | Optimise multiprocessor occupancy as a function of thread block size
--   and resource usage. This returns the smallest satisfying block size in
--   increments of a single warp.
optimalBlockSize :: DeviceProperties -> (Int -> Int) -> (Int -> Int) -> (Int, Occupancy)

-- | As <a>optimalBlockSize</a>, but with a generator that produces the
--   specific thread block sizes that should be tested. The generated list
--   can produce values in any order, but the last satisfying block size
--   will be returned. Hence, values should be monotonically decreasing to
--   return the smallest block size yielding maximum occupancy, and
--   vice-versa.
optimalBlockSizeOf :: DeviceProperties -> [Int] -> (Int -> Int) -> (Int -> Int) -> (Int, Occupancy)

-- | Determine the maximum number of CTAs that can be run simultaneously
--   for a given kernel / device combination.
maxResidentBlocks :: DeviceProperties -> Int -> Int -> Int -> Int

-- | Increments in powers-of-two, over the range of supported thread block
--   sizes for the given device.
incPow2 :: DeviceProperties -> [Int]

-- | Increments in the warp size of the device, over the range of supported
--   thread block sizes.
incWarp :: DeviceProperties -> [Int]

-- | Decrements in powers-of-two, over the range of supported thread block
--   sizes for the given device.
decPow2 :: DeviceProperties -> [Int]

-- | Decrements in the warp size of the device, over the range of supported
--   thread block sizes.
decWarp :: DeviceProperties -> [Int]
instance GHC.Show.Show Foreign.CUDA.Analysis.Occupancy.Occupancy
instance GHC.Classes.Ord Foreign.CUDA.Analysis.Occupancy.Occupancy
instance GHC.Classes.Eq Foreign.CUDA.Analysis.Occupancy.Occupancy


-- | Meta-module exporting CUDA analysis routines
module Foreign.CUDA.Analysis