-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | FFI bindings to NVVM
--   
--   The NVVM library compiles NVVM IR (a subset of LLVM IR) into PTX code
--   which can then be executed on NVIDIA GPUs. In contrast to the standard
--   NVPTX target built in to the LLVM toolchain, NVVM includes a set of
--   proprietary optimisations which are otherwise only available by
--   compiling CUDA code with the <a>nvcc</a> compiler.
--   
--   The resulting PTX code can be loaded onto the GPU and executed using
--   the <a>cuda</a> package:
--   
--   <a>https://hackage.haskell.org/package/cuda</a>
--   
--   The NVVM library is a compiler component available a part of the CUDA
--   toolkit:
--   
--   <a>https://developer.nvidia.com/cuda-toolkit</a>
--   
--   See the <a>travis-ci.org</a> build matrix for tested CUDA library
--   versions.
@package nvvm
@version 0.9.0.0


-- | Error handling
module Foreign.NVVM.Error

-- | NVVM API function return code
data Status
Success :: Status
OutOfMemory :: Status
ProgramCreationFailure :: Status
IRVersionMismatch :: Status
InvalidInput :: Status
InvalidProgram :: Status
InvalidIR :: Status
InvalidOption :: Status
NoModuleInProgram :: Status
CompilationFailure :: Status
describe :: Status -> String

-- | Return the result of a function on successful execution, otherwise
--   throw an exception.
resultIfOk :: (Status, a) -> IO a

-- | Throw an exception on an unsuccessful return code
nothingIfOk :: Status -> IO ()
checkStatus :: CInt -> IO ()

-- | Throw an exception. Exceptions may be thrown from pure code, but can
--   only be caught in the <a>IO</a> monad.
nvvmError :: String -> a

-- | Raise an NVVM exception in the <a>IO</a> monad
nvvmErrorIO :: String -> IO a

-- | A specially formatted error message
requireSDK :: Name -> Double -> a
instance GHC.Show.Show Foreign.NVVM.Error.Status
instance GHC.Classes.Eq Foreign.NVVM.Error.Status
instance GHC.Exception.Type.Exception Foreign.NVVM.Error.NVVMException
instance GHC.Show.Show Foreign.NVVM.Error.NVVMException
instance GHC.Enum.Enum Foreign.NVVM.Error.Status


-- | General information query
module Foreign.NVVM.Info

-- | Get the version of the NVVM library
--   
--   
--   <a>http://docs.nvidia.com/cuda/libnvvm-api/group__query.html#group__query_1gcdd062f26078d20ded68f1017e999246</a>
nvvmVersion :: Version

-- | Get the version of NVVM IR supported by this library. The first
--   component is the NVVM IR version, and the second the version of the
--   debug metadata.
--   
--   Requires: CUDA-7.0
--   
--   
--   <a>http://docs.nvidia.com/cuda/libnvvm-api/group__query.html#group__query_1g0894677934db095b3c40d4f8e2578cc5</a>
nvvmIRVersion :: (Version, Version)


-- | Program compilation
module Foreign.NVVM.Compile

-- | An NVVM program
data Program

-- | The result of compiling an NVVM program.
data Result
Result :: !ByteString -> !ByteString -> Result

-- | The compiled kernel, which can be loaded into the current program
--   using 'Foreign.CUDA.Driver.loadData*'
[compileResult] :: Result -> !ByteString

-- | Warning messages generated by the compiler/verifier
[compileLog] :: Result -> !ByteString

-- | Program compilation options
data CompileOption

-- | optimisation level, from 0 (disable optimisations) to 3 (default)
OptimisationLevel :: !Int -> CompileOption

-- | target architecture to compile for (default: compute 2.0)
Target :: !Compute -> CompileOption

-- | flush denormal values to zero when performing single-precision
--   floating-point operations (default: no)
FlushToZero :: CompileOption

-- | disable fused-multiply-add instructions (default: enabled)
NoFMA :: CompileOption

-- | use a fast approximation for single-precision floating-point square
--   root (default: no)
FastSqrt :: CompileOption

-- | use a fast approximation for single-precision floating-point division
--   and reciprocal (default: no)
FastDiv :: CompileOption

-- | generate debugging information (-g) (default: no)
GenerateDebugInfo :: CompileOption

-- | Compile an NVVM IR module, in either bitcode or textual
--   representation, into PTX code.
compileModule :: ShortByteString -> ByteString -> [CompileOption] -> IO Result

-- | Compile a collection of NVVM IR modules into PTX code
compileModules :: [(ShortByteString, ByteString)] -> [CompileOption] -> IO Result

-- | Create an empty <a>Program</a>
--   
--   
--   <a>http://docs.nvidia.com/cuda/libnvvm-api/group__compilation.html#group__compilation_1g46a0ab04a063cba28bfbb41a1939e3f4</a>
create :: IO Program

-- | Destroy a <a>Program</a>
--   
--   
--   <a>http://docs.nvidia.com/cuda/libnvvm-api/group__compilation.html#group__compilation_1gfba94cab1224c0152841b80690d366aa</a>
destroy :: Program -> IO ()

-- | Add a module level NVVM IR to a program
--   
--   
--   <a>http://docs.nvidia.com/cuda/libnvvm-api/group__compilation.html#group__compilation_1g0c22d2b9be033c165bc37b16f3ed75c6</a>
addModule :: Program -> ShortByteString -> ByteString -> IO ()

-- | As with <a>addModule</a>, but read the specified number of bytes from
--   the given pointer.
addModuleFromPtr :: Program -> ShortByteString -> Int -> Ptr Word8 -> IO ()

-- | Add a module level NVVM IR to a program.
--   
--   The module is loaded lazily: only symbols required by modules loaded
--   using <a>addModule</a> or <a>addModuleFromPtr</a> will be loaded.
--   
--   Requires CUDA-10.0
--   
--   
--   <a>https://docs.nvidia.com/cuda/libnvvm-api/group__compilation.html#group__compilation_1g5356ce5063db232cd4330b666c62219b</a>
addModuleLazy :: Program -> ShortByteString -> ByteString -> IO ()

-- | As with <a>addModuleLazy</a>, but read the specified number of bytes
--   from the given pointer (the symbols are loaded lazily, the data in the
--   buffer will be read immediately).
--   
--   Requires CUDA-10.0
addModuleLazyFromPtr :: Program -> ShortByteString -> Int -> Ptr Word8 -> IO ()

-- | Compile the NVVM program. Returns the log from the compiler/verifier
--   and, if successful, the compiled program.
--   
--   
--   <a>http://docs.nvidia.com/cuda/libnvvm-api/group__compilation.html#group__compilation_1g76ac1e23f5d0e2240e78be0e63450346</a>
compile :: Program -> [CompileOption] -> IO (ByteString, Maybe ByteString)

-- | Verify the NVVM program. Returns whether compilation will succeed,
--   together with any error or warning messages.
verify :: Program -> [CompileOption] -> IO (Status, ByteString)
instance GHC.Show.Show Foreign.NVVM.Compile.CompileOption
instance GHC.Classes.Eq Foreign.NVVM.Compile.CompileOption
instance GHC.Show.Show Foreign.NVVM.Compile.Program
instance GHC.Classes.Eq Foreign.NVVM.Compile.Program


-- | This module defines an interface to the <i>libNVVM</i> library
--   provided by NVIDIA as part of the CUDA toolkit. It compiles NVVM IR, a
--   compiler intermediate representation based on LLVM IR, into PTX code
--   suitable for execution on NVIDIA GPUs. NVVM IR is a subset of LLVM IR,
--   with a set of rules, restrictions, conventions, and intrinsic
--   functions.
--   
--   NVIDIA's own <tt>nvcc</tt> compiler uses NVVM IR and <i>libNVVM</i>
--   internally as part of the CUDA C compilation process. In contrast to
--   the (open-source) NVPTX target included with the standard LLVM
--   toolchain (which also compiles NVVM IR into PTX code), <i>libNVVM</i>
--   includes a set of proprietary optimisation passes, which <i>may</i>
--   result in faster GPU code. More information on NVVM IR can be found
--   here:
--   
--   <a>http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html</a>
--   
--   The following is a short tutorial on using this library. The steps can
--   be copied into a file, or run directly in <tt>ghci</tt>, in which case
--   <tt>ghci</tt> should be launched with the option
--   <tt>-fno-ghci-sandbox</tt>. This is because CUDA maintains CPU-local
--   state, so operations must be run from a bound thread.
--   
--   Note that the focus of this library is the generation of executable
--   PTX code from NVVM IR, so we will additionally need to use the
--   <tt>cuda</tt> package to control and execute the compiled program. In
--   this tutorial we will go over those steps quickly, but see the
--   <tt>cuda</tt> package for more information and a similar tutorial:
--   
--   <a>https://hackage.haskell.org/package/cuda</a>
--   
--   <ul>
--   <li><i><i>Initialise the CUDA environment</i></i></li>
--   </ul>
--   
--   Before any operation can be performed, we must initialise the CUDA
--   Driver API.
--   
--   <pre>
--   &gt;&gt;&gt; import Foreign.CUDA.Driver as CUDA
--   
--   &gt;&gt;&gt; CUDA.initialise []
--   </pre>
--   
--   Select a GPU and create an execution context for that device. Each
--   available device is given a unique numeric identifier (beginning at
--   zero). For this example we just select the first device (the default).
--   
--   <pre>
--   &gt;&gt;&gt; dev0 &lt;- CUDA.device 0
--   
--   &gt;&gt;&gt; prp0 &lt;- CUDA.props dev0
--   
--   &gt;&gt;&gt; ctx0 &lt;- CUDA.create dev0 []
--   </pre>
--   
--   Remember that once the context is no longer needed, it should be
--   <a>destroy</a>ed in order to free up any resources that were allocated
--   into it.
--   
--   <ul>
--   <li><i><i>Compiling kernels with NVVM</i></i></li>
--   </ul>
--   
--   For this example we will step through executing the equivalent of the
--   following Haskell function, which element-wise adds the elements of
--   two arrays:
--   
--   <pre>
--   &gt;&gt;&gt; vecAdd xs ys = zipWith (+) xs ys
--   </pre>
--   
--   The following NVVM IR implements this for the GPU. Note that this
--   example is written using NVVM IR version 1.2 syntax (corresponding to
--   CUDA toolkit 7.5), which is based on LLVM IR version 3.4. The human
--   readable representation of LLVM IR (and by extension NVVM IR) is
--   notorious for changing between releases, whereas the bitcode
--   representation is somewhat more stable. You may wish to keep this in
--   mind for your own programs.
--   
--   <pre>
--   target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
--   target triple = "nvptx64-nvidia-cuda"
--   
--   define void @vecAdd(float* %A, float* %B, float* %C) {
--   entry:
--     ; What is my ID?
--     %id = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind
--   
--     ; Compute pointers into A, B, and C
--     %ptrA = getelementptr float* %A, i32 %id
--     %ptrB = getelementptr float* %B, i32 %id
--     %ptrC = getelementptr float* %C, i32 %id
--   
--     ; Read A, B
--     %valA = load float* %ptrA, align 4
--     %valB = load float* %ptrB, align 4
--   
--     ; Compute C = A + B
--     %valC = fadd float %valA, %valB
--   
--     ; Store back to C
--     store float %valC, float* %ptrC, align 4
--   
--     ret void
--   }
--   
--   ; Intrinsic to read threadIdx.x
--   declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind
--   
--   !nvvm.annotations = !{!0}
--   !0 = metadata !{void (float*, float*, float*)* @vecAdd, metadata !"kernel", i64 1}
--   </pre>
--   
--   For reference, in CUDA this kernel would have been written as:
--   
--   <pre>
--   extern "C" __global__ void vecAdd(float *xs, float* ys, float *zs)
--   {
--       int ix = threadIdx.x;
--   
--       zs[ix] = xs[ix] + ys[ix];
--   }
--   </pre>
--   
--   The NVVM IR can be stored directly in the program as a
--   <tt>[Byte]String</tt>, but here I will assume that it is saved to a
--   file <tt>vector_add.ll</tt>:
--   
--   <pre>
--   &gt;&gt;&gt; import Data.ByteString as B
--   
--   &gt;&gt;&gt; ll &lt;- B.readFile "vector_add.ll"
--   </pre>
--   
--   Now we can use NVVM to compile this into PTX code:
--   
--   <pre>
--   &gt;&gt;&gt; import Foreign.NVVM as NVVM
--   
--   &gt;&gt;&gt; ptx &lt;- NVVM.compileModule "vecAdd" ll [ NVVM.Target (CUDA.computeCapability prp0) ]
--   </pre>
--   
--   Notice that we asked NVVM to specialise the generated PTX code for our
--   current device. By default the code will be compiled for compute
--   capability 2.0 (the earliest supported target).
--   
--   The generated PTX code can then be loaded into the current CUDA
--   execution context, from which we can extract a reference to the GPU
--   kernel that we will later execute.
--   
--   <pre>
--   &gt;&gt;&gt; mdl    &lt;- CUDA.loadData (NVVM.compileResult ptx)
--   
--   &gt;&gt;&gt; vecAdd &lt;- CUDA.getFun mdl "vecAdd"
--   </pre>
--   
--   After we are finished with the module, it is a good idea to
--   <a>unload</a> it in order to free any resources it used.
--   
--   <ul>
--   <li><i><i>Executing the kernel</i></i></li>
--   </ul>
--   
--   Executing the <tt>vecAdd</tt> kernel now proceeds exactly like
--   executing any other kernel function using the CUDA Driver API. The
--   following is a quick overview; see the tutorial in the <tt>cuda</tt>
--   package for more information.
--   
--   First, generate some data and copy it to the device. We also allocate
--   an (uninitialised) array on the device to store the results.
--   
--   <pre>
--   &gt;&gt;&gt; let xs = [1..256]   :: [Float]
--   
--   &gt;&gt;&gt; let ys = [2,4..512] :: [Float]
--   
--   &gt;&gt;&gt; xs_dev &lt;- CUDA.newListArray xs
--   
--   &gt;&gt;&gt; ys_dev &lt;- CUDA.newListArray ys
--   
--   &gt;&gt;&gt; zs_dev &lt;- CUDA.mallocArray 256 :: IO (CUDA.DevicePtr Float)
--   </pre>
--   
--   For this simple kernel we execute it using a single (one dimensional)
--   thread block, with one thread computing each element of the output.
--   
--   <pre>
--   &gt;&gt;&gt; CUDA.launchKernel vecAdd (1,1,1) (256,1,1) 0 Nothing [CUDA.VArg xs_dev, CUDA.VArg ys_dev, CUDA.VArg zs_dev]
--   </pre>
--   
--   Finally, we can copy the results back to the host, and deallocate the
--   arrays from the GPU.
--   
--   <pre>
--   &gt;&gt;&gt; zs &lt;- CUDA.peekListArray 256 zs_dev
--   
--   &gt;&gt;&gt; CUDA.free xs_dev
--   
--   &gt;&gt;&gt; CUDA.free ys_dev
--   
--   &gt;&gt;&gt; CUDA.free zs_dev
--   </pre>
--   
--   <ul>
--   <li><i><i>Next steps</i></i></li>
--   </ul>
--   
--   The library also provides functions for compiling several NVVM IR
--   sources into a single module. In particular this is useful when
--   linking against <tt>libdevice</tt>, a standard library of functions in
--   NVVM IR which implement, for example, math primitives and bitwise
--   operations. More information on <tt>libdevice</tt> can be found here:
--   
--   <a>http://docs.nvidia.com/cuda/libdevice-users-guide/index.html</a>
module Foreign.NVVM