-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | FFI bindings to NVVM -- -- The NVVM library compiles NVVM IR (a subset of LLVM IR) into PTX code -- which can then be executed on NVIDIA GPUs. In contrast to the standard -- NVPTX target built in to the LLVM toolchain, NVVM includes a set of -- proprietary optimisations which are otherwise only available by -- compiling CUDA code with the nvcc compiler. -- -- The resulting PTX code can be loaded onto the GPU and executed using -- the cuda package: -- -- https://hackage.haskell.org/package/cuda -- -- The NVVM library is a compiler component available a part of the CUDA -- toolkit: -- -- https://developer.nvidia.com/cuda-toolkit -- -- See the travis-ci.org build matrix for tested CUDA library -- versions. @package nvvm @version 0.9.0.0 -- | Error handling module Foreign.NVVM.Error -- | NVVM API function return code data Status Success :: Status OutOfMemory :: Status ProgramCreationFailure :: Status IRVersionMismatch :: Status InvalidInput :: Status InvalidProgram :: Status InvalidIR :: Status InvalidOption :: Status NoModuleInProgram :: Status CompilationFailure :: Status describe :: Status -> String -- | Return the result of a function on successful execution, otherwise -- throw an exception. resultIfOk :: (Status, a) -> IO a -- | Throw an exception on an unsuccessful return code nothingIfOk :: Status -> IO () checkStatus :: CInt -> IO () -- | Throw an exception. Exceptions may be thrown from pure code, but can -- only be caught in the IO monad. nvvmError :: String -> a -- | Raise an NVVM exception in the IO monad nvvmErrorIO :: String -> IO a -- | A specially formatted error message requireSDK :: Name -> Double -> a instance GHC.Show.Show Foreign.NVVM.Error.Status instance GHC.Classes.Eq Foreign.NVVM.Error.Status instance GHC.Exception.Type.Exception Foreign.NVVM.Error.NVVMException instance GHC.Show.Show Foreign.NVVM.Error.NVVMException instance GHC.Enum.Enum Foreign.NVVM.Error.Status -- | General information query module Foreign.NVVM.Info -- | Get the version of the NVVM library -- -- -- http://docs.nvidia.com/cuda/libnvvm-api/group__query.html#group__query_1gcdd062f26078d20ded68f1017e999246 nvvmVersion :: Version -- | Get the version of NVVM IR supported by this library. The first -- component is the NVVM IR version, and the second the version of the -- debug metadata. -- -- Requires: CUDA-7.0 -- -- -- http://docs.nvidia.com/cuda/libnvvm-api/group__query.html#group__query_1g0894677934db095b3c40d4f8e2578cc5 nvvmIRVersion :: (Version, Version) -- | Program compilation module Foreign.NVVM.Compile -- | An NVVM program data Program -- | The result of compiling an NVVM program. data Result Result :: !ByteString -> !ByteString -> Result -- | The compiled kernel, which can be loaded into the current program -- using 'Foreign.CUDA.Driver.loadData*' [compileResult] :: Result -> !ByteString -- | Warning messages generated by the compiler/verifier [compileLog] :: Result -> !ByteString -- | Program compilation options data CompileOption -- | optimisation level, from 0 (disable optimisations) to 3 (default) OptimisationLevel :: !Int -> CompileOption -- | target architecture to compile for (default: compute 2.0) Target :: !Compute -> CompileOption -- | flush denormal values to zero when performing single-precision -- floating-point operations (default: no) FlushToZero :: CompileOption -- | disable fused-multiply-add instructions (default: enabled) NoFMA :: CompileOption -- | use a fast approximation for single-precision floating-point square -- root (default: no) FastSqrt :: CompileOption -- | use a fast approximation for single-precision floating-point division -- and reciprocal (default: no) FastDiv :: CompileOption -- | generate debugging information (-g) (default: no) GenerateDebugInfo :: CompileOption -- | Compile an NVVM IR module, in either bitcode or textual -- representation, into PTX code. compileModule :: ShortByteString -> ByteString -> [CompileOption] -> IO Result -- | Compile a collection of NVVM IR modules into PTX code compileModules :: [(ShortByteString, ByteString)] -> [CompileOption] -> IO Result -- | Create an empty Program -- -- -- http://docs.nvidia.com/cuda/libnvvm-api/group__compilation.html#group__compilation_1g46a0ab04a063cba28bfbb41a1939e3f4 create :: IO Program -- | Destroy a Program -- -- -- http://docs.nvidia.com/cuda/libnvvm-api/group__compilation.html#group__compilation_1gfba94cab1224c0152841b80690d366aa destroy :: Program -> IO () -- | Add a module level NVVM IR to a program -- -- -- http://docs.nvidia.com/cuda/libnvvm-api/group__compilation.html#group__compilation_1g0c22d2b9be033c165bc37b16f3ed75c6 addModule :: Program -> ShortByteString -> ByteString -> IO () -- | As with addModule, but read the specified number of bytes from -- the given pointer. addModuleFromPtr :: Program -> ShortByteString -> Int -> Ptr Word8 -> IO () -- | Add a module level NVVM IR to a program. -- -- The module is loaded lazily: only symbols required by modules loaded -- using addModule or addModuleFromPtr will be loaded. -- -- Requires CUDA-10.0 -- -- -- https://docs.nvidia.com/cuda/libnvvm-api/group__compilation.html#group__compilation_1g5356ce5063db232cd4330b666c62219b addModuleLazy :: Program -> ShortByteString -> ByteString -> IO () -- | As with addModuleLazy, but read the specified number of bytes -- from the given pointer (the symbols are loaded lazily, the data in the -- buffer will be read immediately). -- -- Requires CUDA-10.0 addModuleLazyFromPtr :: Program -> ShortByteString -> Int -> Ptr Word8 -> IO () -- | Compile the NVVM program. Returns the log from the compiler/verifier -- and, if successful, the compiled program. -- -- -- http://docs.nvidia.com/cuda/libnvvm-api/group__compilation.html#group__compilation_1g76ac1e23f5d0e2240e78be0e63450346 compile :: Program -> [CompileOption] -> IO (ByteString, Maybe ByteString) -- | Verify the NVVM program. Returns whether compilation will succeed, -- together with any error or warning messages. verify :: Program -> [CompileOption] -> IO (Status, ByteString) instance GHC.Show.Show Foreign.NVVM.Compile.CompileOption instance GHC.Classes.Eq Foreign.NVVM.Compile.CompileOption instance GHC.Show.Show Foreign.NVVM.Compile.Program instance GHC.Classes.Eq Foreign.NVVM.Compile.Program -- | This module defines an interface to the libNVVM library -- provided by NVIDIA as part of the CUDA toolkit. It compiles NVVM IR, a -- compiler intermediate representation based on LLVM IR, into PTX code -- suitable for execution on NVIDIA GPUs. NVVM IR is a subset of LLVM IR, -- with a set of rules, restrictions, conventions, and intrinsic -- functions. -- -- NVIDIA's own nvcc compiler uses NVVM IR and libNVVM -- internally as part of the CUDA C compilation process. In contrast to -- the (open-source) NVPTX target included with the standard LLVM -- toolchain (which also compiles NVVM IR into PTX code), libNVVM -- includes a set of proprietary optimisation passes, which may -- result in faster GPU code. More information on NVVM IR can be found -- here: -- -- http://docs.nvidia.com/cuda/nvvm-ir-spec/index.html -- -- The following is a short tutorial on using this library. The steps can -- be copied into a file, or run directly in ghci, in which case -- ghci should be launched with the option -- -fno-ghci-sandbox. This is because CUDA maintains CPU-local -- state, so operations must be run from a bound thread. -- -- Note that the focus of this library is the generation of executable -- PTX code from NVVM IR, so we will additionally need to use the -- cuda package to control and execute the compiled program. In -- this tutorial we will go over those steps quickly, but see the -- cuda package for more information and a similar tutorial: -- -- https://hackage.haskell.org/package/cuda -- --
-- >>> import Foreign.CUDA.Driver as CUDA -- -- >>> CUDA.initialise [] ---- -- Select a GPU and create an execution context for that device. Each -- available device is given a unique numeric identifier (beginning at -- zero). For this example we just select the first device (the default). -- --
-- >>> dev0 <- CUDA.device 0 -- -- >>> prp0 <- CUDA.props dev0 -- -- >>> ctx0 <- CUDA.create dev0 [] ---- -- Remember that once the context is no longer needed, it should be -- destroyed in order to free up any resources that were allocated -- into it. -- --
-- >>> vecAdd xs ys = zipWith (+) xs ys ---- -- The following NVVM IR implements this for the GPU. Note that this -- example is written using NVVM IR version 1.2 syntax (corresponding to -- CUDA toolkit 7.5), which is based on LLVM IR version 3.4. The human -- readable representation of LLVM IR (and by extension NVVM IR) is -- notorious for changing between releases, whereas the bitcode -- representation is somewhat more stable. You may wish to keep this in -- mind for your own programs. -- --
-- target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-- target triple = "nvptx64-nvidia-cuda"
--
-- define void @vecAdd(float* %A, float* %B, float* %C) {
-- entry:
-- ; What is my ID?
-- %id = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind
--
-- ; Compute pointers into A, B, and C
-- %ptrA = getelementptr float* %A, i32 %id
-- %ptrB = getelementptr float* %B, i32 %id
-- %ptrC = getelementptr float* %C, i32 %id
--
-- ; Read A, B
-- %valA = load float* %ptrA, align 4
-- %valB = load float* %ptrB, align 4
--
-- ; Compute C = A + B
-- %valC = fadd float %valA, %valB
--
-- ; Store back to C
-- store float %valC, float* %ptrC, align 4
--
-- ret void
-- }
--
-- ; Intrinsic to read threadIdx.x
-- declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() readnone nounwind
--
-- !nvvm.annotations = !{!0}
-- !0 = metadata !{void (float*, float*, float*)* @vecAdd, metadata !"kernel", i64 1}
--
--
-- For reference, in CUDA this kernel would have been written as:
--
--
-- extern "C" __global__ void vecAdd(float *xs, float* ys, float *zs)
-- {
-- int ix = threadIdx.x;
--
-- zs[ix] = xs[ix] + ys[ix];
-- }
--
--
-- The NVVM IR can be stored directly in the program as a
-- [Byte]String, but here I will assume that it is saved to a
-- file vector_add.ll:
--
-- -- >>> import Data.ByteString as B -- -- >>> ll <- B.readFile "vector_add.ll" ---- -- Now we can use NVVM to compile this into PTX code: -- --
-- >>> import Foreign.NVVM as NVVM -- -- >>> ptx <- NVVM.compileModule "vecAdd" ll [ NVVM.Target (CUDA.computeCapability prp0) ] ---- -- Notice that we asked NVVM to specialise the generated PTX code for our -- current device. By default the code will be compiled for compute -- capability 2.0 (the earliest supported target). -- -- The generated PTX code can then be loaded into the current CUDA -- execution context, from which we can extract a reference to the GPU -- kernel that we will later execute. -- --
-- >>> mdl <- CUDA.loadData (NVVM.compileResult ptx) -- -- >>> vecAdd <- CUDA.getFun mdl "vecAdd" ---- -- After we are finished with the module, it is a good idea to -- unload it in order to free any resources it used. -- --
-- >>> let xs = [1..256] :: [Float] -- -- >>> let ys = [2,4..512] :: [Float] -- -- >>> xs_dev <- CUDA.newListArray xs -- -- >>> ys_dev <- CUDA.newListArray ys -- -- >>> zs_dev <- CUDA.mallocArray 256 :: IO (CUDA.DevicePtr Float) ---- -- For this simple kernel we execute it using a single (one dimensional) -- thread block, with one thread computing each element of the output. -- --
-- >>> CUDA.launchKernel vecAdd (1,1,1) (256,1,1) 0 Nothing [CUDA.VArg xs_dev, CUDA.VArg ys_dev, CUDA.VArg zs_dev] ---- -- Finally, we can copy the results back to the host, and deallocate the -- arrays from the GPU. -- --
-- >>> zs <- CUDA.peekListArray 256 zs_dev -- -- >>> CUDA.free xs_dev -- -- >>> CUDA.free ys_dev -- -- >>> CUDA.free zs_dev ---- --