accelerate-llvm-ptx- Accelerate backend for NVIDIA GPUs

Copyright[2016..2017] Trevor L. McDonell
MaintainerTrevor L. McDonell <>
Portabilitynon-portable (GHC extensions)
Safe HaskellNone







data ForeignAcc f where Source #


ForeignAcc :: String -> (Stream -> a -> LLVM PTX b) -> ForeignAcc (a -> b) 


Foreign ForeignAcc Source # 


strForeign :: ForeignAcc args -> String

liftForeign :: ForeignAcc args -> Q (TExp (ForeignAcc args))

data ForeignExp f where Source #


ForeignExp :: String -> IRFun1 PTX () (x -> y) -> ForeignExp (x -> y) 


Foreign ForeignExp Source # 


strForeign :: ForeignExp args -> String

liftForeign :: ForeignExp args -> Q (TExp (ForeignExp args))

data LLVM target a :: * -> * -> * #


MonadState target (LLVM target) 


get :: LLVM target target #

put :: target -> LLVM target () #

state :: (target -> (a, target)) -> LLVM target a #

Execute arch => ExecuteAfun arch (LLVM arch b) 

Associated Types

type ExecAfunR arch (LLVM arch b) :: *


executeOpenAfun :: ExecOpenAfun arch aenv (ExecAfunR arch (LLVM arch b)) -> LLVM arch (AvalR arch aenv) -> LLVM arch b

Monad (LLVM target) 


(>>=) :: LLVM target a -> (a -> LLVM target b) -> LLVM target b #

(>>) :: LLVM target a -> LLVM target b -> LLVM target b #

return :: a -> LLVM target a #

fail :: String -> LLVM target a #

Functor (LLVM target) 


fmap :: (a -> b) -> LLVM target a -> LLVM target b #

(<$) :: a -> LLVM target b -> LLVM target a #

Applicative (LLVM target) 


pure :: a -> LLVM target a #

(<*>) :: LLVM target (a -> b) -> LLVM target a -> LLVM target b #

liftA2 :: (a -> b -> c) -> LLVM target a -> LLVM target b -> LLVM target c #

(*>) :: LLVM target a -> LLVM target b -> LLVM target b #

(<*) :: LLVM target a -> LLVM target b -> LLVM target a #

MonadIO (LLVM target) 


liftIO :: IO a -> LLVM target a #

MonadThrow (LLVM target) 


throwM :: Exception e => e -> LLVM target a #

MonadCatch (LLVM target) 


catch :: Exception e => LLVM target a -> (e -> LLVM target a) -> LLVM target a #

MonadMask (LLVM target) 


mask :: ((forall a. LLVM target a -> LLVM target a) -> LLVM target b) -> LLVM target b #

uninterruptibleMask :: ((forall a. LLVM target a -> LLVM target a) -> LLVM target b) -> LLVM target b #

type ExecAfunR arch (LLVM arch b) 
type ExecAfunR arch (LLVM arch b) = b
type RemotePtr (LLVM PTX) 
type RemotePtr (LLVM PTX) = DevicePtr

data PTX Source #

The PTX execution target for NVIDIA GPUs.

The execution target carries state specific for the current execution context. The data here --- device memory and execution streams --- are implicitly tied to this CUDA execution context.

Don't store anything here that is independent of the context, for example state related to [persistent] kernel caching should _not_ go here.





Target PTX Source # 
type StreamR PTX 
type StreamR PTX = Stream
type EventR PTX 
type EventR PTX = Event
data KernelMetadata PTX 
data KernelMetadata PTX = KM_PTX LaunchConfig
type ArgR PTX 
type ArgR PTX = FunParam
data ObjectR PTX 
data ObjectR PTX = ObjectR {}
data ExecutableR PTX 
data ExecutableR PTX = PTXR {}
type RemotePtr (LLVM PTX) 
type RemotePtr (LLVM PTX) = DevicePtr

data Context Source #

An execution context, which is tied to a specific device and CUDA execution context.




liftIO :: MonadIO m => forall a. IO a -> m a #

Lift a computation from the IO monad.

withDevicePtr :: (ArrayElt e, ArrayPtrs e ~ Ptr a, Typeable e, Typeable a, Storable a) => ArrayData e -> (DevicePtr a -> LLVM PTX (Maybe Event, r)) -> LLVM PTX r Source #

Lookup the device memory associated with a given host array and do something with it.

copyToHostLazy :: Arrays arrs => arrs -> LLVM PTX arrs Source #

Copy an array from the remote device to the host. Although the Accelerate program is hyper-strict and will evaluate the computation as soon as any part of it is demanded, the individual array payloads are copied back to the host _only_ as they are demanded by the Haskell program. This has several consequences:

  1. If the device has multiple memcpy engines, only one will be used. The transfers are however associated with a non-default stream.
  2. Using seq to force an Array to head-normal form will initiate the computation, but not transfer the results back to the host. Requesting an array element or using deepseq to force to normal form is required to actually transfer the data.

cloneArrayAsync :: (Shape sh, Elt e) => Stream -> Array sh e -> LLVM PTX (Array sh e) Source #

Clone an array into a newly allocated array on the device.

type Async a = AsyncR PTX a Source #

type Stream = Lifetime Stream Source #

A Stream represents an independent sequence of computations executed on the GPU. Operations in different streams may be executed concurrently with each other, but operations in the same stream can never overlap. Events can be used for efficient cross-stream synchronisation.

type Event = Lifetime Event Source #

Events can be used for efficient device-side synchronisation between execution streams and between the host.

Orphan instances

Foreign PTX Source # 


foreignAcc :: (Foreign asm, Typeable * a, Typeable * b) => PTX -> asm (a -> b) -> Maybe (StreamR PTX -> a -> LLVM PTX b)

foreignExp :: (Foreign asm, Typeable * x, Typeable * y) => PTX -> asm (x -> y) -> Maybe (IRFun1 PTX () (x -> y))