Safe Haskell | Safe-Inferred |
---|---|
Language | GHC2021 |
Synopsis
- data KernelConstants = KernelConstants {
- kernelGlobalThreadId :: TExp Int32
- kernelLocalThreadId :: TExp Int32
- kernelGroupId :: TExp Int32
- kernelGlobalThreadIdVar :: VName
- kernelLocalThreadIdVar :: VName
- kernelGroupIdVar :: VName
- kernelNumGroupsCount :: Count NumGroups SubExp
- kernelGroupSizeCount :: Count GroupSize SubExp
- kernelNumGroups :: TExp Int64
- kernelGroupSize :: TExp Int64
- kernelNumThreads :: TExp Int32
- kernelWaveSize :: TExp Int32
- kernelLocalIdMap :: Map [SubExp] [TExp Int32]
- kernelChunkItersMap :: Map [SubExp] (TExp Int32)
- threadOperations :: Operations GPUMem KernelEnv KernelOp
- keyWithEntryPoint :: Maybe Name -> Name -> Name
- type CallKernelGen = ImpM GPUMem HostEnv HostOp
- type InKernelGen = ImpM GPUMem KernelEnv KernelOp
- data Locks = Locks {
- locksArray :: VName
- locksCount :: Int
- data HostEnv = HostEnv {}
- data Target
- data KernelEnv = KernelEnv {}
- groupReduce :: TExp Int32 -> Lambda GPUMem -> [VName] -> InKernelGen ()
- groupScan :: Maybe (TExp Int32 -> TExp Int32 -> TExp Bool) -> TExp Int64 -> TExp Int64 -> Lambda GPUMem -> [VName] -> InKernelGen ()
- groupLoop :: IntExp t => TExp t -> (TExp t -> InKernelGen ()) -> InKernelGen ()
- isActive :: [(VName, SubExp)] -> TExp Bool
- sKernel :: Operations GPUMem KernelEnv KernelOp -> (KernelConstants -> TExp Int32) -> String -> VName -> KernelAttrs -> InKernelGen () -> CallKernelGen ()
- sKernelThread :: String -> VName -> KernelAttrs -> InKernelGen () -> CallKernelGen ()
- data KernelAttrs = KernelAttrs {}
- defKernelAttrs :: Count NumGroups SubExp -> Count GroupSize SubExp -> KernelAttrs
- lvlKernelAttrs :: SegLevel -> CallKernelGen KernelAttrs
- allocLocal :: AllocCompiler GPUMem r KernelOp
- kernelAlloc :: Pat LetDecMem -> SubExp -> Space -> InKernelGen ()
- compileThreadResult :: SegSpace -> PatElem LetDecMem -> KernelResult -> InKernelGen ()
- virtualiseGroups :: SegVirt -> TExp Int32 -> (TExp Int32 -> InKernelGen ()) -> InKernelGen ()
- kernelLoop :: IntExp t => TExp t -> TExp t -> TExp t -> (TExp t -> InKernelGen ()) -> InKernelGen ()
- groupCoverSpace :: IntExp t => [TExp t] -> ([TExp t] -> InKernelGen ()) -> InKernelGen ()
- fenceForArrays :: [VName] -> InKernelGen Fence
- updateAcc :: VName -> [SubExp] -> [SubExp] -> InKernelGen ()
- genZeroes :: String -> Int -> CallKernelGen VName
- sReplicate :: VName -> SubExp -> CallKernelGen ()
- sIota :: VName -> TExp Int64 -> Exp -> Exp -> IntType -> CallKernelGen ()
- type AtomicBinOp = BinOp -> Maybe (VName -> VName -> Count Elements (TExp Int64) -> Exp -> AtomicOp)
- atomicUpdateLocking :: AtomicBinOp -> Lambda GPUMem -> AtomicUpdate GPUMem KernelEnv
- data Locking = Locking {
- lockingArray :: VName
- lockingIsUnlocked :: TExp Int32
- lockingToLock :: TExp Int32
- lockingToUnlock :: TExp Int32
- lockingMapping :: [TExp Int64] -> [TExp Int64]
- data AtomicUpdate rep r
- = AtomicPrim (DoAtomicUpdate rep r)
- | AtomicCAS (DoAtomicUpdate rep r)
- | AtomicLocking (Locking -> DoAtomicUpdate rep r)
- type DoAtomicUpdate rep r = Space -> [VName] -> [TExp Int64] -> ImpM rep r KernelOp ()
Documentation
data KernelConstants Source #
KernelConstants | |
|
Information about the locks available for accumulators.
Locks | |
|
HostEnv | |
|
Which target are we ultimately generating code for? While most of the kernels code is the same, there are some cases where we generate special code based on the ultimate low-level API we are targeting.
groupReduce :: TExp Int32 -> Lambda GPUMem -> [VName] -> InKernelGen () Source #
groupScan :: Maybe (TExp Int32 -> TExp Int32 -> TExp Bool) -> TExp Int64 -> TExp Int64 -> Lambda GPUMem -> [VName] -> InKernelGen () Source #
groupLoop :: IntExp t => TExp t -> (TExp t -> InKernelGen ()) -> InKernelGen () Source #
Assign iterations of a for-loop to threads in the workgroup. The
passed-in function is invoked with the (symbolic) iteration. For
multidimensional loops, use groupCoverSpace
.
sKernel :: Operations GPUMem KernelEnv KernelOp -> (KernelConstants -> TExp Int32) -> String -> VName -> KernelAttrs -> InKernelGen () -> CallKernelGen () Source #
sKernelThread :: String -> VName -> KernelAttrs -> InKernelGen () -> CallKernelGen () Source #
data KernelAttrs Source #
Various extra configuration of the kernel being generated.
KernelAttrs | |
|
defKernelAttrs :: Count NumGroups SubExp -> Count GroupSize SubExp -> KernelAttrs Source #
The default kernel attributes.
lvlKernelAttrs :: SegLevel -> CallKernelGen KernelAttrs Source #
Compute kernel attributes from SegLevel
; including synthesising
group-size and thread count if no grid is provided.
kernelAlloc :: Pat LetDecMem -> SubExp -> Space -> InKernelGen () Source #
compileThreadResult :: SegSpace -> PatElem LetDecMem -> KernelResult -> InKernelGen () Source #
virtualiseGroups :: SegVirt -> TExp Int32 -> (TExp Int32 -> InKernelGen ()) -> InKernelGen () Source #
For many kernels, we may not have enough physical groups to cover the logical iteration space. Some groups thus have to perform double duty; we put an outer loop to accomplish this. The advantage over just launching a bazillion threads is that the cost of memory expansion should be proportional to the number of *physical* threads (hardware parallelism), not the amount of application parallelism.
kernelLoop :: IntExp t => TExp t -> TExp t -> TExp t -> (TExp t -> InKernelGen ()) -> InKernelGen () Source #
Assign iterations of a for-loop to all threads in the kernel.
The passed-in function is invoked with the (symbolic) iteration.
The body must contain thread-level code. For multidimensional
loops, use groupCoverSpace
.
groupCoverSpace :: IntExp t => [TExp t] -> ([TExp t] -> InKernelGen ()) -> InKernelGen () Source #
Iterate collectively though a multidimensional space, such that all threads in the group participate. The passed-in function is invoked with a (symbolic) point in the index space.
fenceForArrays :: [VName] -> InKernelGen Fence Source #
If we are touching these arrays, which kind of fence do we need?
genZeroes :: String -> Int -> CallKernelGen VName Source #
Generate a constant device array of 32-bit integer zeroes with the given number of elements. Initialised with a replicate.
Host-level bulk operations
sReplicate :: VName -> SubExp -> CallKernelGen () Source #
Perform a Replicate with a kernel.
sIota :: VName -> TExp Int64 -> Exp -> Exp -> IntType -> CallKernelGen () Source #
Perform an Iota with a kernel.
Atomics
type AtomicBinOp = BinOp -> Maybe (VName -> VName -> Count Elements (TExp Int64) -> Exp -> AtomicOp) Source #
atomicUpdateLocking :: AtomicBinOp -> Lambda GPUMem -> AtomicUpdate GPUMem KernelEnv Source #
Do an atomic update corresponding to a binary operator lambda.
Locking strategy used for an atomic update.
Locking | |
|
data AtomicUpdate rep r Source #
The mechanism that will be used for performing the atomic update. Approximates how efficient it will be. Ordered from most to least efficient.
AtomicPrim (DoAtomicUpdate rep r) | Supported directly by primitive. |
AtomicCAS (DoAtomicUpdate rep r) | Can be done by efficient swaps. |
AtomicLocking (Locking -> DoAtomicUpdate rep r) | Requires explicit locking. |