module Data.Array.Accelerate.CUDA.CodeGen.Reduction (
mkFold, mkFold1, mkFoldSeg, mkFold1Seg,
) where
import Foreign.CUDA.Analysis
import Language.C.Quote.CUDA
import qualified Language.C.Syntax as C
import Data.Array.Accelerate.Analysis.Shape
import Data.Array.Accelerate.Array.Sugar ( Array, Shape, Elt, Z(..), (:.)(..) )
import Data.Array.Accelerate.Error ( internalError )
import Data.Array.Accelerate.Type ( IsIntegral )
import Data.Array.Accelerate.CUDA.AST
import Data.Array.Accelerate.CUDA.CodeGen.Base
import Data.Array.Accelerate.CUDA.CodeGen.Type
mkFold :: forall aenv sh e. (Shape sh, Elt e)
=> DeviceProperties
-> Gamma aenv
-> CUFun2 aenv (e -> e -> e)
-> CUExp aenv e
-> CUDelayedAcc aenv (sh :. Int) e
-> [CUTranslSkel aenv (Array sh e)]
mkFold dev aenv f z a
| expDim (undefined :: Exp aenv sh) > 0 = mkFoldDim dev aenv f (Just z) a
| otherwise = mkFoldAll dev aenv f (Just z) a
mkFold1 :: forall aenv sh e. (Shape sh, Elt e)
=> DeviceProperties
-> Gamma aenv
-> CUFun2 aenv (e -> e -> e)
-> CUDelayedAcc aenv (sh :. Int) e
-> [ CUTranslSkel aenv (Array sh e) ]
mkFold1 dev aenv f a
| expDim (undefined :: Exp aenv sh) > 0 = mkFoldDim dev aenv f Nothing a
| otherwise = mkFoldAll dev aenv f Nothing a
mkFoldAll
:: forall aenv sh e. (Shape sh, Elt e)
=> DeviceProperties
-> Gamma aenv
-> CUFun2 aenv (e -> e -> e)
-> Maybe (CUExp aenv e)
-> CUDelayedAcc aenv (sh :. Int) e
-> [ CUTranslSkel aenv (Array sh e) ]
mkFoldAll dev aenv f z a
= let (_, rec) = readArray "Rec" (undefined :: Array (sh:.Int) e)
in
[ mkFoldAll' False dev aenv f z a
, mkFoldAll' True dev aenv f z rec ]
mkFoldAll'
:: forall aenv sh e. (Shape sh, Elt e)
=> Bool
-> DeviceProperties
-> Gamma aenv
-> CUFun2 aenv (e -> e -> e)
-> Maybe (CUExp aenv e)
-> CUDelayedAcc aenv (sh :. Int) e
-> CUTranslSkel aenv (Array sh e)
mkFoldAll' recursive dev aenv fun@(CUFun2 _ _ combine) mseed (CUDelayed (CUExp shIn) _ (CUFun1 _ get))
= CUTranslSkel foldAll [cunit|
$esc:("#include <accelerate_cuda.h>")
$edecls:texIn
extern "C" __global__ void
$id:foldAll
(
$params:argIn,
$params:argOut,
$params:argRec
)
{
$decls:smem
$decls:declx
$decls:decly
$decls:declz
$items:(sh .=. shIn)
const int shapeSize = $exp:(csize sh);
const int gridSize = $exp:(gridSize dev);
int ix = $exp:(threadIdx dev);
/*
* Reduce multiple elements per thread. The number is determined by the
* number of active thread blocks (via gridDim). More blocks will result in
* a larger `gridSize', and hence fewer elements per thread
*
* The loop stride of `gridSize' is used to maintain coalescing.
*
* Note that we can't simply kill threads that won't participate in the
* reduction, as exclusive reductions of empty arrays then won't be
* initialised with their seed element.
*/
if ( ix < shapeSize )
{
/*
* Initialise the local sum, then ...
*/
$items:(y .=. get ix)
/*
* ... continue striding the array, reading new values into 'x' and
* combining into the local accumulator 'y'. The nonidiomatic
* structure of the loop below is because we have already
* initialised 'y' above.
*/
for ( ix += gridSize; ix < shapeSize; ix += gridSize )
{
$items:(x .=. get ix)
$items:(z .=. combine y x)
$items:(y .=. z)
}
}
/*
* Each thread puts its local sum into shared memory, then threads
* cooperatively reduce the shared array to a single value.
*/
$items:(sdata "threadIdx.x" .=. y)
ix = min(shapeSize blockIdx.x * blockDim.x, blockDim.x);
$items:(reduceBlock dev fun x y z sdata (cvar "ix"))
/*
* Write the results of this block back to global memory. If we are the last
* phase of a recursive multiblock reduction, include the seed element.
*/
if ( threadIdx.x == 0 )
{
$items:(maybe inclusive_finish exclusive_finish mseed)
}
}
|]
where
foldAll = maybe "fold1All" (const "foldAll") mseed
(texIn, argIn) = environment dev aenv
(argOut, _, setOut) = writeArray "Out" (undefined :: Array (sh :. Int) e)
(argRec, _)
| recursive = readArray "Rec" (undefined :: Array (sh :. Int) e)
| otherwise = ([], undefined)
(_, x, declx) = locals "x" (undefined :: e)
(_, y, decly) = locals "y" (undefined :: e)
(_, z, declz) = locals "z" (undefined :: e)
(sh, _, _) = locals "sh" (undefined :: sh :. Int)
ix = [cvar "ix"]
(smem, sdata) = shared (undefined :: e) "sdata" [cexp| blockDim.x |] Nothing
inclusive_finish = setOut "blockIdx.x" .=. y
exclusive_finish (CUExp seed) = [[citem|
if ( shapeSize > 0 ) {
if ( gridDim.x == 1 ) {
$items:(x .=. seed)
$items:(z .=. combine y x)
$items:(y .=. z)
}
$items:(setOut "blockIdx.x" .=. y)
}
else {
$items:(setOut "blockIdx.x" .=. seed)
}
|]]
mkFoldDim
:: forall aenv sh e. (Shape sh, Elt e)
=> DeviceProperties
-> Gamma aenv
-> CUFun2 aenv (e -> e -> e)
-> Maybe (CUExp aenv e)
-> CUDelayedAcc aenv (sh :. Int) e
-> [ CUTranslSkel aenv (Array sh e) ]
mkFoldDim dev aenv fun@(CUFun2 _ _ combine) mseed (CUDelayed (CUExp shIn) _ (CUFun1 _ get))
= return
$ CUTranslSkel fold [cunit|
$esc:("#include <accelerate_cuda.h>")
$edecls:texIn
extern "C" __global__ void
$id:fold
(
$params:argIn,
$params:argOut
)
{
$decls:smem
$decls:declx
$decls:decly
$decls:declz
$items:(sh .=. shIn)
const int numIntervals = $exp:(csize (cindexTail sh));
const int intervalSize = $exp:(cindexHead sh);
int ix;
int seg;
/*
* If the intervals of an exclusive fold are empty, use all threads to
* map the seed value to the output array and exit.
*/
$items:(maybe [] mapseed mseed)
/*
* Threads in a block cooperatively reduce all elements in an interval.
*/
for ( seg = blockIdx.x
; seg < numIntervals
; seg += gridDim.x )
{
const int start = seg * intervalSize;
const int end = start + intervalSize;
const int n = min(end start, blockDim.x);
/*
* Kill threads that will not participate to avoid invalid reads.
* Take advantage of the fact that the array is rectangular.
*/
if ( threadIdx.x >= n )
return;
/*
* Ensure aligned access to global memory, and that each thread
* initialises its local sum
*/
ix = start (start & (warpSize 1));
if ( ix == start || intervalSize > blockDim.x)
{
ix += threadIdx.x;
if ( ix >= start )
{
$items:(y .=. get ix)
}
if ( ix + blockDim.x < end )
{
$items:(x .=. get [cvar "ix + blockDim.x"])
if ( ix >= start ) {
$items:(z .=. combine y x)
$items:(y .=. z)
}
else {
$items:(y .=. x)
}
}
/*
* Now, iterate collecting a local sum
*/
for ( ix += 2 * blockDim.x; ix < end; ix += blockDim.x )
{
$items:(x .=. get ix)
$items:(z .=. combine y x)
$items:(y .=. z)
}
}
else
{
$items:(y .=. get [cvar "start + threadIdx.x"])
}
/*
* Each thread puts its local sum into shared memory, and
* cooperatively reduces this to a single value.
*/
$items:(sdata "threadIdx.x" .=. y)
$items:(reduceBlock dev fun x y z sdata (cvar "n"))
/*
* Finally, the first thread writes the result for this segment. For
* exclusive reductions, we also combine with the seed element here.
*/
if ( threadIdx.x == 0 ) {
$items:(maybe [] exclusive_finish mseed)
$items:(setOut "seg" .=. y)
}
}
}
|]
where
fold = maybe "fold1" (const "fold") mseed
(texIn, argIn) = environment dev aenv
(argOut, shOut, setOut) = writeArray "Out" (undefined :: Array sh e)
(_, x, declx) = locals "x" (undefined :: e)
(_, y, decly) = locals "y" (undefined :: e)
(_, z, declz) = locals "z" (undefined :: e)
(sh, _, _) = locals "sh" (undefined :: sh :. Int)
ix = [cvar "ix"]
(smem, sdata) = shared (undefined :: e) "sdata" [cexp| blockDim.x |] Nothing
mapseed (CUExp seed)
= [citem| if ( intervalSize == 0 || numIntervals == 0 ) {
const int gridSize = $exp:(gridSize dev);
for ( ix = $exp:(threadIdx dev)
; ix < $exp:(csize shOut)
; ix += gridSize )
{
$items:(setOut "ix" .=. seed)
}
return;
} |] :[]
exclusive_finish (CUExp seed)
= concat [ x .=. seed
, z .=. combine y x
, y .=. z ]
mkFoldSeg
:: (Shape sh, Elt e, Elt i, IsIntegral i)
=> DeviceProperties
-> Gamma aenv
-> CUFun2 aenv (e -> e -> e)
-> CUExp aenv e
-> CUDelayedAcc aenv (sh :. Int) e
-> CUDelayedAcc aenv (Z :. Int) i
-> [CUTranslSkel aenv (Array (sh :. Int) e)]
mkFoldSeg dev aenv f z a s = [ mkFoldSeg' dev aenv f (Just z) a s ]
mkFold1Seg
:: (Shape sh, Elt e, Elt i, IsIntegral i)
=> DeviceProperties
-> Gamma aenv
-> CUFun2 aenv (e -> e -> e)
-> CUDelayedAcc aenv (sh :. Int) e
-> CUDelayedAcc aenv (Z :. Int) i
-> [CUTranslSkel aenv (Array (sh :. Int) e)]
mkFold1Seg dev aenv f a s = [ mkFoldSeg' dev aenv f Nothing a s ]
mkFoldSeg'
:: forall aenv sh e i. (Shape sh, Elt e, Elt i, IsIntegral i)
=> DeviceProperties
-> Gamma aenv
-> CUFun2 aenv (e -> e -> e)
-> Maybe (CUExp aenv e)
-> CUDelayedAcc aenv (sh :. Int) e
-> CUDelayedAcc aenv (Z :. Int) i
-> CUTranslSkel aenv (Array (sh :. Int) e)
mkFoldSeg' dev aenv fun@(CUFun2 _ _ combine) mseed
(CUDelayed (CUExp shIn) _ (CUFun1 _ get))
(CUDelayed _ _ (CUFun1 _ offset))
= CUTranslSkel foldSeg [cunit|
$esc:("#include <accelerate_cuda.h>")
$edecls:texIn
extern "C"
__global__ void
$id:foldSeg
(
$params:argIn,
$params:argOut
)
{
const int vectors_per_block = blockDim.x / warpSize;
const int num_vectors = $exp:(umul24 dev vectors_per_block gridDim);
const int thread_id = $exp:(threadIdx dev);
const int vector_id = thread_id / warpSize;
const int thread_lane = threadIdx.x & (warpSize 1);
const int vector_lane = threadIdx.x / warpSize;
const int num_segments = $exp:(cindexHead shOut);
const int total_segments = $exp:(csize shOut);
int seg;
int ix;
extern volatile __shared__ int s_ptrs[][2];
$decls:smem
$decls:declx
$decls:decly
$decls:declz
$items:(sh .=. shIn)
/*
* Threads in a warp cooperatively reduce a segment
*/
for ( seg = vector_id
; seg < total_segments
; seg += num_vectors )
{
const int s = seg % num_segments;
const int base = (seg / num_segments) * $exp:(cindexHead sh);
/*
* Use two threads to fetch the indices of the start and end of this
* segment. This results in single coalesced global read.
*/
if ( thread_lane < 2 ) {
$items:([cvar "s_ptrs[vector_lane][thread_lane]"] .=. offset [cvar "s + thread_lane"])
}
const int start = base + s_ptrs[vector_lane][0];
const int end = base + s_ptrs[vector_lane][1];
const int num_elements = end start;
/*
* Each thread reads in values of this segment, accumulating a local sum
*/
if ( num_elements > warpSize )
{
/*
* Ensure aligned access to global memory
*/
ix = start (start & (warpSize 1)) + thread_lane;
if ( ix >= start )
{
$items:(y .=. get ix)
}
/*
* Subsequent reads to global memory are aligned, but make sure all
* threads have initialised their local sum.
*/
if ( ix + warpSize < end )
{
$items:(x .=. get [cvar "ix + warpSize"])
if ( ix >= start ) {
$items:(z .=. combine y x)
$items:(y .=. z)
}
else {
$items:(y .=. x)
}
}
/*
* Now, iterate along the innermost dimension collecting a local sum
*/
for ( ix += 2 * warpSize; ix < end; ix += warpSize )
{
$items:(x .=. get ix)
$items:(z .=. combine y x)
$items:(y .=. z)
}
}
else if ( start + thread_lane < end )
{
$items:(y .=. get [cvar "start + thread_lane"])
}
/*
* Store local sums into shared memory and reduce to a single value
*/
ix = min(num_elements, warpSize);
$items:(sdata "threadIdx.x" .=. y)
$items:(reduceWarp dev fun x y z sdata (cvar "ix") (cvar "thread_lane"))
/*
* Finally, the first thread writes the result for this segment
*/
if ( thread_lane == 0 )
{
$items:(maybe [] exclusive_finish mseed)
$items:(setOut "seg" .=. y)
}
}
}
|]
where
foldSeg = maybe "fold1Seg" (const "foldSeg") mseed
(texIn, argIn) = environment dev aenv
(argOut, shOut, setOut) = writeArray "Out" (undefined :: Array (sh :. Int) e)
(_, x, declx) = locals "x" (undefined :: e)
(_, y, decly) = locals "y" (undefined :: e)
(_, z, declz) = locals "z" (undefined :: e)
(sh, _, _) = locals "sh" (undefined :: sh :. Int)
(smem, sdata) = shared (undefined :: e) "sdata" [cexp| blockDim.x |] (Just $ [cexp| &s_ptrs[vectors_per_block][2] |])
ix = [cvar "ix"]
vectors_per_block = cvar "vectors_per_block"
gridDim = cvar "gridDim.x"
exclusive_finish (CUExp seed)
= [[citem| if ( num_elements > 0 ) {
$items:(x .=. seed)
$items:(z .=. combine y x)
$items:(y .=. z)
} else {
$items:(y .=. seed)
} |]]
reduceWarp
:: forall aenv e. Elt e
=> DeviceProperties
-> CUFun2 aenv (e -> e -> e)
-> [C.Exp] -> [C.Exp] -> [C.Exp]
-> (Name -> [C.Exp])
-> C.Exp
-> C.Exp
-> [C.BlockItem]
reduceWarp dev fun x0 x1 x2 sdata n tid
| shflOK dev (undefined :: e) = return
$ reduceWarpShfl dev fun x0 x1 n tid
| otherwise = reduceWarpTree dev fun x0 x1 x2 sdata n tid
reduceBlock
:: forall aenv e. Elt e
=> DeviceProperties
-> CUFun2 aenv (e -> e -> e)
-> [C.Exp] -> [C.Exp] -> [C.Exp]
-> (Name -> [C.Exp])
-> C.Exp
-> [C.BlockItem]
reduceBlock dev fun x0 x1 x2 sdata n
| shflOK dev (undefined :: e) = reduceBlockShfl dev fun x0 x1 sdata n
| otherwise = reduceBlockTree dev fun x0 x1 x2 sdata n
reduceWarpTree
:: Elt e
=> DeviceProperties
-> CUFun2 aenv (e -> e -> e)
-> [C.Exp] -> [C.Exp] -> [C.Exp]
-> (Name -> [C.Exp])
-> C.Exp
-> C.Exp
-> [C.BlockItem]
reduceWarpTree dev (CUFun2 _ _ f) x0 x1 x2 sdata n tid
= map (reduce . pow2) [v, v1 .. 0]
where
v = floor (logBase 2 (fromIntegral $ warpSize dev :: Double))
pow2 :: Int -> Int
pow2 x = 2 ^ x
reduce :: Int -> C.BlockItem
reduce 0
= [citem| if ( $exp:tid < $exp:n ) {
$items:(x0 .=. sdata "threadIdx.x + 1")
$items:(x2 .=. f x1 x0)
$items:(x1 .=. x2)
} |]
reduce i
= [citem| if ( $exp:tid + $int:i < $exp:n ) {
$items:(x0 .=. sdata ("threadIdx.x + " ++ show i))
$items:(x2 .=. f x1 x0)
$items:(x1 .=. x2)
$items:(sdata "threadIdx.x" .=. x1)
} |]
reduceBlockTree
:: Elt e
=> DeviceProperties
-> CUFun2 aenv (e -> e -> e)
-> [C.Exp] -> [C.Exp] -> [C.Exp]
-> (Name -> [C.Exp])
-> C.Exp
-> [C.BlockItem]
reduceBlockTree dev fun@(CUFun2 _ _ f) x0 x1 x2 sdata n
= flip (foldr1 (.)) []
$ map (reduce . pow2) [u1, u2 .. v]
where
u = floor (logBase 2 (fromIntegral $ maxThreadsPerBlock dev :: Double))
v = floor (logBase 2 (fromIntegral $ warpSize dev :: Double))
pow2 :: Int -> Int
pow2 x = 2 ^ x
reduce :: Int -> [C.BlockItem] -> [C.BlockItem]
reduce i rest
| i > warpSize dev
= [citem| __syncthreads(); |]
: [citem| if ( threadIdx.x + $int:i < $exp:n ) {
$items:(x0 .=. sdata ("threadIdx.x + " ++ show i))
$items:(x2 .=. f x1 x0)
$items:(x1 .=. x2)
} |]
: [citem| __syncthreads(); |]
: (sdata "threadIdx.x" .=. x1)
++ rest
| otherwise
= [citem| __syncthreads(); |]
: [citem| if ( threadIdx.x < $int:(warpSize dev) ) {
$items:(reduceWarpTree dev fun x0 x1 x2 sdata n (cvar "threadIdx.x"))
} |]
: rest
shflOK :: Elt e => DeviceProperties -> e -> Bool
shflOK _dev _ = False
reduceWarpShfl
:: forall aenv e. Elt e
=> DeviceProperties
-> CUFun2 aenv (e -> e -> e)
-> [C.Exp] -> [C.Exp]
-> C.Exp
-> C.Exp
-> C.BlockItem
reduceWarpShfl _dev (CUFun2 _ _ f) x0 x1 n tid
= [citem| for ( int z = warpSize/2; z >= 1; z /= 2 ) {
$items:(x0 .=. shfl_xor x1)
if ( $exp:tid + z < $exp:n ) {
$items:(x1 .=. f x1 x0)
}
} |]
where
sizeof = eltSizeOf (undefined :: e)
shfl_xor = zipWith (\s x -> ccall (shfl s) [ x, cvar "z" ]) sizeof
where
shfl 4 = "shfl_xor32"
shfl 8 = "shfl_xor64"
shfl _ = $internalError "shfl_xor" "I only know about 32- and 64-bit types"
reduceBlockShfl
:: forall aenv e. Elt e
=> DeviceProperties
-> CUFun2 aenv (e -> e -> e)
-> [C.Exp] -> [C.Exp]
-> (Name -> [C.Exp])
-> C.Exp
-> [C.BlockItem]
reduceBlockShfl dev fun x0 x1 sdata n
= reduceWarpShfl dev fun x0 x1 n (cvar "threadIdx.x")
: [citem| if ( (threadIdx.x & warpSize 1) == 0 ) {
$items:(sdata "threadIdx.x / warpSize" .=. x1)
} |]
: [citem| __syncthreads(); |]
: [citem| if ( threadIdx.x < warpSize ) {
$items:(x1 .=. sdata "threadIdx.x")
$exp:n = ($exp:n + warpSize 1) / warpSize;
$item:(reduceWarpShfl dev fun x0 x1 n (cvar "threadIdx.x"))
} |]
: []