/* ----------------------------------------------------------------------------- * * Kernel : FoldSeg * Copyright : [2008..2011] Manuel M T Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell * License : BSD3 * * Maintainer : Trevor L. McDonell * Stability : experimental * * Reduce segments along the innermost dimension of a multidimensional array to * a single value for each segment, using a binary associative function * * ---------------------------------------------------------------------------*/ #include "reduce.inl" /* * Each segment of the vector is assigned to a warp, which computes the * reduction of the i-th section, in parallel. * * This division of work implies that the data arrays are accessed in a * contiguous manner (if not necessarily aligned). For devices of compute * capability 1.2 and later, these accesses will be coalesced. A single * transaction will be issued if all of the addresses for a half-warp happen to * fall within a single 128-byte boundary. Extra transactions will be made to * cover any spill. The same applies for 2.x devices, except that all widths are * doubled since transactions occur on a per-warp basis. * * Since an entire 32-thread warp is assigned for each segment, many threads * will remain idle when the segments are very small. This code relies on * implicit synchronisation among threads in a warp. * * The offset array contains the starting index for each segment in the input * array. The i-th warp reduces values in the input array at indices * [d_offset[i], d_offset[i+1]). */ extern "C" __global__ void foldSeg ( ArrOut d_out, const ArrIn0 d_in0, const Int* d_offset, const DimOut shOut, const DimIn0 shIn0 ) { const Ix vectors_per_block = blockDim.x / warpSize; const Ix num_vectors = vectors_per_block * gridDim.x; const Ix thread_id = blockDim.x * blockIdx.x + threadIdx.x; const Ix vector_id = thread_id / warpSize; const Ix thread_lane = threadIdx.x & (warpSize - 1); const Ix vector_lane = threadIdx.x / warpSize; const Ix num_segments = indexHead(shOut); const Ix total_segments = size(shOut); /* * Manually partition (dynamically-allocated) shared memory */ extern volatile __shared__ Ix s_ptrs[][2]; ArrOut s_data = partition((void*) &s_ptrs[vectors_per_block][2], blockDim.x); for (Ix seg = vector_id; seg < total_segments; seg += num_vectors) { const Ix s = seg % num_segments; const Ix base = (seg / num_segments) * indexHead(shIn0); /* * Use two threads to fetch the indices of the start and end of this * segment. This results in single coalesced global read, instead of two * separate transactions. */ if (thread_lane < 2) s_ptrs[vector_lane][thread_lane] = d_offset[s + thread_lane]; const Ix start = base + s_ptrs[vector_lane][0]; const Ix end = base + s_ptrs[vector_lane][1]; const Ix num_elements = end - start; TyOut sum; /* * Each thread reads in values of this segment, accumulating a local sum */ if (num_elements > warpSize) { /* * Ensure aligned access to global memory */ Ix i = start - (start & (warpSize - 1)) + thread_lane; if (i >= start) sum = get0(d_in0, i); /* * Subsequent reads to global memory are aligned, but make sure all * threads have initialised their local sum. */ if (i + warpSize < end) { TyOut tmp = get0(d_in0, i + warpSize); if (i >= start) sum = apply(sum, tmp); else sum = tmp; } for (i += 2 * warpSize; i < end; i += warpSize) sum = apply(sum, get0(d_in0, i)); } else if (start + thread_lane < end) { sum = get0(d_in0, start + thread_lane); } /* * Store local sums into shared memory and reduce to a single value */ set(s_data, threadIdx.x, sum); sum = reduce_warp_n(s_data, sum, min(num_elements, warpSize)); /* * Finally, the first thread writes the result for this segment */ if (thread_lane == 0) { #ifndef INCLUSIVE sum = num_elements > 0 ? apply(sum, identity()) : identity(); #endif set(d_out, seg, sum); } } }