/* ----------------------------------------------------------------------------- * * Kernel : Fold * Copyright : [2008..2010] Manuel M T Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell * License : BSD3 * * Maintainer : Trevor L. McDonell * Stability : experimental * * Reduce an array to a single value with a binary associative function * * ---------------------------------------------------------------------------*/ #ifndef LENGTH_IS_POW_2 #define LENGTH_IS_POW_2 0 #endif /* * Compute multiple elements per thread sequentially. This reduces the overall * cost of the algorithm while keeping the work complexity O(n) and the step * complexity O(log n). c.f. Brent's Theorem optimisation. */ extern "C" __global__ void fold ( ArrOut d_out, const ArrIn0 d_in0, const Ix shape ) { extern volatile __shared__ void* s_ptr[]; ArrOut s_data = partition(s_ptr, blockDim.x); /* * Calculate first level of reduction reading into shared memory */ Ix i; TyOut sum = identity(); const Ix tid = threadIdx.x; const Ix gridSize = blockDim.x * 2 * gridDim.x; /* * Reduce multiple elements per thread. The number is determined by the * number of active thread blocks (via gridDim). More blocks will result in * a larger `gridSize', and hence fewer elements per thread * * The loop stride of `gridSize' is used to maintain coalescing. */ for (i = blockIdx.x * blockDim.x * 2 + tid; i < shape; i += gridSize) { sum = apply(sum, get0(d_in0, i)); /* * Ensure we don't read out of bounds. This is optimised away if the * input length is a power of two */ if (LENGTH_IS_POW_2 || i + blockDim.x < shape) sum = apply(sum, get0(d_in0, i+blockDim.x)); } /* * Each thread puts its local sum into shared memory, then threads * cooperatively reduce the shared array to a single value. */ set(s_data, tid, sum); __syncthreads(); if (blockDim.x >= 512) { if (tid < 256) { sum = apply(sum, get0(s_data, tid+256)); set(s_data, tid, sum); } __syncthreads(); } if (blockDim.x >= 256) { if (tid < 128) { sum = apply(sum, get0(s_data, tid+128)); set(s_data, tid, sum); } __syncthreads(); } if (blockDim.x >= 128) { if (tid < 64) { sum = apply(sum, get0(s_data, tid+ 64)); set(s_data, tid, sum); } __syncthreads(); } if (tid < 32) { /* * Use an extra warps worth of elements of shared memory, to let threads * index beyond the input data without using any branch instructions. */ sum = apply(sum, get0(s_data, tid+32)); set(s_data, tid, sum); sum = apply(sum, get0(s_data, tid+16)); set(s_data, tid, sum); sum = apply(sum, get0(s_data, tid+ 8)); set(s_data, tid, sum); sum = apply(sum, get0(s_data, tid+ 4)); set(s_data, tid, sum); sum = apply(sum, get0(s_data, tid+ 2)); set(s_data, tid, sum); sum = apply(sum, get0(s_data, tid+ 1)); } /* * Write the results of this block back to global memory */ if (tid == 0) set(d_out, blockIdx.x, sum); }