/* -----------------------------------------------------------------------------
 *
 * Module    : Fold
 * Copyright : (c) [2009..2010] Trevor L. McDonell
 * License   : BSD
 *
 * Reduce an array to a single value with a binary associative function
 *
 * ---------------------------------------------------------------------------*/

/*
 * We require block sizes are a power of two. This value gives maximum occupancy
 * for both 1.x and 2.x class devices.
 */
#ifndef BLOCK_SIZE
#define BLOCK_SIZE              256
#endif

#ifndef LENGTH_IS_POW_2
#define LENGTH_IS_POW_2         0
#endif


/*
 * Compute multiple elements per thread sequentially. This reduces the overall
 * cost of the algorithm while keeping the work complexity O(n) and the step
 * complexity O(log n). c.f. Brent's Theorem optimisation.
 */
extern "C"
__global__ void
fold
(
    ArrOut              d_out,
    const ArrIn0        d_in0,
    const Ix            shape
)
{
    extern volatile __shared__ TyOut s_data[];

    /*
     * Calculate first level of reduction reading into shared memory
     */
    Ix       i;
    TyOut    sum      = identity();
    const Ix tid      = threadIdx.x;
    const Ix gridSize = BLOCK_SIZE * 2 * gridDim.x;

    /*
     * Reduce multiple elements per thread. The number is determined by the
     * number of active thread blocks (via gridDim). More blocks will result in
     * a larger `gridSize', and hence fewer elements per thread
     *
     * The loop stride of `gridSize' is used to maintain coalescing.
     */
    for (i =  blockIdx.x * BLOCK_SIZE * 2 + tid; i <  shape; i += gridSize)
    {
        sum = apply(sum, get0(d_in0, i));

        /*
         * Ensure we don't read out of bounds. This is optimised away if the
         * input length is a power of two
         */
        if (LENGTH_IS_POW_2 || i + BLOCK_SIZE < shape)
            sum = apply(sum, get0(d_in0, i+BLOCK_SIZE));
    }

    /*
     * Each thread puts its local sum into shared memory, then threads
     * cooperatively reduce the shared array to a single value.
     */
    s_data[tid] = sum;
    __syncthreads();

    if (BLOCK_SIZE >= 512) { if (tid < 256) { s_data[tid] = sum = apply(sum, s_data[tid+256]); } __syncthreads(); }
    if (BLOCK_SIZE >= 256) { if (tid < 128) { s_data[tid] = sum = apply(sum, s_data[tid+128]); } __syncthreads(); }
    if (BLOCK_SIZE >= 128) { if (tid <  64) { s_data[tid] = sum = apply(sum, s_data[tid+ 64]); } __syncthreads(); }

    if (tid < 32)
    {
        /*
         * Use an extra warps worth of elements of shared memory, to let threads
         * index beyond the input data without using any branch instructions.
         */
        s_data[tid] = sum = apply(sum, s_data[tid + 32]);
        s_data[tid] = sum = apply(sum, s_data[tid + 16]);
        s_data[tid] = sum = apply(sum, s_data[tid +  8]);
        s_data[tid] = sum = apply(sum, s_data[tid +  4]);
        s_data[tid] = sum = apply(sum, s_data[tid +  2]);
        s_data[tid] = sum = apply(sum, s_data[tid +  1]);
    }

    /*
     * Write the results of this block back to global memory
     */
    if (tid == 0)
        set(d_out, blockIdx.x, sum);
}