/*
 *  Copyright 2008-2010 NVIDIA Corporation
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */


extern "C"
__global__ void
inclusive_scan
(
    ArrOut              d_out,
    const ArrIn0        d_in0,
    ArrIn0              d_block_results,
    const Ix            N,
    const Ix            interval_size
)
{
    extern __shared__ TyOut sdata[];
    scan_intervals(sdata, d_out, d_in0, d_block_results, N, interval_size);
}


/*
 * Haskell-style scans increase the array length by one, so the indices of where
 * to update results changes slightly. Additionally, the final reduction value
 * is written to d_out, rather than updating the input partial block sums d_sum.
 */
extern "C"
__global__ void
exclusive_update
(
    ArrOut              d_out,
    ArrIn0              d_in0,
    ArrIn0              d_sum,
    const Ix            N,
    const Ix            interval_size
)
{
    extern __shared__ TyOut sdata[];

    const Ix interval_begin = interval_size * blockIdx.x;
    const Ix interval_end   = min(interval_begin + interval_size, N);

    // value to add to this segment
    TyOut carry = identity();

#if REVERSE
    if (blockIdx.x != gridDim.x - 1)
    {
      TyOut tmp = get0(d_in0, blockIdx.x + 1);
      carry     = apply(carry, tmp);
    }
#else
    if (blockIdx.x != 0)
    {
      TyOut tmp = get0(d_in0, blockIdx.x - 1);
      carry     = apply(carry, tmp);
    }
#endif

    // advance result iterator
    Ix output = REVERSE ? interval_end - threadIdx.x - 1 : interval_begin + threadIdx.x;
    TyOut val = carry;

    for (Ix base = interval_begin; base < interval_end; base += blockDim.x)
    {
        const Ix i = base + threadIdx.x;

        if (i < interval_end)
        {
            TyOut tmp          = get0(d_out, output);
            sdata[threadIdx.x] = apply(carry, tmp);
        }
        __syncthreads();

        if (threadIdx.x != 0)
            val = sdata[threadIdx.x - 1];

        if (i < interval_end)
        {
#if REVERSE && HASKELL_STYLE
            set(d_out, output + 1, val);
#else
            set(d_out, output, val);
#endif
        }

        if (threadIdx.x == 0)
            val = sdata[blockDim.x - 1];

        __syncthreads();

        // update output iterator
#if REVERSE
        output -= blockDim.x;
#else
        output += blockDim.x;
#endif
    }

    // Use a single thread to set the overall scan result.
    if (blockIdx.x == 0 && threadIdx.x == 0)
    {
        TyOut sum = apply(identity(), get0(d_sum, 0));

#if HASKELL_STYLE
        set(d_out, REVERSE ? 0 : N, sum);
#else
        set(d_sum, 0, sum);
#endif
    }
}