{-# LANGUAGE GADTs               #-}
{-# LANGUAGE OverloadedStrings   #-}
{-# LANGUAGE RebindableSyntax    #-}
{-# LANGUAGE RecordWildCards     #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell     #-}
{-# LANGUAGE TypeApplications    #-}
{-# LANGUAGE TypeOperators       #-}
{-# LANGUAGE ViewPatterns        #-}
-- |
-- Module      : Data.Array.Accelerate.LLVM.PTX.CodeGen.Fold
-- Copyright   : [2016..2020] The Accelerate Team
-- License     : BSD3
--
-- Maintainer  : Trevor L. McDonell <trevor.mcdonell@gmail.com>
-- Stability   : experimental
-- Portability : non-portable (GHC extensions)
--

module Data.Array.Accelerate.LLVM.PTX.CodeGen.Fold
  where

import Data.Array.Accelerate.Representation.Array
import Data.Array.Accelerate.Representation.Elt
import Data.Array.Accelerate.Representation.Shape                   hiding ( size )
import Data.Array.Accelerate.Representation.Type

import Data.Array.Accelerate.LLVM.CodeGen.Arithmetic                as A
import Data.Array.Accelerate.LLVM.CodeGen.Array
import Data.Array.Accelerate.LLVM.CodeGen.Base
import Data.Array.Accelerate.LLVM.CodeGen.Constant
import Data.Array.Accelerate.LLVM.CodeGen.Environment
import Data.Array.Accelerate.LLVM.CodeGen.Exp
import Data.Array.Accelerate.LLVM.CodeGen.IR
import Data.Array.Accelerate.LLVM.CodeGen.Loop                      as Loop
import Data.Array.Accelerate.LLVM.CodeGen.Monad
import Data.Array.Accelerate.LLVM.CodeGen.Sugar

import Data.Array.Accelerate.LLVM.PTX.Analysis.Launch
import Data.Array.Accelerate.LLVM.PTX.CodeGen.Base
import Data.Array.Accelerate.LLVM.PTX.CodeGen.Generate
import Data.Array.Accelerate.LLVM.PTX.Target

import LLVM.AST.Type.Representation

import qualified Foreign.CUDA.Analysis                              as CUDA

import Control.Monad                                                ( (>=>) )
import Control.Monad.State                                          ( gets )
import Data.String                                                  ( fromString )
import Data.Bits                                                    as P
import Prelude                                                      as P


-- Reduce an array along the innermost dimension. The reduction function must be
-- associative to allow for an efficient parallel implementation, but the
-- initial element does /not/ need to be a neutral element of operator.
--
-- TODO: Specialise for commutative operations (such as (+)) and those with
--       a neutral element {(+), 0}
--
mkFold
    :: forall aenv sh e.
       Gamma            aenv
    -> ArrayR (Array sh e)
    -> IRFun2       PTX aenv (e -> e -> e)
    -> Maybe (IRExp PTX aenv e)
    -> MIRDelayed   PTX aenv (Array (sh, Int) e)
    -> CodeGen      PTX      (IROpenAcc PTX aenv (Array sh e))
mkFold aenv repr f z acc = case z of
  Just z' -> (+++) <$> codeFold <*> mkFoldFill aenv repr z'
  Nothing -> codeFold
  where
    codeFold = case repr of
      ArrayR ShapeRz tp -> mkFoldAll aenv tp   f z acc
      _                 -> mkFoldDim aenv repr f z acc


-- Reduce an array to a single element.
--
-- Since reductions consume arrays that have been fused into them, parallel
-- reduction requires two separate kernels. At an example, take vector dot
-- product:
--
-- > dotp xs ys = fold (+) 0 (zipWith (*) xs ys)
--
-- 1. The first pass reads in the fused array data, in this case corresponding
--    to the function (\i -> (xs!i) * (ys!i)).
--
-- 2. The second pass reads in the manifest array data from the first step and
--    directly reduces the array. This can be done recursively in-place until
--    only a single element remains.
--
-- In both phases, thread blocks cooperatively reduce a stripe of the input (one
-- element per thread) to a single element, which is stored to the output array.
--
mkFoldAll
    :: forall aenv e.
       Gamma          aenv                      -- ^ array environment
    -> TypeR e
    -> IRFun2     PTX aenv (e -> e -> e)        -- ^ combination function
    -> MIRExp     PTX aenv e                    -- ^ (optional) initial element for exclusive reductions
    -> MIRDelayed PTX aenv (Vector e)           -- ^ input data
    -> CodeGen    PTX      (IROpenAcc PTX aenv (Scalar e))
mkFoldAll aenv tp combine mseed macc = do
  dev <- liftCodeGen $ gets ptxDeviceProperties
  foldr1 (+++) <$> sequence [ mkFoldAllS  dev aenv tp combine mseed macc
                            , mkFoldAllM1 dev aenv tp combine       macc
                            , mkFoldAllM2 dev aenv tp combine mseed
                            ]


-- Reduction to an array to a single element, for small arrays which can be
-- processed by a single thread block.
--
mkFoldAllS
    :: forall aenv e.
       DeviceProperties                         -- ^ properties of the target GPU
    -> Gamma          aenv                      -- ^ array environment
    -> TypeR e
    -> IRFun2     PTX aenv (e -> e -> e)        -- ^ combination function
    -> MIRExp     PTX aenv e                    -- ^ (optional) initial element for exclusive reductions
    -> MIRDelayed PTX aenv (Vector e)           -- ^ input data
    -> CodeGen    PTX      (IROpenAcc PTX aenv (Scalar e))
mkFoldAllS dev aenv tp combine mseed marr =
  let
      (arrOut, paramOut)  = mutableArray (ArrayR dim0 tp) "out"
      (arrIn,  paramIn)   = delayedArray "in" marr
      paramEnv            = envParam aenv
      --
      config              = launchConfig dev (CUDA.incWarp dev) smem multipleOf multipleOfQ
      smem n              = warps * (1 + per_warp) * bytes
        where
          ws        = CUDA.warpSize dev
          warps     = n `P.quot` ws
          per_warp  = ws + ws `P.quot` 2
          bytes     = bytesElt tp
  in
  makeOpenAccWith config "foldAllS" (paramOut ++ paramIn ++ paramEnv) $ do

    tid     <- threadIdx
    bd      <- blockDim

    sh      <- delayedExtent arrIn
    end     <- shapeSize dim1 sh

    -- We can assume that there is only a single thread block
    start'  <- return (liftInt32 0)
    end'    <- i32 end
    i0      <- A.add numType start' tid
    sz      <- A.sub numType end' start'
    when (A.lt singleType i0 sz) $ do

      -- Thread reads initial element and then participates in block-wide
      -- reduction.
      x0 <- app1 (delayedLinearIndex arrIn) =<< int i0
      r0 <- if (tp, A.eq singleType sz bd)
              then reduceBlockSMem dev tp combine Nothing   x0
              else reduceBlockSMem dev tp combine (Just sz) x0

      when (A.eq singleType tid (liftInt32 0)) $
        writeArray TypeInt32 arrOut tid =<<
          case mseed of
            Nothing -> return r0
            Just z  -> flip (app2 combine) r0 =<< z   -- Note: initial element on the left

    return_


-- Reduction of an entire array to a single element. This kernel implements step
-- one for reducing large arrays which must be processed by multiple thread
-- blocks.
--
mkFoldAllM1
    :: forall aenv e.
       DeviceProperties                         -- ^ properties of the target GPU
    -> Gamma          aenv                      -- ^ array environment
    -> TypeR e
    -> IRFun2     PTX aenv (e -> e -> e)        -- ^ combination function
    -> MIRDelayed PTX aenv (Vector e)           -- ^ input data
    -> CodeGen    PTX      (IROpenAcc PTX aenv (Scalar e))
mkFoldAllM1 dev aenv tp combine marr =
  let
      (arrTmp, paramTmp)  = mutableArray (ArrayR dim1 tp) "tmp"
      (arrIn,  paramIn)   = delayedArray "in" marr
      paramEnv            = envParam aenv
      start               = liftInt 0
      --
      config              = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||]
      smem n              = warps * (1 + per_warp) * bytes
        where
          ws        = CUDA.warpSize dev
          warps     = n `P.quot` ws
          per_warp  = ws + ws `P.quot` 2
          bytes     = bytesElt tp
  in
  makeOpenAccWith config "foldAllM1" (paramTmp ++ paramIn ++ paramEnv) $ do

    -- Each thread block cooperatively reduces a stripe of the input and stores
    -- that value into a temporary array at a corresponding index. Since the
    -- order of operations remains fixed, this method supports non-commutative
    -- reductions.
    --
    tid   <- threadIdx
    bd    <- int =<< blockDim
    sz    <- indexHead <$> delayedExtent arrIn
    end   <- shapeSize dim1 (irArrayShape arrTmp)

    imapFromTo start end $ \seg -> do

      -- Wait for all threads to catch up before beginning the stripe
      __syncthreads

      -- Bounds of the input array we will reduce between
      from  <- A.mul numType    seg  bd
      step  <- A.add numType    from bd
      to    <- A.min singleType sz   step

      -- Threads cooperatively reduce this stripe
      reduceFromTo dev tp from to combine
        (app1 (delayedLinearIndex arrIn))
        (when (A.eq singleType tid (liftInt32 0)) . writeArray TypeInt arrTmp seg)

    return_


-- Reduction of an array to a single element, (recursive) step 2 of multi-block
-- reduction algorithm.
--
mkFoldAllM2
    :: forall aenv e.
       DeviceProperties
    -> Gamma       aenv
    -> TypeR e
    -> IRFun2  PTX aenv (e -> e -> e)
    -> MIRExp  PTX aenv e
    -> CodeGen PTX      (IROpenAcc PTX aenv (Scalar e))
mkFoldAllM2 dev aenv tp combine mseed =
  let
      (arrTmp, paramTmp)  = mutableArray (ArrayR dim1 tp) "tmp"
      (arrOut, paramOut)  = mutableArray (ArrayR dim1 tp) "out"
      paramEnv            = envParam aenv
      start               = liftInt 0
      --
      config              = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||]
      smem n              = warps * (1 + per_warp) * bytes
        where
          ws        = CUDA.warpSize dev
          warps     = n `P.quot` ws
          per_warp  = ws + ws `P.quot` 2
          bytes     = bytesElt tp
  in
  makeOpenAccWith config "foldAllM2" (paramTmp ++ paramOut ++ paramEnv) $ do

    -- Threads cooperatively reduce a stripe of the input (temporary) array
    -- output from the first phase, storing the results into another temporary.
    -- When only a single thread block remains, we have reached the final
    -- reduction step and add the initial element (for exclusive reductions).
    --
    tid   <- threadIdx
    gd    <- gridDim
    bd    <- int =<< blockDim
    sz    <- return $ indexHead (irArrayShape arrTmp)
    end   <- shapeSize dim1 (irArrayShape arrOut)

    imapFromTo start end $ \seg -> do

      -- Wait for all threads to catch up before beginning the stripe
      __syncthreads

      -- Bounds of the input we will reduce between
      from  <- A.mul numType    seg  bd
      step  <- A.add numType    from bd
      to    <- A.min singleType sz   step

      -- Threads cooperatively reduce this stripe
      reduceFromTo dev tp from to combine (readArray TypeInt arrTmp) $ \r ->
        when (A.eq singleType tid (liftInt32 0)) $
          writeArray TypeInt arrOut seg =<<
            case mseed of
              Nothing -> return r
              Just z  -> if (tp, A.eq singleType gd (liftInt32 1))
                           then flip (app2 combine) r =<< z   -- Note: initial element on the left
                           else return r

    return_


-- Reduce an array of arbitrary rank along the innermost dimension only.
--
-- For simplicity, each element of the output (reduction along an
-- innermost-dimension index) is computed by a single thread block, meaning we
-- don't have to worry about inter-block synchronisation. A more balanced method
-- would be a segmented reduction (specialised, since the length of each segment
-- is known a priori).
--
mkFoldDim
    :: forall aenv sh e.
       Gamma aenv                                     -- ^ array environment
    -> ArrayR (Array sh e)
    -> IRFun2     PTX aenv (e -> e -> e)              -- ^ combination function
    -> MIRExp     PTX aenv e                          -- ^ (optional) seed element, if this is an exclusive reduction
    -> MIRDelayed PTX aenv (Array (sh, Int) e)        -- ^ input data
    -> CodeGen    PTX      (IROpenAcc PTX aenv (Array sh e))
mkFoldDim aenv repr@(ArrayR shr tp) combine mseed marr = do
  dev <- liftCodeGen $ gets ptxDeviceProperties
  --
  let
      (arrOut, paramOut)  = mutableArray repr "out"
      (arrIn,  paramIn)   = delayedArray "in" marr
      paramEnv            = envParam aenv
      --
      config              = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||]
      smem n              = warps * (1 + per_warp) * bytes
        where
          ws        = CUDA.warpSize dev
          warps     = n `P.quot` ws
          per_warp  = ws + ws `P.quot` 2
          bytes     = bytesElt tp
  --
  makeOpenAccWith config "fold" (paramOut ++ paramIn ++ paramEnv) $ do

    -- If the innermost dimension is smaller than the number of threads in the
    -- block, those threads will never contribute to the output.
    tid   <- threadIdx
    sz    <- indexHead <$> delayedExtent arrIn
    sz'   <- i32 sz

    when (A.lt singleType tid sz') $ do

      start <- return (liftInt 0)
      end   <- shapeSize shr (irArrayShape arrOut)

      -- Thread blocks iterate over the outer dimensions, each thread block
      -- cooperatively reducing along each outermost index to a single value.
      --
      imapFromTo start end $ \seg -> do

        -- Wait for threads to catch up before starting this segment. We could
        -- also place this at the bottom of the loop, but here allows threads to
        -- exit quickly on the last iteration.
        __syncthreads

        -- Step 1: initialise local sums
        from  <- A.mul numType seg  sz          -- first linear index this block will reduce
        to    <- A.add numType from sz          -- last linear index this block will reduce (exclusive)

        i0    <- A.add numType from =<< int tid
        x0    <- app1 (delayedLinearIndex arrIn) i0
        bd    <- blockDim
        r0    <- if (tp, A.gte singleType sz' bd)
                   then reduceBlockSMem dev tp combine Nothing    x0
                   else reduceBlockSMem dev tp combine (Just sz') x0

        -- Step 2: keep walking over the input
        bd'   <- int bd
        next  <- A.add numType from bd'
        r     <- iterFromStepTo tp next bd' to r0 $ \offset r -> do

          -- Wait for all threads to catch up before starting the next stripe
          __syncthreads

          -- Threads cooperatively reduce this stripe of the input
          i   <- A.add numType offset =<< int tid
          v'  <- A.sub numType to offset
          r'  <- if (tp, A.gte singleType v' bd')
                   -- All threads of the block are valid, so we can avoid
                   -- bounds checks.
                   then do
                     x <- app1 (delayedLinearIndex arrIn) i
                     y <- reduceBlockSMem dev tp combine Nothing x
                     return y

                   -- Otherwise, we require bounds checks when reading the input
                   -- and during the reduction. Note that even though only the
                   -- valid threads will contribute useful work in the
                   -- reduction, we must still have all threads enter the
                   -- reduction procedure to avoid synchronisation divergence.
                   else do
                     x <- if (tp, A.lt singleType i to)
                            then app1 (delayedLinearIndex arrIn) i
                            else let
                                     go :: TypeR a -> Operands a
                                     go TupRunit       = OP_Unit
                                     go (TupRpair a b) = OP_Pair (go a) (go b)
                                     go (TupRsingle t) = ir t (undef t)
                                 in
                                 return $ go tp

                     v <- i32 v'
                     y <- reduceBlockSMem dev tp combine (Just v) x
                     return y

          if (tp, A.eq singleType tid (liftInt32 0))
            then app2 combine r r'
            else return r'

        -- Step 3: Thread 0 writes the aggregate reduction of this dimension to
        -- memory. If this is an exclusive fold, combine with the initial element.
        --
        when (A.eq singleType tid (liftInt32 0)) $
          writeArray TypeInt arrOut seg =<<
            case mseed of
              Nothing -> return r
              Just z  -> flip (app2 combine) r =<< z  -- Note: initial element on the left

    return_


-- Exclusive reductions over empty arrays (of any dimension) fill the lower
-- dimensions with the initial element.
--
mkFoldFill
    :: Gamma       aenv
    -> ArrayR (Array sh e)
    -> IRExp   PTX aenv e
    -> CodeGen PTX      (IROpenAcc PTX aenv (Array sh e))
mkFoldFill aenv repr seed =
  mkGenerate aenv repr (IRFun1 (const seed))


-- Efficient threadblock-wide reduction using the specified operator. The
-- aggregate reduction value is stored in thread zero. Supports non-commutative
-- operators.
--
-- Requires dynamically allocated memory: (#warps * (1 + 1.5 * warp size)).
--
-- Example: https://github.com/NVlabs/cub/blob/1.5.2/cub/block/specializations/block_reduce_warp_reductions.cuh
--
reduceBlockSMem
    :: forall aenv e.
       DeviceProperties                         -- ^ properties of the target device
    -> TypeR e
    -> IRFun2 PTX aenv (e -> e -> e)            -- ^ combination function
    -> Maybe (Operands Int32)                         -- ^ number of valid elements (may be less than block size)
    -> Operands e                                     -- ^ calling thread's input element
    -> CodeGen PTX (Operands e)                       -- ^ thread-block-wide reduction using the specified operator (lane 0 only)
reduceBlockSMem dev tp combine size = warpReduce >=> warpAggregate
  where
    int32 :: Integral a => a -> Operands Int32
    int32 = liftInt32 . P.fromIntegral

    -- Temporary storage required for each warp
    bytes           = bytesElt tp
    warp_smem_elems = CUDA.warpSize dev + (CUDA.warpSize dev `P.quot` 2)

    -- Step 1: Reduction in every warp
    --
    warpReduce :: Operands e -> CodeGen PTX (Operands e)
    warpReduce input = do
      -- Allocate (1.5 * warpSize) elements of shared memory for each warp
      wid   <- warpId
      skip  <- A.mul numType wid (int32 (warp_smem_elems * bytes))
      smem  <- dynamicSharedMem tp TypeInt32 (int32 warp_smem_elems) skip

      -- Are we doing bounds checking for this warp?
      --
      case size of
        -- The entire thread block is valid, so skip bounds checks.
        Nothing ->
          reduceWarpSMem dev tp combine smem Nothing input

        -- Otherwise check how many elements are valid for this warp. If it is
        -- full then we can still skip bounds checks for it.
        Just n -> do
          offset <- A.mul numType wid (int32 (CUDA.warpSize dev))
          valid  <- A.sub numType n offset
          if (tp, A.gte singleType valid (int32 (CUDA.warpSize dev)))
            then reduceWarpSMem dev tp combine smem Nothing      input
            else reduceWarpSMem dev tp combine smem (Just valid) input

    -- Step 2: Aggregate per-warp reductions
    --
    warpAggregate :: Operands e -> CodeGen PTX (Operands e)
    warpAggregate input = do
      -- Allocate #warps elements of shared memory
      bd    <- blockDim
      warps <- A.quot integralType bd (int32 (CUDA.warpSize dev))
      skip  <- A.mul numType warps (int32 (warp_smem_elems * bytes))
      smem  <- dynamicSharedMem tp TypeInt32 warps skip

      -- Share the per-lane aggregates
      wid   <- warpId
      lane  <- laneId
      when (A.eq singleType lane (liftInt32 0)) $ do
        writeArray TypeInt32 smem wid input

      -- Wait for each warp to finish its local reduction
      __syncthreads

      -- Update the total aggregate. Thread 0 just does this sequentially (as is
      -- done in CUB), but we could also do this cooperatively (better for
      -- larger thread blocks?)
      tid   <- threadIdx
      if (tp, A.eq singleType tid (liftInt32 0))
        then do
          steps <- case size of
                     Nothing -> return warps
                     Just n  -> do
                       a <- A.add numType n (int32 (CUDA.warpSize dev - 1))
                       b <- A.quot integralType a (int32 (CUDA.warpSize dev))
                       return b
          iterFromStepTo tp (liftInt32 1) (liftInt32 1) steps input $ \step x ->
            app2 combine x =<< readArray TypeInt32 smem step
        else
          return input


-- Efficient warp-wide reduction using shared memory. The aggregate reduction
-- value for the warp is stored in thread lane zero.
--
-- Each warp requires 48 (1.5 x warp size) elements of shared memory. The
-- routine assumes that is is allocated individually per-warp (i.e. can be
-- indexed in the range [0,warp size)).
--
-- Example: https://github.com/NVlabs/cub/blob/1.5.2/cub/warp/specializations/warp_reduce_smem.cuh#L128
--
reduceWarpSMem
    :: forall aenv e.
       DeviceProperties                         -- ^ properties of the target device
    -> TypeR e
    -> IRFun2 PTX aenv (e -> e -> e)            -- ^ combination function
    -> IRArray (Vector e)                       -- ^ temporary storage array in shared memory (1.5 warp size elements)
    -> Maybe (Operands Int32)                         -- ^ number of items that will be reduced by this warp, otherwise all lanes are valid
    -> Operands e                                     -- ^ calling thread's input element
    -> CodeGen PTX (Operands e)                       -- ^ warp-wide reduction using the specified operator (lane 0 only)
reduceWarpSMem dev tp combine smem size = reduce 0
  where
    log2 :: Double -> Double
    log2  = P.logBase 2

    -- Number steps required to reduce warp
    steps = P.floor . log2 . P.fromIntegral . CUDA.warpSize $ dev

    -- Return whether the index is valid. Assume that constant branches are
    -- optimised away.
    valid i =
      case size of
        Nothing -> return (liftBool True)
        Just n  -> A.lt singleType i n

    -- Unfold the reduction as a recursive code generation function.
    reduce :: Int -> Operands e -> CodeGen PTX (Operands e)
    reduce step x
      | step >= steps = return x
      | otherwise     = do
          let offset = liftInt32 (1 `P.shiftL` step)

          -- share input through buffer
          lane <- laneId
          writeArray TypeInt32 smem lane x

          __syncwarp

          -- update input if in range
          i   <- A.add numType lane offset
          x'  <- if (tp, valid i)
                   then app2 combine x =<< readArray TypeInt32 smem i
                   else return x

          __syncwarp

          reduce (step+1) x'


-- Efficient warp reduction using __shfl_up instruction (compute >= 3.0)
--
-- Example: https://github.com/NVlabs/cub/blob/1.5.2/cub/warp/specializations/warp_reduce_shfl.cuh#L310
--
-- reduceWarpShfl
--     :: IRFun2 PTX aenv (e -> e -> e)                            -- ^ combination function
--     -> Operands e                                                     -- ^ this thread's input value
--     -> CodeGen (Operands e)                                           -- ^ final result
-- reduceWarpShfl combine input =
--   error "TODO: PTX.reduceWarpShfl"


-- Reduction loops
-- ---------------

reduceFromTo
    :: DeviceProperties
    -> TypeR a
    -> Operands Int                                   -- ^ starting index
    -> Operands Int                                   -- ^ final index (exclusive)
    -> (IRFun2 PTX aenv (a -> a -> a))          -- ^ combination function
    -> (Operands Int -> CodeGen PTX (Operands a))           -- ^ function to retrieve element at index
    -> (Operands a -> CodeGen PTX ())                 -- ^ what to do with the value
    -> CodeGen PTX ()
reduceFromTo dev tp from to combine get set = do

  tid   <- int =<< threadIdx
  bd    <- int =<< blockDim

  valid <- A.sub numType to from
  i     <- A.add numType from tid

  _     <- if (TupRunit, A.gte singleType valid bd)
             then do
               -- All threads in the block will participate in the reduction, so
               -- we can avoid bounds checks
               x <- get i
               r <- reduceBlockSMem dev tp combine Nothing x
               set r

               return (lift TupRunit ())
             else do
               -- Only in-bounds threads can read their input and participate in
               -- the reduction
               when (A.lt singleType i to) $ do
                 x <- get i
                 v <- i32 valid
                 r <- reduceBlockSMem dev tp combine (Just v) x
                 set r

               return (lift TupRunit ())

  return ()


-- Utilities
-- ---------

i32 :: Operands Int -> CodeGen PTX (Operands Int32)
i32 = A.fromIntegral integralType numType

int :: Operands Int32 -> CodeGen PTX (Operands Int)
int = A.fromIntegral integralType numType

imapFromTo
    :: Operands Int
    -> Operands Int
    -> (Operands Int -> CodeGen PTX ())
    -> CodeGen PTX ()
imapFromTo start end body = do
  bid <- int =<< blockIdx
  gd  <- int =<< gridDim
  i0  <- A.add numType start bid
  imapFromStepTo i0 gd end body