module Data.Repa.Eval.Generic.Seq.Cursored
        (fillCursoredBlock2)
where
import Data.Repa.Eval.Elt
import GHC.Exts


-- | Fill a block in a rank-2 array, sequentially.
--
--   * Blockwise filling can be more cache-efficient than linear filling for
--     rank-2 arrays.
--
--   * Using cursor functions can help to expose inter-element indexing
--     computations to the GHC and LLVM optimisers.
--
--   * Coordinates given are of the filled edges of the block.
--
--   * The block is filled in row major order from top to bottom.
-- 
--   * We need the `Elt` constraint so that we can use its `touch` function
--     to provide an order of evaluation ammenable to the LLVM optimiser.
--     You should compile your Haskell program with @-fllvm -optlo-O3@ to
--     enable LLVM's Global Value Numbering optimisation.
--
fillCursoredBlock2
        :: Elt a
        => (Int# -> a -> IO ())
                -- ^ Update function to write into result buffer.
        -> (Int# -> Int# -> cursor)
                -- ^ Make a cursor to a particular element from an (x, y) index.
        -> (Int# -> Int# -> cursor -> cursor) 
                -- ^ Shift the cursor by an (x, y) offset.
        -> (cursor -> a) -- ^ Function to evaluate an element at the given index.
        -> Int#          -- ^ Width of the whole array.
        -> Int#          -- ^ x0 lower left corner of block to fill.
        -> Int#          -- ^ y0
        -> Int#          -- ^ w0 width of block to fill
        -> Int#          -- ^ h0 height of block to fill
        -> IO ()

fillCursoredBlock2
        write
        makeCursor shiftCursor getElem
        !imageWidth !x0 !y0 !w0 h0

 = do   fillBlock y0
 where  !x1     = x0 +# w0
        !y1     = y0 +# h0

        fillBlock !y
         | 1# <- y >=# y1 = return ()
         | otherwise
         = do   fillLine4 x0
                fillBlock (y +# 1#)

         where  fillLine4 !x
                 | 1# <- x +# 4# >=# x1 = fillLine1 x
                 | otherwise
                 = do   -- Compute each source cursor based on the previous one
                        -- so that the variable live ranges in the generated
                        -- code are shorter.
                        let srcCur0     = makeCursor  x  y 
                        let srcCur1     = shiftCursor 1# 0# srcCur0
                        let srcCur2     = shiftCursor 1# 0# srcCur1
                        let srcCur3     = shiftCursor 1# 0# srcCur2

                        -- Get the result value for each cursor.
                        let val0        = getElem srcCur0
                        let val1        = getElem srcCur1
                        let val2        = getElem srcCur2
                        let val3        = getElem srcCur3

                        -- Ensure that we've computed each of the result values
                        -- before we write into the array. If the backend code
                        -- generator can't tell our destination array doesn't
                        -- alias with the source then writing to it can prevent
                        -- the sharing of intermediate computations.
                        touch val0
                        touch val1
                        touch val2
                        touch val3

                        -- Compute row-major index into destination array.
                        let !dstCur0    = x +# (y *# imageWidth)
                        write  dstCur0        val0
                        write (dstCur0 +# 1#) val1
                        write (dstCur0 +# 2#) val2
                        write (dstCur0 +# 3#) val3
                        fillLine4 (x +# 4#)
                {-# INLINE fillLine4 #-}
                
                fillLine1 !x
                 | 1# <- x >=# x1 = return ()
                 | otherwise
                 = do   let val0  = getElem $ makeCursor x y
                        write (x +# (y *# imageWidth)) val0
                        fillLine1 (x +# 1#)
                {-# INLINE fillLine1 #-}
        {-# INLINE fillBlock #-}
{-# INLINE [0] fillCursoredBlock2 #-}