{-# LANGUAGE MagicHash #-}
-- | Evaluate an array by dividing it into rectangular blocks and filling
--   each block in parallel.
module Data.Array.Repa.Eval.Cursored
	( fillBlock2P
	, fillBlock2S
	, fillCursoredBlock2P
	, fillCursoredBlock2S )
where
import Data.Array.Repa.Index
import Data.Array.Repa.Shape
import Data.Array.Repa.Eval.Elt
import Data.Array.Repa.Eval.Gang
import GHC.Base
import Prelude                          as P

-- Non-cursored interface -----------------------------------------------------
-- | Fill a block in a rank-2 array in parallel.
--
--   * Blockwise filling can be more cache-efficient than linear filling for
--    rank-2 arrays.
--
--   * Coordinates given are of the filled edges of the block.
-- 
--   * We divide the block into columns, and give one column to each thread.
-- 
--   * Each column is filled in row major order from top to bottom.
--
fillBlock2P 
        :: Elt a
	=> (Int -> a -> IO ())	-- ^ Update function to write into result buffer.
        -> (DIM2 -> a)          -- ^ Function to evaluate the element at an index.
	-> Int#			-- ^ Width of the whole array.
	-> Int#			-- ^ x0 lower left corner of block to fill
	-> Int#			-- ^ y0 
	-> Int#			-- ^ x1 upper right corner of block to fill
	-> Int#			-- ^ y1
        -> IO ()

{-# INLINE [0] fillBlock2P #-}
fillBlock2P !write !getElem !imageWidth !x0 !y0 !x1 !y1
 = fillCursoredBlock2P 
        write id addDim getElem 
        imageWidth x0 y0 x1 y1


-- | Fill a block in a rank-2 array sequentially.
--
--   * Blockwise filling can be more cache-efficient than linear filling for
--    rank-2 arrays.
--
--   * Coordinates given are of the filled edges of the block.
-- 
--   * The block is filled in row major order from top to bottom.
--
fillBlock2S
        :: Elt a
	=> (Int -> a -> IO ())	-- ^ Update function to write into result buffer.
        -> (DIM2 -> a)          -- ^ Function to evaluate the element at an index.
	-> Int#			-- ^ Width of the whole array.
	-> Int#			-- ^ x0 lower left corner of block to fill
	-> Int#			-- ^ y0
	-> Int#			-- ^ x1 upper right corner of block to fill
	-> Int#			-- ^ y1
        -> IO ()

{-# INLINE [0] fillBlock2S #-}
fillBlock2S !write !getElem imageWidth x0 y0 x1 y1
 = fillCursoredBlock2S
        write id addDim getElem 
        imageWidth x0 y0 x1 y1


-- Block filling ----------------------------------------------------------------------------------
-- | Fill a block in a rank-2 array in parallel.
-- 
--   * Blockwise filling can be more cache-efficient than linear filling for rank-2 arrays.
--
--   * Using cursor functions can help to expose inter-element indexing computations to
--     the GHC and LLVM optimisers.
--
--   * Coordinates given are of the filled edges of the block.
-- 
--   * We divide the block into columns, and give one column to each thread.
-- 
--   * Each column is filled in row major order from top to bottom.
--
fillCursoredBlock2P
	:: Elt a
	=> (Int -> a -> IO ())		-- ^ Update function to write into result buffer.
	-> (DIM2   -> cursor)		-- ^ Make a cursor to a particular element.
	-> (DIM2   -> cursor -> cursor)	-- ^ Shift the cursor by an offset.
	-> (cursor -> a)		-- ^ Function to evaluate the element at an index.
	-> Int#			-- ^ Width of the whole array.
	-> Int#			-- ^ x0 lower left corner of block to fill
	-> Int#			-- ^ y0
	-> Int#			-- ^ x1 upper right corner of block to fill
	-> Int#			-- ^ y1
	-> IO ()

{-# INLINE [0] fillCursoredBlock2P #-}
fillCursoredBlock2P
	!write
	!makeCursorFCB !shiftCursorFCB !getElemFCB
	!imageWidth !x0 !y0 !x1 !y1
 = 	gangIO theGang fillBlock
 where	!(I# threads)  = gangSize theGang
	!blockWidth    = x1 -# x0 +# 1#

	-- All columns have at least this many pixels.
	!colChunkLen   = I# (blockWidth `quotInt#` threads)

	-- Extra pixels that we have to divide between some of the threads.
	!colChunkSlack = I# (blockWidth `remInt#` threads)

	-- Get the starting pixel of a column in the image.
	{-# INLINE colIx #-}
	colIx !ix
	 | ix < colChunkSlack = (I# x0) +  ix * (colChunkLen  + 1)
	 | otherwise	      = (I# x0) + (ix * colChunkLen) + colChunkSlack

	-- Give one column to each thread
	{-# INLINE fillBlock #-}
	fillBlock :: Int -> IO ()
	fillBlock !ix
	 = let	!(I# x0')	= colIx ix
		!(I# x1')	= colIx (ix + 1) - 1
		!y0'	  = y0
		!y1'	  = y1
	   in	fillCursoredBlock2S
			write
			makeCursorFCB shiftCursorFCB getElemFCB
			imageWidth x0' y0' x1' y1'


-- | Fill a block in a rank-2 array, sequentially.
--
--   * Blockwise filling can be more cache-efficient than linear filling for rank-2 arrays.
--
--   * Using cursor functions can help to expose inter-element indexing computations to
--     the GHC and LLVM optimisers.
--
--   * Coordinates given are of the filled edges of the block.
--
--   * The block is filled in row major order from top to bottom.
--
fillCursoredBlock2S
	:: Elt a
	=> (Int -> a -> IO ())		-- ^ Update function to write into result buffer.
	-> (DIM2   -> cursor)		-- ^ Make a cursor to a particular element.
	-> (DIM2   -> cursor -> cursor)	-- ^ Shift the cursor by an offset.
	-> (cursor -> a)		-- ^ Function to evaluate an element at the given index.
	-> Int#				-- ^ Width of the whole array.
	-> Int#				-- ^ x0 lower left corner of block to fill.
	-> Int#				-- ^ y0
	-> Int#				-- ^ x1 upper right corner of block to fill.
	-> Int#				-- ^ y1
	-> IO ()

{-# INLINE [0] fillCursoredBlock2S #-}
fillCursoredBlock2S
	!write
	!makeCursor !shiftCursor !getElem
	!imageWidth !x0 !y0 !x1 !y1

 = fillBlock y0

 where	{-# INLINE fillBlock #-}
	fillBlock !y
	 | y ># y1	= return ()
	 | otherwise
	 = do	fillLine4 x0
		fillBlock (y +# 1#)

	 where	{-# INLINE fillLine4 #-}
		fillLine4 !x
 	   	 | x +# 4# ># x1 	= fillLine1 x
	   	 | otherwise
	   	 = do	-- Compute each source cursor based on the previous one so that
			-- the variable live ranges in the generated code are shorter.
			let srcCur0	= makeCursor  (Z :. (I# y) :. (I# x))
			let srcCur1	= shiftCursor (Z :. 0 :. 1) srcCur0
			let srcCur2	= shiftCursor (Z :. 0 :. 1) srcCur1
			let srcCur3	= shiftCursor (Z :. 0 :. 1) srcCur2

			-- Get the result value for each cursor.
			let val0	= getElem srcCur0
			let val1	= getElem srcCur1
			let val2	= getElem srcCur2
			let val3	= getElem srcCur3

			-- Ensure that we've computed each of the result values before we
			-- write into the array. If the backend code generator can't tell
			-- our destination array doesn't alias with the source then writing
			-- to it can prevent the sharing of intermediate computations.
			touch val0
			touch val1
			touch val2
			touch val3

			-- Compute cursor into destination array.
			let !dstCur0	= x +# (y *# imageWidth)
			write (I# dstCur0)         val0
			write (I# (dstCur0 +# 1#)) val1
			write (I# (dstCur0 +# 2#)) val2
			write (I# (dstCur0 +# 3#)) val3
			fillLine4 (x +# 4#)

		{-# INLINE fillLine1 #-}
		fillLine1 !x
 	   	 | x ># x1		= return ()
	   	 | otherwise
	   	 = do	write (I# (x +# (y *# imageWidth)))
                              (getElem $ makeCursor (Z :. (I# y) :. (I# x)))
			fillLine1 (x +# 1#)