-- | Evaluate a vector by breaking it up into linear chunks and filling each chunk
--   in parallel.
{-# LANGUAGE BangPatterns #-}
module Data.Array.Repa.Internals.EvalChunked
	( fillChunkedS
	, fillChunkedP)
where
import Data.Array.Repa.Internals.Gang
import GHC.Base					(remInt, quotInt)
import Prelude					as P


-- | Fill something sequentially.
fillChunkedS
	:: Int                  -- ^ Number of elements
	-> (Int -> a -> IO ())	-- ^ Update function to write into result buffer
	-> (Int -> a)	        -- ^ Fn to get the value at a given index.
	-> IO ()

{-# INLINE [0] fillChunkedS #-}
fillChunkedS !len !write !getElem
 = fill 0
 where	fill !ix
	 | ix >= len	= return ()
	 | otherwise
	 = do	write ix (getElem ix)
		fill (ix + 1)


-- | Fill something in parallel.
fillChunkedP
        :: Int                  -- ^ Number of elements
	-> (Int -> a -> IO ())	-- ^ Update function to write into result buffer
	-> (Int -> a)	        -- ^ Fn to get the value at a given index.
	-> IO ()

{-# INLINE [0] fillChunkedP #-}
fillChunkedP !len !write !getElem
 = 	gangIO theGang
	 $  \thread -> fill (splitIx thread) (splitIx (thread + 1))

 where
	-- Decide now to split the work across the threads.
	-- If the length of the vector doesn't divide evenly among the threads,
	-- then the first few get an extra element.
	!threads 	= gangSize theGang
	!chunkLen 	= len `quotInt` threads
	!chunkLeftover	= len `remInt`  threads

	{-# INLINE splitIx #-}
	splitIx thread
	 | thread < chunkLeftover = thread * (chunkLen + 1)
	 | otherwise		  = thread * chunkLen  + chunkLeftover

	-- Evaluate the elements of a single chunk.
	{-# INLINE fill #-}
	fill !ix !end
	 | ix >= end		= return ()
	 | otherwise
	 = do	write ix (getElem ix)
		fill (ix + 1) end