module Data.Repa.Eval.Generic.Par.Cursored
        ( fillBlock2
        , fillCursoredBlock2)
where
import Data.Repa.Eval.Elt
import Data.Repa.Eval.Gang
import qualified Data.Repa.Eval.Generic.Seq.Cursored      as Seq
import GHC.Exts


-- Non-cursored interface -----------------------------------------------------
-- | Fill a block in a rank-2 array in parallel.
--
--   * Blockwise filling can be more cache-efficient than linear filling for
--     rank-2 arrays.
--
--   * Coordinates given are of the filled edges of the block.
-- 
--   * We divide the block into columns, and give one column to each thread.
-- 
--   * Each column is filled in row major order from top to bottom.
--
fillBlock2 
        :: Elt a
        => Gang
        -> (Int# -> a -> IO ()) 
                        -- ^ Update function to write into result buffer.
        -> (Int# -> Int# -> a)  
                        -- ^ Function to evaluate the element at an (x, y) index.
        -> Int#         -- ^ Width of the whole array.
        -> Int#         -- ^ x0 lower left corner of block to fill
        -> Int#         -- ^ y0 
        -> Int#         -- ^ w0 width of block to fill.
        -> Int#         -- ^ h0 height of block to fill.
        -> IO ()

fillBlock2 :: forall a.
Elt a =>
Gang
-> (Int# -> a -> IO ())
-> (Int# -> Int# -> a)
-> Int#
-> Int#
-> Int#
-> Int#
-> Int#
-> IO ()
fillBlock2 Gang
gang Int# -> a -> IO ()
write Int# -> Int# -> a
getElem !Int#
imageWidth !Int#
x0 !Int#
y0 !Int#
w0 Int#
h0
 = Gang
-> (Int# -> a -> IO ())
-> (Int# -> Int# -> DIM2)
-> (Int# -> Int# -> DIM2 -> DIM2)
-> (DIM2 -> a)
-> Int#
-> Int#
-> Int#
-> Int#
-> Int#
-> IO ()
forall a cursor.
Elt a =>
Gang
-> (Int# -> a -> IO ())
-> (Int# -> Int# -> cursor)
-> (Int# -> Int# -> cursor -> cursor)
-> (cursor -> a)
-> Int#
-> Int#
-> Int#
-> Int#
-> Int#
-> IO ()
fillCursoredBlock2
        Gang
gang Int# -> a -> IO ()
write
        Int# -> Int# -> DIM2
makeCursor Int# -> Int# -> DIM2 -> DIM2
shiftCursor DIM2 -> a
loadCursor
        Int#
imageWidth Int#
x0 Int#
y0 Int#
w0 Int#
h0

 where  makeCursor :: Int# -> Int# -> DIM2
makeCursor Int#
x Int#
y
                = Int# -> Int# -> DIM2
DIM2 Int#
x Int#
y
        {-# INLINE makeCursor #-}

        shiftCursor :: Int# -> Int# -> DIM2 -> DIM2
shiftCursor Int#
x' Int#
y' (DIM2 Int#
x Int#
y) 
                = Int# -> Int# -> DIM2
DIM2 (Int#
x Int# -> Int# -> Int#
+# Int#
x') (Int#
y Int# -> Int# -> Int#
+# Int#
y')
        {-# INLINE shiftCursor #-}

        loadCursor :: DIM2 -> a
loadCursor (DIM2 Int#
x Int#
y)
                = Int# -> Int# -> a
getElem Int#
x Int#
y
        {-# INLINE loadCursor #-}

{-# INLINE [0] fillBlock2 #-}

data DIM2 
        = DIM2 Int# Int#


-- Block filling --------------------------------------------------------------
-- | Fill a block in a rank-2 array in parallel.
-- 
--   * Blockwise filling can be more cache-efficient than linear filling for
--     rank-2 arrays.
--
--   * Using cursor functions can help to expose inter-element indexing
--     computations to the GHC and LLVM optimisers.
--
--   * Coordinates given are of the filled edges of the block.
--
--   * We divide the block into columns, and give one column to each thread.
-- 
--   * We need the `Elt` constraint so that we can use its `touch` function
--     to provide an order of evaluation ammenable to the LLVM optimiser.
--     You should compile your Haskell program with @-fllvm -optlo-O3@ to
--     enable LLVM's Global Value Numbering optimisation.
--
fillCursoredBlock2
        :: Elt a
        => Gang -- ^ Gang to run the operation on.
        -> (Int# -> a -> IO ())          
                -- ^ Update function to write into result buffer.
        -> (Int# -> Int# -> cursor)           
                -- ^ Make a cursor from an (x, y) index.
        -> (Int# -> Int# -> cursor -> cursor) 
                -- ^ Shift the cursor by an (x, y) offset.
        -> (cursor -> a) -- ^ Function to evaluate the element at an index.
        -> Int#          -- ^ Width of the whole array.
        -> Int#          -- ^ x0 lower left corner of block to fill
        -> Int#          -- ^ y0
        -> Int#          -- ^ w0 width of block to fill
        -> Int#          -- ^ h0 height of block to fill
        -> IO ()

fillCursoredBlock2 :: forall a cursor.
Elt a =>
Gang
-> (Int# -> a -> IO ())
-> (Int# -> Int# -> cursor)
-> (Int# -> Int# -> cursor -> cursor)
-> (cursor -> a)
-> Int#
-> Int#
-> Int#
-> Int#
-> Int#
-> IO ()
fillCursoredBlock2
        Gang
gang Int# -> a -> IO ()
write
        Int# -> Int# -> cursor
makeCursorFCB Int# -> Int# -> cursor -> cursor
shiftCursorFCB cursor -> a
getElemFCB
        !Int#
imageWidth !Int#
x0 !Int#
y0 !Int#
w0 !Int#
h0
 =      Gang -> (Int# -> IO ()) -> IO ()
gangIO Gang
gang Int# -> IO ()
fillBlock
 where  
        !threads :: Int#
threads        = Gang -> Int#
gangSize Gang
gang

        -- All columns have at least this many pixels.
        !colChunkLen :: Int#
colChunkLen   = Int#
w0 Int# -> Int# -> Int#
`quotInt#` Int#
threads

        -- Extra pixels that we have to divide between some of the threads.
        !colChunkSlack :: Int#
colChunkSlack = Int#
w0 Int# -> Int# -> Int#
`remInt#` Int#
threads

        -- Get the starting pixel of a column in the image.
        colIx :: Int# -> Int#
colIx !Int#
ix
         | Int#
1# <- Int#
ix Int# -> Int# -> Int#
<# Int#
colChunkSlack = Int#
x0 Int# -> Int# -> Int#
+# (Int#
ix Int# -> Int# -> Int#
*# (Int#
colChunkLen Int# -> Int# -> Int#
+# Int#
1#))
         | Bool
otherwise                 = Int#
x0 Int# -> Int# -> Int#
+# (Int#
ix Int# -> Int# -> Int#
*# Int#
colChunkLen) Int# -> Int# -> Int#
+# Int#
colChunkSlack
        {-# INLINE colIx #-}

        -- Give one column to each thread
        fillBlock :: Int# -> IO ()
        fillBlock :: Int# -> IO ()
fillBlock !Int#
ix
         = let  !x0' :: Int#
x0'      = Int# -> Int#
colIx Int#
ix
                !w0' :: Int#
w0'      = Int# -> Int#
colIx (Int#
ix Int# -> Int# -> Int#
+# Int#
1#) Int# -> Int# -> Int#
-# Int#
x0'
                !y0' :: Int#
y0'      = Int#
y0
                !h0' :: Int#
h0'      = Int#
h0
           in   (Int# -> a -> IO ())
-> (Int# -> Int# -> cursor)
-> (Int# -> Int# -> cursor -> cursor)
-> (cursor -> a)
-> Int#
-> Int#
-> Int#
-> Int#
-> Int#
-> IO ()
forall a cursor.
Elt a =>
(Int# -> a -> IO ())
-> (Int# -> Int# -> cursor)
-> (Int# -> Int# -> cursor -> cursor)
-> (cursor -> a)
-> Int#
-> Int#
-> Int#
-> Int#
-> Int#
-> IO ()
Seq.fillCursoredBlock2
                        Int# -> a -> IO ()
write
                        Int# -> Int# -> cursor
makeCursorFCB Int# -> Int# -> cursor -> cursor
shiftCursorFCB cursor -> a
getElemFCB
                        Int#
imageWidth Int#
x0' Int#
y0' Int#
w0' Int#
h0'
        {-# INLINE fillBlock #-}

{-# INLINE [0] fillCursoredBlock2 #-}