module Data.Repa.Eval.Generic.Par.Cursored ( fillBlock2 , fillCursoredBlock2) where import Data.Repa.Eval.Elt import Data.Repa.Eval.Gang import qualified Data.Repa.Eval.Generic.Seq.Cursored as Seq import GHC.Exts -- Non-cursored interface ----------------------------------------------------- -- | Fill a block in a rank-2 array in parallel. -- -- * Blockwise filling can be more cache-efficient than linear filling for -- rank-2 arrays. -- -- * Coordinates given are of the filled edges of the block. -- -- * We divide the block into columns, and give one column to each thread. -- -- * Each column is filled in row major order from top to bottom. -- fillBlock2 :: Elt a => Gang -> (Int# -> a -> IO ()) -- ^ Update function to write into result buffer. -> (Int# -> Int# -> a) -- ^ Function to evaluate the element at an (x, y) index. -> Int# -- ^ Width of the whole array. -> Int# -- ^ x0 lower left corner of block to fill -> Int# -- ^ y0 -> Int# -- ^ w0 width of block to fill. -> Int# -- ^ h0 height of block to fill. -> IO () fillBlock2 gang write getElem !imageWidth !x0 !y0 !w0 h0 = fillCursoredBlock2 gang write makeCursor shiftCursor loadCursor imageWidth x0 y0 w0 h0 where makeCursor x y = DIM2 x y {-# INLINE makeCursor #-} shiftCursor x' y' (DIM2 x y) = DIM2 (x +# x') (y +# y') {-# INLINE shiftCursor #-} loadCursor (DIM2 x y) = getElem x y {-# INLINE loadCursor #-} {-# INLINE [0] fillBlock2 #-} data DIM2 = DIM2 Int# Int# -- Block filling -------------------------------------------------------------- -- | Fill a block in a rank-2 array in parallel. -- -- * Blockwise filling can be more cache-efficient than linear filling for -- rank-2 arrays. -- -- * Using cursor functions can help to expose inter-element indexing -- computations to the GHC and LLVM optimisers. -- -- * Coordinates given are of the filled edges of the block. -- -- * We divide the block into columns, and give one column to each thread. -- -- * We need the `Elt` constraint so that we can use its `touch` function -- to provide an order of evaluation ammenable to the LLVM optimiser. -- You should compile your Haskell program with @-fllvm -optlo-O3@ to -- enable LLVM's Global Value Numbering optimisation. -- fillCursoredBlock2 :: Elt a => Gang -- ^ Gang to run the operation on. -> (Int# -> a -> IO ()) -- ^ Update function to write into result buffer. -> (Int# -> Int# -> cursor) -- ^ Make a cursor from an (x, y) index. -> (Int# -> Int# -> cursor -> cursor) -- ^ Shift the cursor by an (x, y) offset. -> (cursor -> a) -- ^ Function to evaluate the element at an index. -> Int# -- ^ Width of the whole array. -> Int# -- ^ x0 lower left corner of block to fill -> Int# -- ^ y0 -> Int# -- ^ w0 width of block to fill -> Int# -- ^ h0 height of block to fill -> IO () fillCursoredBlock2 gang write makeCursorFCB shiftCursorFCB getElemFCB !imageWidth !x0 !y0 !w0 !h0 = gangIO gang fillBlock where !threads = gangSize gang -- All columns have at least this many pixels. !colChunkLen = w0 `quotInt#` threads -- Extra pixels that we have to divide between some of the threads. !colChunkSlack = w0 `remInt#` threads -- Get the starting pixel of a column in the image. colIx !ix | 1# <- ix <# colChunkSlack = x0 +# (ix *# (colChunkLen +# 1#)) | otherwise = x0 +# (ix *# colChunkLen) +# colChunkSlack {-# INLINE colIx #-} -- Give one column to each thread fillBlock :: Int# -> IO () fillBlock !ix = let !x0' = colIx ix !w0' = colIx (ix +# 1#) -# x0' !y0' = y0 !h0' = h0 in Seq.fillCursoredBlock2 write makeCursorFCB shiftCursorFCB getElemFCB imageWidth x0' y0' w0' h0' {-# INLINE fillBlock #-} {-# INLINE [0] fillCursoredBlock2 #-}