{-# LANGUAGE GADTs #-} {-# LANGUAGE PatternGuards #-} {-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE ScopedTypeVariables #-} -- | -- Module : Data.Array.Accelerate.CUDA.CodeGen.IndexSpace -- Copyright : [2008..2010] Manuel M T Chakravarty, Gabriele Keller, Sean Lee -- [2009..2012] Manuel M T Chakravarty, Gabriele Keller, Trevor L. McDonell -- License : BSD3 -- -- Maintainer : Trevor L. McDonell -- Stability : experimental -- Portability : non-portable (GHC extensions) -- module Data.Array.Accelerate.CUDA.CodeGen.IndexSpace ( -- Array construction mkGenerate, -- Permutations mkTransform, mkPermute, ) where import Data.List import Language.C.Quote.CUDA import Foreign.CUDA.Analysis.Device import qualified Language.C.Syntax as C import Data.Array.Accelerate.Array.Sugar ( Array, Shape, Elt ) import Data.Array.Accelerate.Analysis.Shape import Data.Array.Accelerate.CUDA.AST ( Gamma, Exp ) import Data.Array.Accelerate.CUDA.CodeGen.Type import Data.Array.Accelerate.CUDA.CodeGen.Base -- Construct a new array by applying a function to each index. Each thread -- processes multiple elements, striding the array by the grid size. -- -- generate :: (Shape ix, Elt e) -- => Exp ix -- dimension of the result -- -> (Exp ix -> Exp a) -- function to apply at each index -- -> Acc (Array ix a) -- mkGenerate :: forall aenv sh e. (Shape sh, Elt e) => DeviceProperties -> Gamma aenv -> CUFun1 aenv (sh -> e) -> [CUTranslSkel aenv (Array sh e)] mkGenerate dev aenv (CUFun1 _ f) = return $ CUTranslSkel "generate" [cunit| $esc:("#include ") $edecl:(cdim "DimOut" dim) $edecls:texIn extern "C" __global__ void generate ( $params:argIn, $params:argOut ) { const int shapeSize = size(shOut); const int gridSize = $exp:(gridSize dev); int ix; for ( ix = $exp:(threadIdx dev) ; ix < shapeSize ; ix += gridSize ) { const typename DimOut sh = fromIndex(shOut, ix); $items:(setOut "ix" .=. f sh) } } |] where dim = expDim (undefined :: Exp aenv sh) sh = cshape dim (cvar "sh") (texIn, argIn) = environment dev aenv (argOut, setOut) = setters "Out" (undefined :: Array sh e) -- A combination map/backpermute, where the index and value transformations have -- been separated. -- -- transform :: (Elt a, Elt b, Shape sh, Shape sh') -- => PreExp acc aenv sh' -- dimension of the result -- -> PreFun acc aenv (sh' -> sh) -- index permutation function -- -> PreFun acc aenv (a -> b) -- function to apply at each element -- -> acc aenv (Array sh a) -- source array -- -> PreOpenAcc acc aenv (Array sh' b) -- mkTransform :: forall aenv sh sh' a b. (Shape sh, Shape sh', Elt a, Elt b) => DeviceProperties -> Gamma aenv -> CUFun1 aenv (sh' -> sh) -> CUFun1 aenv (a -> b) -> CUDelayedAcc aenv sh a -> [CUTranslSkel aenv (Array sh' b)] mkTransform dev aenv perm fun arr | CUFun1 _ p <- perm , CUFun1 dce f <- fun , CUDelayed _ (CUFun1 _ get) _ <- arr = return $ CUTranslSkel "transform" [cunit| $esc:("#include ") $edecl:(cdim "DimOut" dimOut) $edecl:(cdim "DimIn" dimIn) $edecls:texIn extern "C" __global__ void transform ( $params:argIn, $params:argOut ) { const int shapeSize = size(shOut); const int gridSize = $exp:(gridSize dev); int ix; for ( ix = $exp:(threadIdx dev) ; ix < shapeSize ; ix += gridSize ) { const typename DimOut sh_ = fromIndex(shOut, ix); $items:(sh .=. p sh_) $items:(dce x0 .=. get sh) $items:(setOut "ix" .=. f x0) } } |] where dimIn = expDim (undefined :: Exp aenv sh) dimOut = expDim (undefined :: Exp aenv sh') sh_ = cshape dimOut (cvar "sh_") (texIn, argIn) = environment dev aenv (argOut, setOut) = setters "Out" (undefined :: Array sh' b) (x0, _, _) = locals "x" (undefined :: a) (sh, _, _) = locals "sh" (undefined :: sh) -- Forward permutation specified by an index mapping that determines for each -- element in the source array where it should go in the target. The resultant -- array is initialised with the given defaults and any further values that are -- permuted into the result array are added to the current value using the given -- combination function. -- -- The combination function must be associative. Extents that are mapped to the -- magic value 'ignore' by the permutation function are dropped. -- -- permute :: (Shape ix, Shape ix', Elt a) -- => (Exp a -> Exp a -> Exp a) -- combination function -- -> Acc (Array ix' a) -- array of default values -- -> (Exp ix -> Exp ix') -- permutation -- -> Acc (Array ix a) -- permuted array -- -> Acc (Array ix' a) -- mkPermute :: forall aenv sh sh' e. (Shape sh, Shape sh', Elt e) => DeviceProperties -> Gamma aenv -> CUFun2 aenv (e -> e -> e) -> CUFun1 aenv (sh -> sh') -> CUDelayedAcc aenv sh e -> [CUTranslSkel aenv (Array sh' e)] mkPermute dev aenv (CUFun2 dcex dcey combine) (CUFun1 _ prj) arr | CUDelayed (CUExp shIn) _ (CUFun1 _ get) <- arr = return $ CUTranslSkel "permute" [cunit| $esc:("#include ") $edecl:(cdim "DimOut" dimOut) $edecl:(cdim "DimIn" dimIn) $edecls:texIn extern "C" __global__ void permute ( $params:argIn, $params:argOut ) { $items:(sh .=. shIn) const typename DimIn shIn = $exp:(ccall "shape" (map rvalue sh)); const int shapeSize = $exp:(shapeSize sh); const int gridSize = $exp:(gridSize dev); int ix; for ( ix = $exp:(threadIdx dev) ; ix < shapeSize ; ix += gridSize ) { typename DimOut dst; const typename DimIn src = fromIndex( shIn, ix ); $items:(dst .=. prj src) if ( !ignore(dst) ) { $decls:decly $decls:decly' const int jx = toIndex(shOut, dst); $items:(dcex x .=. get ix) $items:(dcey y .=. arrOut "jx") $items:write } } } |] where dimIn = expDim (undefined :: Exp aenv sh) dimOut = expDim (undefined :: Exp aenv sh') sizeof = eltSizeOf (undefined::e) (texIn, argIn) = environment dev aenv (argOut, arrOut) = setters "Out" (undefined :: Array sh' e) (sh, _, _) = locals "sh" (undefined :: sh) (x, _, _) = locals "x" (undefined :: e) (_, y, decly) = locals "y" (undefined :: e) (_, y', decly') = locals "_y" (undefined :: e) ix = [cvar "ix"] src = cshape dimIn (cvar "src") dst = cshape dimOut (cvar "dst") sm = computeCapability dev -- Apply the combining function between old and new values. If multiple -- threads attempt to write to the same location, the hardware -- write-combining mechanism will accept one transaction and all other -- updates will be lost. -- -- If the hardware supports it, we can use atomicCAS (compare-and-swap) to -- work around this. This requires at least compute 1.1 for 32-bit values, -- and compute 1.2 for 64-bit values. If hardware support is not available, -- write the result as normal and hope for the best. -- -- Each element of a tuple is necessarily written individually, so the tuple -- as a whole is not stored atomically. -- write = env ++ zipWith6 apply sizeof (arrOut "jx") fun x (dcey y) y' (env, fun) = combine x y apply size out f x1 (used,y1) y1' | used , Just atomicCAS <- reinterpret size = C.BlockStm [cstm| do { $exp:y1' = $exp:y1; $exp:y1 = $exp:atomicCAS ( & $exp:out, $exp:y1', $exp:f ); } while ( $exp:y1 != $exp:y1' ); |] | otherwise = C.BlockStm [cstm| $exp:out = $exp:(rvalue x1); |] -- reinterpret :: Int -> Maybe C.Exp reinterpret 4 | sm >= Compute 1 1 = Just [cexp| $id:("atomicCAS32") |] reinterpret 8 | sm >= Compute 1 2 = Just [cexp| $id:("atomicCAS64") |] reinterpret _ = Nothing