{-# LANGUAGE GADTs #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RebindableSyntax #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeOperators #-} {-# LANGUAGE ViewPatterns #-} -- | -- Module : Data.Array.Accelerate.LLVM.PTX.CodeGen.Fold -- Copyright : [2016..2017] Trevor L. McDonell -- License : BSD3 -- -- Maintainer : Trevor L. McDonell -- Stability : experimental -- Portability : non-portable (GHC extensions) -- module Data.Array.Accelerate.LLVM.PTX.CodeGen.Fold where -- accelerate import Data.Array.Accelerate.Analysis.Match import Data.Array.Accelerate.Analysis.Type import Data.Array.Accelerate.Array.Sugar ( Array, Scalar, Vector, Shape, Z, (:.), Elt(..) ) -- accelerate-llvm-* import Data.Array.Accelerate.LLVM.Analysis.Match import Data.Array.Accelerate.LLVM.CodeGen.Arithmetic as A import Data.Array.Accelerate.LLVM.CodeGen.Array import Data.Array.Accelerate.LLVM.CodeGen.Base import Data.Array.Accelerate.LLVM.CodeGen.Environment import Data.Array.Accelerate.LLVM.CodeGen.Exp import Data.Array.Accelerate.LLVM.CodeGen.IR import Data.Array.Accelerate.LLVM.CodeGen.Loop as Loop import Data.Array.Accelerate.LLVM.CodeGen.Monad import Data.Array.Accelerate.LLVM.CodeGen.Sugar import Data.Array.Accelerate.LLVM.PTX.Analysis.Launch import Data.Array.Accelerate.LLVM.PTX.CodeGen.Base import Data.Array.Accelerate.LLVM.PTX.CodeGen.Generate import Data.Array.Accelerate.LLVM.PTX.Context import Data.Array.Accelerate.LLVM.PTX.Target import LLVM.AST.Type.Representation -- cuda import qualified Foreign.CUDA.Analysis as CUDA import Control.Applicative ( (<$>), (<*>) ) import Control.Monad ( (>=>), (<=<) ) import Data.String ( fromString ) import Data.Bits as P import Prelude as P -- Reduce an array along the innermost dimension. The reduction function must be -- associative to allow for an efficient parallel implementation, but the -- initial element does /not/ need to be a neutral element of operator. -- -- TODO: Specialise for commutative operations (such as (+)) and those with -- a neutral element {(+), 0} -- mkFold :: forall aenv sh e. (Shape sh, Elt e) => PTX -> Gamma aenv -> IRFun2 PTX aenv (e -> e -> e) -> IRExp PTX aenv e -> IRDelayed PTX aenv (Array (sh :. Int) e) -> CodeGen (IROpenAcc PTX aenv (Array sh e)) mkFold ptx@(deviceProperties . ptxContext -> dev) aenv f z acc | Just Refl <- matchShapeType (undefined::sh) (undefined::Z) = (+++) <$> mkFoldAll dev aenv f (Just z) acc <*> mkFoldFill ptx aenv z | otherwise = (+++) <$> mkFoldDim dev aenv f (Just z) acc <*> mkFoldFill ptx aenv z -- Reduce a non-empty array along the innermost dimension. The reduction -- function must be associative to allow for an efficient parallel -- implementation. -- -- TODO: Specialise for commutative operations (such as (+)) and those with -- a neutral element {(+), 0} -- mkFold1 :: forall aenv sh e. (Shape sh, Elt e) => PTX -> Gamma aenv -> IRFun2 PTX aenv (e -> e -> e) -> IRDelayed PTX aenv (Array (sh :. Int) e) -> CodeGen (IROpenAcc PTX aenv (Array sh e)) mkFold1 (deviceProperties . ptxContext -> dev) aenv f acc | Just Refl <- matchShapeType (undefined::sh) (undefined::Z) = mkFoldAll dev aenv f Nothing acc | otherwise = mkFoldDim dev aenv f Nothing acc -- Reduce an array to a single element. -- -- Since reductions consume arrays that have been fused into them, parallel -- reduction requires two separate kernels. At an example, take vector dot -- product: -- -- > dotp xs ys = fold (+) 0 (zipWith (*) xs ys) -- -- 1. The first pass reads in the fused array data, in this case corresponding -- to the function (\i -> (xs!i) * (ys!i)). -- -- 2. The second pass reads in the manifest array data from the first step and -- directly reduces the array. This can be done recursively in-place until -- only a single element remains. -- -- In both phases, thread blocks cooperatively reduce a stripe of the input (one -- element per thread) to a single element, which is stored to the output array. -- mkFoldAll :: forall aenv e. Elt e => DeviceProperties -- ^ properties of the target GPU -> Gamma aenv -- ^ array environment -> IRFun2 PTX aenv (e -> e -> e) -- ^ combination function -> Maybe (IRExp PTX aenv e) -- ^ seed element, if this is an exclusive reduction -> IRDelayed PTX aenv (Vector e) -- ^ input data -> CodeGen (IROpenAcc PTX aenv (Scalar e)) mkFoldAll dev aenv combine mseed acc = foldr1 (+++) <$> sequence [ mkFoldAllS dev aenv combine mseed acc , mkFoldAllM1 dev aenv combine acc , mkFoldAllM2 dev aenv combine mseed ] -- Reduction to an array to a single element, for small arrays which can be -- processed by a single thread block. -- mkFoldAllS :: forall aenv e. Elt e => DeviceProperties -- ^ properties of the target GPU -> Gamma aenv -- ^ array environment -> IRFun2 PTX aenv (e -> e -> e) -- ^ combination function -> Maybe (IRExp PTX aenv e) -> IRDelayed PTX aenv (Vector e) -- ^ input data -> CodeGen (IROpenAcc PTX aenv (Scalar e)) mkFoldAllS dev aenv combine mseed IRDelayed{..} = let (start, end, paramGang) = gangParam (arrOut, paramOut) = mutableArray ("out" :: Name (Scalar e)) paramEnv = envParam aenv -- config = launchConfig dev (CUDA.incWarp dev) smem multipleOf multipleOfQ smem n = warps * (1 + per_warp) * bytes where ws = CUDA.warpSize dev warps = n `P.quot` ws per_warp = ws + ws `P.quot` 2 bytes = sizeOf (eltType (undefined :: e)) in makeOpenAccWith config "foldAllS" (paramGang ++ paramOut ++ paramEnv) $ do tid <- threadIdx bd <- blockDim -- We can assume that there is only a single thread block i0 <- A.add numType start tid sz <- A.sub numType end start when (A.lt scalarType i0 sz) $ do -- Thread reads initial element and then participates in block-wide -- reduction. x0 <- app1 delayedLinearIndex =<< A.fromIntegral integralType numType i0 r0 <- if A.eq scalarType sz bd then reduceBlockSMem dev combine Nothing x0 else reduceBlockSMem dev combine (Just sz) x0 when (A.eq scalarType tid (lift 0)) $ writeArray arrOut tid =<< case mseed of Nothing -> return r0 Just z -> flip (app2 combine) r0 =<< z -- Note: initial element on the left return_ -- Reduction of an entire array to a single element. This kernel implements step -- one for reducing large arrays which must be processed by multiple thread -- blocks. -- mkFoldAllM1 :: forall aenv e. Elt e => DeviceProperties -- ^ properties of the target GPU -> Gamma aenv -- ^ array environment -> IRFun2 PTX aenv (e -> e -> e) -- ^ combination function -> IRDelayed PTX aenv (Vector e) -- ^ input data -> CodeGen (IROpenAcc PTX aenv (Scalar e)) mkFoldAllM1 dev aenv combine IRDelayed{..} = let (start, end, paramGang) = gangParam (arrTmp, paramTmp) = mutableArray ("tmp" :: Name (Vector e)) paramEnv = envParam aenv -- config = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||] smem n = warps * (1 + per_warp) * bytes where ws = CUDA.warpSize dev warps = n `P.quot` ws per_warp = ws + ws `P.quot` 2 bytes = sizeOf (eltType (undefined :: e)) in makeOpenAccWith config "foldAllM1" (paramGang ++ paramTmp ++ paramEnv) $ do -- Each thread block cooperatively reduces a stripe of the input and stores -- that value into a temporary array at a corresponding index. Since the -- order of operations remains fixed, this method supports non-commutative -- reductions. -- tid <- threadIdx bd <- blockDim sz <- i32 . indexHead =<< delayedExtent imapFromTo start end $ \seg -> do -- Wait for all threads to catch up before beginning the stripe __syncthreads -- Bounds of the input array we will reduce between from <- A.mul numType seg bd step <- A.add numType from bd to <- A.min scalarType sz step -- Threads cooperatively reduce this stripe reduceFromTo dev from to combine (app1 delayedLinearIndex <=< A.fromIntegral integralType numType) (when (A.eq scalarType tid (lift 0)) . writeArray arrTmp seg) return_ -- Reduction of an array to a single element, (recursive) step 2 of multi-block -- reduction algorithm. -- mkFoldAllM2 :: forall aenv e. Elt e => DeviceProperties -> Gamma aenv -> IRFun2 PTX aenv (e -> e -> e) -> Maybe (IRExp PTX aenv e) -> CodeGen (IROpenAcc PTX aenv (Scalar e)) mkFoldAllM2 dev aenv combine mseed = let (start, end, paramGang) = gangParam (arrTmp, paramTmp) = mutableArray ("tmp" :: Name (Vector e)) (arrOut, paramOut) = mutableArray ("out" :: Name (Vector e)) paramEnv = envParam aenv -- config = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||] smem n = warps * (1 + per_warp) * bytes where ws = CUDA.warpSize dev warps = n `P.quot` ws per_warp = ws + ws `P.quot` 2 bytes = sizeOf (eltType (undefined :: e)) in makeOpenAccWith config "foldAllM2" (paramGang ++ paramTmp ++ paramOut ++ paramEnv) $ do -- Threads cooperatively reduce a stripe of the input (temporary) array -- output from the first phase, storing the results into another temporary. -- When only a single thread block remains, we have reached the final -- reduction step and add the initial element (for exclusive reductions). -- tid <- threadIdx bd <- blockDim gd <- gridDim sz <- i32 . indexHead $ irArrayShape arrTmp imapFromTo start end $ \seg -> do -- Wait for all threads to catch up before beginning the stripe __syncthreads -- Bounds of the input we will reduce between from <- A.mul numType seg bd step <- A.add numType from bd to <- A.min scalarType sz step -- Threads cooperatively reduce this stripe reduceFromTo dev from to combine (readArray arrTmp) $ \r -> when (A.eq scalarType tid (lift 0)) $ writeArray arrOut seg =<< case mseed of Nothing -> return r Just z -> if A.eq scalarType gd (lift 1) then flip (app2 combine) r =<< z -- Note: initial element on the left else return r return_ -- Reduce an array of arbitrary rank along the innermost dimension only. -- -- For simplicity, each element of the output (reduction along an -- innermost-dimension index) is computed by a single thread block, meaning we -- don't have to worry about inter-block synchronisation. A more balanced method -- would be a segmented reduction (specialised, since the length of each segment -- is known a priori). -- mkFoldDim :: forall aenv sh e. (Shape sh, Elt e) => DeviceProperties -- ^ properties of the target GPU -> Gamma aenv -- ^ array environment -> IRFun2 PTX aenv (e -> e -> e) -- ^ combination function -> Maybe (IRExp PTX aenv e) -- ^ seed element, if this is an exclusive reduction -> IRDelayed PTX aenv (Array (sh :. Int) e) -- ^ input data -> CodeGen (IROpenAcc PTX aenv (Array sh e)) mkFoldDim dev aenv combine mseed IRDelayed{..} = let (start, end, paramGang) = gangParam (arrOut, paramOut) = mutableArray ("out" :: Name (Array sh e)) paramEnv = envParam aenv -- config = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||] smem n = warps * (1 + per_warp) * bytes where ws = CUDA.warpSize dev warps = n `P.quot` ws per_warp = ws + ws `P.quot` 2 bytes = sizeOf (eltType (undefined :: e)) in makeOpenAccWith config "fold" (paramGang ++ paramOut ++ paramEnv) $ do -- If the innermost dimension is smaller than the number of threads in the -- block, those threads will never contribute to the output. tid <- threadIdx sz <- i32 . indexHead =<< delayedExtent when (A.lt scalarType tid sz) $ do -- Thread blocks iterate over the outer dimensions, each thread block -- cooperatively reducing along each outermost index to a single value. -- imapFromTo start end $ \seg -> do -- Wait for threads to catch up before starting this segment. We could -- also place this at the bottom of the loop, but here allows threads to -- exit quickly on the last iteration. __syncthreads -- Step 1: initialise local sums from <- A.mul numType seg sz -- first linear index this block will reduce to <- A.add numType from sz -- last linear index this block will reduce (exclusive) i0 <- A.add numType from tid x0 <- app1 delayedLinearIndex =<< A.fromIntegral integralType numType i0 bd <- blockDim r0 <- if A.gte scalarType sz bd then reduceBlockSMem dev combine Nothing x0 else reduceBlockSMem dev combine (Just sz) x0 -- Step 2: keep walking over the input next <- A.add numType from bd r <- iterFromStepTo next bd to r0 $ \offset r -> do -- Wait for all threads to catch up before starting the next stripe __syncthreads -- Threads cooperatively reduce this stripe of the input i <- A.add numType offset tid v' <- A.sub numType to offset r' <- if A.gte scalarType v' bd -- All threads of the block are valid, so we can avoid -- bounds checks. then do x <- app1 delayedLinearIndex =<< A.fromIntegral integralType numType i y <- reduceBlockSMem dev combine Nothing x return y -- Otherwise, we require bounds checks when reading the input -- and during the reduction. Note that even though only the -- valid threads will contribute useful work in the -- reduction, we must still have all threads enter the -- reduction procedure to avoid synchronisation divergence. else do x <- if A.lt scalarType i to then app1 delayedLinearIndex =<< A.fromIntegral integralType numType i else return r y <- reduceBlockSMem dev combine (Just v') x return y if A.eq scalarType tid (lift 0) then app2 combine r r' else return r' -- Step 3: Thread 0 writes the aggregate reduction of this dimension to -- memory. If this is an exclusive fold, combine with the initial element. -- when (A.eq scalarType tid (lift 0)) $ writeArray arrOut seg =<< case mseed of Nothing -> return r Just z -> flip (app2 combine) r =<< z -- Note: initial element on the left return_ -- Exclusive reductions over empty arrays (of any dimension) fill the lower -- dimensions with the initial element. -- mkFoldFill :: (Shape sh, Elt e) => PTX -> Gamma aenv -> IRExp PTX aenv e -> CodeGen (IROpenAcc PTX aenv (Array sh e)) mkFoldFill ptx aenv seed = mkGenerate ptx aenv (IRFun1 (const seed)) -- Efficient threadblock-wide reduction using the specified operator. The -- aggregate reduction value is stored in thread zero. Supports non-commutative -- operators. -- -- Requires dynamically allocated memory: (#warps * (1 + 1.5 * warp size)). -- -- Example: https://github.com/NVlabs/cub/blob/1.5.2/cub/block/specializations/block_reduce_warp_reductions.cuh -- reduceBlockSMem :: forall aenv e. Elt e => DeviceProperties -- ^ properties of the target device -> IRFun2 PTX aenv (e -> e -> e) -- ^ combination function -> Maybe (IR Int32) -- ^ number of valid elements (may be less than block size) -> IR e -- ^ calling thread's input element -> CodeGen (IR e) -- ^ thread-block-wide reduction using the specified operator (lane 0 only) reduceBlockSMem dev combine size = warpReduce >=> warpAggregate where int32 :: Integral a => a -> IR Int32 int32 = lift . P.fromIntegral -- Temporary storage required for each warp bytes = sizeOf (eltType (undefined::e)) warp_smem_elems = CUDA.warpSize dev + (CUDA.warpSize dev `P.quot` 2) -- Step 1: Reduction in every warp -- warpReduce :: IR e -> CodeGen (IR e) warpReduce input = do -- Allocate (1.5 * warpSize) elements of shared memory for each warp wid <- warpId skip <- A.mul numType wid (int32 (warp_smem_elems * bytes)) smem <- dynamicSharedMem (int32 warp_smem_elems) skip -- Are we doing bounds checking for this warp? -- case size of -- The entire thread block is valid, so skip bounds checks. Nothing -> reduceWarpSMem dev combine smem Nothing input -- Otherwise check how many elements are valid for this warp. If it is -- full then we can still skip bounds checks for it. Just n -> do offset <- A.mul numType wid (int32 (CUDA.warpSize dev)) valid <- A.sub numType n offset if A.gte scalarType valid (int32 (CUDA.warpSize dev)) then reduceWarpSMem dev combine smem Nothing input else reduceWarpSMem dev combine smem (Just valid) input -- Step 2: Aggregate per-warp reductions -- warpAggregate :: IR e -> CodeGen (IR e) warpAggregate input = do -- Allocate #warps elements of shared memory bd <- blockDim warps <- A.quot integralType bd (int32 (CUDA.warpSize dev)) skip <- A.mul numType warps (int32 (warp_smem_elems * bytes)) smem <- dynamicSharedMem warps skip -- Share the per-lane aggregates wid <- warpId lane <- laneId when (A.eq scalarType lane (lift 0)) $ do writeArray smem wid input -- Wait for each warp to finish its local reduction __syncthreads -- Update the total aggregate. Thread 0 just does this sequentially (as is -- done in CUB), but we could also do this cooperatively (better for -- larger thread blocks?) tid <- threadIdx if A.eq scalarType tid (lift 0) then do steps <- case size of Nothing -> return warps Just n -> do a <- A.add numType n (int32 (CUDA.warpSize dev - 1)) b <- A.quot integralType a (int32 (CUDA.warpSize dev)) return b iterFromStepTo (lift 1) (lift 1) steps input $ \step x -> app2 combine x =<< readArray smem step else return input -- Efficient warp-wide reduction using shared memory. The aggregate reduction -- value for the warp is stored in thread lane zero. -- -- Each warp requires 48 (1.5 x warp size) elements of shared memory. The -- routine assumes that is is allocated individually per-warp (i.e. can be -- indexed in the range [0,warp size)). -- -- Example: https://github.com/NVlabs/cub/blob/1.5.2/cub/warp/specializations/warp_reduce_smem.cuh#L128 -- reduceWarpSMem :: forall aenv e. Elt e => DeviceProperties -- ^ properties of the target device -> IRFun2 PTX aenv (e -> e -> e) -- ^ combination function -> IRArray (Vector e) -- ^ temporary storage array in shared memory (1.5 warp size elements) -> Maybe (IR Int32) -- ^ number of items that will be reduced by this warp, otherwise all lanes are valid -> IR e -- ^ calling thread's input element -> CodeGen (IR e) -- ^ warp-wide reduction using the specified operator (lane 0 only) reduceWarpSMem dev combine smem size = reduce 0 where log2 :: Double -> Double log2 = P.logBase 2 -- Number steps required to reduce warp steps = P.floor . log2 . P.fromIntegral . CUDA.warpSize $ dev -- Return whether the index is valid. Assume that constant branches are -- optimised away. valid i = case size of Nothing -> return (lift True) Just n -> A.lt scalarType i n -- Unfold the reduction as a recursive code generation function. reduce :: Int -> IR e -> CodeGen (IR e) reduce step x | step >= steps = return x | offset <- 1 `P.shiftL` step = do -- share input through buffer lane <- laneId writeArray smem lane x -- update input if in range i <- A.add numType lane (lift offset) x' <- if valid i then app2 combine x =<< readArray smem i else return x reduce (step+1) x' -- Efficient warp reduction using __shfl_up instruction (compute >= 3.0) -- -- Example: https://github.com/NVlabs/cub/blob/1.5.2/cub/warp/specializations/warp_reduce_shfl.cuh#L310 -- -- reduceWarpShfl -- :: IRFun2 PTX aenv (e -> e -> e) -- ^ combination function -- -> IR e -- ^ this thread's input value -- -> CodeGen (IR e) -- ^ final result -- reduceWarpShfl combine input = -- error "TODO: PTX.reduceWarpShfl" -- Reduction loops -- --------------- reduceFromTo :: Elt a => DeviceProperties -> IR Int32 -- ^ starting index -> IR Int32 -- ^ final index (exclusive) -> (IRFun2 PTX aenv (a -> a -> a)) -- ^ combination function -> (IR Int32 -> CodeGen (IR a)) -- ^ function to retrieve element at index -> (IR a -> CodeGen ()) -- ^ what to do with the value -> CodeGen () reduceFromTo dev from to combine get set = do tid <- threadIdx bd <- blockDim valid <- A.sub numType to from i <- A.add numType from tid _ <- if A.gte scalarType valid bd then do -- All threads in the block will participate in the reduction, so -- we can avoid bounds checks x <- get i r <- reduceBlockSMem dev combine Nothing x set r return (IR OP_Unit :: IR ()) -- unsightly, but free else do -- Only in-bounds threads can read their input and participate in -- the reduction when (A.lt scalarType i to) $ do x <- get i r <- reduceBlockSMem dev combine (Just valid) x set r return (IR OP_Unit :: IR ()) return () -- Utilities -- --------- i32 :: IR Int -> CodeGen (IR Int32) i32 = A.fromIntegral integralType numType imapFromTo :: IR Int32 -> IR Int32 -> (IR Int32 -> CodeGen ()) -> CodeGen () imapFromTo start end body = do bid <- blockIdx gd <- gridDim i0 <- A.add numType start bid imapFromStepTo i0 gd end body