{-# LANGUAGE GADTs #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RebindableSyntax #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE TypeOperators #-} {-# LANGUAGE ViewPatterns #-} -- | -- Module : Data.Array.Accelerate.LLVM.PTX.CodeGen.FoldSeg -- Copyright : [2016..2020] The Accelerate Team -- License : BSD3 -- -- Maintainer : Trevor L. McDonell -- Stability : experimental -- Portability : non-portable (GHC extensions) -- module Data.Array.Accelerate.LLVM.PTX.CodeGen.FoldSeg where import Data.Array.Accelerate.Representation.Array import Data.Array.Accelerate.Representation.Elt import Data.Array.Accelerate.Representation.Shape import Data.Array.Accelerate.Representation.Type import Data.Array.Accelerate.LLVM.CodeGen.Arithmetic as A import Data.Array.Accelerate.LLVM.CodeGen.Array import Data.Array.Accelerate.LLVM.CodeGen.Base import Data.Array.Accelerate.LLVM.CodeGen.Constant import Data.Array.Accelerate.LLVM.CodeGen.Environment import Data.Array.Accelerate.LLVM.CodeGen.Exp import Data.Array.Accelerate.LLVM.CodeGen.IR import Data.Array.Accelerate.LLVM.CodeGen.Loop as Loop import Data.Array.Accelerate.LLVM.CodeGen.Monad import Data.Array.Accelerate.LLVM.CodeGen.Sugar import Data.Array.Accelerate.LLVM.PTX.Analysis.Launch import Data.Array.Accelerate.LLVM.PTX.CodeGen.Base import Data.Array.Accelerate.LLVM.PTX.CodeGen.Fold ( reduceBlockSMem, reduceWarpSMem, imapFromTo ) import Data.Array.Accelerate.LLVM.PTX.Target import LLVM.AST.Type.Representation import qualified Foreign.CUDA.Analysis as CUDA import Control.Monad ( void ) import Control.Monad.State ( gets ) import Data.String ( fromString ) import Prelude as P -- Segmented reduction along the innermost dimension of an array. Performs one -- reduction per segment of the source array. -- mkFoldSeg :: forall aenv sh i e. Gamma aenv -> ArrayR (Array (sh, Int) e) -> IntegralType i -> IRFun2 PTX aenv (e -> e -> e) -> Maybe (IRExp PTX aenv e) -> MIRDelayed PTX aenv (Array (sh, Int) e) -> MIRDelayed PTX aenv (Segments i) -> CodeGen PTX (IROpenAcc PTX aenv (Array (sh, Int) e)) mkFoldSeg aenv repr intTp combine seed arr seg = (+++) <$> mkFoldSegP_block aenv repr intTp combine seed arr seg <*> mkFoldSegP_warp aenv repr intTp combine seed arr seg -- This implementation assumes that the segments array represents the offset -- indices to the source array, rather than the lengths of each segment. The -- segment-offset approach is required for parallel implementations. -- -- Each segment is computed by a single thread block, meaning we don't have to -- worry about inter-block synchronisation. -- mkFoldSegP_block :: forall aenv sh i e. Gamma aenv -> ArrayR (Array (sh, Int) e) -> IntegralType i -> IRFun2 PTX aenv (e -> e -> e) -> MIRExp PTX aenv e -> MIRDelayed PTX aenv (Array (sh, Int) e) -> MIRDelayed PTX aenv (Segments i) -> CodeGen PTX (IROpenAcc PTX aenv (Array (sh, Int) e)) mkFoldSegP_block aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = do dev <- liftCodeGen $ gets ptxDeviceProperties -- let (arrOut, paramOut) = mutableArray repr "out" (arrIn, paramIn) = delayedArray "in" marr (arrSeg, paramSeg) = delayedArray "seg" mseg paramEnv = envParam aenv -- config = launchConfig dev (CUDA.decWarp dev) dsmem const [|| const ||] dsmem n = warps * (1 + per_warp) * bytes where ws = CUDA.warpSize dev warps = n `P.quot` ws per_warp = ws + ws `P.quot` 2 bytes = bytesElt tp -- makeOpenAccWith config "foldSeg_block" (paramOut ++ paramIn ++ paramSeg ++ paramEnv) $ do -- We use a dynamically scheduled work queue in order to evenly distribute -- the uneven workload, due to the variable length of each segment, over the -- available thread blocks. -- queue <- globalWorkQueue -- All threads in the block need to know what the start and end indices of -- this segment are in order to participate in the reduction. We use -- variables in __shared__ memory to communicate these values between -- threads in the block. Furthermore, by using a 2-element array, we can -- have the first two threads of the block read the start and end indices as -- a single coalesced read, since they will be sequential in the -- segment-offset array. -- smem <- staticSharedMem (TupRsingle scalarTypeInt) 2 -- Compute the number of segments and size of the innermost dimension. These -- are required if we are reducing a rank-2 or higher array, to properly -- compute the start and end indices of the portion of the array this thread -- block reduces. Note that this is a segment-offset array computed by -- 'scanl (+) 0' of the segment length array, so its size has increased by -- one. -- sz <- indexHead <$> delayedExtent arrIn ss <- do n <- indexHead <$> delayedExtent arrSeg A.sub numType n (liftInt 1) -- Each thread block cooperatively reduces a segment. -- s0 <- dequeue queue (lift 1) -- for s0 (\s -> A.lt singleType s end) (\_ -> dequeue queue (lift 1)) $ \s -> do start <- return (liftInt 0) end <- shapeSize shr (irArrayShape arrOut) imapFromTo start end $ \s -> do -- The first two threads of the block determine the indices of the -- segments array that we will reduce between and distribute those values -- to the other threads in the block. tid <- threadIdx when (A.lt singleType tid (liftInt32 2)) $ do i <- case shr of ShapeRsnoc ShapeRz -> return s _ -> A.rem integralType s ss j <- A.add numType i =<< int tid v <- app1 (delayedLinearIndex arrSeg) j writeArray TypeInt32 smem tid =<< A.fromIntegral intTp numType v -- Once all threads have caught up, begin work on the new segment. __syncthreads u <- readArray TypeInt32 smem (liftInt32 0) v <- readArray TypeInt32 smem (liftInt32 1) -- Determine the index range of the input array we will reduce over. -- Necessary for multidimensional segmented reduction. (inf,sup) <- A.unpair <$> case shr of ShapeRsnoc ShapeRz -> return (A.pair u v) _ -> do q <- A.quot integralType s ss a <- A.mul numType q sz A.pair <$> A.add numType u a <*> A.add numType v a void $ if (TupRunit, A.eq singleType inf sup) -- This segment is empty. If this is an exclusive reduction the -- first thread writes out the initial element for this segment. then do case mseed of Nothing -> return (lift TupRunit ()) Just z -> do when (A.eq singleType tid (liftInt32 0)) $ writeArray TypeInt arrOut s =<< z return (lift TupRunit ()) -- This is a non-empty segment. else do -- Step 1: initialise local sums -- -- NOTE: We require all threads to enter this branch and execute the -- first step, even if they do not have a valid element and must -- return 'undef'. If we attempt to skip this entire section for -- non-participating threads (i.e. 'when (i0 < sup)'), it seems that -- those threads die and will not participate in the computation of -- _any_ further segment. I'm not sure if this is a CUDA oddity -- (e.g. we must have all threads convergent on __syncthreads) or -- a bug in NVPTX / ptxas. -- i0 <- A.add numType inf =<< int tid x0 <- if (tp, A.lt singleType i0 sup) then app1 (delayedLinearIndex arrIn) i0 else let go :: TypeR a -> Operands a go TupRunit = OP_Unit go (TupRpair a b) = OP_Pair (go a) (go b) go (TupRsingle t) = ir t (undef t) in return $ go tp bd <- int =<< blockDim v0 <- A.sub numType sup inf v0' <- i32 v0 r0 <- if (tp, A.gte singleType v0 bd) then reduceBlockSMem dev tp combine Nothing x0 else reduceBlockSMem dev tp combine (Just v0') x0 -- Step 2: keep walking over the input nxt <- A.add numType inf bd r <- iterFromStepTo tp nxt bd sup r0 $ \offset r -> do -- Wait for threads to catch up before starting the next stripe __syncthreads i' <- A.add numType offset =<< int tid v' <- A.sub numType sup offset r' <- if (tp, A.gte singleType v' bd) -- All threads in the block are in bounds, so we -- can avoid bounds checks. then do x <- app1 (delayedLinearIndex arrIn) i' y <- reduceBlockSMem dev tp combine Nothing x return y -- Not all threads are valid. Note that we still -- have all threads enter the reduction procedure -- to avoid thread divergence on synchronisation -- points, similar to the above NOTE. else do x <- if (tp, A.lt singleType i' sup) then app1 (delayedLinearIndex arrIn) i' else let go :: TypeR a -> Operands a go TupRunit = OP_Unit go (TupRpair a b) = OP_Pair (go a) (go b) go (TupRsingle t) = ir t (undef t) in return $ go tp z <- i32 v' y <- reduceBlockSMem dev tp combine (Just z) x return y -- first thread incorporates the result from the previous -- iteration if (tp, A.eq singleType tid (liftInt32 0)) then app2 combine r r' else return r' -- Step 3: Thread zero writes the aggregate reduction for this -- segment to memory. If this is an exclusive fold combine with the -- initial element as well. when (A.eq singleType tid (liftInt32 0)) $ writeArray TypeInt arrOut s =<< case mseed of Nothing -> return r Just z -> flip (app2 combine) r =<< z -- Note: initial element on the left return (lift TupRunit ()) return_ -- This implementation assumes that the segments array represents the offset -- indices to the source array, rather than the lengths of each segment. The -- segment-offset approach is required for parallel implementations. -- -- Each segment is computed by a single warp, meaning we don't have to worry -- about inter- or intra-block synchronisation. -- mkFoldSegP_warp :: forall aenv sh i e. Gamma aenv -> ArrayR (Array (sh, Int) e) -> IntegralType i -> IRFun2 PTX aenv (e -> e -> e) -> MIRExp PTX aenv e -> MIRDelayed PTX aenv (Array (sh, Int) e) -> MIRDelayed PTX aenv (Segments i) -> CodeGen PTX (IROpenAcc PTX aenv (Array (sh, Int) e)) mkFoldSegP_warp aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = do dev <- liftCodeGen $ gets ptxDeviceProperties -- let (arrOut, paramOut) = mutableArray repr "out" (arrIn, paramIn) = delayedArray "in" marr (arrSeg, paramSeg) = delayedArray "seg" mseg paramEnv = envParam aenv -- config = launchConfig dev (CUDA.decWarp dev) dsmem grid gridQ dsmem n = warps * per_warp_bytes where warps = (n + ws - 1) `P.quot` ws -- grid n m = multipleOf n (m `P.quot` ws) gridQ = [|| \n m -> $$multipleOfQ n (m `P.quot` ws) ||] -- per_warp_bytes = (per_warp_elems * bytesElt tp) `P.max` (2 * bytesElt tp) per_warp_elems = ws + (ws `P.quot` 2) ws = CUDA.warpSize dev int32 :: Integral a => a -> Operands Int32 int32 = liftInt32 . P.fromIntegral -- makeOpenAccWith config "foldSeg_warp" (paramOut ++ paramIn ++ paramSeg ++ paramEnv) $ do -- Each warp works independently. -- Determine the ID of this warp within the thread block. tid <- threadIdx wid <- A.quot integralType tid (int32 ws) -- Number of warps per thread block bd <- blockDim wpb <- A.quot integralType bd (int32 ws) -- ID of this warp within the grid bid <- blockIdx gwid <- do a <- A.mul numType bid wpb b <- A.add numType wid a return b -- All threads in the warp need to know what the start and end indices of -- this segment are in order to participate in the reduction. We use -- variables in __shared__ memory to communicate these values between -- threads. Furthermore, by using a 2-element array, we can have the first -- two threads of the warp read the start and end indices as a single -- coalesced read, as these elements will be adjacent in the segment-offset -- array. -- -- Note that this is aliased with the memory used to communicate reduction -- values within the warp. -- lim <- do a <- A.mul numType wid (int32 per_warp_bytes) b <- dynamicSharedMem (TupRsingle scalarTypeInt) TypeInt32 (liftInt32 2) a return b -- Allocate (1.5 * warpSize) elements of shared memory for each warp to -- communicate reduction values. -- -- Note that this is aliased with the memory used to communicate the start -- and end indices of this segment. -- smem <- do a <- A.mul numType wid (int32 per_warp_bytes) b <- dynamicSharedMem tp TypeInt32 (int32 per_warp_elems) a return b -- Compute the number of segments and size of the innermost dimension. These -- are required if we are reducing a rank-2 or higher array, to properly -- compute the start and end indices of the portion of the array this warp -- reduces. Note that this is a segment-offset array computed by 'scanl (+) 0' -- of the segment length array, so its size has increased by one. -- sz <- indexHead <$> delayedExtent arrIn ss <- do a <- indexHead <$> delayedExtent arrSeg b <- A.sub numType a (liftInt 1) return b -- Each thread reduces a segment independently s0 <- int gwid gd <- int =<< gridDim wpb' <- int wpb step <- A.mul numType wpb' gd end <- shapeSize shr (irArrayShape arrOut) imapFromStepTo s0 step end $ \s -> do __syncwarp -- The first two threads of the warp determine the indices of the segments -- array that we will reduce between and distribute those values to the -- other threads in the warp lane <- laneId when (A.lt singleType lane (liftInt32 2)) $ do a <- case shr of ShapeRsnoc ShapeRz -> return s _ -> A.rem integralType s ss b <- A.add numType a =<< int lane c <- app1 (delayedLinearIndex arrSeg) b writeArray TypeInt32 lim lane =<< A.fromIntegral intTp numType c __syncwarp -- Determine the index range of the input array we will reduce over. -- Necessary for multidimensional segmented reduction. (inf,sup) <- do u <- readArray TypeInt32 lim (liftInt32 0) v <- readArray TypeInt32 lim (liftInt32 1) A.unpair <$> case shr of ShapeRsnoc ShapeRz -> return (A.pair u v) _ -> do q <- A.quot integralType s ss a <- A.mul numType q sz A.pair <$> A.add numType u a <*> A.add numType v a __syncwarp void $ if (TupRunit, A.eq singleType inf sup) -- This segment is empty. If this is an exclusive reduction the first -- lane writes out the initial element for this segment. then do case mseed of Nothing -> return (lift TupRunit ()) Just z -> do when (A.eq singleType lane (liftInt32 0)) $ writeArray TypeInt arrOut s =<< z return (lift TupRunit ()) -- This is a non-empty segment. else do -- Step 1: initialise local sums -- -- See comment above why we initialise the loop in this way -- i0 <- A.add numType inf =<< int lane x0 <- if (tp, A.lt singleType i0 sup) then app1 (delayedLinearIndex arrIn) i0 else let go :: TypeR a -> Operands a go TupRunit = OP_Unit go (TupRpair a b) = OP_Pair (go a) (go b) go (TupRsingle t) = ir t (undef t) in return $ go tp v0 <- A.sub numType sup inf v0' <- i32 v0 r0 <- if (tp, A.gte singleType v0 (liftInt ws)) then reduceWarpSMem dev tp combine smem Nothing x0 else reduceWarpSMem dev tp combine smem (Just v0') x0 -- Step 2: Keep walking over the rest of the segment nx <- A.add numType inf (liftInt ws) r <- iterFromStepTo tp nx (liftInt ws) sup r0 $ \offset r -> do -- __syncwarp __syncthreads -- TLM: why is this necessary? i' <- A.add numType offset =<< int lane v' <- A.sub numType sup offset r' <- if (tp, A.gte singleType v' (liftInt ws)) then do -- All lanes are in bounds, so avoid bounds checks x <- app1 (delayedLinearIndex arrIn) i' y <- reduceWarpSMem dev tp combine smem Nothing x return y else do x <- if (tp, A.lt singleType i' sup) then app1 (delayedLinearIndex arrIn) i' else let go :: TypeR a -> Operands a go TupRunit = OP_Unit go (TupRpair a b) = OP_Pair (go a) (go b) go (TupRsingle t) = ir t (undef t) in return $ go tp z <- i32 v' y <- reduceWarpSMem dev tp combine smem (Just z) x return y -- The first lane incorporates the result from the previous -- iteration if (tp, A.eq singleType lane (liftInt32 0)) then app2 combine r r' else return r' -- Step 3: Lane zero writes the aggregate reduction for this -- segment to memory. If this is an exclusive reduction, also -- combine with the initial element when (A.eq singleType lane (liftInt32 0)) $ writeArray TypeInt arrOut s =<< case mseed of Nothing -> return r Just z -> flip (app2 combine) r =<< z -- Note: initial element on the left return (lift TupRunit ()) return_ i32 :: IsIntegral i => Operands i -> CodeGen PTX (Operands Int32) i32 = A.fromIntegral integralType numType int :: IsIntegral i => Operands i -> CodeGen PTX (Operands Int) int = A.fromIntegral integralType numType