{-# LANGUAGE TypeFamilies #-} {-# LANGUAGE FlexibleContexts #-} -- | Our compilation strategy for 'SegGenRed' is based around avoiding -- bin conflicts. We do this by splitting the input into chunks, and -- for each chunk computing a single subhistogram. Then we combine -- the subhistograms using an ordinary segmented reduction ('SegRed'). -- -- There are some branches around to efficiently handle the case where -- we use only a single subhistogram (because it's large), so that we -- respect the asymptotics, and do not copy the destination array. module Futhark.CodeGen.ImpGen.Kernels.SegGenRed ( compileSegGenRed ) where import Control.Monad.Except import Data.Maybe import Data.List import Prelude hiding (quot, rem) import Futhark.MonadFreshNames import Futhark.Representation.ExplicitMemory import qualified Futhark.Representation.ExplicitMemory.IndexFunction as IxFun import Futhark.Pass.ExplicitAllocations() import qualified Futhark.CodeGen.ImpCode.Kernels as Imp import qualified Futhark.CodeGen.ImpGen as ImpGen import Futhark.CodeGen.ImpGen ((<--), sFor, sComment, sIf, sWhen, sArray, dPrim_, dPrimV) import Futhark.CodeGen.ImpGen.Kernels.SegRed (compileSegRed') import Futhark.CodeGen.ImpGen.Kernels.Base import Futhark.Util.IntegralExp (quotRoundingUp, quot, rem) import Futhark.Util (chunks, mapAccumLM, splitFromEnd, takeLast) import Futhark.Construct (fullSliceNum) prepareAtomicUpdate :: Maybe Locking -> [VName] -> Lambda InKernel -> CallKernelGen (Maybe Locking, [Imp.Exp] -> ImpGen.ImpM InKernel Imp.KernelOp ()) prepareAtomicUpdate l dests lam = -- We need a separate lock array if the opterators are not all of a -- particularly simple form that permits pure atomic operations. case (l, atomicUpdateLocking lam) of (_, Left f) -> return (l, f dests) (Just l', Right f) -> return (l, f l' dests) (Nothing, Right f) -> do -- The number of locks used here is too low, but since we are -- currently forced to inline a huge list, I'm keeping it down -- for now. Some quick experiments suggested that it has little -- impact anyway (maybe the locking case is just too slow). -- -- A fun solution would also be to use a simple hashing -- algorithm to ensure good distribution of locks. let num_locks = 10000 locks <- ImpGen.sStaticArray "genred_locks" (Space "device") int32 $ Imp.ArrayZeros num_locks let l' = Locking locks 0 1 0 ((`rem` fromIntegral num_locks) . sum) return (Just l', f l' dests) prepareIntermediateArrays :: [SubExp] -> Imp.Exp -> [GenReduceOp InKernel] -> CallKernelGen [(VName, [VName], [Imp.Exp] -> ImpGen.ImpM InKernel Imp.KernelOp ())] prepareIntermediateArrays segment_dims num_threads = fmap snd . mapAccumLM onOp Nothing where onOp l op = do -- Determining the degree of cooperation (heuristic): -- coop_lvl := size of histogram (Cooperation level) -- num_histos := (threads / coop_lvl) (Number of histograms) -- threads := min(physical_threads, segment_size) -- -- Careful to avoid division by zero when genReduceWidth==0. num_histos <- dPrimV "num_histos" $ num_threads `quotRoundingUp` BinOpExp (SMax Int32) 1 (ImpGen.compileSubExpOfType int32 (genReduceWidth op)) ImpGen.emit $ Imp.DebugPrint "num_histograms" int32 $ Imp.var num_histos int32 -- Initialise sub-histograms. -- -- If num_histos is 1, then we just reuse the original -- destination. The idea is to avoid a copy if we are writing a -- small number of values into a very large prior histogram. dests <- forM (zip (genReduceDest op) (genReduceNeutral op)) $ \(dest, ne) -> do dest_t <- lookupType dest dest_mem <- ImpGen.entryArrayLocation <$> ImpGen.lookupArray dest let num_elems = foldl' (*) (Imp.var num_histos int32) $ map (ImpGen.compileSubExpOfType int32) $ arrayDims dest_t let size = Imp.elements num_elems `Imp.withElemType` int32 (sub_mem, size') <- ImpGen.sDeclareMem "subhistogram_mem" size $ Space "device" let num_segments = length segment_dims sub_shape = Shape (segment_dims++[Var num_histos]) <> stripDims num_segments (arrayShape dest_t) sub_membind = ArrayIn sub_mem $ IxFun.iota $ map (primExpFromSubExp int32) $ shapeDims sub_shape subhisto <- sArray "genred_dest" (elemType dest_t) sub_shape sub_membind let unitHistoCase = ImpGen.emit $ Imp.SetMem sub_mem (ImpGen.memLocationName dest_mem) $ Space "device" multiHistoCase = do ImpGen.sAlloc_ sub_mem size' $ Space "device" sReplicate subhisto (Shape $ segment_dims ++ [Var num_histos, genReduceWidth op]) ne subhisto_t <- lookupType subhisto let slice = fullSliceNum (map (ImpGen.compileSubExpOfType int32) $ arrayDims subhisto_t) $ map (unitSlice 0 . ImpGen.compileSubExpOfType int32) segment_dims ++ [DimFix 0] ImpGen.sUpdate subhisto slice $ Var dest sIf (Imp.var num_histos int32 .==. 1) unitHistoCase multiHistoCase return subhisto (l', do_op) <- prepareAtomicUpdate l dests $ genReduceOp op return (l', (num_histos, dests, do_op)) genRedKernel :: [PatElem ExplicitMemory] -> KernelSpace -> [GenReduceOp InKernel] -> Body InKernel -> CallKernelGen [(VName, [VName])] genRedKernel map_pes space ops body = do (base_constants, init_constants) <- kernelInitialisationSetSpace space $ return () let constants = base_constants { kernelThreadActive = true } (space_is, space_sizes) = unzip $ spaceDimensions space i32_to_i64 = ConvOpExp (SExt Int32 Int64) space_sizes_64 = map (i32_to_i64 . ImpGen.compileSubExpOfType int32) space_sizes total_w_64 = product space_sizes_64 histograms <- prepareIntermediateArrays (init space_sizes) (kernelNumThreads constants) ops elems_per_thread_64 <- dPrimV "elems_per_thread_64" $ total_w_64 `quotRoundingUp` ConvOpExp (SExt Int32 Int64) (kernelNumThreads constants) sKernel constants "seggenred" $ allThreads constants $ do init_constants i <- newVName "i" -- Compute subhistogram index for each thread, per histogram. subhisto_inds <- forM histograms $ \(num_histograms, _, _) -> dPrimV "subhisto_ind" $ kernelGlobalThreadId constants `quot` (kernelNumThreads constants `quotRoundingUp` Imp.var num_histograms int32) sFor i Int64 (Imp.var elems_per_thread_64 int64) $ do -- Compute the offset into the input and output. To this a -- thread can add its local ID to figure out which element it is -- responsible for. The calculation is done with 64-bit -- integers to avoid overflow, but the final segment indexes are -- 32 bit. offset <- dPrimV "offset" $ (i32_to_i64 (kernelGroupId constants) * (Imp.var elems_per_thread_64 int64 * i32_to_i64 (kernelGroupSize constants))) + (Imp.var i int64 * i32_to_i64 (kernelGroupSize constants)) j <- dPrimV "j" $ Imp.var offset int64 + i32_to_i64 (kernelLocalThreadId constants) -- Construct segment indices. let setIndex v e = do dPrim_ v int32 v <-- e zipWithM_ setIndex space_is $ map (ConvOpExp (SExt Int64 Int32)) . unflattenIndex space_sizes_64 $ Imp.var j int64 -- We execute the bucket function once and update each histogram serially. -- We apply the bucket function if j=offset+ltid is less than -- num_elements. This also involves writing to the mapout -- arrays. let input_in_bounds = Imp.var j int32 .<. total_w_64 sWhen input_in_bounds $ ImpGen.compileStms mempty (stmsToList $ bodyStms body) $ do let (red_res, map_res) = splitFromEnd (length map_pes) $ bodyResult body sComment "save map-out results" $ forM_ (zip map_pes map_res) $ \(pe, se) -> ImpGen.copyDWIM (patElemName pe) (map ((`Imp.var` int32) . fst) $ kernelDimensions constants) se [] let (buckets, vs) = splitAt (length ops) red_res perOp = chunks $ map (length . genReduceDest) ops sComment "perform atomic updates" $ forM_ (zip5 ops histograms buckets (perOp vs) subhisto_inds) $ \(GenReduceOp dest_w _ _ shape lam, (_, _, do_op), bucket, vs', subhisto_ind) -> do let bucket' = ImpGen.compileSubExpOfType int32 bucket dest_w' = ImpGen.compileSubExpOfType int32 dest_w bucket_in_bounds = 0 .<=. bucket' .&&. bucket' .<. dest_w' bucket_is = map (`Imp.var` int32) (init space_is) ++ [Imp.var subhisto_ind int32, bucket'] vs_params = takeLast (length vs') $ lambdaParams lam sWhen bucket_in_bounds $ do ImpGen.dLParams vs_params vectorLoops [] (shapeDims shape) $ \is -> do forM_ (zip vs_params vs') $ \(p, v) -> ImpGen.copyDWIM (paramName p) [] v is do_op (bucket_is ++ is) let histogramInfo (num_histos, dests, _) = (num_histos, dests) return $ map histogramInfo histograms vectorLoops :: [Imp.Exp] -> [SubExp] -> ([Imp.Exp] -> ImpGen.ImpM lore op ()) -> ImpGen.ImpM lore op () vectorLoops is [] f = f $ reverse is vectorLoops is (d:ds) f = do i <- newVName "i" d' <- ImpGen.compileSubExp d ImpGen.sFor i Int32 d' $ vectorLoops (Imp.var i int32:is) ds f compileSegGenRed :: Pattern ExplicitMemory -> KernelSpace -> [GenReduceOp InKernel] -> Body InKernel -> CallKernelGen () compileSegGenRed (Pattern _ pes) genred_space ops body = do let num_red_res = length ops + sum (map (length . genReduceNeutral) ops) (all_red_pes, map_pes) = splitAt num_red_res pes infos <- genRedKernel map_pes genred_space ops body let pes_per_op = chunks (map (length . genReduceDest) ops) all_red_pes forM_ (zip3 infos pes_per_op ops) $ \((num_histos, subhistos), red_pes, op) -> do let unitHistoCase = -- This is OK because the memory blocks are at least as -- large as the ones we are supposed to use for the result. forM_ (zip red_pes subhistos) $ \(pe, subhisto) -> do pe_mem <- ImpGen.memLocationName . ImpGen.entryArrayLocation <$> ImpGen.lookupArray (patElemName pe) subhisto_mem <- ImpGen.memLocationName . ImpGen.entryArrayLocation <$> ImpGen.lookupArray subhisto ImpGen.emit $ Imp.SetMem pe_mem subhisto_mem $ Space "device" sIf (Imp.var num_histos int32 .==. 1) unitHistoCase $ do -- For the segmented reduction, we keep the segment dimensions -- unchanged. To this, we add two dimensions: one over the number -- of buckets, and one over the number of subhistograms. This -- inner dimension is the one that is collapsed in the reduction. let segment_dims = init $ spaceDimensions genred_space num_buckets = genReduceWidth op bucket_id <- newVName "bucket_id" subhistogram_id <- newVName "subhistogram_id" vector_ids <- mapM (const $ newVName "vector_id") $ shapeDims $ genReduceShape op gtid <- newVName $ baseString $ spaceGlobalId genred_space let lam = genReduceOp op segred_space = genred_space { spaceStructure = FlatThreadSpace $ segment_dims ++ [(bucket_id, num_buckets)] ++ zip vector_ids (shapeDims $ genReduceShape op) ++ [(subhistogram_id, Var num_histos)] , spaceGlobalId = gtid } compileSegRed' (Pattern [] red_pes) segred_space Commutative lam (genReduceNeutral op) $ \red_dests _ -> forM_ (zip red_dests subhistos) $ \((d, is), subhisto) -> ImpGen.copyDWIM d is (Var subhisto) $ map (`Imp.var` int32) $ map fst segment_dims ++ [subhistogram_id, bucket_id] ++ vector_ids