module Futhark.CodeGen.ImpGen.Multicore.Base ( extractAllocations, compileThreadResult, Locks (..), HostEnv (..), AtomicBinOp, MulticoreGen, decideScheduling, decideScheduling', groupResultArrays, renameSegBinOp, freeParams, renameHistOpLambda, atomicUpdateLocking, AtomicUpdate (..), DoAtomicUpdate, Locking (..), getSpace, getLoopBounds, getIterationDomain, getReturnParams, segOpString, ChunkLoopVectorization (..), generateChunkLoop, generateUniformizeLoop, extractVectorLane, inISPC, toParam, sLoopNestVectorized, ) where import Control.Monad import Data.Bifunctor import Data.Map qualified as M import Data.Maybe import Futhark.CodeGen.ImpCode.Multicore qualified as Imp import Futhark.CodeGen.ImpGen import Futhark.Error import Futhark.IR.MCMem import Futhark.MonadFreshNames import Futhark.Transform.Rename import Prelude hiding (quot, rem) -- | Is there an atomic t'BinOp' corresponding to this t'BinOp'? type AtomicBinOp = BinOp -> Maybe (VName -> VName -> Imp.Count Imp.Elements (Imp.TExp Int32) -> Imp.Exp -> Imp.AtomicOp) -- | Information about the locks available for accumulators. data Locks = Locks { locksArray :: VName, locksCount :: Int } data HostEnv = HostEnv { hostAtomics :: AtomicBinOp, hostLocks :: M.Map VName Locks } type MulticoreGen = ImpM MCMem HostEnv Imp.Multicore segOpString :: SegOp () MCMem -> MulticoreGen String segOpString SegMap {} = pure "segmap" segOpString SegRed {} = pure "segred" segOpString SegScan {} = pure "segscan" segOpString SegHist {} = pure "seghist" arrParam :: VName -> MulticoreGen Imp.Param arrParam arr = do name_entry <- lookupVar arr case name_entry of ArrayVar _ (ArrayEntry (MemLoc mem _ _) _) -> pure $ Imp.MemParam mem DefaultSpace _ -> error $ "arrParam: could not handle array " ++ show arr toParam :: VName -> TypeBase shape u -> MulticoreGen [Imp.Param] toParam name (Prim pt) = pure [Imp.ScalarParam name pt] toParam name (Mem space) = pure [Imp.MemParam name space] toParam name Array {} = pure <$> arrParam name toParam _name Acc {} = pure [] -- FIXME? Are we sure this works? getSpace :: SegOp () MCMem -> SegSpace getSpace (SegHist _ space _ _ _) = space getSpace (SegRed _ space _ _ _) = space getSpace (SegScan _ space _ _ _) = space getSpace (SegMap _ space _ _) = space getLoopBounds :: MulticoreGen (Imp.TExp Int64, Imp.TExp Int64) getLoopBounds = do start <- dPrim "start" int64 end <- dPrim "end" int64 emit $ Imp.Op $ Imp.GetLoopBounds (tvVar start) (tvVar end) pure (tvExp start, tvExp end) getIterationDomain :: SegOp () MCMem -> SegSpace -> MulticoreGen (Imp.TExp Int64) getIterationDomain SegMap {} space = do let ns = map snd $ unSegSpace space ns_64 = map pe64 ns pure $ product ns_64 getIterationDomain _ space = do let ns = map snd $ unSegSpace space ns_64 = map pe64 ns case unSegSpace space of [_] -> pure $ product ns_64 -- A segmented SegOp is over the segments -- so we drop the last dimension, which is -- executed sequentially _ -> pure $ product $ init ns_64 -- When the SegRed's return value is a scalar -- we perform a call by value-result in the segop function getReturnParams :: Pat LetDecMem -> SegOp () MCMem -> MulticoreGen [Imp.Param] getReturnParams pat SegRed {} = -- It's a good idea to make sure any prim values are initialised, as -- we will load them (redundantly) in the task code, and -- uninitialised values are UB. fmap concat . forM (patElems pat) $ \pe -> do case patElemType pe of Prim pt -> patElemName pe <~~ ValueExp (blankPrimValue pt) _ -> pure () toParam (patElemName pe) (patElemType pe) getReturnParams _ _ = pure mempty renameSegBinOp :: [SegBinOp MCMem] -> MulticoreGen [SegBinOp MCMem] renameSegBinOp segbinops = forM segbinops $ \(SegBinOp comm lam ne shape) -> do lam' <- renameLambda lam pure $ SegBinOp comm lam' ne shape compileThreadResult :: SegSpace -> PatElem LetDecMem -> KernelResult -> MulticoreGen () compileThreadResult space pe (Returns _ _ what) = do let is = map (Imp.le64 . fst) $ unSegSpace space copyDWIMFix (patElemName pe) is what [] compileThreadResult _ _ WriteReturns {} = compilerBugS "compileThreadResult: WriteReturns unhandled." compileThreadResult _ _ TileReturns {} = compilerBugS "compileThreadResult: TileReturns unhandled." compileThreadResult _ _ RegTileReturns {} = compilerBugS "compileThreadResult: RegTileReturns unhandled." freeParams :: FreeIn a => a -> MulticoreGen [Imp.Param] freeParams code = do let free = namesToList $ freeIn code ts <- mapM lookupType free concat <$> zipWithM toParam free ts -- | Arrays for storing group results shared between threads groupResultArrays :: String -> SubExp -> [SegBinOp MCMem] -> MulticoreGen [[VName]] groupResultArrays s num_threads reds = forM reds $ \(SegBinOp _ lam _ shape) -> forM (lambdaReturnType lam) $ \t -> do let full_shape = Shape [num_threads] <> shape <> arrayShape t sAllocArray s (elemType t) full_shape DefaultSpace isLoadBalanced :: Imp.MCCode -> Bool isLoadBalanced (a Imp.:>>: b) = isLoadBalanced a && isLoadBalanced b isLoadBalanced (Imp.For _ _ a) = isLoadBalanced a isLoadBalanced (Imp.If _ a b) = isLoadBalanced a && isLoadBalanced b isLoadBalanced (Imp.Comment _ a) = isLoadBalanced a isLoadBalanced Imp.While {} = False isLoadBalanced (Imp.Op (Imp.ParLoop _ code _)) = isLoadBalanced code isLoadBalanced (Imp.Op (Imp.ForEachActive _ a)) = isLoadBalanced a isLoadBalanced (Imp.Op (Imp.ForEach _ _ _ a)) = isLoadBalanced a isLoadBalanced (Imp.Op (Imp.ISPCKernel a _)) = isLoadBalanced a isLoadBalanced _ = True segBinOpComm' :: [SegBinOp rep] -> Commutativity segBinOpComm' = mconcat . map segBinOpComm decideScheduling' :: SegOp () rep -> Imp.MCCode -> Imp.Scheduling decideScheduling' SegHist {} _ = Imp.Static decideScheduling' SegScan {} _ = Imp.Static decideScheduling' (SegRed _ _ reds _ _) code = case segBinOpComm' reds of Commutative -> decideScheduling code Noncommutative -> Imp.Static decideScheduling' SegMap {} code = decideScheduling code decideScheduling :: Imp.MCCode -> Imp.Scheduling decideScheduling code = if isLoadBalanced code then Imp.Static else Imp.Dynamic -- | Try to extract invariant allocations. If we assume that the -- given 'Imp.MCCode' is the body of a 'SegOp', then it is always safe -- to move the immediate allocations to the prebody. extractAllocations :: Imp.MCCode -> (Imp.MCCode, Imp.MCCode) extractAllocations segop_code = f segop_code where declared = Imp.declaredIn segop_code f (Imp.DeclareMem name space) = -- Hoisting declarations out is always safe. (Imp.DeclareMem name space, mempty) f (Imp.Allocate name size space) | not $ freeIn size `namesIntersect` declared = (Imp.Allocate name size space, mempty) f (x Imp.:>>: y) = f x <> f y f (Imp.While cond body) = (mempty, Imp.While cond body) f (Imp.For i bound body) = (mempty, Imp.For i bound body) f (Imp.Comment s code) = second (Imp.Comment s) (f code) f Imp.Free {} = mempty f (Imp.If cond tcode fcode) = let (ta, tcode') = f tcode (fa, fcode') = f fcode in (ta <> fa, Imp.If cond tcode' fcode') f (Imp.Op (Imp.ParLoop s body free)) = let (body_allocs, body') = extractAllocations body (free_allocs, here_allocs) = f body_allocs free' = filter ( (`notNameIn` Imp.declaredIn body_allocs) . Imp.paramName ) free in ( free_allocs, here_allocs <> Imp.Op (Imp.ParLoop s body' free') ) f code = (mempty, code) -- | Indicates whether to vectorize a chunk loop or keep it sequential. -- We use this to allow falling back to sequential chunk loops in cases -- we don't care about trying to vectorize. data ChunkLoopVectorization = Vectorized | Scalar -- | Emit code for the chunk loop, given an action that generates code -- for a single iteration. -- -- The action is called with the (symbolic) index of the current -- iteration. generateChunkLoop :: String -> ChunkLoopVectorization -> (Imp.TExp Int64 -> MulticoreGen ()) -> MulticoreGen () generateChunkLoop desc Scalar m = do (start, end) <- getLoopBounds n <- dPrimVE "n" $ end - start i <- newVName (desc <> "_i") (body_allocs, body) <- fmap extractAllocations $ collect $ do addLoopVar i Int64 m $ start + Imp.le64 i emit body_allocs -- Emit either foreach or normal for loop let bound = untyped n emit $ Imp.For i bound body generateChunkLoop desc Vectorized m = do (start, end) <- getLoopBounds n <- dPrimVE "n" $ end - start i <- newVName (desc <> "_i") (body_allocs, body) <- fmap extractAllocations $ collect $ do addLoopVar i Int64 m $ Imp.le64 i emit body_allocs -- Emit either foreach or normal for loop let from = untyped start let bound = untyped (start + n) emit $ Imp.Op $ Imp.ForEach i from bound body -- | Emit code for a sequential loop over each vector lane, given -- and action that generates code for a single iteration. The action -- is called with the symbolic index of the current iteration. generateUniformizeLoop :: (Imp.TExp Int64 -> MulticoreGen ()) -> MulticoreGen () generateUniformizeLoop m = do i <- newVName "uni_i" body <- collect $ do addLoopVar i Int64 m $ Imp.le64 i emit $ Imp.Op $ Imp.ForEachActive i body -- | Given a piece of code, if that code performs an assignment, turn -- that assignment into an extraction of element from a vector on the -- right hand side, using a passed index for the extraction. Other code -- is left as is. extractVectorLane :: Imp.TExp Int64 -> MulticoreGen Imp.MCCode -> MulticoreGen () extractVectorLane j code = do let ut_exp = untyped j code' <- code case code' of Imp.SetScalar vname e -> do typ <- lookupType vname case typ of -- ISPC v1.17 does not support extract on f16 yet.. -- Thus we do this stupid conversion to f32 Prim (FloatType Float16) -> do tv <- dPrim "hack_extract_f16" (FloatType Float32) emit $ Imp.SetScalar (tvVar tv) e emit $ Imp.Op $ Imp.ExtractLane vname (untyped $ tvExp tv) ut_exp _ -> emit $ Imp.Op $ Imp.ExtractLane vname e ut_exp _ -> emit code' -- | Given an action that may generate some code, put that code -- into an ISPC kernel. inISPC :: MulticoreGen () -> MulticoreGen () inISPC code = do code' <- collect code free <- freeParams code' emit $ Imp.Op $ Imp.ISPCKernel code' free ------------------------------- ------- SegRed helpers ------- ------------------------------- sForVectorized' :: VName -> Imp.Exp -> MulticoreGen () -> MulticoreGen () sForVectorized' i bound body = do let it = case primExpType bound of IntType bound_t -> bound_t t -> error $ "sFor': bound " ++ prettyString bound ++ " is of type " ++ prettyString t addLoopVar i it body' <- collect body emit $ Imp.Op $ Imp.ForEach i (Imp.ValueExp $ blankPrimValue $ Imp.IntType Imp.Int64) bound body' sForVectorized :: String -> Imp.TExp t -> (Imp.TExp t -> MulticoreGen ()) -> MulticoreGen () sForVectorized i bound body = do i' <- newVName i sForVectorized' i' (untyped bound) $ body $ TPrimExp $ Imp.var i' $ primExpType $ untyped bound -- | Like sLoopNest, but puts a vectorized loop at the innermost layer. sLoopNestVectorized :: Shape -> ([Imp.TExp Int64] -> MulticoreGen ()) -> MulticoreGen () sLoopNestVectorized = sLoopNest' [] . shapeDims where sLoopNest' is [] f = f $ reverse is sLoopNest' is [d] f = sForVectorized "nest_i" (pe64 d) $ \i -> sLoopNest' (i : is) [] f sLoopNest' is (d : ds) f = sFor "nest_i" (pe64 d) $ \i -> sLoopNest' (i : is) ds f ------------------------------- ------- SegHist helpers ------- ------------------------------- renameHistOpLambda :: [HistOp MCMem] -> MulticoreGen [HistOp MCMem] renameHistOpLambda hist_ops = forM hist_ops $ \(HistOp w rf dest neutral shape lam) -> do lam' <- renameLambda lam pure $ HistOp w rf dest neutral shape lam' -- | Locking strategy used for an atomic update. data Locking = Locking { -- | Array containing the lock. lockingArray :: VName, -- | Value for us to consider the lock free. lockingIsUnlocked :: Imp.TExp Int32, -- | What to write when we lock it. lockingToLock :: Imp.TExp Int32, -- | What to write when we unlock it. lockingToUnlock :: Imp.TExp Int32, -- | A transformation from the logical lock index to the -- physical position in the array. This can also be used -- to make the lock array smaller. lockingMapping :: [Imp.TExp Int64] -> [Imp.TExp Int64] } -- | A function for generating code for an atomic update. Assumes -- that the bucket is in-bounds. type DoAtomicUpdate rep r = [VName] -> [Imp.TExp Int64] -> MulticoreGen () -- | The mechanism that will be used for performing the atomic update. -- Approximates how efficient it will be. Ordered from most to least -- efficient. data AtomicUpdate rep r = AtomicPrim (DoAtomicUpdate rep r) | -- | Can be done by efficient swaps. AtomicCAS (DoAtomicUpdate rep r) | -- | Requires explicit locking. AtomicLocking (Locking -> DoAtomicUpdate rep r) atomicUpdateLocking :: AtomicBinOp -> Lambda MCMem -> AtomicUpdate MCMem () atomicUpdateLocking atomicBinOp lam | Just ops_and_ts <- lamIsBinOp lam, all (\(_, t, _, _) -> supportedPrims $ primBitSize t) ops_and_ts = primOrCas ops_and_ts $ \arrs bucket -> -- If the operator is a vectorised binary operator on 32-bit values, -- we can use a particularly efficient implementation. If the -- operator has an atomic implementation we use that, otherwise it -- is still a binary operator which can be implemented by atomic -- compare-and-swap if 32 bits. forM_ (zip arrs ops_and_ts) $ \(a, (op, t, x, y)) -> do -- Common variables. old <- dPrim "old" t (arr', _a_space, bucket_offset) <- fullyIndexArray a bucket case opHasAtomicSupport (tvVar old) arr' (sExt32 <$> bucket_offset) op of Just f -> sOp $ f $ Imp.var y t Nothing -> atomicUpdateCAS t a (tvVar old) bucket x $ x <~~ Imp.BinOpExp op (Imp.var x t) (Imp.var y t) where opHasAtomicSupport old arr' bucket' bop = do let atomic f = Imp.Atomic . f old arr' bucket' atomic <$> atomicBinOp bop primOrCas ops | all isPrim ops = AtomicPrim | otherwise = AtomicCAS isPrim (op, _, _, _) = isJust $ atomicBinOp op atomicUpdateLocking _ op | [Prim t] <- lambdaReturnType op, [xp, _] <- lambdaParams op, supportedPrims (primBitSize t) = AtomicCAS $ \[arr] bucket -> do old <- dPrim "old" t atomicUpdateCAS t arr (tvVar old) bucket (paramName xp) $ compileBody' [xp] $ lambdaBody op atomicUpdateLocking _ op = AtomicLocking $ \locking arrs bucket -> do old <- dPrim "old" int32 continue <- dPrimVol "continue" int32 (0 :: Imp.TExp Int32) -- Correctly index into locks. (locks', _locks_space, locks_offset) <- fullyIndexArray (lockingArray locking) $ lockingMapping locking bucket -- Critical section let try_acquire_lock = do old <-- (0 :: Imp.TExp Int32) sOp . Imp.Atomic $ Imp.AtomicCmpXchg int32 (tvVar old) locks' (sExt32 <$> locks_offset) (tvVar continue) (untyped (lockingToLock locking)) lock_acquired = tvExp continue -- Even the releasing is done with an atomic rather than a -- simple write, for memory coherency reasons. release_lock = do old <-- lockingToLock locking sOp . Imp.Atomic $ Imp.AtomicCmpXchg int32 (tvVar old) locks' (sExt32 <$> locks_offset) (tvVar continue) (untyped (lockingToUnlock locking)) -- Preparing parameters. It is assumed that the caller has already -- filled the arr_params. We copy the current value to the -- accumulator parameters. let (acc_params, _arr_params) = splitAt (length arrs) $ lambdaParams op bind_acc_params = everythingVolatile $ sComment "bind lhs" $ forM_ (zip acc_params arrs) $ \(acc_p, arr) -> copyDWIMFix (paramName acc_p) [] (Var arr) bucket let op_body = sComment "execute operation" $ compileBody' acc_params $ lambdaBody op do_hist = everythingVolatile $ sComment "update global result" $ zipWithM_ (writeArray bucket) arrs $ map (Var . paramName) acc_params -- While-loop: Try to insert your value sWhile (tvExp continue .==. 0) $ do try_acquire_lock sUnless (lock_acquired .==. 0) $ do dLParams acc_params bind_acc_params op_body do_hist release_lock where writeArray bucket arr val = copyDWIMFix arr bucket val [] atomicUpdateCAS :: PrimType -> VName -> VName -> [Imp.TExp Int64] -> VName -> MulticoreGen () -> MulticoreGen () atomicUpdateCAS t arr old bucket x do_op = do run_loop <- dPrimV "run_loop" (0 :: Imp.TExp Int32) (arr', _a_space, bucket_offset) <- fullyIndexArray arr bucket bytes <- toIntegral $ primBitSize t let (toBits, fromBits) = case t of FloatType Float16 -> ( \v -> Imp.FunExp "to_bits16" [v] int16, \v -> Imp.FunExp "from_bits16" [v] t ) FloatType Float32 -> ( \v -> Imp.FunExp "to_bits32" [v] int32, \v -> Imp.FunExp "from_bits32" [v] t ) FloatType Float64 -> ( \v -> Imp.FunExp "to_bits64" [v] int64, \v -> Imp.FunExp "from_bits64" [v] t ) _ -> (id, id) int | primBitSize t == 16 = int16 | primBitSize t == 32 = int32 | otherwise = int64 everythingVolatile $ copyDWIMFix old [] (Var arr) bucket old_bits_v <- tvVar <$> dPrim "old_bits" int old_bits_v <~~ toBits (Imp.var old t) let old_bits = Imp.var old_bits_v int -- While-loop: Try to insert your value sWhile (tvExp run_loop .==. 0) $ do x <~~ Imp.var old t do_op -- Writes result into x sOp . Imp.Atomic $ Imp.AtomicCmpXchg bytes old_bits_v arr' (sExt32 <$> bucket_offset) (tvVar run_loop) (toBits (Imp.var x t)) old <~~ fromBits old_bits supportedPrims :: Int -> Bool supportedPrims 8 = True supportedPrims 16 = True supportedPrims 32 = True supportedPrims 64 = True supportedPrims _ = False -- Supported bytes lengths by GCC (and clang) compiler toIntegral :: Int -> MulticoreGen PrimType toIntegral 8 = pure int8 toIntegral 16 = pure int16 toIntegral 32 = pure int32 toIntegral 64 = pure int64 toIntegral b = error $ "number of bytes is not supported for CAS - " ++ prettyString b