module Futhark.CodeGen.ImpGen.Multicore.Base ( extractAllocations, compileThreadResult, Locks (..), HostEnv (..), AtomicBinOp, MulticoreGen, decideScheduling, decideScheduling', renameSegBinOp, freeParams, renameHistOpLambda, atomicUpdateLocking, AtomicUpdate (..), DoAtomicUpdate, Locking (..), getSpace, getLoopBounds, getIterationDomain, getReturnParams, segOpString, ChunkLoopVectorization (..), generateChunkLoop, generateUniformizeLoop, extractVectorLane, inISPC, toParam, sLoopNestVectorized, ) where import Control.Monad import Data.Bifunctor import Data.Map qualified as M import Data.Maybe import Futhark.CodeGen.ImpCode.Multicore qualified as Imp import Futhark.CodeGen.ImpGen import Futhark.Error import Futhark.IR.MCMem import Futhark.MonadFreshNames import Futhark.Transform.Rename import Prelude hiding (quot, rem) -- | Is there an atomic t'BinOp' corresponding to this t'BinOp'? type AtomicBinOp = BinOp -> Maybe (VName -> VName -> Imp.Count Imp.Elements (Imp.TExp Int32) -> Imp.Exp -> Imp.AtomicOp) -- | Information about the locks available for accumulators. data Locks = Locks { locksArray :: VName, locksCount :: Int } data HostEnv = HostEnv { hostAtomics :: AtomicBinOp, hostLocks :: M.Map VName Locks } type MulticoreGen = ImpM MCMem HostEnv Imp.Multicore segOpString :: SegOp () MCMem -> MulticoreGen String segOpString SegMap {} = pure "segmap" segOpString SegRed {} = pure "segred" segOpString SegScan {} = pure "segscan" segOpString SegHist {} = pure "seghist" arrParam :: VName -> MulticoreGen Imp.Param arrParam arr = do name_entry <- lookupVar arr case name_entry of ArrayVar _ (ArrayEntry (MemLoc mem _ _) _) -> pure $ Imp.MemParam mem DefaultSpace _ -> error $ "arrParam: could not handle array " ++ show arr toParam :: VName -> TypeBase shape u -> MulticoreGen [Imp.Param] toParam name (Prim pt) = pure [Imp.ScalarParam name pt] toParam name (Mem space) = pure [Imp.MemParam name space] toParam name Array {} = pure <$> arrParam name toParam _name Acc {} = pure [] -- FIXME? Are we sure this works? getSpace :: SegOp () MCMem -> SegSpace getSpace (SegHist _ space _ _ _) = space getSpace (SegRed _ space _ _ _) = space getSpace (SegScan _ space _ _ _) = space getSpace (SegMap _ space _ _) = space getLoopBounds :: MulticoreGen (Imp.TExp Int64, Imp.TExp Int64) getLoopBounds = do start <- dPrim "start" int64 end <- dPrim "end" int64 emit $ Imp.Op $ Imp.GetLoopBounds (tvVar start) (tvVar end) pure (tvExp start, tvExp end) getIterationDomain :: SegOp () MCMem -> SegSpace -> MulticoreGen (Imp.TExp Int64) getIterationDomain SegMap {} space = do let ns = map snd $ unSegSpace space ns_64 = map pe64 ns pure $ product ns_64 getIterationDomain _ space = do let ns = map snd $ unSegSpace space ns_64 = map pe64 ns case unSegSpace space of [_] -> pure $ product ns_64 -- A segmented SegOp is over the segments -- so we drop the last dimension, which is -- executed sequentially _ -> pure $ product $ init ns_64 -- When the SegRed's return value is a scalar -- we perform a call by value-result in the segop function getReturnParams :: Pat LetDecMem -> SegOp () MCMem -> MulticoreGen [Imp.Param] getReturnParams pat SegRed {} = -- It's a good idea to make sure any prim values are initialised, as -- we will load them (redundantly) in the task code, and -- uninitialised values are UB. fmap concat . forM (patElems pat) $ \pe -> do case patElemType pe of Prim pt -> patElemName pe <~~ ValueExp (blankPrimValue pt) _ -> pure () toParam (patElemName pe) (patElemType pe) getReturnParams _ _ = pure mempty renameSegBinOp :: [SegBinOp MCMem] -> MulticoreGen [SegBinOp MCMem] renameSegBinOp segbinops = forM segbinops $ \(SegBinOp comm lam ne shape) -> do lam' <- renameLambda lam pure $ SegBinOp comm lam' ne shape compileThreadResult :: SegSpace -> PatElem LetDecMem -> KernelResult -> MulticoreGen () compileThreadResult space pe (Returns _ _ what) = do let is = map (Imp.le64 . fst) $ unSegSpace space copyDWIMFix (patElemName pe) is what [] compileThreadResult _ _ WriteReturns {} = compilerBugS "compileThreadResult: WriteReturns unhandled." compileThreadResult _ _ TileReturns {} = compilerBugS "compileThreadResult: TileReturns unhandled." compileThreadResult _ _ RegTileReturns {} = compilerBugS "compileThreadResult: RegTileReturns unhandled." freeParams :: FreeIn a => a -> MulticoreGen [Imp.Param] freeParams code = do let free = namesToList $ freeIn code ts <- mapM lookupType free concat <$> zipWithM toParam free ts isLoadBalanced :: Imp.MCCode -> Bool isLoadBalanced (a Imp.:>>: b) = isLoadBalanced a && isLoadBalanced b isLoadBalanced (Imp.For _ _ a) = isLoadBalanced a isLoadBalanced (Imp.If _ a b) = isLoadBalanced a && isLoadBalanced b isLoadBalanced (Imp.Comment _ a) = isLoadBalanced a isLoadBalanced Imp.While {} = False isLoadBalanced (Imp.Op (Imp.ParLoop _ code _)) = isLoadBalanced code isLoadBalanced (Imp.Op (Imp.ForEachActive _ a)) = isLoadBalanced a isLoadBalanced (Imp.Op (Imp.ForEach _ _ _ a)) = isLoadBalanced a isLoadBalanced (Imp.Op (Imp.ISPCKernel a _)) = isLoadBalanced a isLoadBalanced _ = True decideScheduling' :: SegOp () rep -> Imp.MCCode -> Imp.Scheduling decideScheduling' SegHist {} _ = Imp.Static decideScheduling' SegScan {} _ = Imp.Static decideScheduling' SegRed {} _ = Imp.Static decideScheduling' SegMap {} code = decideScheduling code decideScheduling :: Imp.MCCode -> Imp.Scheduling decideScheduling code = if isLoadBalanced code then Imp.Static else Imp.Dynamic -- | Try to extract invariant allocations. If we assume that the -- given 'Imp.MCCode' is the body of a 'SegOp', then it is always safe -- to move the immediate allocations to the prebody. extractAllocations :: Imp.MCCode -> (Imp.MCCode, Imp.MCCode) extractAllocations segop_code = f segop_code where declared = Imp.declaredIn segop_code f (Imp.DeclareMem name space) = -- Hoisting declarations out is always safe. (Imp.DeclareMem name space, mempty) f (Imp.Allocate name size space) | not $ freeIn size `namesIntersect` declared = (Imp.Allocate name size space, mempty) f (x Imp.:>>: y) = f x <> f y f (Imp.While cond body) = (mempty, Imp.While cond body) f (Imp.For i bound body) = (mempty, Imp.For i bound body) f (Imp.Comment s code) = second (Imp.Comment s) (f code) f Imp.Free {} = mempty f (Imp.If cond tcode fcode) = let (ta, tcode') = f tcode (fa, fcode') = f fcode in (ta <> fa, Imp.If cond tcode' fcode') f (Imp.Op (Imp.ParLoop s body free)) = let (body_allocs, body') = extractAllocations body (free_allocs, here_allocs) = f body_allocs free' = filter ( (`notNameIn` Imp.declaredIn body_allocs) . Imp.paramName ) free in ( free_allocs, here_allocs <> Imp.Op (Imp.ParLoop s body' free') ) f code = (mempty, code) -- | Indicates whether to vectorize a chunk loop or keep it sequential. -- We use this to allow falling back to sequential chunk loops in cases -- we don't care about trying to vectorize. data ChunkLoopVectorization = Vectorized | Scalar -- | Emit code for the chunk loop, given an action that generates code -- for a single iteration. -- -- The action is called with the (symbolic) index of the current -- iteration. generateChunkLoop :: String -> ChunkLoopVectorization -> (Imp.TExp Int64 -> MulticoreGen ()) -> MulticoreGen () generateChunkLoop desc Scalar m = do (start, end) <- getLoopBounds n <- dPrimVE "n" $ end - start i <- newVName (desc <> "_i") (body_allocs, body) <- fmap extractAllocations $ collect $ do addLoopVar i Int64 m $ start + Imp.le64 i emit body_allocs -- Emit either foreach or normal for loop let bound = untyped n emit $ Imp.For i bound body generateChunkLoop desc Vectorized m = do (start, end) <- getLoopBounds n <- dPrimVE "n" $ end - start i <- newVName (desc <> "_i") (body_allocs, body) <- fmap extractAllocations $ collect $ do addLoopVar i Int64 m $ Imp.le64 i emit body_allocs -- Emit either foreach or normal for loop let from = untyped start let bound = untyped (start + n) emit $ Imp.Op $ Imp.ForEach i from bound body -- | Emit code for a sequential loop over each vector lane, given -- and action that generates code for a single iteration. The action -- is called with the symbolic index of the current iteration. generateUniformizeLoop :: (Imp.TExp Int64 -> MulticoreGen ()) -> MulticoreGen () generateUniformizeLoop m = do i <- newVName "uni_i" body <- collect $ do addLoopVar i Int64 m $ Imp.le64 i emit $ Imp.Op $ Imp.ForEachActive i body -- | Given a piece of code, if that code performs an assignment, turn -- that assignment into an extraction of element from a vector on the -- right hand side, using a passed index for the extraction. Other code -- is left as is. extractVectorLane :: Imp.TExp Int64 -> MulticoreGen Imp.MCCode -> MulticoreGen () extractVectorLane j code = do let ut_exp = untyped j code' <- code case code' of Imp.SetScalar vname e -> do typ <- lookupType vname case typ of -- ISPC v1.17 does not support extract on f16 yet.. -- Thus we do this stupid conversion to f32 Prim (FloatType Float16) -> do tv <- dPrim "hack_extract_f16" (FloatType Float32) emit $ Imp.SetScalar (tvVar tv) e emit $ Imp.Op $ Imp.ExtractLane vname (untyped $ tvExp tv) ut_exp _ -> emit $ Imp.Op $ Imp.ExtractLane vname e ut_exp _ -> emit code' -- | Given an action that may generate some code, put that code -- into an ISPC kernel. inISPC :: MulticoreGen () -> MulticoreGen () inISPC code = do code' <- collect code free <- freeParams code' emit $ Imp.Op $ Imp.ISPCKernel code' free ------------------------------- ------- SegRed helpers ------- ------------------------------- sForVectorized' :: VName -> Imp.Exp -> MulticoreGen () -> MulticoreGen () sForVectorized' i bound body = do let it = case primExpType bound of IntType bound_t -> bound_t t -> error $ "sFor': bound " ++ prettyString bound ++ " is of type " ++ prettyString t addLoopVar i it body' <- collect body emit $ Imp.Op $ Imp.ForEach i (Imp.ValueExp $ blankPrimValue $ Imp.IntType Imp.Int64) bound body' sForVectorized :: String -> Imp.TExp t -> (Imp.TExp t -> MulticoreGen ()) -> MulticoreGen () sForVectorized i bound body = do i' <- newVName i sForVectorized' i' (untyped bound) $ body $ TPrimExp $ Imp.var i' $ primExpType $ untyped bound -- | Like sLoopNest, but puts a vectorized loop at the innermost layer. sLoopNestVectorized :: Shape -> ([Imp.TExp Int64] -> MulticoreGen ()) -> MulticoreGen () sLoopNestVectorized = sLoopNest' [] . shapeDims where sLoopNest' is [] f = f $ reverse is sLoopNest' is [d] f = sForVectorized "nest_i" (pe64 d) $ \i -> sLoopNest' (i : is) [] f sLoopNest' is (d : ds) f = sFor "nest_i" (pe64 d) $ \i -> sLoopNest' (i : is) ds f ------------------------------- ------- SegHist helpers ------- ------------------------------- renameHistOpLambda :: [HistOp MCMem] -> MulticoreGen [HistOp MCMem] renameHistOpLambda hist_ops = forM hist_ops $ \(HistOp w rf dest neutral shape lam) -> do lam' <- renameLambda lam pure $ HistOp w rf dest neutral shape lam' -- | Locking strategy used for an atomic update. data Locking = Locking { -- | Array containing the lock. lockingArray :: VName, -- | Value for us to consider the lock free. lockingIsUnlocked :: Imp.TExp Int32, -- | What to write when we lock it. lockingToLock :: Imp.TExp Int32, -- | What to write when we unlock it. lockingToUnlock :: Imp.TExp Int32, -- | A transformation from the logical lock index to the -- physical position in the array. This can also be used -- to make the lock array smaller. lockingMapping :: [Imp.TExp Int64] -> [Imp.TExp Int64] } -- | A function for generating code for an atomic update. Assumes -- that the bucket is in-bounds. type DoAtomicUpdate rep r = [VName] -> [Imp.TExp Int64] -> MulticoreGen () -- | The mechanism that will be used for performing the atomic update. -- Approximates how efficient it will be. Ordered from most to least -- efficient. data AtomicUpdate rep r = AtomicPrim (DoAtomicUpdate rep r) | -- | Can be done by efficient swaps. AtomicCAS (DoAtomicUpdate rep r) | -- | Requires explicit locking. AtomicLocking (Locking -> DoAtomicUpdate rep r) atomicUpdateLocking :: AtomicBinOp -> Lambda MCMem -> AtomicUpdate MCMem () atomicUpdateLocking atomicBinOp lam | Just ops_and_ts <- lamIsBinOp lam, all (\(_, t, _, _) -> supportedPrims $ primBitSize t) ops_and_ts = primOrCas ops_and_ts $ \arrs bucket -> -- If the operator is a vectorised binary operator on 32-bit values, -- we can use a particularly efficient implementation. If the -- operator has an atomic implementation we use that, otherwise it -- is still a binary operator which can be implemented by atomic -- compare-and-swap if 32 bits. forM_ (zip arrs ops_and_ts) $ \(a, (op, t, x, y)) -> do -- Common variables. old <- dPrim "old" t (arr', _a_space, bucket_offset) <- fullyIndexArray a bucket case opHasAtomicSupport (tvVar old) arr' (sExt32 <$> bucket_offset) op of Just f -> sOp $ f $ Imp.var y t Nothing -> atomicUpdateCAS t a (tvVar old) bucket x $ x <~~ Imp.BinOpExp op (Imp.var x t) (Imp.var y t) where opHasAtomicSupport old arr' bucket' bop = do let atomic f = Imp.Atomic . f old arr' bucket' atomic <$> atomicBinOp bop primOrCas ops | all isPrim ops = AtomicPrim | otherwise = AtomicCAS isPrim (op, _, _, _) = isJust $ atomicBinOp op atomicUpdateLocking _ op | [Prim t] <- lambdaReturnType op, [xp, _] <- lambdaParams op, supportedPrims (primBitSize t) = AtomicCAS $ \[arr] bucket -> do old <- dPrim "old" t atomicUpdateCAS t arr (tvVar old) bucket (paramName xp) $ compileBody' [xp] $ lambdaBody op atomicUpdateLocking _ op = AtomicLocking $ \locking arrs bucket -> do old <- dPrim "old" int32 continue <- dPrimVol "continue" int32 (0 :: Imp.TExp Int32) -- Correctly index into locks. (locks', _locks_space, locks_offset) <- fullyIndexArray (lockingArray locking) $ lockingMapping locking bucket -- Critical section let try_acquire_lock = do old <-- (0 :: Imp.TExp Int32) sOp . Imp.Atomic $ Imp.AtomicCmpXchg int32 (tvVar old) locks' (sExt32 <$> locks_offset) (tvVar continue) (untyped (lockingToLock locking)) lock_acquired = tvExp continue -- Even the releasing is done with an atomic rather than a -- simple write, for memory coherency reasons. release_lock = do old <-- lockingToLock locking sOp . Imp.Atomic $ Imp.AtomicCmpXchg int32 (tvVar old) locks' (sExt32 <$> locks_offset) (tvVar continue) (untyped (lockingToUnlock locking)) -- Preparing parameters. It is assumed that the caller has already -- filled the arr_params. We copy the current value to the -- accumulator parameters. let (acc_params, _arr_params) = splitAt (length arrs) $ lambdaParams op bind_acc_params = everythingVolatile $ sComment "bind lhs" $ forM_ (zip acc_params arrs) $ \(acc_p, arr) -> copyDWIMFix (paramName acc_p) [] (Var arr) bucket let op_body = sComment "execute operation" $ compileBody' acc_params $ lambdaBody op do_hist = everythingVolatile $ sComment "update global result" $ zipWithM_ (writeArray bucket) arrs $ map (Var . paramName) acc_params -- While-loop: Try to insert your value sWhile (tvExp continue .==. 0) $ do try_acquire_lock sUnless (lock_acquired .==. 0) $ do dLParams acc_params bind_acc_params op_body do_hist release_lock where writeArray bucket arr val = copyDWIMFix arr bucket val [] atomicUpdateCAS :: PrimType -> VName -> VName -> [Imp.TExp Int64] -> VName -> MulticoreGen () -> MulticoreGen () atomicUpdateCAS t arr old bucket x do_op = do run_loop <- dPrimV "run_loop" (0 :: Imp.TExp Int32) (arr', _a_space, bucket_offset) <- fullyIndexArray arr bucket bytes <- toIntegral $ primBitSize t let (toBits, fromBits) = case t of FloatType Float16 -> ( \v -> Imp.FunExp "to_bits16" [v] int16, \v -> Imp.FunExp "from_bits16" [v] t ) FloatType Float32 -> ( \v -> Imp.FunExp "to_bits32" [v] int32, \v -> Imp.FunExp "from_bits32" [v] t ) FloatType Float64 -> ( \v -> Imp.FunExp "to_bits64" [v] int64, \v -> Imp.FunExp "from_bits64" [v] t ) _ -> (id, id) int | primBitSize t == 16 = int16 | primBitSize t == 32 = int32 | otherwise = int64 everythingVolatile $ copyDWIMFix old [] (Var arr) bucket old_bits_v <- tvVar <$> dPrim "old_bits" int old_bits_v <~~ toBits (Imp.var old t) let old_bits = Imp.var old_bits_v int -- While-loop: Try to insert your value sWhile (tvExp run_loop .==. 0) $ do x <~~ Imp.var old t do_op -- Writes result into x sOp . Imp.Atomic $ Imp.AtomicCmpXchg bytes old_bits_v arr' (sExt32 <$> bucket_offset) (tvVar run_loop) (toBits (Imp.var x t)) old <~~ fromBits old_bits supportedPrims :: Int -> Bool supportedPrims 8 = True supportedPrims 16 = True supportedPrims 32 = True supportedPrims 64 = True supportedPrims _ = False -- Supported bytes lengths by GCC (and clang) compiler toIntegral :: Int -> MulticoreGen PrimType toIntegral 8 = pure int8 toIntegral 16 = pure int16 toIntegral 32 = pure int32 toIntegral 64 = pure int64 toIntegral b = error $ "number of bytes is not supported for CAS - " ++ prettyString b