{-# LANGUAGE TypeFamilies #-} {-# LANGUAGE UndecidableInstances #-} module Futhark.IR.GPU.Op ( -- * Size operations SizeOp (..), -- * Host operations HostOp (..), traverseHostOpStms, typeCheckHostOp, -- * SegOp refinements SegLevel (..), segVirt, SegVirt (..), SegSeqDims (..), KernelGrid (..), -- * Reexports module Futhark.IR.GPU.Sizes, module Futhark.IR.SegOp, ) where import Control.Monad import Data.Sequence qualified as SQ import Data.Text qualified as T import Futhark.Analysis.Alias qualified as Alias import Futhark.Analysis.Metrics import Futhark.Analysis.SymbolTable qualified as ST import Futhark.IR import Futhark.IR.Aliases (Aliases, CanBeAliased (..)) import Futhark.IR.GPU.Sizes import Futhark.IR.Prop.Aliases import Futhark.IR.SegOp import Futhark.IR.TypeCheck qualified as TC import Futhark.Optimise.Simplify.Engine qualified as Engine import Futhark.Optimise.Simplify.Rep import Futhark.Transform.Rename import Futhark.Transform.Substitute import Futhark.Util.Pretty ( commasep, parens, ppTuple', pretty, (<+>), ) import Futhark.Util.Pretty qualified as PP -- | These dimensions (indexed from 0, outermost) of the corresponding -- 'SegSpace' should not be parallelised, but instead iterated -- sequentially. For example, with a 'SegSeqDims' of @[0]@ and a -- 'SegSpace' with dimensions @[n][m]@, there will be an outer loop -- with @n@ iterations, while the @m@ dimension will be parallelised. -- -- Semantically, this has no effect, but it may allow reductions in -- memory usage or other low-level optimisations. Operationally, the -- guarantee is that for a SegSeqDims of e.g. @[i,j,k]@, threads -- running at any given moment will always have the same indexes along -- the dimensions specified by @[i,j,k]@. -- -- At the moment, this is only supported for 'SegNoVirtFull' -- intra-group parallelism in GPU code, as we have not yet found it -- useful anywhere else. newtype SegSeqDims = SegSeqDims {segSeqDims :: [Int]} deriving (Eq, Ord, Show) -- | Do we need group-virtualisation when generating code for the -- segmented operation? In most cases, we do, but for some simple -- kernels, we compute the full number of groups in advance, and then -- virtualisation is an unnecessary (but generally very small) -- overhead. This only really matters for fairly trivial but very -- wide @map@ kernels where each thread performs constant-time work on -- scalars. data SegVirt = SegVirt | SegNoVirt | -- | Not only do we not need virtualisation, but we _guarantee_ -- that all physical threads participate in the work. This can -- save some checks in code generation. SegNoVirtFull SegSeqDims deriving (Eq, Ord, Show) -- | The actual, physical grid dimensions used for the GPU kernel -- running this 'SegOp'. data KernelGrid = KernelGrid { gridNumGroups :: Count NumGroups SubExp, gridGroupSize :: Count GroupSize SubExp } deriving (Eq, Ord, Show) -- | At which level the *body* of a t'SegOp' executes. data SegLevel = SegThread SegVirt (Maybe KernelGrid) | SegGroup SegVirt (Maybe KernelGrid) | SegThreadInGroup SegVirt deriving (Eq, Ord, Show) -- | The 'SegVirt' of the 'SegLevel'. segVirt :: SegLevel -> SegVirt segVirt (SegThread v _) = v segVirt (SegGroup v _) = v segVirt (SegThreadInGroup v) = v instance PP.Pretty SegVirt where pretty SegNoVirt = mempty pretty (SegNoVirtFull dims) = "full" <+> pretty (segSeqDims dims) pretty SegVirt = "virtualise" instance PP.Pretty KernelGrid where pretty (KernelGrid num_groups group_size) = "groups=" <> pretty num_groups <> PP.semi <+> "groupsize=" <> pretty group_size instance PP.Pretty SegLevel where pretty (SegThread virt grid) = PP.parens ("thread" <> PP.semi <+> pretty virt <> PP.semi <+> pretty grid) pretty (SegGroup virt grid) = PP.parens ("group" <> PP.semi <+> pretty virt <> PP.semi <+> pretty grid) pretty (SegThreadInGroup virt) = PP.parens ("ingroup" <> PP.semi <+> pretty virt) instance Engine.Simplifiable KernelGrid where simplify (KernelGrid num_groups group_size) = KernelGrid <$> traverse Engine.simplify num_groups <*> traverse Engine.simplify group_size instance Engine.Simplifiable SegLevel where simplify (SegThread virt grid) = SegThread virt <$> Engine.simplify grid simplify (SegGroup virt grid) = SegGroup virt <$> Engine.simplify grid simplify (SegThreadInGroup virt) = pure $ SegThreadInGroup virt instance Substitute KernelGrid where substituteNames substs (KernelGrid num_groups group_size) = KernelGrid (substituteNames substs num_groups) (substituteNames substs group_size) instance Substitute SegLevel where substituteNames substs (SegThread virt grid) = SegThread virt (substituteNames substs grid) substituteNames substs (SegGroup virt grid) = SegGroup virt (substituteNames substs grid) substituteNames _ (SegThreadInGroup virt) = SegThreadInGroup virt instance Rename SegLevel where rename = substituteRename instance FreeIn KernelGrid where freeIn' (KernelGrid num_groups group_size) = freeIn' (num_groups, group_size) instance FreeIn SegLevel where freeIn' (SegThread _virt grid) = freeIn' grid freeIn' (SegGroup _virt grid) = freeIn' grid freeIn' (SegThreadInGroup _virt) = mempty -- | A simple size-level query or computation. data SizeOp = -- | Produce some runtime-configurable size. GetSize Name SizeClass | -- | The maximum size of some class. GetSizeMax SizeClass | -- | Compare size (likely a threshold) with some integer value. CmpSizeLe Name SizeClass SubExp | -- | @CalcNumGroups w max_num_groups group_size@ calculates the -- number of GPU workgroups to use for an input of the given size. -- The @Name@ is a size name. Note that @w@ is an i64 to avoid -- overflow issues. CalcNumGroups SubExp Name SubExp deriving (Eq, Ord, Show) instance Substitute SizeOp where substituteNames substs (CmpSizeLe name sclass x) = CmpSizeLe name sclass (substituteNames substs x) substituteNames substs (CalcNumGroups w max_num_groups group_size) = CalcNumGroups (substituteNames substs w) max_num_groups (substituteNames substs group_size) substituteNames _ op = op instance Rename SizeOp where rename (CmpSizeLe name sclass x) = CmpSizeLe name sclass <$> rename x rename (CalcNumGroups w max_num_groups group_size) = CalcNumGroups <$> rename w <*> pure max_num_groups <*> rename group_size rename x = pure x instance IsOp SizeOp where safeOp _ = True cheapOp _ = True instance TypedOp SizeOp where opType (GetSize _ _) = pure [Prim int64] opType (GetSizeMax _) = pure [Prim int64] opType CmpSizeLe {} = pure [Prim Bool] opType CalcNumGroups {} = pure [Prim int64] instance AliasedOp SizeOp where opAliases _ = [mempty] consumedInOp _ = mempty instance FreeIn SizeOp where freeIn' (CmpSizeLe _ _ x) = freeIn' x freeIn' (CalcNumGroups w _ group_size) = freeIn' w <> freeIn' group_size freeIn' _ = mempty instance PP.Pretty SizeOp where pretty (GetSize name size_class) = "get_size" <> parens (commasep [pretty name, pretty size_class]) pretty (GetSizeMax size_class) = "get_size_max" <> parens (commasep [pretty size_class]) pretty (CmpSizeLe name size_class x) = "cmp_size" <> parens (commasep [pretty name, pretty size_class]) <+> "<=" <+> pretty x pretty (CalcNumGroups w max_num_groups group_size) = "calc_num_groups" <> parens (commasep [pretty w, pretty max_num_groups, pretty group_size]) instance OpMetrics SizeOp where opMetrics GetSize {} = seen "GetSize" opMetrics GetSizeMax {} = seen "GetSizeMax" opMetrics CmpSizeLe {} = seen "CmpSizeLe" opMetrics CalcNumGroups {} = seen "CalcNumGroups" typeCheckSizeOp :: TC.Checkable rep => SizeOp -> TC.TypeM rep () typeCheckSizeOp GetSize {} = pure () typeCheckSizeOp GetSizeMax {} = pure () typeCheckSizeOp (CmpSizeLe _ _ x) = TC.require [Prim int64] x typeCheckSizeOp (CalcNumGroups w _ group_size) = do TC.require [Prim int64] w TC.require [Prim int64] group_size -- | A host-level operation; parameterised by what else it can do. data HostOp op rep = -- | A segmented operation. SegOp (SegOp SegLevel rep) | SizeOp SizeOp | OtherOp (op rep) | -- | Code to run sequentially on the GPU, -- in a single thread. GPUBody [Type] (Body rep) deriving (Eq, Ord, Show) -- | A helper for defining 'TraverseOpStms'. traverseHostOpStms :: Monad m => OpStmsTraverser m (op rep) rep -> OpStmsTraverser m (HostOp op rep) rep traverseHostOpStms _ f (SegOp segop) = SegOp <$> traverseSegOpStms f segop traverseHostOpStms _ _ (SizeOp sizeop) = pure $ SizeOp sizeop traverseHostOpStms onOtherOp f (OtherOp other) = OtherOp <$> onOtherOp f other traverseHostOpStms _ f (GPUBody ts body) = do stms <- f mempty $ bodyStms body pure $ GPUBody ts $ body {bodyStms = stms} instance (ASTRep rep, Substitute (op rep)) => Substitute (HostOp op rep) where substituteNames substs (SegOp op) = SegOp $ substituteNames substs op substituteNames substs (OtherOp op) = OtherOp $ substituteNames substs op substituteNames substs (SizeOp op) = SizeOp $ substituteNames substs op substituteNames substs (GPUBody ts body) = GPUBody (substituteNames substs ts) (substituteNames substs body) instance (ASTRep rep, Rename (op rep)) => Rename (HostOp op rep) where rename (SegOp op) = SegOp <$> rename op rename (OtherOp op) = OtherOp <$> rename op rename (SizeOp op) = SizeOp <$> rename op rename (GPUBody ts body) = GPUBody <$> rename ts <*> rename body instance (ASTRep rep, IsOp (op rep)) => IsOp (HostOp op rep) where safeOp (SegOp op) = safeOp op safeOp (OtherOp op) = safeOp op safeOp (SizeOp op) = safeOp op safeOp (GPUBody _ body) = all (safeExp . stmExp) $ bodyStms body cheapOp (SegOp op) = cheapOp op cheapOp (OtherOp op) = cheapOp op cheapOp (SizeOp op) = cheapOp op cheapOp (GPUBody types body) = -- Current GPUBody usage only benefits from hoisting kernels that -- transfer scalars to device. SQ.null (bodyStms body) && all ((== 0) . arrayRank) types instance TypedOp (op rep) => TypedOp (HostOp op rep) where opType (SegOp op) = opType op opType (OtherOp op) = opType op opType (SizeOp op) = opType op opType (GPUBody ts _) = pure $ staticShapes $ map (`arrayOfRow` intConst Int64 1) ts instance (Aliased rep, AliasedOp (op rep)) => AliasedOp (HostOp op rep) where opAliases (SegOp op) = opAliases op opAliases (OtherOp op) = opAliases op opAliases (SizeOp op) = opAliases op opAliases (GPUBody ts _) = map (const mempty) ts consumedInOp (SegOp op) = consumedInOp op consumedInOp (OtherOp op) = consumedInOp op consumedInOp (SizeOp op) = consumedInOp op consumedInOp (GPUBody _ body) = consumedInBody body instance (ASTRep rep, FreeIn (op rep)) => FreeIn (HostOp op rep) where freeIn' (SegOp op) = freeIn' op freeIn' (OtherOp op) = freeIn' op freeIn' (SizeOp op) = freeIn' op freeIn' (GPUBody ts body) = freeIn' ts <> freeIn' body instance CanBeAliased op => CanBeAliased (HostOp op) where addOpAliases aliases (SegOp op) = SegOp $ addOpAliases aliases op addOpAliases aliases (GPUBody ts body) = GPUBody ts $ Alias.analyseBody aliases body addOpAliases aliases (OtherOp op) = OtherOp $ addOpAliases aliases op addOpAliases _ (SizeOp op) = SizeOp op instance CanBeWise op => CanBeWise (HostOp op) where addOpWisdom (SegOp op) = SegOp $ addOpWisdom op addOpWisdom (OtherOp op) = OtherOp $ addOpWisdom op addOpWisdom (SizeOp op) = SizeOp op addOpWisdom (GPUBody ts body) = GPUBody ts $ informBody body instance (ASTRep rep, ST.IndexOp (op rep)) => ST.IndexOp (HostOp op rep) where indexOp vtable k (SegOp op) is = ST.indexOp vtable k op is indexOp vtable k (OtherOp op) is = ST.indexOp vtable k op is indexOp _ _ _ _ = Nothing instance (PrettyRep rep, PP.Pretty (op rep)) => PP.Pretty (HostOp op rep) where pretty (SegOp op) = pretty op pretty (OtherOp op) = pretty op pretty (SizeOp op) = pretty op pretty (GPUBody ts body) = "gpu" <+> PP.colon <+> ppTuple' (map pretty ts) <+> PP.nestedBlock "{" "}" (pretty body) instance (OpMetrics (Op rep), OpMetrics (op rep)) => OpMetrics (HostOp op rep) where opMetrics (SegOp op) = opMetrics op opMetrics (OtherOp op) = opMetrics op opMetrics (SizeOp op) = opMetrics op opMetrics (GPUBody _ body) = inside "GPUBody" $ bodyMetrics body instance RephraseOp op => RephraseOp (HostOp op) where rephraseInOp r (SegOp op) = SegOp <$> rephraseInOp r op rephraseInOp r (OtherOp op) = OtherOp <$> rephraseInOp r op rephraseInOp _ (SizeOp op) = pure $ SizeOp op rephraseInOp r (GPUBody ts body) = GPUBody ts <$> rephraseBody r body checkGrid :: TC.Checkable rep => KernelGrid -> TC.TypeM rep () checkGrid grid = do TC.require [Prim int64] $ unCount $ gridNumGroups grid TC.require [Prim int64] $ unCount $ gridGroupSize grid checkSegLevel :: TC.Checkable rep => Maybe SegLevel -> SegLevel -> TC.TypeM rep () checkSegLevel (Just SegGroup {}) (SegThreadInGroup _virt) = pure () checkSegLevel _ (SegThreadInGroup _virt) = TC.bad $ TC.TypeError "ingroup SegOp not in group SegOp." checkSegLevel (Just SegThread {}) _ = TC.bad $ TC.TypeError "SegOps cannot occur when already at thread level." checkSegLevel (Just SegThreadInGroup {}) _ = TC.bad $ TC.TypeError "SegOps cannot occur when already at ingroup level." checkSegLevel _ (SegThread _virt Nothing) = pure () checkSegLevel (Just _) SegThread {} = TC.bad $ TC.TypeError "thread-level SegOp cannot be nested" checkSegLevel Nothing (SegThread _virt grid) = mapM_ checkGrid grid checkSegLevel (Just _) SegGroup {} = TC.bad $ TC.TypeError "group-level SegOp cannot be nested" checkSegLevel Nothing (SegGroup _virt grid) = mapM_ checkGrid grid typeCheckHostOp :: TC.Checkable rep => (SegLevel -> Op (Aliases rep) -> TC.TypeM rep ()) -> Maybe SegLevel -> (op (Aliases rep) -> TC.TypeM rep ()) -> HostOp op (Aliases rep) -> TC.TypeM rep () typeCheckHostOp checker lvl _ (SegOp op) = TC.checkOpWith (checker $ segLevel op) $ typeCheckSegOp (checkSegLevel lvl) op typeCheckHostOp _ Just {} _ GPUBody {} = TC.bad $ TC.TypeError "GPUBody may not be nested in SegOps." typeCheckHostOp _ _ f (OtherOp op) = f op typeCheckHostOp _ _ _ (SizeOp op) = typeCheckSizeOp op typeCheckHostOp _ Nothing _ (GPUBody ts body) = do mapM_ TC.checkType ts void $ TC.checkBody body body_ts <- extendedScope (traverse subExpResType (bodyResult body)) (scopeOf (bodyStms body)) unless (body_ts == ts) . TC.bad . TC.TypeError . T.unlines $ [ "Expected type: " <> prettyTuple ts, "Got body type: " <> prettyTuple body_ts ]