{-# LANGUAGE QuasiQuotes #-}
module Futhark.CodeGen.Backends.CCUDA
( compileProg,
GC.CParts (..),
GC.asLibrary,
GC.asExecutable,
GC.asServer,
)
where
import Control.Monad
import Data.List (unzip4)
import Data.Maybe (catMaybes)
import Data.Text qualified as T
import Futhark.CodeGen.Backends.CCUDA.Boilerplate
import Futhark.CodeGen.Backends.COpenCL.Boilerplate (commonOptions, sizeLoggingCode)
import Futhark.CodeGen.Backends.GenericC qualified as GC
import Futhark.CodeGen.Backends.GenericC.Options
import Futhark.CodeGen.Backends.SimpleRep (primStorageType, toStorage)
import Futhark.CodeGen.ImpCode.OpenCL
import Futhark.CodeGen.ImpGen.CUDA qualified as ImpGen
import Futhark.IR.GPUMem hiding
( CmpSizeLe,
GetSize,
GetSizeMax,
)
import Futhark.MonadFreshNames
import Language.C.Quote.OpenCL qualified as C
import Language.C.Syntax qualified as C
import NeatInterpolation (untrimming)
compileProg :: MonadFreshNames m => T.Text -> Prog GPUMem -> m (ImpGen.Warnings, GC.CParts)
compileProg :: forall (m :: * -> *).
MonadFreshNames m =>
Text -> Prog GPUMem -> m (Warnings, CParts)
compileProg Text
version Prog GPUMem
prog = do
(Warnings
ws, Program Text
cuda_code Text
cuda_prelude Map Name KernelSafety
kernels [PrimType]
_ ParamMap
params [FailureMsg]
failures Definitions OpenCL
prog') <-
Prog GPUMem -> m (Warnings, Program)
forall (m :: * -> *).
MonadFreshNames m =>
Prog GPUMem -> m (Warnings, Program)
ImpGen.compileProg Prog GPUMem
prog
let cost_centres :: [Name]
cost_centres =
[ Name
copyDevToDev,
Name
copyDevToHost,
Name
copyHostToDev,
Name
copyScalarToDev,
Name
copyScalarFromDev
]
extra :: CompilerM OpenCL () ()
extra =
Text
-> Text
-> [Name]
-> Map Name KernelSafety
-> [FailureMsg]
-> CompilerM OpenCL () ()
generateBoilerplate
Text
cuda_code
Text
cuda_prelude
[Name]
cost_centres
Map Name KernelSafety
kernels
[FailureMsg]
failures
(Warnings
ws,)
(CParts -> (Warnings, CParts)) -> m CParts -> m (Warnings, CParts)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text
-> Text
-> ParamMap
-> Operations OpenCL ()
-> CompilerM OpenCL () ()
-> Text
-> (Space, [Space])
-> [Option]
-> Definitions OpenCL
-> m CParts
forall (m :: * -> *) op.
MonadFreshNames m =>
Text
-> Text
-> ParamMap
-> Operations op ()
-> CompilerM op () ()
-> Text
-> (Space, [Space])
-> [Option]
-> Definitions op
-> m CParts
GC.compileProg
Text
"cuda"
Text
version
ParamMap
params
Operations OpenCL ()
operations
CompilerM OpenCL () ()
extra
Text
cuda_includes
(String -> Space
Space String
"device", [String -> Space
Space String
"device", Space
DefaultSpace])
[Option]
cliOptions
Definitions OpenCL
prog'
where
operations :: GC.Operations OpenCL ()
operations :: Operations OpenCL ()
operations =
Operations OpenCL ()
forall op s. Operations op s
GC.defaultOperations
{ opsWriteScalar :: WriteScalar OpenCL ()
GC.opsWriteScalar = WriteScalar OpenCL ()
writeCUDAScalar,
opsReadScalar :: ReadScalar OpenCL ()
GC.opsReadScalar = ReadScalar OpenCL ()
readCUDAScalar,
opsAllocate :: Allocate OpenCL ()
GC.opsAllocate = Allocate OpenCL ()
allocateCUDABuffer,
opsDeallocate :: Allocate OpenCL ()
GC.opsDeallocate = Allocate OpenCL ()
deallocateCUDABuffer,
opsCopy :: Copy OpenCL ()
GC.opsCopy = Copy OpenCL ()
copyCUDAMemory,
opsMemoryType :: MemoryType OpenCL ()
GC.opsMemoryType = MemoryType OpenCL ()
cudaMemoryType,
opsCompiler :: OpCompiler OpenCL ()
GC.opsCompiler = OpCompiler OpenCL ()
callKernel,
opsFatMemory :: Bool
GC.opsFatMemory = Bool
True,
opsCritical :: ([BlockItem], [BlockItem])
GC.opsCritical =
( [C.citems|CUDA_SUCCEED_FATAL(cuCtxPushCurrent(ctx->cu_ctx));|],
[C.citems|CUDA_SUCCEED_FATAL(cuCtxPopCurrent(&ctx->cu_ctx));|]
)
}
cuda_includes :: Text
cuda_includes =
[untrimming|
#include <cuda.h>
#include <cuda_runtime.h>
#include <nvrtc.h>
|]
cliOptions :: [Option]
cliOptions :: [Option]
cliOptions =
[Option]
commonOptions
[Option] -> [Option] -> [Option]
forall a. [a] -> [a] -> [a]
++ [ Option
{ optionLongName :: String
optionLongName = String
"dump-cuda",
optionShortName :: Maybe Char
optionShortName = Maybe Char
forall a. Maybe a
Nothing,
optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"FILE",
optionDescription :: String
optionDescription = String
"Dump the embedded CUDA kernels to the indicated file.",
optionAction :: Stm
optionAction =
[C.cstm|{futhark_context_config_dump_program_to(cfg, optarg);
entry_point = NULL;}|]
},
Option
{ optionLongName :: String
optionLongName = String
"load-cuda",
optionShortName :: Maybe Char
optionShortName = Maybe Char
forall a. Maybe a
Nothing,
optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"FILE",
optionDescription :: String
optionDescription = String
"Instead of using the embedded CUDA kernels, load them from the indicated file.",
optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_load_program_from(cfg, optarg);|]
},
Option
{ optionLongName :: String
optionLongName = String
"dump-ptx",
optionShortName :: Maybe Char
optionShortName = Maybe Char
forall a. Maybe a
Nothing,
optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"FILE",
optionDescription :: String
optionDescription = String
"Dump the PTX-compiled version of the embedded kernels to the indicated file.",
optionAction :: Stm
optionAction =
[C.cstm|{futhark_context_config_dump_ptx_to(cfg, optarg);
entry_point = NULL;}|]
},
Option
{ optionLongName :: String
optionLongName = String
"load-ptx",
optionShortName :: Maybe Char
optionShortName = Maybe Char
forall a. Maybe a
Nothing,
optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"FILE",
optionDescription :: String
optionDescription = String
"Load PTX code from the indicated file.",
optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_load_ptx_from(cfg, optarg);|]
},
Option
{ optionLongName :: String
optionLongName = String
"nvrtc-option",
optionShortName :: Maybe Char
optionShortName = Maybe Char
forall a. Maybe a
Nothing,
optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"OPT",
optionDescription :: String
optionDescription = String
"Add an additional build option to the string passed to NVRTC.",
optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_add_nvrtc_option(cfg, optarg);|]
},
Option
{ optionLongName :: String
optionLongName = String
"profile",
optionShortName :: Maybe Char
optionShortName = Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'P',
optionArgument :: OptionArgument
optionArgument = OptionArgument
NoArgument,
optionDescription :: String
optionDescription = String
"Gather profiling data while executing and print out a summary at the end.",
optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_set_profiling(cfg, 1);|]
}
]
writeCUDAScalar :: GC.WriteScalar OpenCL ()
writeCUDAScalar :: WriteScalar OpenCL ()
writeCUDAScalar Exp
mem Exp
idx Type
t String
"device" Volatility
_ val :: Exp
val@C.Const {} = do
VName
val' <- String -> CompilerM OpenCL () VName
forall (m :: * -> *). MonadFreshNames m => String -> m VName
newVName String
"write_static"
let ([BlockItem]
bef, [BlockItem]
aft) = Name -> ([BlockItem], [BlockItem])
profilingEnclosure Name
copyScalarToDev
BlockItem -> CompilerM OpenCL () ()
forall op s. BlockItem -> CompilerM op s ()
GC.item
[C.citem|{static $ty:t $id:val' = $exp:val;
$items:bef
CUDA_SUCCEED_OR_RETURN(
cuMemcpyHtoDAsync($exp:mem + $exp:idx * sizeof($ty:t),
&$id:val',
sizeof($ty:t),
ctx->stream));
$items:aft
}|]
writeCUDAScalar Exp
mem Exp
idx Type
t String
"device" Volatility
_ Exp
val = do
VName
val' <- String -> CompilerM OpenCL () VName
forall (m :: * -> *). MonadFreshNames m => String -> m VName
newVName String
"write_tmp"
let ([BlockItem]
bef, [BlockItem]
aft) = Name -> ([BlockItem], [BlockItem])
profilingEnclosure Name
copyScalarToDev
BlockItem -> CompilerM OpenCL () ()
forall op s. BlockItem -> CompilerM op s ()
GC.item
[C.citem|{$ty:t $id:val' = $exp:val;
$items:bef
CUDA_SUCCEED_OR_RETURN(
cuMemcpyHtoD($exp:mem + $exp:idx * sizeof($ty:t),
&$id:val',
sizeof($ty:t)));
$items:aft
}|]
writeCUDAScalar Exp
_ Exp
_ Type
_ String
space Volatility
_ Exp
_ =
String -> CompilerM OpenCL () ()
forall a. HasCallStack => String -> a
error (String -> CompilerM OpenCL () ())
-> String -> CompilerM OpenCL () ()
forall a b. (a -> b) -> a -> b
$ String
"Cannot write to '" String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
space String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"' memory space."
readCUDAScalar :: GC.ReadScalar OpenCL ()
readCUDAScalar :: ReadScalar OpenCL ()
readCUDAScalar Exp
mem Exp
idx Type
t String
"device" Volatility
_ = do
VName
val <- String -> CompilerM OpenCL () VName
forall (m :: * -> *). MonadFreshNames m => String -> m VName
newVName String
"read_res"
let ([BlockItem]
bef, [BlockItem]
aft) = Name -> ([BlockItem], [BlockItem])
profilingEnclosure Name
copyScalarFromDev
(BlockItem -> CompilerM OpenCL () ())
-> [BlockItem] -> CompilerM OpenCL () ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_
BlockItem -> CompilerM OpenCL () ()
forall op s. BlockItem -> CompilerM op s ()
GC.item
[C.citems|
$ty:t $id:val;
{
$items:bef
CUDA_SUCCEED_OR_RETURN(
cuMemcpyDtoH(&$id:val,
$exp:mem + $exp:idx * sizeof($ty:t),
sizeof($ty:t)));
$items:aft
}
|]
Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm
[C.cstm|if (ctx->failure_is_an_option && futhark_context_sync(ctx) != 0)
{ return 1; }|]
Exp -> CompilerM OpenCL () Exp
forall a. a -> CompilerM OpenCL () a
forall (f :: * -> *) a. Applicative f => a -> f a
pure [C.cexp|$id:val|]
readCUDAScalar Exp
_ Exp
_ Type
_ String
space Volatility
_ =
String -> CompilerM OpenCL () Exp
forall a. HasCallStack => String -> a
error (String -> CompilerM OpenCL () Exp)
-> String -> CompilerM OpenCL () Exp
forall a b. (a -> b) -> a -> b
$ String
"Cannot write to '" String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
space String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"' memory space."
allocateCUDABuffer :: GC.Allocate OpenCL ()
allocateCUDABuffer :: Allocate OpenCL ()
allocateCUDABuffer Exp
mem Exp
size Exp
tag String
"device" =
Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm
[C.cstm|ctx->error =
CUDA_SUCCEED_NONFATAL(cuda_alloc(ctx, ctx->log,
(size_t)$exp:size, $exp:tag,
&$exp:mem, (size_t*)&$exp:size));|]
allocateCUDABuffer Exp
_ Exp
_ Exp
_ String
space =
String -> CompilerM OpenCL () ()
forall a. HasCallStack => String -> a
error (String -> CompilerM OpenCL () ())
-> String -> CompilerM OpenCL () ()
forall a b. (a -> b) -> a -> b
$ String
"Cannot allocate in '" String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
space String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"' memory space."
deallocateCUDABuffer :: GC.Deallocate OpenCL ()
deallocateCUDABuffer :: Allocate OpenCL ()
deallocateCUDABuffer Exp
mem Exp
size Exp
tag String
"device" =
Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|CUDA_SUCCEED_OR_RETURN(cuda_free(ctx, $exp:mem, $exp:size, $exp:tag));|]
deallocateCUDABuffer Exp
_ Exp
_ Exp
_ String
space =
String -> CompilerM OpenCL () ()
forall a. HasCallStack => String -> a
error (String -> CompilerM OpenCL () ())
-> String -> CompilerM OpenCL () ()
forall a b. (a -> b) -> a -> b
$ String
"Cannot deallocate in '" String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
space String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"' memory space."
copyCUDAMemory :: GC.Copy OpenCL ()
copyCUDAMemory :: Copy OpenCL ()
copyCUDAMemory CopyBarrier
b Exp
dstmem Exp
dstidx Space
dstSpace Exp
srcmem Exp
srcidx Space
srcSpace Exp
nbytes = do
let (Exp
copy, Name
prof) = CopyBarrier -> Space -> Space -> (Exp, Name)
memcpyFun CopyBarrier
b Space
dstSpace Space
srcSpace
([BlockItem]
bef, [BlockItem]
aft) = Name -> ([BlockItem], [BlockItem])
profilingEnclosure Name
prof
BlockItem -> CompilerM OpenCL () ()
forall op s. BlockItem -> CompilerM op s ()
GC.item
[C.citem|{$items:bef CUDA_SUCCEED_OR_RETURN($exp:copy); $items:aft}|]
where
dst :: Exp
dst = [C.cexp|$exp:dstmem + $exp:dstidx|]
src :: Exp
src = [C.cexp|$exp:srcmem + $exp:srcidx|]
memcpyFun :: CopyBarrier -> Space -> Space -> (Exp, Name)
memcpyFun CopyBarrier
GC.CopyBarrier Space
DefaultSpace (Space String
"device") =
([C.cexp|cuMemcpyDtoH($exp:dst, $exp:src, $exp:nbytes)|], Name
copyDevToHost)
memcpyFun CopyBarrier
GC.CopyBarrier (Space String
"device") Space
DefaultSpace =
([C.cexp|cuMemcpyHtoD($exp:dst, $exp:src, $exp:nbytes)|], Name
copyHostToDev)
memcpyFun CopyBarrier
_ (Space String
"device") (Space String
"device") =
([C.cexp|cuMemcpy($exp:dst, $exp:src, $exp:nbytes)|], Name
copyDevToDev)
memcpyFun CopyBarrier
GC.CopyNoBarrier Space
DefaultSpace (Space String
"device") =
([C.cexp|cuMemcpyDtoHAsync($exp:dst, $exp:src, $exp:nbytes, ctx->stream)|], Name
copyDevToHost)
memcpyFun CopyBarrier
GC.CopyNoBarrier (Space String
"device") Space
DefaultSpace =
([C.cexp|cuMemcpyHtoDAsync($exp:dst, $exp:src, $exp:nbytes, ctx->stream)|], Name
copyHostToDev)
memcpyFun CopyBarrier
_ Space
_ Space
_ =
String -> (Exp, Name)
forall a. HasCallStack => String -> a
error (String -> (Exp, Name)) -> String -> (Exp, Name)
forall a b. (a -> b) -> a -> b
$
String
"Cannot copy to '"
String -> String -> String
forall a. [a] -> [a] -> [a]
++ Space -> String
forall a. Show a => a -> String
show Space
dstSpace
String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"' from '"
String -> String -> String
forall a. [a] -> [a] -> [a]
++ Space -> String
forall a. Show a => a -> String
show Space
srcSpace
String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"'."
cudaMemoryType :: GC.MemoryType OpenCL ()
cudaMemoryType :: MemoryType OpenCL ()
cudaMemoryType String
"device" = Type -> CompilerM OpenCL () Type
forall a. a -> CompilerM OpenCL () a
forall (f :: * -> *) a. Applicative f => a -> f a
pure [C.cty|typename CUdeviceptr|]
cudaMemoryType String
space =
MemoryType OpenCL ()
forall a. HasCallStack => String -> a
error MemoryType OpenCL () -> MemoryType OpenCL ()
forall a b. (a -> b) -> a -> b
$ String
"CUDA backend does not support '" String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
space String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"' memory space."
kernelConstToExp :: KernelConst -> C.Exp
kernelConstToExp :: KernelConst -> Exp
kernelConstToExp (SizeConst Name
key) =
[C.cexp|*ctx->tuning_params.$id:key|]
kernelConstToExp (SizeMaxConst SizeClass
size_class) =
[C.cexp|ctx->$id:field|]
where
field :: String
field = String
"max_" String -> String -> String
forall a. Semigroup a => a -> a -> a
<> SizeClass -> String
cudaSizeClass SizeClass
size_class
cudaSizeClass :: SizeClass -> String
cudaSizeClass SizeThreshold {} = String
"threshold"
cudaSizeClass SizeClass
SizeGroup = String
"block_size"
cudaSizeClass SizeClass
SizeNumGroups = String
"grid_size"
cudaSizeClass SizeClass
SizeTile = String
"tile_size"
cudaSizeClass SizeClass
SizeRegTile = String
"reg_tile_size"
cudaSizeClass SizeClass
SizeLocalMemory = String
"shared_memory"
cudaSizeClass (SizeBespoke Name
x Int64
_) = Name -> String
forall a. Pretty a => a -> String
prettyString Name
x
compileGroupDim :: GroupDim -> GC.CompilerM op s C.Exp
compileGroupDim :: forall op s. GroupDim -> CompilerM op s Exp
compileGroupDim (Left Exp
e) = Exp -> CompilerM op s Exp
forall op s. Exp -> CompilerM op s Exp
GC.compileExp Exp
e
compileGroupDim (Right KernelConst
kc) = Exp -> CompilerM op s Exp
forall a. a -> CompilerM op s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Exp -> CompilerM op s Exp) -> Exp -> CompilerM op s Exp
forall a b. (a -> b) -> a -> b
$ KernelConst -> Exp
kernelConstToExp KernelConst
kc
callKernel :: GC.OpCompiler OpenCL ()
callKernel :: OpCompiler OpenCL ()
callKernel (GetSize VName
v Name
key) = do
let e :: Exp
e = KernelConst -> Exp
kernelConstToExp (KernelConst -> Exp) -> KernelConst -> Exp
forall a b. (a -> b) -> a -> b
$ Name -> KernelConst
SizeConst Name
key
Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|$id:v = $exp:e;|]
callKernel (CmpSizeLe VName
v Name
key Exp
x) = do
let e :: Exp
e = KernelConst -> Exp
kernelConstToExp (KernelConst -> Exp) -> KernelConst -> Exp
forall a b. (a -> b) -> a -> b
$ Name -> KernelConst
SizeConst Name
key
Exp
x' <- Exp -> CompilerM OpenCL () Exp
forall op s. Exp -> CompilerM op s Exp
GC.compileExp Exp
x
Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|$id:v = $exp:e <= $exp:x';|]
VName -> Name -> Exp -> CompilerM OpenCL () ()
forall op. VName -> Name -> Exp -> CompilerM op () ()
sizeLoggingCode VName
v Name
key Exp
x'
callKernel (GetSizeMax VName
v SizeClass
size_class) = do
let e :: Exp
e = KernelConst -> Exp
kernelConstToExp (KernelConst -> Exp) -> KernelConst -> Exp
forall a b. (a -> b) -> a -> b
$ SizeClass -> KernelConst
SizeMaxConst SizeClass
size_class
Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|$id:v = $exp:e;|]
callKernel (LaunchKernel KernelSafety
safety Name
kernel_name [KernelArg]
args [Exp]
num_blocks [GroupDim]
block_size) = do
([Param]
arg_params, [Initializer]
arg_params_inits, [Exp]
call_args, [Maybe (VName, VName)]
shared_vars) <-
[(Param, Initializer, Exp, Maybe (VName, VName))]
-> ([Param], [Initializer], [Exp], [Maybe (VName, VName)])
forall a b c d. [(a, b, c, d)] -> ([a], [b], [c], [d])
unzip4 ([(Param, Initializer, Exp, Maybe (VName, VName))]
-> ([Param], [Initializer], [Exp], [Maybe (VName, VName)]))
-> CompilerM
OpenCL () [(Param, Initializer, Exp, Maybe (VName, VName))]
-> CompilerM
OpenCL () ([Param], [Initializer], [Exp], [Maybe (VName, VName)])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Int
-> KernelArg
-> CompilerM
OpenCL () (Param, Initializer, Exp, Maybe (VName, VName)))
-> [Int]
-> [KernelArg]
-> CompilerM
OpenCL () [(Param, Initializer, Exp, Maybe (VName, VName))]
forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m [c]
zipWithM Int
-> KernelArg
-> CompilerM
OpenCL () (Param, Initializer, Exp, Maybe (VName, VName))
forall {a} {op} {s}.
Show a =>
a
-> KernelArg
-> CompilerM op s (Param, Initializer, Exp, Maybe (VName, VName))
mkArgs [(Int
0 :: Int) ..] [KernelArg]
args
let ([VName]
shared_sizes, [VName]
shared_offsets) = [(VName, VName)] -> ([VName], [VName])
forall a b. [(a, b)] -> ([a], [b])
unzip ([(VName, VName)] -> ([VName], [VName]))
-> [(VName, VName)] -> ([VName], [VName])
forall a b. (a -> b) -> a -> b
$ [Maybe (VName, VName)] -> [(VName, VName)]
forall a. [Maybe a] -> [a]
catMaybes [Maybe (VName, VName)]
shared_vars
shared_offsets_sc :: [Exp]
shared_offsets_sc = [VName] -> [Exp]
mkOffsets [VName]
shared_sizes
shared_args :: [(VName, Exp)]
shared_args = [VName] -> [Exp] -> [(VName, Exp)]
forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
shared_offsets [Exp]
shared_offsets_sc
shared_bytes :: Exp
shared_bytes = [Exp] -> Exp
forall a. HasCallStack => [a] -> a
last [Exp]
shared_offsets_sc
[(VName, Exp)]
-> ((VName, Exp) -> CompilerM OpenCL () ())
-> CompilerM OpenCL () ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [(VName, Exp)]
shared_args (((VName, Exp) -> CompilerM OpenCL () ())
-> CompilerM OpenCL () ())
-> ((VName, Exp) -> CompilerM OpenCL () ())
-> CompilerM OpenCL () ()
forall a b. (a -> b) -> a -> b
$ \(VName
arg, Exp
offset) ->
InitGroup -> CompilerM OpenCL () ()
forall op s. InitGroup -> CompilerM op s ()
GC.decl [C.cdecl|unsigned int $id:arg = $exp:offset;|]
(Exp
grid_x, Exp
grid_y, Exp
grid_z) <- [Exp] -> (Exp, Exp, Exp)
mkDims ([Exp] -> (Exp, Exp, Exp))
-> CompilerM OpenCL () [Exp] -> CompilerM OpenCL () (Exp, Exp, Exp)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Exp -> CompilerM OpenCL () Exp)
-> [Exp] -> CompilerM OpenCL () [Exp]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> [a] -> m [b]
mapM Exp -> CompilerM OpenCL () Exp
forall op s. Exp -> CompilerM op s Exp
GC.compileExp [Exp]
num_blocks
(Exp
block_x, Exp
block_y, Exp
block_z) <- [Exp] -> (Exp, Exp, Exp)
mkDims ([Exp] -> (Exp, Exp, Exp))
-> CompilerM OpenCL () [Exp] -> CompilerM OpenCL () (Exp, Exp, Exp)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (GroupDim -> CompilerM OpenCL () Exp)
-> [GroupDim] -> CompilerM OpenCL () [Exp]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> [a] -> m [b]
mapM GroupDim -> CompilerM OpenCL () Exp
forall op s. GroupDim -> CompilerM op s Exp
compileGroupDim [GroupDim]
block_size
let need_perm :: Bool
need_perm = [Exp] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Exp]
num_blocks Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
3
Name
kernel_fname <- Name
-> KernelSafety
-> Bool
-> [Param]
-> [Initializer]
-> CompilerM OpenCL () Name
forall op s.
Name
-> KernelSafety
-> Bool
-> [Param]
-> [Initializer]
-> CompilerM op s Name
genKernelFunction Name
kernel_name KernelSafety
safety Bool
need_perm [Param]
arg_params [Initializer]
arg_params_inits
Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm
[C.cstm|{
err = $id:kernel_fname(ctx,
$exp:grid_x,$exp:grid_y,$exp:grid_z,
$exp:block_x, $exp:block_y, $exp:block_z,
$exp:shared_bytes,
$args:call_args);
if (err != FUTHARK_SUCCESS) { goto cleanup; }
}|]
Bool -> CompilerM OpenCL () () -> CompilerM OpenCL () ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (KernelSafety
safety KernelSafety -> KernelSafety -> Bool
forall a. Ord a => a -> a -> Bool
>= KernelSafety
SafetyFull) (CompilerM OpenCL () () -> CompilerM OpenCL () ())
-> CompilerM OpenCL () () -> CompilerM OpenCL () ()
forall a b. (a -> b) -> a -> b
$
Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|ctx->failure_is_an_option = 1;|]
where
mkDims :: [Exp] -> (Exp, Exp, Exp)
mkDims [] = ([C.cexp|0|], [C.cexp|0|], [C.cexp|0|])
mkDims [Exp
x] = (Exp
x, [C.cexp|1|], [C.cexp|1|])
mkDims [Exp
x, Exp
y] = (Exp
x, Exp
y, [C.cexp|1|])
mkDims (Exp
x : Exp
y : Exp
z : [Exp]
_) = (Exp
x, Exp
y, Exp
z)
addExp :: a -> a -> Exp
addExp a
x a
y = [C.cexp|$exp:x + $exp:y|]
alignExp :: a -> Exp
alignExp a
e = [C.cexp|$exp:e + ((8 - ($exp:e % 8)) % 8)|]
mkOffsets :: [VName] -> [Exp]
mkOffsets = (Exp -> VName -> Exp) -> Exp -> [VName] -> [Exp]
forall b a. (b -> a -> b) -> b -> [a] -> [b]
scanl (\Exp
a VName
b -> Exp
a Exp -> Exp -> Exp
forall {a} {a}. (ToExp a, ToExp a) => a -> a -> Exp
`addExp` VName -> Exp
forall {a}. ToExp a => a -> Exp
alignExp VName
b) [C.cexp|0|]
mkArgs :: a
-> KernelArg
-> CompilerM op s (Param, Initializer, Exp, Maybe (VName, VName))
mkArgs a
i (ValueKArg Exp
e PrimType
t) = do
Exp
e' <- Exp -> CompilerM op s Exp
forall op s. Exp -> CompilerM op s Exp
GC.compileExp Exp
e
(Param, Initializer, Exp, Maybe (VName, VName))
-> CompilerM op s (Param, Initializer, Exp, Maybe (VName, VName))
forall a. a -> CompilerM op s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure
( [C.cparam|$ty:(primStorageType t) $id:("arg" <> show i)|],
[C.cinit|&$id:("arg" <> show i)|],
PrimType -> Exp -> Exp
toStorage PrimType
t Exp
e',
Maybe (VName, VName)
forall a. Maybe a
Nothing
)
mkArgs a
i (MemKArg VName
v) = do
Exp
v' <- VName -> CompilerM op s Exp
forall op s. VName -> CompilerM op s Exp
GC.rawMem VName
v
(Param, Initializer, Exp, Maybe (VName, VName))
-> CompilerM op s (Param, Initializer, Exp, Maybe (VName, VName))
forall a. a -> CompilerM op s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure
( [C.cparam|typename CUdeviceptr $id:("arg" <> show i)|],
[C.cinit|&$id:("arg" <> show i)|],
Exp
v',
Maybe (VName, VName)
forall a. Maybe a
Nothing
)
mkArgs a
i (SharedMemoryKArg (Count Exp
c)) = do
Exp
num_bytes <- Exp -> CompilerM op s Exp
forall op s. Exp -> CompilerM op s Exp
GC.compileExp Exp
c
VName
size <- String -> CompilerM op s VName
forall (m :: * -> *). MonadFreshNames m => String -> m VName
newVName String
"shared_size"
VName
offset <- String -> CompilerM op s VName
forall (m :: * -> *). MonadFreshNames m => String -> m VName
newVName String
"shared_offset"
InitGroup -> CompilerM op s ()
forall op s. InitGroup -> CompilerM op s ()
GC.decl [C.cdecl|unsigned int $id:size = $exp:num_bytes;|]
(Param, Initializer, Exp, Maybe (VName, VName))
-> CompilerM op s (Param, Initializer, Exp, Maybe (VName, VName))
forall a. a -> CompilerM op s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure
( [C.cparam|unsigned int $id:("arg" <> show i)|],
[C.cinit|&$id:("arg" <> show i)|],
[C.cexp|$id:offset|],
(VName, VName) -> Maybe (VName, VName)
forall a. a -> Maybe a
Just (VName
size, VName
offset)
)
genKernelFunction ::
KernelName ->
KernelSafety ->
Bool ->
[C.Param] ->
[C.Initializer] ->
GC.CompilerM op s Name
genKernelFunction :: forall op s.
Name
-> KernelSafety
-> Bool
-> [Param]
-> [Initializer]
-> CompilerM op s Name
genKernelFunction Name
kernel_name KernelSafety
safety Bool
need_perm [Param]
arg_params [Initializer]
arg_params_inits = do
let kernel_fname :: Name
kernel_fname = Name
"gpu_kernel_" Name -> Name -> Name
forall a. Semigroup a => a -> a -> a
<> Name
kernel_name
([BlockItem]
bef, [BlockItem]
aft) = Name -> ([BlockItem], [BlockItem])
profilingEnclosure Name
kernel_name
perm_args :: [Initializer]
perm_args
| Bool
need_perm = [[C.cinit|&perm[0]|], [C.cinit|&perm[1]|], [C.cinit|&perm[2]|]]
| Bool
otherwise = []
failure_args :: [Initializer]
failure_args =
Int -> [Initializer] -> [Initializer]
forall a. Int -> [a] -> [a]
take
(KernelSafety -> Int
numFailureParams KernelSafety
safety)
[ [C.cinit|&ctx->global_failure|],
[C.cinit|&ctx->failure_is_an_option|],
[C.cinit|&ctx->global_failure_args|]
]
Definition -> CompilerM op s ()
forall op s. Definition -> CompilerM op s ()
GC.libDecl
[C.cedecl|static int $id:kernel_fname(
struct futhark_context* ctx, unsigned int grid_x, unsigned int grid_y, unsigned int grid_z,
unsigned int block_x, unsigned int block_y, unsigned int block_z,
unsigned int shared_bytes, $params:arg_params) {
if (grid_x * grid_y * grid_z * block_x * block_y * block_z != 0) {
int perm[3] = { 0, 1, 2 };
if (grid_y >= (1<<16)) {
perm[1] = perm[0];
perm[0] = 1;
}
if (grid_z >= (1<<16)) {
perm[2] = perm[0];
perm[0] = 2;
}
size_t grid[3];
grid[perm[0]] = grid_x;
grid[perm[1]] = grid_y;
grid[perm[2]] = grid_z;
void *all_args[] = { $inits:(perm_args ++ failure_args ++ arg_params_inits) };
typename int64_t time_start = 0, time_end = 0;
if (ctx->debugging) {
fprintf(ctx->log, "Launching %s with grid size [%d, %d, %d] and block size [%d, %d, %d]; shared memory: %d bytes.\n",
$string:(prettyString kernel_name),
grid_x, grid_y, grid_z,
block_x, block_y, block_z,
shared_bytes);
time_start = get_wall_time();
}
$items:bef
CUDA_SUCCEED_OR_RETURN(
cuLaunchKernel(ctx->program->$id:kernel_name,
grid[0], grid[1], grid[2],
block_x, block_y, block_z,
shared_bytes, ctx->stream,
all_args, NULL));
$items:aft
if (ctx->debugging) {
CUDA_SUCCEED_FATAL(cuCtxSynchronize());
time_end = get_wall_time();
fprintf(ctx->log, "Kernel %s runtime: %ldus\n",
$string:(prettyString kernel_name), time_end - time_start);
}
}
return FUTHARK_SUCCESS;
}|]
Name -> CompilerM op s Name
forall a. a -> CompilerM op s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Name
kernel_fname