{-# LANGUAGE QuasiQuotes #-}

-- | C code generation for GPU, in general.
--
-- This module generates codes that targets the tiny GPU API
-- abstraction layer we define in the runtime system.
module Futhark.CodeGen.Backends.GPU
  ( gpuOperations,
    gpuOptions,
    generateGPUBoilerplate,
  )
where

import Control.Monad
import Control.Monad.Identity
import Data.Bifunctor (bimap)
import Data.Map qualified as M
import Data.Text qualified as T
import Futhark.CodeGen.Backends.GenericC qualified as GC
import Futhark.CodeGen.Backends.GenericC.Options
import Futhark.CodeGen.Backends.GenericC.Pretty (expText, idText)
import Futhark.CodeGen.Backends.SimpleRep (primStorageType, toStorage)
import Futhark.CodeGen.ImpCode.OpenCL
import Futhark.CodeGen.RTS.C (gpuH, gpuPrototypesH)
import Futhark.MonadFreshNames
import Futhark.Util (chunk)
import Futhark.Util.Pretty (prettyTextOneLine)
import Language.C.Quote.OpenCL qualified as C
import Language.C.Syntax qualified as C

genKernelFunction ::
  KernelName ->
  KernelSafety ->
  [C.Param] ->
  [(C.Exp, C.Exp)] ->
  GC.CompilerM op s Name
genKernelFunction :: forall op s.
Name
-> KernelSafety -> [Param] -> [(Exp, Exp)] -> CompilerM op s Name
genKernelFunction Name
kernel_name KernelSafety
safety [Param]
arg_params [(Exp, Exp)]
arg_set = do
  let kernel_fname :: Name
kernel_fname = Name
"gpu_kernel_" forall a. Semigroup a => a -> a -> a
<> Name
kernel_name
  forall op s. Definition -> CompilerM op s ()
GC.libDecl
    [C.cedecl|static int $id:kernel_fname
               (struct futhark_context* ctx,
                unsigned int grid_x, unsigned int grid_y, unsigned int grid_z,
                unsigned int block_x, unsigned int block_y, unsigned int block_z,
                unsigned int shared_bytes, $params:arg_params) {
    if (grid_x * grid_y * grid_z * block_x * block_y * block_z != 0) {
      void* args[$int:num_args] = { $inits:(failure_inits<>args_inits) };
      size_t args_sizes[$int:num_args] = { $inits:(failure_sizes<>args_sizes) };
      return gpu_launch_kernel(ctx, ctx->program->$id:kernel_name,
                               $string:(prettyString kernel_name),
                               (const typename int32_t[]){grid_x, grid_y, grid_z},
                               (const typename int32_t[]){block_x, block_y, block_z},
                               shared_bytes,
                               $int:num_args, args, args_sizes);
    }
    return FUTHARK_SUCCESS;
  }|]

  forall (f :: * -> *) a. Applicative f => a -> f a
pure Name
kernel_fname
  where
    num_args :: Int
num_args = KernelSafety -> Int
numFailureParams KernelSafety
safety forall a. Num a => a -> a -> a
+ forall (t :: * -> *) a. Foldable t => t a -> Int
length [(Exp, Exp)]
arg_set
    expToInit :: a -> Initializer
expToInit a
e = [C.cinit|$exp:e|]
    ([Initializer]
args_sizes, [Initializer]
args_inits) = forall (p :: * -> * -> *) a b c d.
Bifunctor p =>
(a -> b) -> (c -> d) -> p a c -> p b d
bimap (forall a b. (a -> b) -> [a] -> [b]
map forall {a}. ToExp a => a -> Initializer
expToInit) (forall a b. (a -> b) -> [a] -> [b]
map forall {a}. ToExp a => a -> Initializer
expToInit) forall a b. (a -> b) -> a -> b
$ forall a b. [(a, b)] -> ([a], [b])
unzip [(Exp, Exp)]
arg_set
    ([Initializer]
failure_inits, [Initializer]
failure_sizes) =
      forall a b. [(a, b)] -> ([a], [b])
unzip forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. Int -> [a] -> [a]
take (KernelSafety -> Int
numFailureParams KernelSafety
safety) forall a b. (a -> b) -> a -> b
$
        [ ([C.cinit|&ctx->global_failure|], [C.cinit|sizeof(ctx->global_failure)|]),
          ([C.cinit|&ctx->failure_is_an_option|], [C.cinit|sizeof(ctx->failure_is_an_option)|]),
          ([C.cinit|&ctx->global_failure_args|], [C.cinit|sizeof(ctx->global_failure_args)|])
        ]

getParamByKey :: Name -> C.Exp
getParamByKey :: Name -> Exp
getParamByKey Name
key = [C.cexp|*ctx->tuning_params.$id:key|]

kernelConstToExp :: KernelConst -> C.Exp
kernelConstToExp :: KernelConst -> Exp
kernelConstToExp (SizeConst Name
key SizeClass
_) =
  Name -> Exp
getParamByKey Name
key
kernelConstToExp (SizeMaxConst SizeClass
size_class) =
  [C.cexp|ctx->$id:field|]
  where
    field :: String
field = String
"max_" forall a. Semigroup a => a -> a -> a
<> forall a. Pretty a => a -> String
prettyString SizeClass
size_class

compileGroupDim :: GroupDim -> GC.CompilerM op s C.Exp
compileGroupDim :: forall op s. GroupDim -> CompilerM op s Exp
compileGroupDim (Left Exp
e) = forall op s. Exp -> CompilerM op s Exp
GC.compileExp Exp
e
compileGroupDim (Right KernelConst
kc) = forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ KernelConst -> Exp
kernelConstToExp KernelConst
kc

genLaunchKernel ::
  KernelSafety ->
  KernelName ->
  Count Bytes (TExp Int64) ->
  [KernelArg] ->
  [Exp] ->
  [GroupDim] ->
  GC.CompilerM op s ()
genLaunchKernel :: forall op s.
KernelSafety
-> Name
-> Count Bytes (TExp Int64)
-> [KernelArg]
-> [Exp]
-> [GroupDim]
-> CompilerM op s ()
genLaunchKernel KernelSafety
safety Name
kernel_name Count Bytes (TExp Int64)
local_memory [KernelArg]
args [Exp]
num_groups [GroupDim]
group_size = do
  ([Param]
arg_params, [(Exp, Exp)]
arg_params_inits, [Exp]
call_args) <-
    forall a b c. [(a, b, c)] -> ([a], [b], [c])
unzip3 forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m [c]
zipWithM forall {p} {op} {s}.
Show p =>
p -> KernelArg -> CompilerM op s (Param, (Exp, Exp), Exp)
mkArgs [(Int
0 :: Int) ..] [KernelArg]
args

  (Exp
grid_x, Exp
grid_y, Exp
grid_z) <- [Exp] -> (Exp, Exp, Exp)
mkDims forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM forall op s. Exp -> CompilerM op s Exp
GC.compileExp [Exp]
num_groups
  (Exp
group_x, Exp
group_y, Exp
group_z) <- [Exp] -> (Exp, Exp, Exp)
mkDims forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM forall op s. GroupDim -> CompilerM op s Exp
compileGroupDim [GroupDim]
group_size

  Name
kernel_fname <- forall op s.
Name
-> KernelSafety -> [Param] -> [(Exp, Exp)] -> CompilerM op s Name
genKernelFunction Name
kernel_name KernelSafety
safety [Param]
arg_params [(Exp, Exp)]
arg_params_inits

  Exp
local_memory' <- forall op s. Exp -> CompilerM op s Exp
GC.compileExp forall a b. (a -> b) -> a -> b
$ forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped forall a b. (a -> b) -> a -> b
$ forall {k} (u :: k) e. Count u e -> e
unCount Count Bytes (TExp Int64)
local_memory

  forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|{
           err = $id:kernel_fname(ctx,
                                  $exp:grid_x, $exp:grid_y, $exp:grid_z,
                                  $exp:group_x, $exp:group_y, $exp:group_z,
                                  $exp:local_memory',
                                  $args:call_args);
           if (err != FUTHARK_SUCCESS) { goto cleanup; }
           }|]

  forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (KernelSafety
safety forall a. Ord a => a -> a -> Bool
>= KernelSafety
SafetyFull) forall a b. (a -> b) -> a -> b
$
    forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|ctx->failure_is_an_option = 1;|]
  where
    mkDims :: [Exp] -> (Exp, Exp, Exp)
mkDims [] = ([C.cexp|0|], [C.cexp|0|], [C.cexp|0|])
    mkDims [Exp
x] = (Exp
x, [C.cexp|1|], [C.cexp|1|])
    mkDims [Exp
x, Exp
y] = (Exp
x, Exp
y, [C.cexp|1|])
    mkDims (Exp
x : Exp
y : Exp
z : [Exp]
_) = (Exp
x, Exp
y, Exp
z)

    mkArgs :: p -> KernelArg -> CompilerM op s (Param, (Exp, Exp), Exp)
mkArgs p
i (ValueKArg Exp
e PrimType
t) = do
      let arg :: String
arg = String
"arg" forall a. Semigroup a => a -> a -> a
<> forall a. Show a => a -> String
show p
i
      Exp
e' <- forall op s. Exp -> CompilerM op s Exp
GC.compileExp Exp
e
      forall (f :: * -> *) a. Applicative f => a -> f a
pure
        ( [C.cparam|$ty:(primStorageType t) $id:arg|],
          ([C.cexp|sizeof($id:arg)|], [C.cexp|&$id:arg|]),
          PrimType -> Exp -> Exp
toStorage PrimType
t Exp
e'
        )
    mkArgs p
i (MemKArg VName
v) = do
      let arg :: String
arg = String
"arg" forall a. Semigroup a => a -> a -> a
<> forall a. Show a => a -> String
show p
i
      Exp
v' <- forall op s. VName -> CompilerM op s Exp
GC.rawMem VName
v
      forall (f :: * -> *) a. Applicative f => a -> f a
pure
        ( [C.cparam|typename gpu_mem $id:arg|],
          ([C.cexp|sizeof($id:arg)|], [C.cexp|&$id:arg|]),
          Exp
v'
        )

callKernel :: GC.OpCompiler OpenCL ()
callKernel :: OpCompiler OpenCL ()
callKernel (GetSize VName
v Name
key) =
  forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|$id:v = $exp:(getParamByKey key);|]
callKernel (CmpSizeLe VName
v Name
key Exp
x) = do
  Exp
x' <- forall op s. Exp -> CompilerM op s Exp
GC.compileExp Exp
x
  forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|$id:v = $exp:(getParamByKey key) <= $exp:x';|]
  -- Output size information if logging is enabled.  The autotuner
  -- depends on the format of this output, so use caution if changing
  -- it.
  forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|if (ctx->logging) {
    fprintf(ctx->log, "Compared %s <= %ld: %s.\n", $string:(T.unpack (prettyTextOneLine key)), (long)$exp:x', $id:v ? "true" : "false");
    }|]
callKernel (GetSizeMax VName
v SizeClass
size_class) = do
  let e :: Exp
e = KernelConst -> Exp
kernelConstToExp forall a b. (a -> b) -> a -> b
$ SizeClass -> KernelConst
SizeMaxConst SizeClass
size_class
  forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|$id:v = $exp:e;|]
callKernel (LaunchKernel KernelSafety
safety Name
kernel_name Count Bytes (TExp Int64)
local_memory [KernelArg]
args [Exp]
num_groups [GroupDim]
group_size) =
  forall op s.
KernelSafety
-> Name
-> Count Bytes (TExp Int64)
-> [KernelArg]
-> [Exp]
-> [GroupDim]
-> CompilerM op s ()
genLaunchKernel KernelSafety
safety Name
kernel_name Count Bytes (TExp Int64)
local_memory [KernelArg]
args [Exp]
num_groups [GroupDim]
group_size

copygpu2gpu :: GC.DoLMADCopy op s
copygpu2gpu :: forall op s. DoLMADCopy op s
copygpu2gpu CopyBarrier
_ PrimType
t [Count Elements Exp]
shape Exp
dst (Count Elements Exp
dstoffset, [Count Elements Exp]
dststride) Exp
src (Count Elements Exp
srcoffset, [Count Elements Exp]
srcstride) = do
  let fname :: String
fname = String
"lmad_copy_gpu2gpu_" forall a. Semigroup a => a -> a -> a
<> forall a. Show a => a -> String
show (forall a. Num a => PrimType -> a
primByteSize PrimType
t :: Int) forall a. Semigroup a => a -> a -> a
<> String
"b"
      r :: Int
r = forall (t :: * -> *) a. Foldable t => t a -> Int
length [Count Elements Exp]
shape
      dststride_inits :: [Initializer]
dststride_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
dststride]
      srcstride_inits :: [Initializer]
srcstride_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
srcstride]
      shape_inits :: [Initializer]
shape_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
shape]
  forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|
         if ((err =
                $id:fname(ctx, $int:r,
                          $exp:dst, $exp:(unCount dstoffset),
                          (typename int64_t[]){ $inits:dststride_inits },
                          $exp:src, $exp:(unCount srcoffset),
                          (typename int64_t[]){ $inits:srcstride_inits },
                          (typename int64_t[]){ $inits:shape_inits })) != 0) {
           goto cleanup;
         }
     |]

copyhost2gpu :: GC.DoLMADCopy op s
copyhost2gpu :: forall op s. DoLMADCopy op s
copyhost2gpu CopyBarrier
sync PrimType
t [Count Elements Exp]
shape Exp
dst (Count Elements Exp
dstoffset, [Count Elements Exp]
dststride) Exp
src (Count Elements Exp
srcoffset, [Count Elements Exp]
srcstride) = do
  let r :: Int
r = forall (t :: * -> *) a. Foldable t => t a -> Int
length [Count Elements Exp]
shape
      dststride_inits :: [Initializer]
dststride_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
dststride]
      srcstride_inits :: [Initializer]
srcstride_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
srcstride]
      shape_inits :: [Initializer]
shape_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
shape]
  forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|
         if ((err =
                lmad_copy_host2gpu
                         (ctx, $int:(primByteSize t::Int), $exp:sync', $int:r,
                          $exp:dst, $exp:(unCount dstoffset),
                          (typename int64_t[]){ $inits:dststride_inits },
                          $exp:src, $exp:(unCount srcoffset),
                          (typename int64_t[]){ $inits:srcstride_inits },
                          (typename int64_t[]){ $inits:shape_inits })) != 0) {
           goto cleanup;
         }
     |]
  where
    sync' :: Exp
sync' = case CopyBarrier
sync of
      CopyBarrier
GC.CopyBarrier -> [C.cexp|true|]
      CopyBarrier
GC.CopyNoBarrier -> [C.cexp|false|]

copygpu2host :: GC.DoLMADCopy op s
copygpu2host :: forall op s. DoLMADCopy op s
copygpu2host CopyBarrier
sync PrimType
t [Count Elements Exp]
shape Exp
dst (Count Elements Exp
dstoffset, [Count Elements Exp]
dststride) Exp
src (Count Elements Exp
srcoffset, [Count Elements Exp]
srcstride) = do
  let r :: Int
r = forall (t :: * -> *) a. Foldable t => t a -> Int
length [Count Elements Exp]
shape
      dststride_inits :: [Initializer]
dststride_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
dststride]
      srcstride_inits :: [Initializer]
srcstride_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
srcstride]
      shape_inits :: [Initializer]
shape_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
shape]
  forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|
         if ((err =
                lmad_copy_gpu2host
                         (ctx, $int:(primByteSize t::Int), $exp:sync', $int:r,
                          $exp:dst, $exp:(unCount dstoffset),
                          (typename int64_t[]){ $inits:dststride_inits },
                          $exp:src, $exp:(unCount srcoffset),
                          (typename int64_t[]){ $inits:srcstride_inits },
                          (typename int64_t[]){ $inits:shape_inits })) != 0) {
           goto cleanup;
         }
     |]
  where
    sync' :: Exp
sync' = case CopyBarrier
sync of
      CopyBarrier
GC.CopyBarrier -> [C.cexp|true|]
      CopyBarrier
GC.CopyNoBarrier -> [C.cexp|false|]

gpuCopies :: M.Map (Space, Space) (GC.DoLMADCopy op s)
gpuCopies :: forall op s. Map (Space, Space) (DoLMADCopy op s)
gpuCopies =
  forall k a. Ord k => [(k, a)] -> Map k a
M.fromList
    [ ((String -> Space
Space String
"device", String -> Space
Space String
"device"), forall op s. DoLMADCopy op s
copygpu2gpu),
      ((String -> Space
Space String
"device", Space
DefaultSpace), forall op s. DoLMADCopy op s
copyhost2gpu),
      ((Space
DefaultSpace, String -> Space
Space String
"device"), forall op s. DoLMADCopy op s
copygpu2host)
    ]

createKernels :: [KernelName] -> GC.CompilerM op s ()
createKernels :: forall op s. [Name] -> CompilerM op s ()
createKernels [Name]
kernels = forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [Name]
kernels forall a b. (a -> b) -> a -> b
$ \Name
name ->
  forall op s. Id -> Type -> Stm -> Stm -> CompilerM op s ()
GC.contextFieldDyn
    (forall a. ToIdent a => a -> SrcLoc -> Id
C.toIdent Name
name forall a. Monoid a => a
mempty)
    [C.cty|typename gpu_kernel|]
    [C.cstm|gpu_create_kernel(ctx, &ctx->program->$id:name, $string:(T.unpack (idText (C.toIdent name mempty))));|]
    [C.cstm|gpu_free_kernel(ctx, ctx->program->$id:name);|]

allocateGPU :: GC.Allocate op ()
allocateGPU :: forall op. Allocate op ()
allocateGPU Exp
mem Exp
size Exp
tag String
"device" =
  forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|(void)gpu_alloc(ctx, ctx->log,
                            (size_t)$exp:size, $exp:tag,
                            &$exp:mem, (size_t*)&$exp:size);|]
allocateGPU Exp
_ Exp
_ Exp
_ String
space =
  forall a. HasCallStack => String -> a
error forall a b. (a -> b) -> a -> b
$ String
"Cannot allocate in '" forall a. [a] -> [a] -> [a]
++ String
space forall a. [a] -> [a] -> [a]
++ String
"' memory space."

deallocateGPU :: GC.Deallocate op ()
deallocateGPU :: forall op. Allocate op ()
deallocateGPU Exp
mem Exp
size Exp
tag String
"device" =
  forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|(void)gpu_free(ctx, $exp:mem, $exp:size, $exp:tag);|]
deallocateGPU Exp
_ Exp
_ Exp
_ String
space =
  forall a. HasCallStack => String -> a
error forall a b. (a -> b) -> a -> b
$ String
"Cannot deallocate in '" forall a. [a] -> [a] -> [a]
++ String
space forall a. [a] -> [a] -> [a]
++ String
"' space"

-- It is often faster to do a blocking clEnqueueReadBuffer() than to
-- do an async clEnqueueReadBuffer() followed by a clFinish(), even
-- with an in-order command queue.  This is safe if and only if there
-- are no possible outstanding failures.
readScalarGPU :: GC.ReadScalar op ()
readScalarGPU :: forall op. ReadScalar op ()
readScalarGPU Exp
mem Exp
i Type
t String
"device" Volatility
_ = do
  VName
val <- forall (m :: * -> *). MonadFreshNames m => String -> m VName
newVName String
"read_res"
  forall op s. InitGroup -> CompilerM op s ()
GC.decl [C.cdecl|$ty:t $id:val;|]
  forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|if ((err = gpu_scalar_from_device(ctx, &$id:val, $exp:mem, $exp:i * sizeof($ty:t), sizeof($ty:t))) != 0) { goto cleanup; }|]
  forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|if (ctx->failure_is_an_option && futhark_context_sync(ctx) != 0)
            { err = 1; goto cleanup; }|]
  forall (f :: * -> *) a. Applicative f => a -> f a
pure [C.cexp|$id:val|]
readScalarGPU Exp
_ Exp
_ Type
_ String
space Volatility
_ =
  forall a. HasCallStack => String -> a
error forall a b. (a -> b) -> a -> b
$ String
"Cannot read from '" forall a. [a] -> [a] -> [a]
++ String
space forall a. [a] -> [a] -> [a]
++ String
"' memory space."

-- TODO: Optimised special case when the scalar is a constant, in
-- which case we can do the write asynchronously.
writeScalarGPU :: GC.WriteScalar op ()
writeScalarGPU :: forall op. WriteScalar op ()
writeScalarGPU Exp
mem Exp
i Type
t String
"device" Volatility
_ Exp
val = do
  VName
val' <- forall (m :: * -> *). MonadFreshNames m => String -> m VName
newVName String
"write_tmp"
  forall op s. BlockItem -> CompilerM op s ()
GC.item [C.citem|$ty:t $id:val' = $exp:val;|]
  forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|if ((err = gpu_scalar_to_device(ctx, $exp:mem, $exp:i * sizeof($ty:t), sizeof($ty:t), &$id:val')) != 0) { goto cleanup; }|]
writeScalarGPU Exp
_ Exp
_ Type
_ String
space Volatility
_ Exp
_ =
  forall a. HasCallStack => String -> a
error forall a b. (a -> b) -> a -> b
$ String
"Cannot write to '" forall a. [a] -> [a] -> [a]
++ String
space forall a. [a] -> [a] -> [a]
++ String
"' memory space."

syncArg :: GC.CopyBarrier -> C.Exp
syncArg :: CopyBarrier -> Exp
syncArg CopyBarrier
GC.CopyBarrier = [C.cexp|true|]
syncArg CopyBarrier
GC.CopyNoBarrier = [C.cexp|false|]

copyGPU :: GC.Copy OpenCL ()
copyGPU :: Copy OpenCL ()
copyGPU CopyBarrier
_ Exp
dstmem Exp
dstidx (Space String
"device") Exp
srcmem Exp
srcidx (Space String
"device") Exp
nbytes =
  forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|err = gpu_memcpy(ctx, $exp:dstmem, $exp:dstidx, $exp:srcmem, $exp:srcidx, $exp:nbytes);|]
copyGPU CopyBarrier
b Exp
dstmem Exp
dstidx Space
DefaultSpace Exp
srcmem Exp
srcidx (Space String
"device") Exp
nbytes =
  forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|err = memcpy_gpu2host(ctx, $exp:(syncArg b), $exp:dstmem, $exp:dstidx, $exp:srcmem, $exp:srcidx, $exp:nbytes);|]
copyGPU CopyBarrier
b Exp
dstmem Exp
dstidx (Space String
"device") Exp
srcmem Exp
srcidx Space
DefaultSpace Exp
nbytes =
  forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|err = memcpy_host2gpu(ctx, $exp:(syncArg b), $exp:dstmem, $exp:dstidx, $exp:srcmem, $exp:srcidx, $exp:nbytes);|]
copyGPU CopyBarrier
_ Exp
_ Exp
_ Space
destspace Exp
_ Exp
_ Space
srcspace Exp
_ =
  forall a. HasCallStack => String -> a
error forall a b. (a -> b) -> a -> b
$ String
"Cannot copy to " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> String
show Space
destspace forall a. [a] -> [a] -> [a]
++ String
" from " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> String
show Space
srcspace

gpuOperations :: GC.Operations OpenCL ()
gpuOperations :: Operations OpenCL ()
gpuOperations =
  forall op s. Operations op s
GC.defaultOperations
    { opsCompiler :: OpCompiler OpenCL ()
GC.opsCompiler = OpCompiler OpenCL ()
callKernel,
      opsWriteScalar :: WriteScalar OpenCL ()
GC.opsWriteScalar = forall op. WriteScalar op ()
writeScalarGPU,
      opsReadScalar :: ReadScalar OpenCL ()
GC.opsReadScalar = forall op. ReadScalar op ()
readScalarGPU,
      opsAllocate :: Allocate OpenCL ()
GC.opsAllocate = forall op. Allocate op ()
allocateGPU,
      opsDeallocate :: Allocate OpenCL ()
GC.opsDeallocate = forall op. Allocate op ()
deallocateGPU,
      opsCopy :: Copy OpenCL ()
GC.opsCopy = Copy OpenCL ()
copyGPU,
      opsCopies :: Map (Space, Space) (DoLMADCopy OpenCL ())
GC.opsCopies = forall op s. Map (Space, Space) (DoLMADCopy op s)
gpuCopies forall a. Semigroup a => a -> a -> a
<> forall op s.
Operations op s -> Map (Space, Space) (DoLMADCopy op s)
GC.opsCopies forall op s. Operations op s
GC.defaultOperations,
      opsFatMemory :: Bool
GC.opsFatMemory = Bool
True
    }

-- | Options that are common to multiple GPU-like backends.
gpuOptions :: [Option]
gpuOptions :: [Option]
gpuOptions =
  [ Option
      { optionLongName :: String
optionLongName = String
"device",
        optionShortName :: Maybe Char
optionShortName = forall a. a -> Maybe a
Just Char
'd',
        optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"NAME",
        optionDescription :: String
optionDescription = String
"Use the first device whose name contains the given string.",
        optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_set_device(cfg, optarg);|]
      },
    Option
      { optionLongName :: String
optionLongName = String
"default-group-size",
        optionShortName :: Maybe Char
optionShortName = forall a. Maybe a
Nothing,
        optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"INT",
        optionDescription :: String
optionDescription = String
"The default size of workgroups that are launched.",
        optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_set_default_group_size(cfg, atoi(optarg));|]
      },
    Option
      { optionLongName :: String
optionLongName = String
"default-num-groups",
        optionShortName :: Maybe Char
optionShortName = forall a. Maybe a
Nothing,
        optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"INT",
        optionDescription :: String
optionDescription = String
"The default number of workgroups that are launched.",
        optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_set_default_num_groups(cfg, atoi(optarg));|]
      },
    Option
      { optionLongName :: String
optionLongName = String
"default-tile-size",
        optionShortName :: Maybe Char
optionShortName = forall a. Maybe a
Nothing,
        optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"INT",
        optionDescription :: String
optionDescription = String
"The default tile size used when performing two-dimensional tiling.",
        optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_set_default_tile_size(cfg, atoi(optarg));|]
      },
    Option
      { optionLongName :: String
optionLongName = String
"default-reg-tile-size",
        optionShortName :: Maybe Char
optionShortName = forall a. Maybe a
Nothing,
        optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"INT",
        optionDescription :: String
optionDescription = String
"The default register tile size used when performing two-dimensional tiling.",
        optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_set_default_reg_tile_size(cfg, atoi(optarg));|]
      },
    Option
      { optionLongName :: String
optionLongName = String
"default-threshold",
        optionShortName :: Maybe Char
optionShortName = forall a. Maybe a
Nothing,
        optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"INT",
        optionDescription :: String
optionDescription = String
"The default parallelism threshold.",
        optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_set_default_threshold(cfg, atoi(optarg));|]
      }
  ]

errorMsgNumArgs :: ErrorMsg a -> Int
errorMsgNumArgs :: forall a. ErrorMsg a -> Int
errorMsgNumArgs = forall (t :: * -> *) a. Foldable t => t a -> Int
length forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a. ErrorMsg a -> [PrimType]
errorMsgArgTypes

failureMsgFunction :: [FailureMsg] -> C.Definition
failureMsgFunction :: [FailureMsg] -> Definition
failureMsgFunction [FailureMsg]
failures =
  let printfEscape :: String -> String
printfEscape =
        let escapeChar :: Char -> String
escapeChar Char
'%' = String
"%%"
            escapeChar Char
c = [Char
c]
         in forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Char -> String
escapeChar
      onPart :: ErrorMsgPart a -> String
onPart (ErrorString Text
s) = String -> String
printfEscape forall a b. (a -> b) -> a -> b
$ Text -> String
T.unpack Text
s
      -- FIXME: bogus for non-ints.
      onPart ErrorVal {} = String
"%lld"
      onFailure :: a -> FailureMsg -> Stm
onFailure a
i (FailureMsg emsg :: ErrorMsg Exp
emsg@(ErrorMsg [ErrorMsgPart Exp]
parts) String
backtrace) =
        let msg :: String
msg = forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap forall {a}. ErrorMsgPart a -> String
onPart [ErrorMsgPart Exp]
parts forall a. [a] -> [a] -> [a]
++ String
"\n" forall a. [a] -> [a] -> [a]
++ String -> String
printfEscape String
backtrace
            msgargs :: [Exp]
msgargs = [[C.cexp|args[$int:j]|] | Int
j <- [Int
0 .. forall a. ErrorMsg a -> Int
errorMsgNumArgs ErrorMsg Exp
emsg forall a. Num a => a -> a -> a
- Int
1]]
         in [C.cstm|case $int:i: {return msgprintf($string:msg, $args:msgargs); break;}|]
      failure_cases :: [Stm]
failure_cases =
        forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith forall {a}. (Show a, Integral a) => a -> FailureMsg -> Stm
onFailure [(Int
0 :: Int) ..] [FailureMsg]
failures
   in [C.cedecl|static char* get_failure_msg(int failure_idx, typename int64_t args[]) {
                  (void)args;
                  switch (failure_idx) { $stms:failure_cases }
                  return strdup("Unknown error.  This is a compiler bug.");
                }|]

compileConstExp :: KernelConstExp -> C.Exp
compileConstExp :: KernelConstExp -> Exp
compileConstExp KernelConstExp
e = forall a. Identity a -> a
runIdentity forall a b. (a -> b) -> a -> b
$ forall (m :: * -> *) v.
Monad m =>
(v -> m Exp) -> PrimExp v -> m Exp
GC.compilePrimExp (forall (f :: * -> *) a. Applicative f => a -> f a
pure forall b c a. (b -> c) -> (a -> b) -> a -> c
. KernelConst -> Exp
kernelConstToExp) KernelConstExp
e

-- | Called after most code has been generated to generate the bulk of
-- the boilerplate.
generateGPUBoilerplate ::
  T.Text ->
  [(Name, KernelConstExp)] ->
  T.Text ->
  [Name] ->
  [PrimType] ->
  [FailureMsg] ->
  GC.CompilerM OpenCL () ()
generateGPUBoilerplate :: Text
-> [(Name, KernelConstExp)]
-> Text
-> [Name]
-> [PrimType]
-> [FailureMsg]
-> CompilerM OpenCL () ()
generateGPUBoilerplate Text
gpu_program [(Name, KernelConstExp)]
macros Text
backendH [Name]
kernels [PrimType]
types [FailureMsg]
failures = do
  forall op s. [Name] -> CompilerM op s ()
createKernels [Name]
kernels
  let gpu_program_fragments :: [Initializer]
gpu_program_fragments =
        -- Some C compilers limit the size of literal strings, so
        -- chunk the entire program into small bits here, and
        -- concatenate it again at runtime.
        [[C.cinit|$string:s|] | String
s <- forall a. Int -> [a] -> [[a]]
chunk Int
2000 forall a b. (a -> b) -> a -> b
$ Text -> String
T.unpack Text
gpu_program]
      program_fragments :: [Initializer]
program_fragments = [Initializer]
gpu_program_fragments forall a. [a] -> [a] -> [a]
++ [[C.cinit|NULL|]]
      f64_required :: Exp
f64_required
        | FloatType -> PrimType
FloatType FloatType
Float64 forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [PrimType]
types = [C.cexp|1|]
        | Bool
otherwise = [C.cexp|0|]
      max_failure_args :: Int
max_failure_args = forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl forall a. Ord a => a -> a -> a
max Int
0 forall a b. (a -> b) -> a -> b
$ forall a b. (a -> b) -> [a] -> [b]
map (forall a. ErrorMsg a -> Int
errorMsgNumArgs forall b c a. (b -> c) -> (a -> b) -> a -> c
. FailureMsg -> ErrorMsg Exp
failureError) [FailureMsg]
failures

      setMacro :: a -> (Name, KernelConstExp) -> Stm
setMacro a
i (Name
name, KernelConstExp
e) =
        [C.cstm|{names[$int:i] = $string:(nameToString name);
                 values[$int:i] = $esc:e';}|]
        where
          e' :: String
e' = Text -> String
T.unpack forall a b. (a -> b) -> a -> b
$ Exp -> Text
expText forall a b. (a -> b) -> a -> b
$ KernelConstExp -> Exp
compileConstExp KernelConstExp
e

  forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_
    forall op s. Definition -> CompilerM op s ()
GC.earlyDecl
    [C.cunit|static const int max_failure_args = $int:max_failure_args;
             static const int f64_required = $exp:f64_required;
             static const char *gpu_program[] = {$inits:program_fragments};
             $esc:(T.unpack gpuPrototypesH)
             $esc:(T.unpack backendH)
             $esc:(T.unpack gpuH)
             static int gpu_macros(struct futhark_context *ctx, char*** names_out, typename int64_t** values_out) {
               int num_macros = $int:(length macros);
               char** names = malloc(num_macros * sizeof(char*));
               typename int64_t* values = malloc(num_macros * sizeof(int64_t));

               $stms:(zipWith setMacro [(0::Int)..] macros)

               *names_out = names;
               *values_out = values;
               return num_macros;
             }
            |]
  forall op s. Definition -> CompilerM op s ()
GC.earlyDecl forall a b. (a -> b) -> a -> b
$ [FailureMsg] -> Definition
failureMsgFunction [FailureMsg]
failures

  forall op s. CompilerM op s ()
GC.generateProgramStruct

  forall op s. BlockItem -> CompilerM op s ()
GC.onClear [C.citem|if (ctx->error == NULL) { gpu_free_all(ctx); }|]