{-# LANGUAGE QuasiQuotes #-}

-- | C code generation for GPU, in general.
--
-- This module generates codes that targets the tiny GPU API
-- abstraction layer we define in the runtime system.
module Futhark.CodeGen.Backends.GPU
  ( gpuOperations,
    gpuOptions,
    generateGPUBoilerplate,
  )
where

import Control.Monad
import Data.Bifunctor (bimap)
import Data.Map qualified as M
import Data.Text qualified as T
import Futhark.CodeGen.Backends.GenericC qualified as GC
import Futhark.CodeGen.Backends.GenericC.Options
import Futhark.CodeGen.Backends.GenericC.Pretty (idText)
import Futhark.CodeGen.Backends.SimpleRep (primStorageType, toStorage)
import Futhark.CodeGen.ImpCode.OpenCL
import Futhark.CodeGen.RTS.C (gpuH, gpuPrototypesH)
import Futhark.MonadFreshNames
import Futhark.Util (chunk)
import Futhark.Util.Pretty (prettyTextOneLine)
import Language.C.Quote.OpenCL qualified as C
import Language.C.Syntax qualified as C

genKernelFunction ::
  KernelName ->
  KernelSafety ->
  [C.Param] ->
  [(C.Exp, C.Exp)] ->
  GC.CompilerM op s Name
genKernelFunction :: forall op s.
Name
-> KernelSafety -> [Param] -> [(Exp, Exp)] -> CompilerM op s Name
genKernelFunction Name
kernel_name KernelSafety
safety [Param]
arg_params [(Exp, Exp)]
arg_set = do
  let kernel_fname :: Name
kernel_fname = Name
"gpu_kernel_" Name -> Name -> Name
forall a. Semigroup a => a -> a -> a
<> Name
kernel_name
  Definition -> CompilerM op s ()
forall op s. Definition -> CompilerM op s ()
GC.libDecl
    [C.cedecl|static int $id:kernel_fname
               (struct futhark_context* ctx,
                unsigned int grid_x, unsigned int grid_y, unsigned int grid_z,
                unsigned int block_x, unsigned int block_y, unsigned int block_z,
                unsigned int shared_bytes, $params:arg_params) {
    if (grid_x * grid_y * grid_z * block_x * block_y * block_z != 0) {
      void* args[$int:num_args] = { $inits:(failure_inits<>args_inits) };
      size_t args_sizes[$int:num_args] = { $inits:(failure_sizes<>args_sizes) };
      return gpu_launch_kernel(ctx, ctx->program->$id:kernel_name,
                               $string:(prettyString kernel_name),
                               (const typename int32_t[]){grid_x, grid_y, grid_z},
                               (const typename int32_t[]){block_x, block_y, block_z},
                               shared_bytes,
                               $int:num_args, args, args_sizes);
    }
    return FUTHARK_SUCCESS;
  }|]

  Name -> CompilerM op s Name
forall a. a -> CompilerM op s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Name
kernel_fname
  where
    num_args :: Int
num_args = KernelSafety -> Int
numFailureParams KernelSafety
safety Int -> Int -> Int
forall a. Num a => a -> a -> a
+ [(Exp, Exp)] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [(Exp, Exp)]
arg_set
    expToInit :: a -> Initializer
expToInit a
e = [C.cinit|$exp:e|]
    ([Initializer]
args_sizes, [Initializer]
args_inits) = ([Exp] -> [Initializer])
-> ([Exp] -> [Initializer])
-> ([Exp], [Exp])
-> ([Initializer], [Initializer])
forall a b c d. (a -> b) -> (c -> d) -> (a, c) -> (b, d)
forall (p :: * -> * -> *) a b c d.
Bifunctor p =>
(a -> b) -> (c -> d) -> p a c -> p b d
bimap ((Exp -> Initializer) -> [Exp] -> [Initializer]
forall a b. (a -> b) -> [a] -> [b]
map Exp -> Initializer
forall {a}. ToExp a => a -> Initializer
expToInit) ((Exp -> Initializer) -> [Exp] -> [Initializer]
forall a b. (a -> b) -> [a] -> [b]
map Exp -> Initializer
forall {a}. ToExp a => a -> Initializer
expToInit) (([Exp], [Exp]) -> ([Initializer], [Initializer]))
-> ([Exp], [Exp]) -> ([Initializer], [Initializer])
forall a b. (a -> b) -> a -> b
$ [(Exp, Exp)] -> ([Exp], [Exp])
forall a b. [(a, b)] -> ([a], [b])
unzip [(Exp, Exp)]
arg_set
    ([Initializer]
failure_inits, [Initializer]
failure_sizes) =
      [(Initializer, Initializer)] -> ([Initializer], [Initializer])
forall a b. [(a, b)] -> ([a], [b])
unzip ([(Initializer, Initializer)] -> ([Initializer], [Initializer]))
-> ([(Initializer, Initializer)] -> [(Initializer, Initializer)])
-> [(Initializer, Initializer)]
-> ([Initializer], [Initializer])
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [(Initializer, Initializer)] -> [(Initializer, Initializer)]
forall a. Int -> [a] -> [a]
take (KernelSafety -> Int
numFailureParams KernelSafety
safety) ([(Initializer, Initializer)] -> ([Initializer], [Initializer]))
-> [(Initializer, Initializer)] -> ([Initializer], [Initializer])
forall a b. (a -> b) -> a -> b
$
        [ ([C.cinit|&ctx->global_failure|], [C.cinit|sizeof(ctx->global_failure)|]),
          ([C.cinit|&ctx->failure_is_an_option|], [C.cinit|sizeof(ctx->failure_is_an_option)|]),
          ([C.cinit|&ctx->global_failure_args|], [C.cinit|sizeof(ctx->global_failure_args)|])
        ]

kernelConstToExp :: KernelConst -> C.Exp
kernelConstToExp :: KernelConst -> Exp
kernelConstToExp (SizeConst Name
key) =
  [C.cexp|*ctx->tuning_params.$id:key|]
kernelConstToExp (SizeMaxConst SizeClass
size_class) =
  [C.cexp|ctx->$id:field|]
  where
    field :: String
field = String
"max_" String -> String -> String
forall a. Semigroup a => a -> a -> a
<> SizeClass -> String
forall a. Pretty a => a -> String
prettyString SizeClass
size_class

compileGroupDim :: GroupDim -> GC.CompilerM op s C.Exp
compileGroupDim :: forall op s. GroupDim -> CompilerM op s Exp
compileGroupDim (Left Exp
e) = Exp -> CompilerM op s Exp
forall op s. Exp -> CompilerM op s Exp
GC.compileExp Exp
e
compileGroupDim (Right KernelConst
kc) = Exp -> CompilerM op s Exp
forall a. a -> CompilerM op s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Exp -> CompilerM op s Exp) -> Exp -> CompilerM op s Exp
forall a b. (a -> b) -> a -> b
$ KernelConst -> Exp
kernelConstToExp KernelConst
kc

genLaunchKernel ::
  KernelSafety ->
  KernelName ->
  Count Bytes (TExp Int64) ->
  [KernelArg] ->
  [Exp] ->
  [GroupDim] ->
  GC.CompilerM op s ()
genLaunchKernel :: forall op s.
KernelSafety
-> Name
-> Count Bytes (TExp Int64)
-> [KernelArg]
-> [Exp]
-> [GroupDim]
-> CompilerM op s ()
genLaunchKernel KernelSafety
safety Name
kernel_name Count Bytes (TExp Int64)
local_memory [KernelArg]
args [Exp]
num_groups [GroupDim]
group_size = do
  ([Param]
arg_params, [(Exp, Exp)]
arg_params_inits, [Exp]
call_args) <-
    [(Param, (Exp, Exp), Exp)] -> ([Param], [(Exp, Exp)], [Exp])
forall a b c. [(a, b, c)] -> ([a], [b], [c])
unzip3 ([(Param, (Exp, Exp), Exp)] -> ([Param], [(Exp, Exp)], [Exp]))
-> CompilerM op s [(Param, (Exp, Exp), Exp)]
-> CompilerM op s ([Param], [(Exp, Exp)], [Exp])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Int -> KernelArg -> CompilerM op s (Param, (Exp, Exp), Exp))
-> [Int]
-> [KernelArg]
-> CompilerM op s [(Param, (Exp, Exp), Exp)]
forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m [c]
zipWithM Int -> KernelArg -> CompilerM op s (Param, (Exp, Exp), Exp)
forall {p} {op} {s}.
Show p =>
p -> KernelArg -> CompilerM op s (Param, (Exp, Exp), Exp)
mkArgs [(Int
0 :: Int) ..] [KernelArg]
args

  (Exp
grid_x, Exp
grid_y, Exp
grid_z) <- [Exp] -> (Exp, Exp, Exp)
mkDims ([Exp] -> (Exp, Exp, Exp))
-> CompilerM op s [Exp] -> CompilerM op s (Exp, Exp, Exp)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Exp -> CompilerM op s Exp) -> [Exp] -> CompilerM op s [Exp]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> [a] -> m [b]
mapM Exp -> CompilerM op s Exp
forall op s. Exp -> CompilerM op s Exp
GC.compileExp [Exp]
num_groups
  (Exp
group_x, Exp
group_y, Exp
group_z) <- [Exp] -> (Exp, Exp, Exp)
mkDims ([Exp] -> (Exp, Exp, Exp))
-> CompilerM op s [Exp] -> CompilerM op s (Exp, Exp, Exp)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (GroupDim -> CompilerM op s Exp)
-> [GroupDim] -> CompilerM op s [Exp]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> [a] -> m [b]
mapM GroupDim -> CompilerM op s Exp
forall op s. GroupDim -> CompilerM op s Exp
compileGroupDim [GroupDim]
group_size

  Name
kernel_fname <- Name
-> KernelSafety -> [Param] -> [(Exp, Exp)] -> CompilerM op s Name
forall op s.
Name
-> KernelSafety -> [Param] -> [(Exp, Exp)] -> CompilerM op s Name
genKernelFunction Name
kernel_name KernelSafety
safety [Param]
arg_params [(Exp, Exp)]
arg_params_inits

  Exp
local_memory' <- Exp -> CompilerM op s Exp
forall op s. Exp -> CompilerM op s Exp
GC.compileExp (Exp -> CompilerM op s Exp) -> Exp -> CompilerM op s Exp
forall a b. (a -> b) -> a -> b
$ TExp Int64 -> Exp
forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped (TExp Int64 -> Exp) -> TExp Int64 -> Exp
forall a b. (a -> b) -> a -> b
$ Count Bytes (TExp Int64) -> TExp Int64
forall {k} (u :: k) e. Count u e -> e
unCount Count Bytes (TExp Int64)
local_memory

  Stm -> CompilerM op s ()
forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|{
           err = $id:kernel_fname(ctx,
                                  $exp:grid_x, $exp:grid_y, $exp:grid_z,
                                  $exp:group_x, $exp:group_y, $exp:group_z,
                                  $exp:local_memory',
                                  $args:call_args);
           if (err != FUTHARK_SUCCESS) { goto cleanup; }
           }|]

  Bool -> CompilerM op s () -> CompilerM op s ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when (KernelSafety
safety KernelSafety -> KernelSafety -> Bool
forall a. Ord a => a -> a -> Bool
>= KernelSafety
SafetyFull) (CompilerM op s () -> CompilerM op s ())
-> CompilerM op s () -> CompilerM op s ()
forall a b. (a -> b) -> a -> b
$
    Stm -> CompilerM op s ()
forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|ctx->failure_is_an_option = 1;|]
  where
    mkDims :: [Exp] -> (Exp, Exp, Exp)
mkDims [] = ([C.cexp|0|], [C.cexp|0|], [C.cexp|0|])
    mkDims [Exp
x] = (Exp
x, [C.cexp|1|], [C.cexp|1|])
    mkDims [Exp
x, Exp
y] = (Exp
x, Exp
y, [C.cexp|1|])
    mkDims (Exp
x : Exp
y : Exp
z : [Exp]
_) = (Exp
x, Exp
y, Exp
z)

    mkArgs :: p -> KernelArg -> CompilerM op s (Param, (Exp, Exp), Exp)
mkArgs p
i (ValueKArg Exp
e PrimType
t) = do
      let arg :: String
arg = String
"arg" String -> String -> String
forall a. Semigroup a => a -> a -> a
<> p -> String
forall a. Show a => a -> String
show p
i
      Exp
e' <- Exp -> CompilerM op s Exp
forall op s. Exp -> CompilerM op s Exp
GC.compileExp Exp
e
      (Param, (Exp, Exp), Exp) -> CompilerM op s (Param, (Exp, Exp), Exp)
forall a. a -> CompilerM op s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure
        ( [C.cparam|$ty:(primStorageType t) $id:arg|],
          ([C.cexp|sizeof($id:arg)|], [C.cexp|&$id:arg|]),
          PrimType -> Exp -> Exp
toStorage PrimType
t Exp
e'
        )
    mkArgs p
i (MemKArg VName
v) = do
      let arg :: String
arg = String
"arg" String -> String -> String
forall a. Semigroup a => a -> a -> a
<> p -> String
forall a. Show a => a -> String
show p
i
      Exp
v' <- VName -> CompilerM op s Exp
forall op s. VName -> CompilerM op s Exp
GC.rawMem VName
v
      (Param, (Exp, Exp), Exp) -> CompilerM op s (Param, (Exp, Exp), Exp)
forall a. a -> CompilerM op s a
forall (f :: * -> *) a. Applicative f => a -> f a
pure
        ( [C.cparam|typename gpu_mem $id:arg|],
          ([C.cexp|sizeof($id:arg)|], [C.cexp|&$id:arg|]),
          Exp
v'
        )

callKernel :: GC.OpCompiler OpenCL ()
callKernel :: OpCompiler OpenCL ()
callKernel (GetSize VName
v Name
key) = do
  let e :: Exp
e = KernelConst -> Exp
kernelConstToExp (KernelConst -> Exp) -> KernelConst -> Exp
forall a b. (a -> b) -> a -> b
$ Name -> KernelConst
SizeConst Name
key
  Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|$id:v = $exp:e;|]
callKernel (CmpSizeLe VName
v Name
key Exp
x) = do
  let e :: Exp
e = KernelConst -> Exp
kernelConstToExp (KernelConst -> Exp) -> KernelConst -> Exp
forall a b. (a -> b) -> a -> b
$ Name -> KernelConst
SizeConst Name
key
  Exp
x' <- Exp -> CompilerM OpenCL () Exp
forall op s. Exp -> CompilerM op s Exp
GC.compileExp Exp
x
  Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|$id:v = $exp:e <= $exp:x';|]
  -- Output size information if logging is enabled.  The autotuner
  -- depends on the format of this output, so use caution if changing
  -- it.
  Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|if (ctx->logging) {
    fprintf(ctx->log, "Compared %s <= %ld: %s.\n", $string:(T.unpack (prettyTextOneLine key)), (long)$exp:x', $id:v ? "true" : "false");
    }|]
callKernel (GetSizeMax VName
v SizeClass
size_class) = do
  let e :: Exp
e = KernelConst -> Exp
kernelConstToExp (KernelConst -> Exp) -> KernelConst -> Exp
forall a b. (a -> b) -> a -> b
$ SizeClass -> KernelConst
SizeMaxConst SizeClass
size_class
  Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|$id:v = $exp:e;|]
callKernel (LaunchKernel KernelSafety
safety Name
kernel_name Count Bytes (TExp Int64)
local_memory [KernelArg]
args [Exp]
num_groups [GroupDim]
group_size) =
  KernelSafety
-> Name
-> Count Bytes (TExp Int64)
-> [KernelArg]
-> [Exp]
-> [GroupDim]
-> CompilerM OpenCL () ()
forall op s.
KernelSafety
-> Name
-> Count Bytes (TExp Int64)
-> [KernelArg]
-> [Exp]
-> [GroupDim]
-> CompilerM op s ()
genLaunchKernel KernelSafety
safety Name
kernel_name Count Bytes (TExp Int64)
local_memory [KernelArg]
args [Exp]
num_groups [GroupDim]
group_size

copygpu2gpu :: GC.DoLMADCopy op s
copygpu2gpu :: forall op s. DoLMADCopy op s
copygpu2gpu CopyBarrier
_ PrimType
t [Count Elements Exp]
shape Exp
dst (Count Elements Exp
dstoffset, [Count Elements Exp]
dststride) Exp
src (Count Elements Exp
srcoffset, [Count Elements Exp]
srcstride) = do
  let fname :: String
fname = String
"lmad_copy_gpu2gpu_" String -> String -> String
forall a. Semigroup a => a -> a -> a
<> Int -> String
forall a. Show a => a -> String
show (PrimType -> Int
forall a. Num a => PrimType -> a
primByteSize PrimType
t :: Int) String -> String -> String
forall a. Semigroup a => a -> a -> a
<> String
"b"
      r :: Int
r = [Count Elements Exp] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Count Elements Exp]
shape
      dststride_inits :: [Initializer]
dststride_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
dststride]
      srcstride_inits :: [Initializer]
srcstride_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
srcstride]
      shape_inits :: [Initializer]
shape_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
shape]
  Stm -> CompilerM op s ()
forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|
         if ((err =
                $id:fname(ctx, $int:r,
                          $exp:dst, $exp:(unCount dstoffset),
                          (typename int64_t[]){ $inits:dststride_inits },
                          $exp:src, $exp:(unCount srcoffset),
                          (typename int64_t[]){ $inits:srcstride_inits },
                          (typename int64_t[]){ $inits:shape_inits })) != 0) {
           goto cleanup;
         }
     |]

copyhost2gpu :: GC.DoLMADCopy op s
copyhost2gpu :: forall op s. DoLMADCopy op s
copyhost2gpu CopyBarrier
sync PrimType
t [Count Elements Exp]
shape Exp
dst (Count Elements Exp
dstoffset, [Count Elements Exp]
dststride) Exp
src (Count Elements Exp
srcoffset, [Count Elements Exp]
srcstride) = do
  let r :: Int
r = [Count Elements Exp] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Count Elements Exp]
shape
      dststride_inits :: [Initializer]
dststride_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
dststride]
      srcstride_inits :: [Initializer]
srcstride_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
srcstride]
      shape_inits :: [Initializer]
shape_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
shape]
  Stm -> CompilerM op s ()
forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|
         if ((err =
                lmad_copy_host2gpu
                         (ctx, $int:(primByteSize t::Int), $exp:sync', $int:r,
                          $exp:dst, $exp:(unCount dstoffset),
                          (typename int64_t[]){ $inits:dststride_inits },
                          $exp:src, $exp:(unCount srcoffset),
                          (typename int64_t[]){ $inits:srcstride_inits },
                          (typename int64_t[]){ $inits:shape_inits })) != 0) {
           goto cleanup;
         }
     |]
  where
    sync' :: Exp
sync' = case CopyBarrier
sync of
      CopyBarrier
GC.CopyBarrier -> [C.cexp|true|]
      CopyBarrier
GC.CopyNoBarrier -> [C.cexp|false|]

copygpu2host :: GC.DoLMADCopy op s
copygpu2host :: forall op s. DoLMADCopy op s
copygpu2host CopyBarrier
sync PrimType
t [Count Elements Exp]
shape Exp
dst (Count Elements Exp
dstoffset, [Count Elements Exp]
dststride) Exp
src (Count Elements Exp
srcoffset, [Count Elements Exp]
srcstride) = do
  let r :: Int
r = [Count Elements Exp] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Count Elements Exp]
shape
      dststride_inits :: [Initializer]
dststride_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
dststride]
      srcstride_inits :: [Initializer]
srcstride_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
srcstride]
      shape_inits :: [Initializer]
shape_inits = [[C.cinit|$exp:e|] | Count Exp
e <- [Count Elements Exp]
shape]
  Stm -> CompilerM op s ()
forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|
         if ((err =
                lmad_copy_gpu2host
                         (ctx, $int:(primByteSize t::Int), $exp:sync', $int:r,
                          $exp:dst, $exp:(unCount dstoffset),
                          (typename int64_t[]){ $inits:dststride_inits },
                          $exp:src, $exp:(unCount srcoffset),
                          (typename int64_t[]){ $inits:srcstride_inits },
                          (typename int64_t[]){ $inits:shape_inits })) != 0) {
           goto cleanup;
         }
     |]
  where
    sync' :: Exp
sync' = case CopyBarrier
sync of
      CopyBarrier
GC.CopyBarrier -> [C.cexp|true|]
      CopyBarrier
GC.CopyNoBarrier -> [C.cexp|false|]

gpuCopies :: M.Map (Space, Space) (GC.DoLMADCopy op s)
gpuCopies :: forall op s. Map (Space, Space) (DoLMADCopy op s)
gpuCopies =
  [((Space, Space), DoLMADCopy op s)]
-> Map (Space, Space) (DoLMADCopy op s)
forall k a. Ord k => [(k, a)] -> Map k a
M.fromList
    [ ((String -> Space
Space String
"device", String -> Space
Space String
"device"), DoLMADCopy op s
forall op s. DoLMADCopy op s
copygpu2gpu),
      ((String -> Space
Space String
"device", Space
DefaultSpace), DoLMADCopy op s
forall op s. DoLMADCopy op s
copyhost2gpu),
      ((Space
DefaultSpace, String -> Space
Space String
"device"), DoLMADCopy op s
forall op s. DoLMADCopy op s
copygpu2host)
    ]

createKernels :: [KernelName] -> GC.CompilerM op s ()
createKernels :: forall op s. [Name] -> CompilerM op s ()
createKernels [Name]
kernels = [Name] -> (Name -> CompilerM op s ()) -> CompilerM op s ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ [Name]
kernels ((Name -> CompilerM op s ()) -> CompilerM op s ())
-> (Name -> CompilerM op s ()) -> CompilerM op s ()
forall a b. (a -> b) -> a -> b
$ \Name
name ->
  Id -> Type -> Stm -> Stm -> CompilerM op s ()
forall op s. Id -> Type -> Stm -> Stm -> CompilerM op s ()
GC.contextFieldDyn
    (Name -> SrcLoc -> Id
forall a. ToIdent a => a -> SrcLoc -> Id
C.toIdent Name
name SrcLoc
forall a. Monoid a => a
mempty)
    [C.cty|typename gpu_kernel|]
    [C.cstm|gpu_create_kernel(ctx, &ctx->program->$id:name, $string:(T.unpack (idText (C.toIdent name mempty))));|]
    [C.cstm|gpu_free_kernel(ctx, ctx->program->$id:name);|]

allocateGPU :: GC.Allocate op ()
allocateGPU :: forall op. Allocate op ()
allocateGPU Exp
mem Exp
size Exp
tag String
"device" =
  Stm -> CompilerM op () ()
forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|(void)gpu_alloc(ctx, ctx->log,
                            (size_t)$exp:size, $exp:tag,
                            &$exp:mem, (size_t*)&$exp:size);|]
allocateGPU Exp
_ Exp
_ Exp
_ String
space =
  String -> CompilerM op () ()
forall a. HasCallStack => String -> a
error (String -> CompilerM op () ()) -> String -> CompilerM op () ()
forall a b. (a -> b) -> a -> b
$ String
"Cannot allocate in '" String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
space String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"' memory space."

deallocateGPU :: GC.Deallocate op ()
deallocateGPU :: forall op. Allocate op ()
deallocateGPU Exp
mem Exp
size Exp
tag String
"device" =
  Stm -> CompilerM op () ()
forall op s. Stm -> CompilerM op s ()
GC.stm [C.cstm|(void)gpu_free(ctx, $exp:mem, $exp:size, $exp:tag);|]
deallocateGPU Exp
_ Exp
_ Exp
_ String
space =
  String -> CompilerM op () ()
forall a. HasCallStack => String -> a
error (String -> CompilerM op () ()) -> String -> CompilerM op () ()
forall a b. (a -> b) -> a -> b
$ String
"Cannot deallocate in '" String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
space String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"' space"

-- It is often faster to do a blocking clEnqueueReadBuffer() than to
-- do an async clEnqueueReadBuffer() followed by a clFinish(), even
-- with an in-order command queue.  This is safe if and only if there
-- are no possible outstanding failures.
readScalarGPU :: GC.ReadScalar op ()
readScalarGPU :: forall op. ReadScalar op ()
readScalarGPU Exp
mem Exp
i Type
t String
"device" Volatility
_ = do
  VName
val <- String -> CompilerM op () VName
forall (m :: * -> *). MonadFreshNames m => String -> m VName
newVName String
"read_res"
  InitGroup -> CompilerM op () ()
forall op s. InitGroup -> CompilerM op s ()
GC.decl [C.cdecl|$ty:t $id:val;|]
  Stm -> CompilerM op () ()
forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|if ((err = gpu_scalar_from_device(ctx, &$id:val, $exp:mem, $exp:i * sizeof($ty:t), sizeof($ty:t))) != 0) { goto cleanup; }|]
  Stm -> CompilerM op () ()
forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|if (ctx->failure_is_an_option && futhark_context_sync(ctx) != 0)
            { err = 1; goto cleanup; }|]
  Exp -> CompilerM op () Exp
forall a. a -> CompilerM op () a
forall (f :: * -> *) a. Applicative f => a -> f a
pure [C.cexp|$id:val|]
readScalarGPU Exp
_ Exp
_ Type
_ String
space Volatility
_ =
  String -> CompilerM op () Exp
forall a. HasCallStack => String -> a
error (String -> CompilerM op () Exp) -> String -> CompilerM op () Exp
forall a b. (a -> b) -> a -> b
$ String
"Cannot read from '" String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
space String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"' memory space."

-- TODO: Optimised special case when the scalar is a constant, in
-- which case we can do the write asynchronously.
writeScalarGPU :: GC.WriteScalar op ()
writeScalarGPU :: forall op. WriteScalar op ()
writeScalarGPU Exp
mem Exp
i Type
t String
"device" Volatility
_ Exp
val = do
  VName
val' <- String -> CompilerM op () VName
forall (m :: * -> *). MonadFreshNames m => String -> m VName
newVName String
"write_tmp"
  BlockItem -> CompilerM op () ()
forall op s. BlockItem -> CompilerM op s ()
GC.item [C.citem|$ty:t $id:val' = $exp:val;|]
  Stm -> CompilerM op () ()
forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|if ((err = gpu_scalar_to_device(ctx, $exp:mem, $exp:i * sizeof($ty:t), sizeof($ty:t), &$id:val')) != 0) { goto cleanup; }|]
writeScalarGPU Exp
_ Exp
_ Type
_ String
space Volatility
_ Exp
_ =
  String -> CompilerM op () ()
forall a. HasCallStack => String -> a
error (String -> CompilerM op () ()) -> String -> CompilerM op () ()
forall a b. (a -> b) -> a -> b
$ String
"Cannot write to '" String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
space String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"' memory space."

syncArg :: GC.CopyBarrier -> C.Exp
syncArg :: CopyBarrier -> Exp
syncArg CopyBarrier
GC.CopyBarrier = [C.cexp|true|]
syncArg CopyBarrier
GC.CopyNoBarrier = [C.cexp|false|]

copyGPU :: GC.Copy OpenCL ()
copyGPU :: Copy OpenCL ()
copyGPU CopyBarrier
_ Exp
dstmem Exp
dstidx (Space String
"device") Exp
srcmem Exp
srcidx (Space String
"device") Exp
nbytes =
  Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|err = gpu_memcpy(ctx, $exp:dstmem, $exp:dstidx, $exp:srcmem, $exp:srcidx, $exp:nbytes);|]
copyGPU CopyBarrier
b Exp
dstmem Exp
dstidx Space
DefaultSpace Exp
srcmem Exp
srcidx (Space String
"device") Exp
nbytes =
  Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|err = memcpy_gpu2host(ctx, $exp:(syncArg b), $exp:dstmem, $exp:dstidx, $exp:srcmem, $exp:srcidx, $exp:nbytes);|]
copyGPU CopyBarrier
b Exp
dstmem Exp
dstidx (Space String
"device") Exp
srcmem Exp
srcidx Space
DefaultSpace Exp
nbytes =
  Stm -> CompilerM OpenCL () ()
forall op s. Stm -> CompilerM op s ()
GC.stm
    [C.cstm|err = memcpy_host2gpu(ctx, $exp:(syncArg b), $exp:dstmem, $exp:dstidx, $exp:srcmem, $exp:srcidx, $exp:nbytes);|]
copyGPU CopyBarrier
_ Exp
_ Exp
_ Space
destspace Exp
_ Exp
_ Space
srcspace Exp
_ =
  String -> CompilerM OpenCL () ()
forall a. HasCallStack => String -> a
error (String -> CompilerM OpenCL () ())
-> String -> CompilerM OpenCL () ()
forall a b. (a -> b) -> a -> b
$ String
"Cannot copy to " String -> String -> String
forall a. [a] -> [a] -> [a]
++ Space -> String
forall a. Show a => a -> String
show Space
destspace String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
" from " String -> String -> String
forall a. [a] -> [a] -> [a]
++ Space -> String
forall a. Show a => a -> String
show Space
srcspace

gpuOperations :: GC.Operations OpenCL ()
gpuOperations :: Operations OpenCL ()
gpuOperations =
  Operations OpenCL ()
forall op s. Operations op s
GC.defaultOperations
    { opsCompiler :: OpCompiler OpenCL ()
GC.opsCompiler = OpCompiler OpenCL ()
callKernel,
      opsWriteScalar :: WriteScalar OpenCL ()
GC.opsWriteScalar = WriteScalar OpenCL ()
forall op. WriteScalar op ()
writeScalarGPU,
      opsReadScalar :: ReadScalar OpenCL ()
GC.opsReadScalar = ReadScalar OpenCL ()
forall op. ReadScalar op ()
readScalarGPU,
      opsAllocate :: Allocate OpenCL ()
GC.opsAllocate = Allocate OpenCL ()
forall op. Allocate op ()
allocateGPU,
      opsDeallocate :: Allocate OpenCL ()
GC.opsDeallocate = Allocate OpenCL ()
forall op. Allocate op ()
deallocateGPU,
      opsCopy :: Copy OpenCL ()
GC.opsCopy = Copy OpenCL ()
copyGPU,
      opsCopies :: Map (Space, Space) (DoLMADCopy OpenCL ())
GC.opsCopies = Map (Space, Space) (DoLMADCopy OpenCL ())
forall op s. Map (Space, Space) (DoLMADCopy op s)
gpuCopies Map (Space, Space) (DoLMADCopy OpenCL ())
-> Map (Space, Space) (DoLMADCopy OpenCL ())
-> Map (Space, Space) (DoLMADCopy OpenCL ())
forall a. Semigroup a => a -> a -> a
<> Operations OpenCL () -> Map (Space, Space) (DoLMADCopy OpenCL ())
forall op s.
Operations op s -> Map (Space, Space) (DoLMADCopy op s)
GC.opsCopies Operations OpenCL ()
forall op s. Operations op s
GC.defaultOperations,
      opsFatMemory :: Bool
GC.opsFatMemory = Bool
True
    }

-- | Options that are common to multiple GPU-like backends.
gpuOptions :: [Option]
gpuOptions :: [Option]
gpuOptions =
  [ Option
      { optionLongName :: String
optionLongName = String
"device",
        optionShortName :: Maybe Char
optionShortName = Char -> Maybe Char
forall a. a -> Maybe a
Just Char
'd',
        optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"NAME",
        optionDescription :: String
optionDescription = String
"Use the first device whose name contains the given string.",
        optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_set_device(cfg, optarg);|]
      },
    Option
      { optionLongName :: String
optionLongName = String
"default-group-size",
        optionShortName :: Maybe Char
optionShortName = Maybe Char
forall a. Maybe a
Nothing,
        optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"INT",
        optionDescription :: String
optionDescription = String
"The default size of workgroups that are launched.",
        optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_set_default_group_size(cfg, atoi(optarg));|]
      },
    Option
      { optionLongName :: String
optionLongName = String
"default-num-groups",
        optionShortName :: Maybe Char
optionShortName = Maybe Char
forall a. Maybe a
Nothing,
        optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"INT",
        optionDescription :: String
optionDescription = String
"The default number of workgroups that are launched.",
        optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_set_default_num_groups(cfg, atoi(optarg));|]
      },
    Option
      { optionLongName :: String
optionLongName = String
"default-tile-size",
        optionShortName :: Maybe Char
optionShortName = Maybe Char
forall a. Maybe a
Nothing,
        optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"INT",
        optionDescription :: String
optionDescription = String
"The default tile size used when performing two-dimensional tiling.",
        optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_set_default_tile_size(cfg, atoi(optarg));|]
      },
    Option
      { optionLongName :: String
optionLongName = String
"default-reg-tile-size",
        optionShortName :: Maybe Char
optionShortName = Maybe Char
forall a. Maybe a
Nothing,
        optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"INT",
        optionDescription :: String
optionDescription = String
"The default register tile size used when performing two-dimensional tiling.",
        optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_set_default_reg_tile_size(cfg, atoi(optarg));|]
      },
    Option
      { optionLongName :: String
optionLongName = String
"default-threshold",
        optionShortName :: Maybe Char
optionShortName = Maybe Char
forall a. Maybe a
Nothing,
        optionArgument :: OptionArgument
optionArgument = String -> OptionArgument
RequiredArgument String
"INT",
        optionDescription :: String
optionDescription = String
"The default parallelism threshold.",
        optionAction :: Stm
optionAction = [C.cstm|futhark_context_config_set_default_threshold(cfg, atoi(optarg));|]
      }
  ]

errorMsgNumArgs :: ErrorMsg a -> Int
errorMsgNumArgs :: forall a. ErrorMsg a -> Int
errorMsgNumArgs = [PrimType] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length ([PrimType] -> Int)
-> (ErrorMsg a -> [PrimType]) -> ErrorMsg a -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ErrorMsg a -> [PrimType]
forall a. ErrorMsg a -> [PrimType]
errorMsgArgTypes

failureMsgFunction :: [FailureMsg] -> C.Definition
failureMsgFunction :: [FailureMsg] -> Definition
failureMsgFunction [FailureMsg]
failures =
  let printfEscape :: String -> String
printfEscape =
        let escapeChar :: Char -> String
escapeChar Char
'%' = String
"%%"
            escapeChar Char
c = [Char
c]
         in (Char -> String) -> String -> String
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Char -> String
escapeChar
      onPart :: ErrorMsgPart a -> String
onPart (ErrorString Text
s) = String -> String
printfEscape (String -> String) -> String -> String
forall a b. (a -> b) -> a -> b
$ Text -> String
T.unpack Text
s
      -- FIXME: bogus for non-ints.
      onPart ErrorVal {} = String
"%lld"
      onFailure :: a -> FailureMsg -> Stm
onFailure a
i (FailureMsg emsg :: ErrorMsg Exp
emsg@(ErrorMsg [ErrorMsgPart Exp]
parts) String
backtrace) =
        let msg :: String
msg = (ErrorMsgPart Exp -> String) -> [ErrorMsgPart Exp] -> String
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap ErrorMsgPart Exp -> String
forall {a}. ErrorMsgPart a -> String
onPart [ErrorMsgPart Exp]
parts String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
"\n" String -> String -> String
forall a. [a] -> [a] -> [a]
++ String -> String
printfEscape String
backtrace
            msgargs :: [Exp]
msgargs = [[C.cexp|args[$int:j]|] | Int
j <- [Int
0 .. ErrorMsg Exp -> Int
forall a. ErrorMsg a -> Int
errorMsgNumArgs ErrorMsg Exp
emsg Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1]]
         in [C.cstm|case $int:i: {return msgprintf($string:msg, $args:msgargs); break;}|]
      failure_cases :: [Stm]
failure_cases =
        (Int -> FailureMsg -> Stm) -> [Int] -> [FailureMsg] -> [Stm]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith Int -> FailureMsg -> Stm
forall {a}. (Show a, Integral a) => a -> FailureMsg -> Stm
onFailure [(Int
0 :: Int) ..] [FailureMsg]
failures
   in [C.cedecl|static char* get_failure_msg(int failure_idx, typename int64_t args[]) {
                  (void)args;
                  switch (failure_idx) { $stms:failure_cases }
                  return strdup("Unknown error.  This is a compiler bug.");
                }|]

-- | Called after most code has been generated to generate the bulk of
-- the boilerplate.
generateGPUBoilerplate ::
  T.Text ->
  T.Text ->
  [Name] ->
  [PrimType] ->
  [FailureMsg] ->
  GC.CompilerM OpenCL () ()
generateGPUBoilerplate :: Text
-> Text
-> [Name]
-> [PrimType]
-> [FailureMsg]
-> CompilerM OpenCL () ()
generateGPUBoilerplate Text
gpu_program Text
backendH [Name]
kernels [PrimType]
types [FailureMsg]
failures = do
  [Name] -> CompilerM OpenCL () ()
forall op s. [Name] -> CompilerM op s ()
createKernels [Name]
kernels
  let gpu_program_fragments :: [Initializer]
gpu_program_fragments =
        -- Some C compilers limit the size of literal strings, so
        -- chunk the entire program into small bits here, and
        -- concatenate it again at runtime.
        [[C.cinit|$string:s|] | String
s <- Int -> String -> [String]
forall a. Int -> [a] -> [[a]]
chunk Int
2000 (String -> [String]) -> String -> [String]
forall a b. (a -> b) -> a -> b
$ Text -> String
T.unpack Text
gpu_program]
      program_fragments :: [Initializer]
program_fragments = [Initializer]
gpu_program_fragments [Initializer] -> [Initializer] -> [Initializer]
forall a. [a] -> [a] -> [a]
++ [[C.cinit|NULL|]]
      f64_required :: Exp
f64_required
        | FloatType -> PrimType
FloatType FloatType
Float64 PrimType -> [PrimType] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [PrimType]
types = [C.cexp|1|]
        | Bool
otherwise = [C.cexp|0|]
      max_failure_args :: Int
max_failure_args = (Int -> Int -> Int) -> Int -> [Int] -> Int
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl Int -> Int -> Int
forall a. Ord a => a -> a -> a
max Int
0 ([Int] -> Int) -> [Int] -> Int
forall a b. (a -> b) -> a -> b
$ (FailureMsg -> Int) -> [FailureMsg] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map (ErrorMsg Exp -> Int
forall a. ErrorMsg a -> Int
errorMsgNumArgs (ErrorMsg Exp -> Int)
-> (FailureMsg -> ErrorMsg Exp) -> FailureMsg -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. FailureMsg -> ErrorMsg Exp
failureError) [FailureMsg]
failures
  (Definition -> CompilerM OpenCL () ())
-> [Definition] -> CompilerM OpenCL () ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_
    Definition -> CompilerM OpenCL () ()
forall op s. Definition -> CompilerM op s ()
GC.earlyDecl
    [C.cunit|static const int max_failure_args = $int:max_failure_args;
             static const int f64_required = $exp:f64_required;
             static const char *gpu_program[] = {$inits:program_fragments};
             $esc:(T.unpack gpuPrototypesH)
             $esc:(T.unpack backendH)
             $esc:(T.unpack gpuH)
            |]
  Definition -> CompilerM OpenCL () ()
forall op s. Definition -> CompilerM op s ()
GC.earlyDecl (Definition -> CompilerM OpenCL () ())
-> Definition -> CompilerM OpenCL () ()
forall a b. (a -> b) -> a -> b
$ [FailureMsg] -> Definition
failureMsgFunction [FailureMsg]
failures

  CompilerM OpenCL () ()
forall op s. CompilerM op s ()
GC.generateProgramStruct

  BlockItem -> CompilerM OpenCL () ()
forall op s. BlockItem -> CompilerM op s ()
GC.onClear [C.citem|if (ctx->error == NULL) { gpu_free_all(ctx); }|]