{-# LANGUAGE BangPatterns        #-}
{-# LANGUAGE CPP                 #-}
{-# LANGUAGE DataKinds           #-}
{-# LANGUAGE GADTs               #-}
{-# LANGUAGE MagicHash           #-}
{-# LANGUAGE RecordWildCards     #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeOperators       #-}
{-# LANGUAGE UnboxedTuples       #-}
-- |
-- Module      : Data.Array.Accelerate.LLVM.PTX.Array.Prim
-- Copyright   : [2014..2020] The Accelerate Team
-- License     : BSD3
--
-- Maintainer  : Trevor L. McDonell <trevor.mcdonell@gmail.com>
-- Stability   : experimental
-- Portability : non-portable (GHC extensions)
--

module Data.Array.Accelerate.LLVM.PTX.Array.Prim (

  mallocArray,
  useArrayAsync,
  indexArrayAsync,
  peekArrayAsync,
  pokeArrayAsync,
  copyArrayAsync,
  -- copyArrayPeerAsync,
  memsetArrayAsync,
  withDevicePtr,

) where

import Data.Array.Accelerate.Array.Data
import Data.Array.Accelerate.Array.Unique
import Data.Array.Accelerate.Error
import Data.Array.Accelerate.Lifetime                           hiding ( withLifetime )
import Data.Array.Accelerate.Representation.Elt
import Data.Array.Accelerate.Representation.Type
import Data.Array.Accelerate.Type

import Data.Array.Accelerate.LLVM.State

import Data.Array.Accelerate.LLVM.PTX.Target
import Data.Array.Accelerate.LLVM.PTX.Execute.Async
import Data.Array.Accelerate.LLVM.PTX.Execute.Event
import Data.Array.Accelerate.LLVM.PTX.Execute.Stream
import Data.Array.Accelerate.LLVM.PTX.Array.Remote              as Remote
import qualified Data.Array.Accelerate.LLVM.PTX.Debug           as Debug

import qualified Foreign.CUDA.Driver                            as CUDA
import qualified Foreign.CUDA.Driver.Stream                     as CUDA

import Control.Monad
import Control.Monad.Reader
import Data.IORef
import GHC.Base
import Text.Printf
import Prelude


-- | Allocate a device-side array associated with the given host array. If the
-- allocation fails due to a memory error, we attempt some last-ditch memory
-- cleanup before trying again. If it still fails; error.
--
{-# INLINEABLE mallocArray #-}
mallocArray
    :: HasCallStack
    => SingleType e
    -> Int
    -> ArrayData e
    -> LLVM PTX ()
mallocArray :: SingleType e -> Int -> ArrayData e -> LLVM PTX ()
mallocArray !SingleType e
t !Int
n !ArrayData e
ad = do
  String -> LLVM PTX ()
forall (m :: * -> *). MonadIO m => String -> m ()
message (String
"mallocArray: " String -> String -> String
forall a. [a] -> [a] -> [a]
++ Int -> String
showBytes (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
* TypeR e -> Int
forall e. TypeR e -> Int
bytesElt (ScalarType e -> TypeR e
forall (s :: * -> *) a. s a -> TupR s a
TupRsingle (SingleType e -> ScalarType e
forall a. SingleType a -> ScalarType a
SingleScalarType SingleType e
t))))
  LLVM PTX Bool -> LLVM PTX ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (LLVM PTX Bool -> LLVM PTX ()) -> LLVM PTX Bool -> LLVM PTX ()
forall a b. (a -> b) -> a -> b
$ SingleType e -> ArrayData e -> Int -> Bool -> LLVM PTX Bool
forall e.
SingleType e -> ArrayData e -> Int -> Bool -> LLVM PTX Bool
Remote.malloc SingleType e
t ArrayData e
ad Int
n Bool
False


-- | A combination of 'mallocArray' and 'pokeArray', that allocates remotes
-- memory and uploads an existing array. This is specialised because we tell the
-- allocator that the host-side array is frozen, and thus it is safe to evict
-- the remote memory and re-upload the data at any time.
--
{-# INLINEABLE useArrayAsync #-}
useArrayAsync
    :: HasCallStack
    => SingleType e
    -> Int
    -> ArrayData e
    -> Par PTX (Future (ArrayData e))
useArrayAsync :: SingleType e
-> Int -> ArrayData e -> Par PTX (Future (ArrayData e))
useArrayAsync !SingleType e
t !Int
n !ArrayData e
ad = do
  String -> Par PTX ()
forall (m :: * -> *). MonadIO m => String -> m ()
message (String
"useArrayAsync: " String -> String -> String
forall a. [a] -> [a] -> [a]
++ Int -> String
showBytes (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
* TypeR e -> Int
forall e. TypeR e -> Int
bytesElt (ScalarType e -> TypeR e
forall (s :: * -> *) a. s a -> TupR s a
TupRsingle (SingleType e -> ScalarType e
forall a. SingleType a -> ScalarType a
SingleScalarType SingleType e
t))))
  Bool
alloc <- LLVM PTX Bool -> Par PTX Bool
forall arch a.
(Async arch, HasCallStack) =>
LLVM arch a -> Par arch a
liftPar (LLVM PTX Bool -> Par PTX Bool) -> LLVM PTX Bool -> Par PTX Bool
forall a b. (a -> b) -> a -> b
$ SingleType e -> ArrayData e -> Int -> Bool -> LLVM PTX Bool
forall e.
SingleType e -> ArrayData e -> Int -> Bool -> LLVM PTX Bool
Remote.malloc SingleType e
t ArrayData e
ad Int
n Bool
True
  if Bool
alloc
    then SingleType e
-> Int -> ArrayData e -> Par PTX (Future (ArrayData e))
forall e.
HasCallStack =>
SingleType e
-> Int -> ArrayData e -> Par PTX (Future (ArrayData e))
pokeArrayAsync SingleType e
t Int
n ArrayData e
ad
    else ArrayData e -> Par PTX (FutureR PTX (ArrayData e))
forall arch a.
(Async arch, HasCallStack) =>
a -> Par arch (FutureR arch a)
newFull ArrayData e
ad


-- | Copy data from the host to an existing array on the device
--
{-# INLINEABLE pokeArrayAsync #-}
pokeArrayAsync
    :: HasCallStack
    => SingleType e
    -> Int
    -> ArrayData e
    -> Par PTX (Future (ArrayData e))
pokeArrayAsync :: SingleType e
-> Int -> ArrayData e -> Par PTX (Future (ArrayData e))
pokeArrayAsync !SingleType e
t !Int
n !ArrayData e
ad
  | SingleArrayDict e
SingleArrayDict <- SingleType e -> SingleArrayDict e
forall a. SingleType a -> SingleArrayDict a
singleArrayDict SingleType e
t
  , SingleDict e
SingleDict      <- SingleType e -> SingleDict e
forall a. SingleType a -> SingleDict a
singleDict SingleType e
t
  = do
    let !src :: HostPtr e
src   = Ptr e -> HostPtr e
forall a. Ptr a -> HostPtr a
CUDA.HostPtr (UniqueArray e -> Ptr e
forall a. UniqueArray a -> Ptr a
unsafeUniqueArrayPtr ArrayData e
UniqueArray e
ad)
        !bytes :: Int
bytes = Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
* TypeR e -> Int
forall e. TypeR e -> Int
bytesElt (ScalarType e -> TypeR e
forall (s :: * -> *) a. s a -> TupR s a
TupRsingle (SingleType e -> ScalarType e
forall a. SingleType a -> ScalarType a
SingleScalarType SingleType e
t))
    --
    Stream
stream <- (ParState -> Stream) -> Par PTX Stream
forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks ParState -> Stream
ptxStream
    Future (UniqueArray e)
result <- LLVM PTX (Future (UniqueArray e))
-> Par PTX (Future (UniqueArray e))
forall arch a.
(Async arch, HasCallStack) =>
LLVM arch a -> Par arch a
liftPar (LLVM PTX (Future (UniqueArray e))
 -> Par PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
-> Par PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$
      Stream
-> (Stream -> LLVM PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall (m :: * -> *) a b.
MonadIO m =>
Lifetime a -> (a -> m b) -> m b
withLifetime Stream
stream ((Stream -> LLVM PTX (Future (UniqueArray e)))
 -> LLVM PTX (Future (UniqueArray e)))
-> (Stream -> LLVM PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ \Stream
st  ->
        SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e)
    -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall e r.
HasCallStack =>
SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e) -> LLVM PTX (Maybe Event, r))
-> LLVM PTX r
withDevicePtr SingleType e
t ArrayData e
ad ((DevicePtr (ScalarArrayDataR e)
  -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
 -> LLVM PTX (Future (UniqueArray e)))
-> (DevicePtr (ScalarArrayDataR e)
    -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ \DevicePtr (ScalarArrayDataR e)
dst ->
          Stream
-> LLVM PTX (UniqueArray e)
-> LLVM PTX (Maybe Event, Future (UniqueArray e))
forall a. Stream -> LLVM PTX a -> LLVM PTX (Maybe Event, Future a)
nonblocking Stream
stream (LLVM PTX (UniqueArray e)
 -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (UniqueArray e)
-> LLVM PTX (Maybe Event, Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ do
            String -> Int -> Maybe Stream -> IO () -> LLVM PTX ()
forall (m :: * -> *).
MonadIO m =>
String -> Int -> Maybe Stream -> IO () -> m ()
transfer String
"pokeArray" Int
bytes (Stream -> Maybe Stream
forall a. a -> Maybe a
Just Stream
st) (IO () -> LLVM PTX ()) -> IO () -> LLVM PTX ()
forall a b. (a -> b) -> a -> b
$ do
              Int -> HostPtr e -> DevicePtr e -> Maybe Stream -> IO ()
forall a.
Storable a =>
Int -> HostPtr a -> DevicePtr a -> Maybe Stream -> IO ()
CUDA.pokeArrayAsync Int
n HostPtr e
src DevicePtr e
DevicePtr (ScalarArrayDataR e)
dst (Stream -> Maybe Stream
forall a. a -> Maybe a
Just Stream
st)
              Int64 -> IO ()
Debug.didCopyBytesToRemote (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
bytes)
            UniqueArray e -> LLVM PTX (UniqueArray e)
forall (m :: * -> *) a. Monad m => a -> m a
return ArrayData e
UniqueArray e
ad
    --
    Future (UniqueArray e) -> Par PTX (Future (UniqueArray e))
forall (m :: * -> *) a. Monad m => a -> m a
return Future (UniqueArray e)
result


-- | Read an element from an array at the given row-major index.
--
-- This copies the data via a temporary array on the host, so that packed AoS
-- elements can be copied in a single transfer.
--
{-# INLINEABLE indexArrayAsync #-}
indexArrayAsync
    :: HasCallStack
    => Int              -- actual number of values per element (i.e. this is >1 for SIMD types)
    -> SingleType e
    -> ArrayData e
    -> Int              -- element index
    -> Par PTX (Future (ArrayData e))
indexArrayAsync :: Int
-> SingleType e
-> ArrayData e
-> Int
-> Par PTX (Future (ArrayData e))
indexArrayAsync !Int
n !SingleType e
t !ArrayData e
ad_src !Int
i
  | SingleArrayDict e
SingleArrayDict <- SingleType e -> SingleArrayDict e
forall a. SingleType a -> SingleArrayDict a
singleArrayDict SingleType e
t
  , SingleDict e
SingleDict      <- SingleType e -> SingleDict e
forall a. SingleType a -> SingleDict a
singleDict SingleType e
t
  = do
    UniqueArray e
ad_dst <- IO (UniqueArray e) -> Par PTX (UniqueArray e)
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO (UniqueArray e) -> Par PTX (UniqueArray e))
-> IO (UniqueArray e) -> Par PTX (UniqueArray e)
forall a b. (a -> b) -> a -> b
$ TupR ScalarType e -> Int -> IO (ArrayData e)
forall e.
HasCallStack =>
TupR ScalarType e -> Int -> IO (MutableArrayData e)
newArrayData (ScalarType e -> TupR ScalarType e
forall (s :: * -> *) a. s a -> TupR s a
TupRsingle (ScalarType e -> TupR ScalarType e)
-> ScalarType e -> TupR ScalarType e
forall a b. (a -> b) -> a -> b
$ SingleType e -> ScalarType e
forall a. SingleType a -> ScalarType a
SingleScalarType SingleType e
t) Int
n
    let !bytes :: Int
bytes = Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
* TupR ScalarType e -> Int
forall e. TypeR e -> Int
bytesElt (ScalarType e -> TupR ScalarType e
forall (s :: * -> *) a. s a -> TupR s a
TupRsingle (SingleType e -> ScalarType e
forall a. SingleType a -> ScalarType a
SingleScalarType SingleType e
t))
        !dst :: HostPtr e
dst   = Ptr e -> HostPtr e
forall a. Ptr a -> HostPtr a
CUDA.HostPtr (UniqueArray e -> Ptr e
forall a. UniqueArray a -> Ptr a
unsafeUniqueArrayPtr UniqueArray e
ad_dst)
    --
    Stream
stream <- (ParState -> Stream) -> Par PTX Stream
forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks ParState -> Stream
ptxStream
    Future (UniqueArray e)
result <- LLVM PTX (Future (UniqueArray e))
-> Par PTX (Future (UniqueArray e))
forall arch a.
(Async arch, HasCallStack) =>
LLVM arch a -> Par arch a
liftPar (LLVM PTX (Future (UniqueArray e))
 -> Par PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
-> Par PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$
      Stream
-> (Stream -> LLVM PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall (m :: * -> *) a b.
MonadIO m =>
Lifetime a -> (a -> m b) -> m b
withLifetime Stream
stream  ((Stream -> LLVM PTX (Future (UniqueArray e)))
 -> LLVM PTX (Future (UniqueArray e)))
-> (Stream -> LLVM PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ \Stream
st  ->
      SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e)
    -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall e r.
HasCallStack =>
SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e) -> LLVM PTX (Maybe Event, r))
-> LLVM PTX r
withDevicePtr SingleType e
t ArrayData e
ad_src ((DevicePtr (ScalarArrayDataR e)
  -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
 -> LLVM PTX (Future (UniqueArray e)))
-> (DevicePtr (ScalarArrayDataR e)
    -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ \DevicePtr (ScalarArrayDataR e)
src ->
        Stream
-> LLVM PTX (UniqueArray e)
-> LLVM PTX (Maybe Event, Future (UniqueArray e))
forall a. Stream -> LLVM PTX a -> LLVM PTX (Maybe Event, Future a)
nonblocking Stream
stream (LLVM PTX (UniqueArray e)
 -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (UniqueArray e)
-> LLVM PTX (Maybe Event, Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ do
          String -> Int -> Maybe Stream -> IO () -> LLVM PTX ()
forall (m :: * -> *).
MonadIO m =>
String -> Int -> Maybe Stream -> IO () -> m ()
transfer String
"indexArray" Int
bytes (Stream -> Maybe Stream
forall a. a -> Maybe a
Just Stream
st) (IO () -> LLVM PTX ()) -> IO () -> LLVM PTX ()
forall a b. (a -> b) -> a -> b
$ do
            Int -> DevicePtr e -> HostPtr e -> Maybe Stream -> IO ()
forall a.
Storable a =>
Int -> DevicePtr a -> HostPtr a -> Maybe Stream -> IO ()
CUDA.peekArrayAsync Int
n (DevicePtr e
DevicePtr (ScalarArrayDataR e)
src DevicePtr e -> Int -> DevicePtr e
forall a. Storable a => DevicePtr a -> Int -> DevicePtr a
`CUDA.advanceDevPtr` (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
*Int
n)) HostPtr e
dst (Stream -> Maybe Stream
forall a. a -> Maybe a
Just Stream
st)
            Int64 -> IO ()
Debug.didCopyBytesFromRemote (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
bytes)
          UniqueArray e -> LLVM PTX (UniqueArray e)
forall (m :: * -> *) a. Monad m => a -> m a
return UniqueArray e
ad_dst
    --
    Future (UniqueArray e) -> Par PTX (Future (UniqueArray e))
forall (m :: * -> *) a. Monad m => a -> m a
return Future (UniqueArray e)
result


-- | Copy data from the device into the associated host-side Accelerate array
--
{-# INLINEABLE peekArrayAsync #-}
peekArrayAsync
    :: HasCallStack
    => SingleType e
    -> Int
    -> ArrayData e
    -> Par PTX (Future (ArrayData e))
peekArrayAsync :: SingleType e
-> Int -> ArrayData e -> Par PTX (Future (ArrayData e))
peekArrayAsync !SingleType e
t !Int
n !ArrayData e
ad
  | SingleArrayDict e
SingleArrayDict <- SingleType e -> SingleArrayDict e
forall a. SingleType a -> SingleArrayDict a
singleArrayDict SingleType e
t
  , SingleDict e
SingleDict      <- SingleType e -> SingleDict e
forall a. SingleType a -> SingleDict a
singleDict SingleType e
t
  = do
    let !bytes :: Int
bytes = Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
* TypeR e -> Int
forall e. TypeR e -> Int
bytesElt (ScalarType e -> TypeR e
forall (s :: * -> *) a. s a -> TupR s a
TupRsingle (SingleType e -> ScalarType e
forall a. SingleType a -> ScalarType a
SingleScalarType SingleType e
t))
        !dst :: HostPtr e
dst   = Ptr e -> HostPtr e
forall a. Ptr a -> HostPtr a
CUDA.HostPtr (UniqueArray e -> Ptr e
forall a. UniqueArray a -> Ptr a
unsafeUniqueArrayPtr ArrayData e
UniqueArray e
ad)
    --
    Stream
stream <- (ParState -> Stream) -> Par PTX Stream
forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks ParState -> Stream
ptxStream
    Future (UniqueArray e)
result <- LLVM PTX (Future (UniqueArray e))
-> Par PTX (Future (UniqueArray e))
forall arch a.
(Async arch, HasCallStack) =>
LLVM arch a -> Par arch a
liftPar (LLVM PTX (Future (UniqueArray e))
 -> Par PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
-> Par PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$
      Stream
-> (Stream -> LLVM PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall (m :: * -> *) a b.
MonadIO m =>
Lifetime a -> (a -> m b) -> m b
withLifetime Stream
stream ((Stream -> LLVM PTX (Future (UniqueArray e)))
 -> LLVM PTX (Future (UniqueArray e)))
-> (Stream -> LLVM PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ \Stream
st  ->
        SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e)
    -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall e r.
HasCallStack =>
SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e) -> LLVM PTX (Maybe Event, r))
-> LLVM PTX r
withDevicePtr SingleType e
t ArrayData e
ad  ((DevicePtr (ScalarArrayDataR e)
  -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
 -> LLVM PTX (Future (UniqueArray e)))
-> (DevicePtr (ScalarArrayDataR e)
    -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ \DevicePtr (ScalarArrayDataR e)
src ->
          Stream
-> LLVM PTX (UniqueArray e)
-> LLVM PTX (Maybe Event, Future (UniqueArray e))
forall a. Stream -> LLVM PTX a -> LLVM PTX (Maybe Event, Future a)
nonblocking Stream
stream (LLVM PTX (UniqueArray e)
 -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (UniqueArray e)
-> LLVM PTX (Maybe Event, Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ do
            String -> Int -> Maybe Stream -> IO () -> LLVM PTX ()
forall (m :: * -> *).
MonadIO m =>
String -> Int -> Maybe Stream -> IO () -> m ()
transfer String
"peekArray" Int
bytes (Stream -> Maybe Stream
forall a. a -> Maybe a
Just Stream
st) (IO () -> LLVM PTX ()) -> IO () -> LLVM PTX ()
forall a b. (a -> b) -> a -> b
$ do
              Int -> DevicePtr e -> HostPtr e -> Maybe Stream -> IO ()
forall a.
Storable a =>
Int -> DevicePtr a -> HostPtr a -> Maybe Stream -> IO ()
CUDA.peekArrayAsync Int
n DevicePtr e
DevicePtr (ScalarArrayDataR e)
src HostPtr e
dst (Stream -> Maybe Stream
forall a. a -> Maybe a
Just Stream
st)
              Int64 -> IO ()
Debug.didCopyBytesFromRemote (Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
bytes)
            UniqueArray e -> LLVM PTX (UniqueArray e)
forall (m :: * -> *) a. Monad m => a -> m a
return ArrayData e
UniqueArray e
ad
    --
    Future (UniqueArray e) -> Par PTX (Future (UniqueArray e))
forall (m :: * -> *) a. Monad m => a -> m a
return Future (UniqueArray e)
result


-- | Copy data between arrays in the same context
--
{-# INLINEABLE copyArrayAsync #-}
copyArrayAsync
    :: HasCallStack
    => SingleType e
    -> Int
    -> ArrayData e
    -> ArrayData e
    -> Par PTX (Future (ArrayData e))
copyArrayAsync :: SingleType e
-> Int
-> ArrayData e
-> ArrayData e
-> Par PTX (Future (ArrayData e))
copyArrayAsync !SingleType e
t !Int
n !ArrayData e
ad_src !ArrayData e
ad_dst
  | SingleArrayDict e
SingleArrayDict <- SingleType e -> SingleArrayDict e
forall a. SingleType a -> SingleArrayDict a
singleArrayDict SingleType e
t
  , SingleDict e
SingleDict      <- SingleType e -> SingleDict e
forall a. SingleType a -> SingleDict a
singleDict SingleType e
t
  = do
    let !bytes :: Int
bytes = Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
* TypeR e -> Int
forall e. TypeR e -> Int
bytesElt (ScalarType e -> TypeR e
forall (s :: * -> *) a. s a -> TupR s a
TupRsingle (SingleType e -> ScalarType e
forall a. SingleType a -> ScalarType a
SingleScalarType SingleType e
t))
    --
    Stream
stream <- (ParState -> Stream) -> Par PTX Stream
forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks ParState -> Stream
ptxStream
    Future (UniqueArray e)
result <- LLVM PTX (Future (UniqueArray e))
-> Par PTX (Future (UniqueArray e))
forall arch a.
(Async arch, HasCallStack) =>
LLVM arch a -> Par arch a
liftPar (LLVM PTX (Future (UniqueArray e))
 -> Par PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
-> Par PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$
      Stream
-> (Stream -> LLVM PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall (m :: * -> *) a b.
MonadIO m =>
Lifetime a -> (a -> m b) -> m b
withLifetime Stream
stream        ((Stream -> LLVM PTX (Future (UniqueArray e)))
 -> LLVM PTX (Future (UniqueArray e)))
-> (Stream -> LLVM PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ \Stream
st ->
        SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e)
    -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall e r.
HasCallStack =>
SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e) -> LLVM PTX (Maybe Event, r))
-> LLVM PTX r
withDevicePtr SingleType e
t ArrayData e
ad_src   ((DevicePtr (ScalarArrayDataR e)
  -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
 -> LLVM PTX (Future (UniqueArray e)))
-> (DevicePtr (ScalarArrayDataR e)
    -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ \DevicePtr (ScalarArrayDataR e)
src ->
          SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e)
    -> LLVM PTX (Maybe Event, (Maybe Event, Future (UniqueArray e))))
-> LLVM PTX (Maybe Event, Future (UniqueArray e))
forall e r.
HasCallStack =>
SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e) -> LLVM PTX (Maybe Event, r))
-> LLVM PTX r
withDevicePtr SingleType e
t ArrayData e
ad_dst ((DevicePtr (ScalarArrayDataR e)
  -> LLVM PTX (Maybe Event, (Maybe Event, Future (UniqueArray e))))
 -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> (DevicePtr (ScalarArrayDataR e)
    -> LLVM PTX (Maybe Event, (Maybe Event, Future (UniqueArray e))))
-> LLVM PTX (Maybe Event, Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ \DevicePtr (ScalarArrayDataR e)
dst -> do
            (Maybe Event
e,Future (UniqueArray e)
r) <- Stream
-> LLVM PTX (UniqueArray e)
-> LLVM PTX (Maybe Event, Future (UniqueArray e))
forall a. Stream -> LLVM PTX a -> LLVM PTX (Maybe Event, Future a)
nonblocking Stream
stream (LLVM PTX (UniqueArray e)
 -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (UniqueArray e)
-> LLVM PTX (Maybe Event, Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ do
                      String -> Int -> Maybe Stream -> IO () -> LLVM PTX ()
forall (m :: * -> *).
MonadIO m =>
String -> Int -> Maybe Stream -> IO () -> m ()
transfer String
"copyArray" Int
bytes (Stream -> Maybe Stream
forall a. a -> Maybe a
Just Stream
st) (IO () -> LLVM PTX ()) -> IO () -> LLVM PTX ()
forall a b. (a -> b) -> a -> b
$ Int -> DevicePtr e -> DevicePtr e -> Maybe Stream -> IO ()
forall a.
Storable a =>
Int -> DevicePtr a -> DevicePtr a -> Maybe Stream -> IO ()
CUDA.copyArrayAsync Int
n DevicePtr e
DevicePtr (ScalarArrayDataR e)
src DevicePtr e
DevicePtr (ScalarArrayDataR e)
dst (Stream -> Maybe Stream
forall a. a -> Maybe a
Just Stream
st)
                      UniqueArray e -> LLVM PTX (UniqueArray e)
forall (m :: * -> *) a. Monad m => a -> m a
return ArrayData e
UniqueArray e
ad_dst
            (Maybe Event, (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (Maybe Event, (Maybe Event, Future (UniqueArray e)))
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe Event
e, (Maybe Event
e,Future (UniqueArray e)
r))
    --
    Future (UniqueArray e) -> Par PTX (Future (UniqueArray e))
forall (m :: * -> *) a. Monad m => a -> m a
return Future (UniqueArray e)
result


{--
-- | Copy data from one device context into a _new_ array on the second context.
-- It is an error if the destination array already exists.
--
{-# INLINEABLE copyArrayPeerAsync #-}
copyArrayPeerAsync
    :: SingleType e
    -> Context                            -- destination context
    -> MemoryTable                        -- destination memory table
    -> Stream
    -> Int
    -> ArrayData e
    -> LLVM PTX ()
copyArrayPeerAsync = error "copyArrayPeerAsync"
{--
copyArrayPeerAsync !t !ctx2 !mt2 !st !n !ad = do
  let !bytes    = n * sizeOfSingleType t
  src   <- devicePtr mt1 ad
  dst   <- mallocArray ctx2 mt2 n ad
  transfer "copyArrayPeer" bytes (Just st) $
    CUDA.copyArrayPeerAsync n src (deviceContext ctx1) dst (deviceContext ctx2) (Just st)
--}

-- | Copy part of an array from one device context to another. Both source and
-- destination arrays must exist.
--
{-# INLINEABLE copyArrayPeerAsyncR #-}
copyArrayPeerAsync
    :: SingleType e
    -> Context                            -- destination context
    -> MemoryTable                        -- destination memory table
    -> Stream
    -> Int
    -> Int
    -> ArrayData e
    -> LLVM PTX ()
copyArrayPeerAsync = error "copyArrayPeerAsyncR"
{--
copyArrayPeerAsyncR !t !ctx2 !mt2 !st !from !n !ad = do
  let !bytes    = n    * sizeOfSingleType t
      !offset   = from * sizeOfSingleType t
  src <- devicePtr mt1 ad       :: IO (CUDA.DevicePtr a)
  dst <- devicePtr mt2 ad       :: IO (CUDA.DevicePtr a)
  transfer "copyArrayPeer" bytes (Just st) $
    CUDA.copyArrayPeerAsync n (src `CUDA.plusDevPtr` offset) (deviceContext ctx1)
                              (dst `CUDA.plusDevPtr` offset) (deviceContext ctx2) (Just st)
--}
--}

-- | Set elements of the array to the specified value. Only 8-, 16-, and 32-bit
-- values are supported.
--
{-# INLINEABLE memsetArrayAsync #-}
memsetArrayAsync
    :: HasCallStack
    => SingleType e
    -> Int
    -> ScalarArrayDataR e
    -> ArrayData e
    -> Par PTX (Future (ArrayData e))
memsetArrayAsync :: SingleType e
-> Int
-> ScalarArrayDataR e
-> ArrayData e
-> Par PTX (Future (ArrayData e))
memsetArrayAsync !SingleType e
t !Int
n !ScalarArrayDataR e
v !ArrayData e
ad
  | SingleArrayDict e
SingleArrayDict <- SingleType e -> SingleArrayDict e
forall a. SingleType a -> SingleArrayDict a
singleArrayDict SingleType e
t
  , SingleDict e
SingleDict      <- SingleType e -> SingleDict e
forall a. SingleType a -> SingleDict a
singleDict SingleType e
t
  = do
    let !bytes :: Int
bytes = Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
* TypeR e -> Int
forall e. TypeR e -> Int
bytesElt (ScalarType e -> TypeR e
forall (s :: * -> *) a. s a -> TupR s a
TupRsingle (SingleType e -> ScalarType e
forall a. SingleType a -> ScalarType a
SingleScalarType SingleType e
t))
    --
    Stream
stream <- (ParState -> Stream) -> Par PTX Stream
forall r (m :: * -> *) a. MonadReader r m => (r -> a) -> m a
asks ParState -> Stream
ptxStream
    Future (UniqueArray e)
result <- LLVM PTX (Future (UniqueArray e))
-> Par PTX (Future (UniqueArray e))
forall arch a.
(Async arch, HasCallStack) =>
LLVM arch a -> Par arch a
liftPar (LLVM PTX (Future (UniqueArray e))
 -> Par PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
-> Par PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$
      Stream
-> (Stream -> LLVM PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall (m :: * -> *) a b.
MonadIO m =>
Lifetime a -> (a -> m b) -> m b
withLifetime Stream
stream ((Stream -> LLVM PTX (Future (UniqueArray e)))
 -> LLVM PTX (Future (UniqueArray e)))
-> (Stream -> LLVM PTX (Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ \Stream
st  ->
        SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e)
    -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall e r.
HasCallStack =>
SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e) -> LLVM PTX (Maybe Event, r))
-> LLVM PTX r
withDevicePtr SingleType e
t ArrayData e
ad  ((DevicePtr (ScalarArrayDataR e)
  -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
 -> LLVM PTX (Future (UniqueArray e)))
-> (DevicePtr (ScalarArrayDataR e)
    -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ \DevicePtr (ScalarArrayDataR e)
ptr ->
          Stream
-> LLVM PTX (UniqueArray e)
-> LLVM PTX (Maybe Event, Future (UniqueArray e))
forall a. Stream -> LLVM PTX a -> LLVM PTX (Maybe Event, Future a)
nonblocking Stream
stream (LLVM PTX (UniqueArray e)
 -> LLVM PTX (Maybe Event, Future (UniqueArray e)))
-> LLVM PTX (UniqueArray e)
-> LLVM PTX (Maybe Event, Future (UniqueArray e))
forall a b. (a -> b) -> a -> b
$ do
            String -> Int -> Maybe Stream -> IO () -> LLVM PTX ()
forall (m :: * -> *).
MonadIO m =>
String -> Int -> Maybe Stream -> IO () -> m ()
transfer String
"memset" Int
bytes (Stream -> Maybe Stream
forall a. a -> Maybe a
Just Stream
st) (IO () -> LLVM PTX ()) -> IO () -> LLVM PTX ()
forall a b. (a -> b) -> a -> b
$ DevicePtr e -> Int -> e -> Maybe Stream -> IO ()
forall a.
Storable a =>
DevicePtr a -> Int -> a -> Maybe Stream -> IO ()
CUDA.memsetAsync DevicePtr e
DevicePtr (ScalarArrayDataR e)
ptr Int
n e
ScalarArrayDataR e
v (Stream -> Maybe Stream
forall a. a -> Maybe a
Just Stream
st)
            UniqueArray e -> LLVM PTX (UniqueArray e)
forall (m :: * -> *) a. Monad m => a -> m a
return ArrayData e
UniqueArray e
ad
    --
    Future (UniqueArray e) -> Par PTX (Future (UniqueArray e))
forall (m :: * -> *) a. Monad m => a -> m a
return Future (UniqueArray e)
result


-- Auxiliary
-- ---------

-- | Lookup the device memory associated with a given host array and do
-- something with it.
--
{-# INLINEABLE withDevicePtr #-}
withDevicePtr
    :: HasCallStack
    => SingleType e
    -> ArrayData e
    -> (CUDA.DevicePtr (ScalarArrayDataR e) -> LLVM PTX (Maybe Event, r))
    -> LLVM PTX r
withDevicePtr :: SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e) -> LLVM PTX (Maybe Event, r))
-> LLVM PTX r
withDevicePtr !SingleType e
t !ArrayData e
ad !DevicePtr (ScalarArrayDataR e) -> LLVM PTX (Maybe Event, r)
f = do
  Maybe r
mr <- SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e) -> LLVM PTX (Maybe Event, r))
-> LLVM PTX (Maybe r)
forall e r.
SingleType e
-> ArrayData e
-> (DevicePtr (ScalarArrayDataR e) -> LLVM PTX (Maybe Event, r))
-> LLVM PTX (Maybe r)
withRemote SingleType e
t ArrayData e
ad DevicePtr (ScalarArrayDataR e) -> LLVM PTX (Maybe Event, r)
f
  case Maybe r
mr of
    Maybe r
Nothing -> String -> LLVM PTX r
forall a. HasCallStack => String -> a
internalError String
"array does not exist on the device"
    Just r
r  -> r -> LLVM PTX r
forall (m :: * -> *) a. Monad m => a -> m a
return r
r

{--
-- | Lookup the device memory associated with a given host array
--
{-# INLINEABLE devicePtr #-}
devicePtr
    :: (ArrayElt e, ArrayPtrs e ~ Ptr a, Typeable a, Typeable b)
    => ArrayData e
    -> LLVM PTX (CUDA.DevicePtr b)
devicePtr !ad = do
  undefined
  {--
  mv <- Table.lookup mt ad
  case mv of
    Just v      -> return v
    Nothing     -> $internalError "devicePtr" "lost device memory"
  --}
--}

-- | Execute a (presumable asynchronous) operation and return the result
-- together with an event recorded immediately afterwards in the given stream.
--
{-# INLINE nonblocking #-}
nonblocking :: Stream -> LLVM PTX a -> LLVM PTX (Maybe Event, Future a)
nonblocking :: Stream -> LLVM PTX a -> LLVM PTX (Maybe Event, Future a)
nonblocking !Stream
stream !LLVM PTX a
action = do
  a
result <- LLVM PTX a
action
  Event
event  <- Stream -> LLVM PTX Event
waypoint Stream
stream
  Bool
ready  <- IO Bool -> LLVM PTX Bool
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (Event -> IO Bool
query Event
event)
  if Bool
ready
    then do
      Future a
future <- IORef (IVar a) -> Future a
forall a. IORef (IVar a) -> Future a
Future (IORef (IVar a) -> Future a)
-> LLVM PTX (IORef (IVar a)) -> LLVM PTX (Future a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> IO (IORef (IVar a)) -> LLVM PTX (IORef (IVar a))
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IVar a -> IO (IORef (IVar a))
forall a. a -> IO (IORef a)
newIORef (a -> IVar a
forall a. a -> IVar a
Full a
result))
      (Maybe Event, Future a) -> LLVM PTX (Maybe Event, Future a)
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe Event
forall a. Maybe a
Nothing, Future a
future)

    else do
      Future a
future <- IORef (IVar a) -> Future a
forall a. IORef (IVar a) -> Future a
Future (IORef (IVar a) -> Future a)
-> LLVM PTX (IORef (IVar a)) -> LLVM PTX (Future a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> IO (IORef (IVar a)) -> LLVM PTX (IORef (IVar a))
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IVar a -> IO (IORef (IVar a))
forall a. a -> IO (IORef a)
newIORef (Event -> Maybe (Lifetime FunctionTable) -> a -> IVar a
forall a. Event -> Maybe (Lifetime FunctionTable) -> a -> IVar a
Pending Event
event Maybe (Lifetime FunctionTable)
forall a. Maybe a
Nothing a
result))
      (Maybe Event, Future a) -> LLVM PTX (Maybe Event, Future a)
forall (m :: * -> *) a. Monad m => a -> m a
return (Event -> Maybe Event
forall a. a -> Maybe a
Just Event
event, Future a
future)

{-# INLINE withLifetime #-}
withLifetime :: MonadIO m => Lifetime a -> (a -> m b) -> m b
withLifetime :: Lifetime a -> (a -> m b) -> m b
withLifetime (Lifetime LTF
ref Weak LTF
_ a
a) a -> m b
f = do
  b
r <- a -> m b
f a
a
  IO () -> m ()
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (LTF -> IO ()
forall a. IORef a -> IO ()
touchIORef LTF
ref)
  b -> m b
forall (m :: * -> *) a. Monad m => a -> m a
return b
r

{-# INLINE touchIORef #-}
touchIORef :: IORef a -> IO ()
touchIORef :: IORef a -> IO ()
touchIORef IORef a
r = (State# RealWorld -> (# State# RealWorld, () #)) -> IO ()
forall a. (State# RealWorld -> (# State# RealWorld, a #)) -> IO a
IO ((State# RealWorld -> (# State# RealWorld, () #)) -> IO ())
-> (State# RealWorld -> (# State# RealWorld, () #)) -> IO ()
forall a b. (a -> b) -> a -> b
$ \State# RealWorld
s -> case IORef a -> State# RealWorld -> State# RealWorld
forall a. a -> State# RealWorld -> State# RealWorld
touch# IORef a
r State# RealWorld
s of State# RealWorld
s' -> (# State# RealWorld
s', () #)


-- Debug
-- -----

{-# INLINE showBytes #-}
showBytes :: Int -> String
showBytes :: Int -> String
showBytes Int
x = Maybe Int -> Double -> Double -> String -> String
forall a. RealFloat a => Maybe Int -> a -> a -> String -> String
Debug.showFFloatSIBase (Int -> Maybe Int
forall a. a -> Maybe a
Just Int
0) Double
1024 (Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
x :: Double) String
"B"

{-# INLINE trace #-}
trace :: MonadIO m => String -> m a -> m a
trace :: String -> m a -> m a
trace String
msg m a
next = IO () -> m ()
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (Flag -> String -> IO ()
Debug.traceIO Flag
Debug.dump_gc (String
"gc: " String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
msg)) m () -> m a -> m a
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> m a
next

{-# INLINE message #-}
message :: MonadIO m => String -> m ()
message :: String -> m ()
message String
s = String
s String -> m () -> m ()
forall (m :: * -> *) a. MonadIO m => String -> m a -> m a
`trace` () -> m ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()

{-# INLINE transfer #-}
transfer :: MonadIO m => String -> Int -> Maybe CUDA.Stream -> IO () -> m ()
transfer :: String -> Int -> Maybe Stream -> IO () -> m ()
transfer String
name Int
bytes Maybe Stream
stream IO ()
action =
  let showRate :: a -> a -> String
showRate a
x a
t      = Maybe Int -> a -> a -> String -> String
forall a. RealFloat a => Maybe Int -> a -> a -> String -> String
Debug.showFFloatSIBase (Int -> Maybe Int
forall a. a -> Maybe a
Just Int
3) a
1024 (a -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
x a -> a -> a
forall a. Fractional a => a -> a -> a
/ a
t) String
"B/s"
      msg :: Double -> Double -> Double -> String
msg Double
wall Double
cpu Double
gpu  = String -> String -> String -> String -> String -> String
forall r. PrintfType r => String -> r
printf String
"gc: %s: %s @ %s, %s"
                            String
name
                            (Int -> String
showBytes Int
bytes)
                            (Int -> Double -> String
forall a a. (RealFloat a, Integral a) => a -> a -> String
showRate Int
bytes Double
wall)
                            (Double -> Double -> Double -> String
Debug.elapsed Double
wall Double
cpu Double
gpu)
  in
  IO () -> m ()
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (Flag
-> (Double -> Double -> Double -> String)
-> Maybe Stream
-> IO ()
-> IO ()
Debug.timed Flag
Debug.dump_gc Double -> Double -> Double -> String
msg Maybe Stream
stream IO ()
action)