{-# LANGUAGE GADTs #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} {-# OPTIONS_GHC -fno-warn-name-shadowing #-} {-# OPTIONS_HADDOCK hide #-} -- | -- Module : Data.Array.Accelerate.LLVM.Array.Data -- Copyright : [2014..2017] Trevor L. McDonell -- [2014..2014] Vinod Grover (NVIDIA Corporation) -- License : BSD3 -- -- Maintainer : Trevor L. McDonell -- Stability : experimental -- Portability : non-portable (GHC extensions) -- module Data.Array.Accelerate.LLVM.Array.Data ( Remote(..), newRemote, useRemote, useRemoteAsync, copyToRemote, copyToRemoteAsync, copyToHost, copyToHostAsync, copyToPeer, copyToPeerAsync, runIndexArray, runArrays, runArray, module Data.Array.Accelerate.Array.Data, ) where -- accelerate import Data.Array.Accelerate.Array.Data import Data.Array.Accelerate.Array.Sugar import Data.Array.Accelerate.Type import Data.Array.Accelerate.LLVM.State import Data.Array.Accelerate.LLVM.Execute.Async -- standard library import Control.Monad ( liftM, liftM2 ) import Control.Monad.Trans import Data.Typeable import Foreign.C.Types import Foreign.Ptr import Foreign.Storable import Prelude class Async arch => Remote arch where -- | Allocate a new uninitialised array on the remote device. -- {-# INLINEABLE allocateRemote #-} allocateRemote :: (Shape sh, Elt e) => sh -> LLVM arch (Array sh e) allocateRemote sh = liftIO $ allocateArray sh -- | Use the given immutable array on the remote device. Since the source -- array is immutable, the allocator can evict and re-upload the data as -- necessary without copy-back. -- {-# INLINEABLE useRemoteR #-} useRemoteR :: (ArrayElt e, ArrayPtrs e ~ Ptr a, Storable a, Typeable a, Typeable e) => Int -- ^ number of elements to copy -> Maybe (StreamR arch) -- ^ execute synchronously w.r.t. this execution stream -> ArrayData e -- ^ array payload -> LLVM arch () useRemoteR _ _ _ = return () -- | Upload a section of an array from the host to the remote device. -- {-# INLINEABLE copyToRemoteR #-} copyToRemoteR :: (ArrayElt e, ArrayPtrs e ~ Ptr a, Storable a, Typeable a, Typeable e) => Int -- ^ index of first element to copy -> Int -- ^ number of elements to copy -> Maybe (StreamR arch) -- ^ execute synchronously w.r.t. this execution stream -> ArrayData e -- ^ array payload -> LLVM arch () copyToRemoteR _ _ _ _ = return () -- | Copy a section of an array from the remote device back to the host. -- {-# INLINEABLE copyToHostR #-} copyToHostR :: (ArrayElt e, ArrayPtrs e ~ Ptr a, Storable a, Typeable a, Typeable e) => Int -- ^ index of the first element to copy -> Int -- ^ number of elements to copy -> Maybe (StreamR arch) -- ^ execute synchronously w.r.t. this execution stream -> ArrayData e -- ^ array payload -> LLVM arch () copyToHostR _ _ _ _ = return () -- | Copy a section of an array between two remote instances of the same type. -- This may be more efficient than copying to the host and then to the second -- remote instance (e.g. DMA between two CUDA devices). The elements between -- the given indices (inclusive left, exclusive right) are transferred. -- {-# INLINEABLE copyToPeerR #-} copyToPeerR :: (ArrayElt e, ArrayPtrs e ~ Ptr a, Storable a, Typeable a, Typeable e) => Int -- ^ index of the first element to copy -> Int -- ^ number of elements to copy -> arch -- ^ remote device to copy to -> Maybe (StreamR arch) -- ^ execute synchronously w.r.t. this execution stream -> ArrayData e -- ^ array payload -> LLVM arch () copyToPeerR _ _ _ _ _ = return () -- | Read a single element from the array at a given row-major index -- {-# INLINEABLE indexRemote #-} indexRemote :: Array sh e -> Int -> LLVM arch e indexRemote (Array _ adata) i = return . toElt $! unsafeIndexArrayData adata i -- | Create a new array from its representation on the host, and upload it to -- a new remote array. -- {-# INLINEABLE newRemote #-} newRemote :: (Remote arch, Shape sh, Elt e) => sh -> (sh -> e) -> LLVM arch (Array sh e) newRemote sh f = useRemote $! fromFunction sh f -- | Upload an immutable array from the host to the remote device. This is -- a synchronous operation in that it will not return until the transfer -- completes, but the individual array payloads will be uploaded concurrently if -- possible. -- {-# INLINEABLE useRemote #-} useRemote :: (Remote arch, Arrays arrs) => arrs -> LLVM arch arrs useRemote arrs = do AsyncR _ a <- async (useRemoteAsync arrs) get a -- | Upload an immutable array from the host to the remote device, -- asynchronously. This will upload each array payload in a separate execution -- stream, thereby making us of multiple memcpy engines (where available). -- {-# INLINEABLE useRemoteAsync #-} useRemoteAsync :: (Remote arch, Arrays arrs) => arrs -> StreamR arch -> LLVM arch (AsyncR arch arrs) useRemoteAsync arrs stream = do arrs' <- runArrays arrs $ \arr@Array{} -> let n = size (shape arr) in runArray arr $ \m ad -> do s <- fork useRemoteR (n*m) (Just s) ad after stream =<< checkpoint s join s return ad -- event <- checkpoint stream -- TLM: Assuming that adding events to a stream counts as things to wait for return $! AsyncR event arrs' -- | Uploading existing arrays from the host to the remote device. This is -- synchronous with respect to the calling thread, but the individual array -- payloads may themselves be transferred concurrently. -- {-# INLINEABLE copyToRemote #-} copyToRemote :: (Remote arch, Arrays a) => a -> LLVM arch a copyToRemote arrs = do AsyncR _ a <- async (copyToRemoteAsync arrs) get a -- | Upload an existing array to the remote device, asynchronously. -- {-# INLINEABLE copyToRemoteAsync #-} copyToRemoteAsync :: (Remote arch, Arrays a) => a -> StreamR arch -> LLVM arch (AsyncR arch a) copyToRemoteAsync arrs stream = do arrs' <- runArrays arrs $ \arr@Array{} -> let n = size (shape arr) in runArray arr $ \m ad -> do s <- fork copyToRemoteR 0 (n*m) (Just s) ad after stream =<< checkpoint s join s return ad -- event <- checkpoint stream return $! AsyncR event arrs' -- | Copy an array from the remote device to the host. This is synchronous with -- respect to the calling thread, but the individual array payloads may -- themselves be transferred concurrently. -- {-# INLINEABLE copyToHost #-} copyToHost :: (Remote arch, Arrays a) => a -> LLVM arch a copyToHost arrs = do AsyncR _ a <- async (copyToHostAsync arrs) get a -- | Copy an array from the remote device to the host, asynchronously -- {-# INLINEABLE copyToHostAsync #-} copyToHostAsync :: (Remote arch, Arrays a) => a -> StreamR arch -> LLVM arch (AsyncR arch a) copyToHostAsync arrs stream = do arrs' <- runArrays arrs $ \arr@Array{} -> let n = size (shape arr) in runArray arr $ \m ad -> do s <- fork copyToHostR 0 (n*m) (Just s) ad after stream =<< checkpoint s join s return ad -- event <- checkpoint stream return $! AsyncR event arrs' -- | Copy arrays between two remote instances of the same type. This may be more -- efficient than copying to the host and then to the second remote instance -- (e.g. DMA between CUDA devices). -- {-# INLINEABLE copyToPeer #-} copyToPeer :: (Remote arch, Arrays a) => arch -> a -> LLVM arch a copyToPeer peer arrs = do AsyncR _ a <- async (copyToPeerAsync peer arrs) get a -- | As 'copyToPeer', asynchronously. -- {-# INLINEABLE copyToPeerAsync #-} copyToPeerAsync :: (Remote arch, Arrays a) => arch -> a -> StreamR arch -> LLVM arch (AsyncR arch a) copyToPeerAsync peer arrs stream = do arrs' <- runArrays arrs $ \arr@Array{} -> let n = size (shape arr) in runArray arr $ \m ad -> do s <- fork copyToPeerR 0 (n*m) peer (Just s) ad after stream =<< checkpoint s join s return ad -- event <- checkpoint stream return $! AsyncR event arrs' -- Helpers for traversing the Arrays data structure -- ------------------------------------------------ -- |Read a single element from an array at the given row-major index. -- {-# INLINEABLE runIndexArray #-} runIndexArray :: forall m sh e. Monad m => (forall e a. (ArrayElt e, ArrayPtrs e ~ Ptr a, Storable a, Typeable a, Typeable e) => ArrayData e -> Int -> m a) -> Array sh e -> Int -> m e runIndexArray worker (Array _ adata) ix = toElt `liftM` indexR arrayElt adata ix where indexR :: ArrayEltR a -> ArrayData a -> Int -> m a indexR ArrayEltRunit _ _ = return () indexR ArrayEltRint ad i = worker ad i indexR ArrayEltRint8 ad i = worker ad i indexR ArrayEltRint16 ad i = worker ad i indexR ArrayEltRint32 ad i = worker ad i indexR ArrayEltRint64 ad i = worker ad i indexR ArrayEltRword ad i = worker ad i indexR ArrayEltRword8 ad i = worker ad i indexR ArrayEltRword16 ad i = worker ad i indexR ArrayEltRword32 ad i = worker ad i indexR ArrayEltRword64 ad i = worker ad i indexR ArrayEltRhalf ad i = worker ad i indexR ArrayEltRfloat ad i = worker ad i indexR ArrayEltRdouble ad i = worker ad i indexR ArrayEltRchar ad i = worker ad i indexR ArrayEltRcshort ad i = CShort `liftM` worker ad i indexR ArrayEltRcushort ad i = CUShort `liftM` worker ad i indexR ArrayEltRcint ad i = CInt `liftM` worker ad i indexR ArrayEltRcuint ad i = CUInt `liftM` worker ad i indexR ArrayEltRclong ad i = CLong `liftM` worker ad i indexR ArrayEltRculong ad i = CULong `liftM` worker ad i indexR ArrayEltRcllong ad i = CLLong `liftM` worker ad i indexR ArrayEltRcullong ad i = CULLong `liftM` worker ad i indexR ArrayEltRcchar ad i = CChar `liftM` worker ad i indexR ArrayEltRcschar ad i = CSChar `liftM` worker ad i indexR ArrayEltRcuchar ad i = CUChar `liftM` worker ad i indexR ArrayEltRcfloat ad i = CFloat `liftM` worker ad i indexR ArrayEltRcdouble ad i = CDouble `liftM` worker ad i indexR ArrayEltRbool ad i = toBool `liftM` worker ad i where toBool 0 = False toBool _ = True -- indexR (ArrayEltRpair aeR1 aeR2) (AD_Pair ad1 ad2) i = (,) <$> indexR aeR1 ad1 i <*> indexR aeR2 ad2 i -- NOTE [indexArray for SIMD vector types] -- -- These data types are stored contiguously in memory. Especially for -- backends where data is stored in a separate memory space (i.e. GPUs) we -- should copy all of those values in a single transaction, before unpacking -- them to the appropriate Haskell value (and/or, also store these values -- contiguously in Haskell land). @speed -- indexR (ArrayEltRvec2 r) (AD_V2 ad) i = let i' = 2*i in V2 <$> indexR r ad i' <*> indexR r ad (i'+1) indexR (ArrayEltRvec3 r) (AD_V3 ad) i = let i' = 3*i in V3 <$> indexR r ad i' <*> indexR r ad (i'+1) <*> indexR r ad (i'+2) indexR (ArrayEltRvec4 r) (AD_V4 ad) i = let i' = 4*i in V4 <$> indexR r ad i' <*> indexR r ad (i'+1) <*> indexR r ad (i'+2) <*> indexR r ad (i'+3) indexR (ArrayEltRvec8 r) (AD_V8 ad) i = let i' = 8*i in V8 <$> indexR r ad i' <*> indexR r ad (i'+1) <*> indexR r ad (i'+2) <*> indexR r ad (i'+3) <*> indexR r ad (i'+4) <*> indexR r ad (i'+5) <*> indexR r ad (i'+6) <*> indexR r ad (i'+7) indexR (ArrayEltRvec16 r) (AD_V16 ad) i = let i' = 16*i in V16 <$> indexR r ad i' <*> indexR r ad (i'+1) <*> indexR r ad (i'+2) <*> indexR r ad (i'+3) <*> indexR r ad (i'+4) <*> indexR r ad (i'+5) <*> indexR r ad (i'+6) <*> indexR r ad (i'+7) <*> indexR r ad (i'+8) <*> indexR r ad (i'+9) <*> indexR r ad (i'+10) <*> indexR r ad (i'+11) <*> indexR r ad (i'+12) <*> indexR r ad (i'+13) <*> indexR r ad (i'+14) <*> indexR r ad (i'+15) -- | Generalised function to traverse the Arrays structure -- {-# INLINE runArrays #-} runArrays :: forall m arrs. (Monad m, Arrays arrs) => arrs -> (forall sh e. Array sh e -> m (Array sh e)) -> m arrs runArrays arrs worker = toArr `liftM` runR (arrays arrs) (fromArr arrs) where runR :: ArraysR a -> a -> m a runR ArraysRunit () = return () runR ArraysRarray arr = worker arr runR (ArraysRpair aeR2 aeR1) (arrs2, arrs1) = liftM2 (,) (runR aeR2 arrs2) (runR aeR1 arrs1) -- | Generalised function to traverse the ArrayData structure with one -- additional argument -- {-# INLINE runArray #-} runArray :: forall m sh e. Monad m => Array sh e -> (forall e' p. (ArrayElt e', ArrayPtrs e' ~ Ptr p, Storable p, Typeable p, Typeable e') => Int -> ArrayData e' -> m (ArrayData e')) -> m (Array sh e) runArray (Array sh adata) worker = Array sh `liftM` runR arrayElt adata 1 where runR :: ArrayEltR e' -> ArrayData e' -> Int -> m (ArrayData e') runR ArrayEltRint ad n = worker n ad runR ArrayEltRint8 ad n = worker n ad runR ArrayEltRint16 ad n = worker n ad runR ArrayEltRint32 ad n = worker n ad runR ArrayEltRint64 ad n = worker n ad runR ArrayEltRword ad n = worker n ad runR ArrayEltRword8 ad n = worker n ad runR ArrayEltRword16 ad n = worker n ad runR ArrayEltRword32 ad n = worker n ad runR ArrayEltRword64 ad n = worker n ad runR ArrayEltRhalf ad n = worker n ad runR ArrayEltRfloat ad n = worker n ad runR ArrayEltRdouble ad n = worker n ad runR ArrayEltRbool ad n = worker n ad runR ArrayEltRchar ad n = worker n ad runR ArrayEltRcshort ad n = worker n ad runR ArrayEltRcushort ad n = worker n ad runR ArrayEltRcint ad n = worker n ad runR ArrayEltRcuint ad n = worker n ad runR ArrayEltRclong ad n = worker n ad runR ArrayEltRculong ad n = worker n ad runR ArrayEltRcllong ad n = worker n ad runR ArrayEltRcullong ad n = worker n ad runR ArrayEltRcfloat ad n = worker n ad runR ArrayEltRcdouble ad n = worker n ad runR ArrayEltRcchar ad n = worker n ad runR ArrayEltRcschar ad n = worker n ad runR ArrayEltRcuchar ad n = worker n ad runR (ArrayEltRvec2 ae) (AD_V2 ad) n = liftM AD_V2 (runR ae ad (n*2)) runR (ArrayEltRvec3 ae) (AD_V3 ad) n = liftM AD_V3 (runR ae ad (n*3)) runR (ArrayEltRvec4 ae) (AD_V4 ad) n = liftM AD_V4 (runR ae ad (n*4)) runR (ArrayEltRvec8 ae) (AD_V8 ad) n = liftM AD_V8 (runR ae ad (n*8)) runR (ArrayEltRvec16 ae) (AD_V16 ad) n = liftM AD_V16 (runR ae ad (n*16)) runR (ArrayEltRpair aeR2 aeR1) (AD_Pair ad2 ad1) n = liftM2 AD_Pair (runR aeR2 ad2 n) (runR aeR1 ad1 n) runR ArrayEltRunit AD_Unit _ = return AD_Unit