{-# LANGUAGE CPP #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeFamilies #-} -- | -- Module : Data.Array.Accelerate.CUDA.Array.Data -- Copyright : [2008..2010] Manuel M T Chakravarty, Gabriele Keller, Sean Lee -- [2009..2012] Manuel M T Chakravarty, Gabriele Keller, Trevor L. McDonell -- License : BSD3 -- -- Maintainer : Trevor L. McDonell -- Stability : experimental -- Portability : non-portable (GHC extensions) -- module Data.Array.Accelerate.CUDA.Array.Data ( -- * Array operations and representations mallocArray, indexArray, copyArray, useArray, useArrayAsync, peekArray, peekArrayAsync, pokeArray, pokeArrayAsync, marshalArrayData, marshalTextureData, marshalDevicePtrs, devicePtrsOfArrayData, advancePtrsOfArrayData, -- * Garbage collection cleanupArrayData ) where -- libraries import Prelude hiding ( fst, snd ) import Data.Label.PureM import Control.Applicative import Control.Monad.Trans -- friends import Data.Array.Accelerate.Array.Data import Data.Array.Accelerate.Array.Sugar ( Array(..), Shape, Elt, fromElt, toElt ) import Data.Array.Accelerate.Array.Representation ( size, index ) import Data.Array.Accelerate.CUDA.State import Data.Array.Accelerate.CUDA.Array.Table import qualified Data.Array.Accelerate.CUDA.Array.Prim as Prim import qualified Foreign.CUDA.Driver as CUDA import qualified Foreign.CUDA.Driver.Stream as CUDA import qualified Foreign.CUDA.Driver.Texture as CUDA #include "accelerate.h" -- Array Operations -- ---------------- -- Garbage collection -- cleanupArrayData :: CIO () cleanupArrayData = liftIO . reclaim =<< gets memoryTable -- Array tuple extraction -- fst :: ArrayData (a,b) -> ArrayData a fst = fstArrayData snd :: ArrayData (a,b) -> ArrayData b snd = sndArrayData -- Extract the state information to pass along to the primitive data handlers -- run :: (Context -> MemoryTable -> IO a) -> CIO a run f = do ctx <- gets activeContext mt <- gets memoryTable liftIO $ f ctx mt -- CPP hackery to generate the cases where we dispatch to the worker function handling -- elementary types. -- #define mkPrimDispatch(dispatcher,worker) \ ; dispatcher ArrayEltRint = worker \ ; dispatcher ArrayEltRint8 = worker \ ; dispatcher ArrayEltRint16 = worker \ ; dispatcher ArrayEltRint32 = worker \ ; dispatcher ArrayEltRint64 = worker \ ; dispatcher ArrayEltRword = worker \ ; dispatcher ArrayEltRword8 = worker \ ; dispatcher ArrayEltRword16 = worker \ ; dispatcher ArrayEltRword32 = worker \ ; dispatcher ArrayEltRword64 = worker \ ; dispatcher ArrayEltRfloat = worker \ ; dispatcher ArrayEltRdouble = worker \ ; dispatcher ArrayEltRbool = worker \ ; dispatcher ArrayEltRchar = worker \ ; dispatcher _ = error "mkPrimDispatcher: not primitive" -- |Allocate a new device array to accompany the given host-side array. -- mallocArray :: (Shape dim, Elt e) => Array dim e -> CIO () mallocArray (Array sh adata) = run doMalloc where doMalloc ctx mt = mallocR arrayElt adata where mallocR :: ArrayEltR e -> ArrayData e -> IO () mallocR ArrayEltRunit _ = return () mallocR (ArrayEltRpair aeR1 aeR2) ad = mallocR aeR1 (fst ad) >> mallocR aeR2 (snd ad) mallocR aer ad = mallocPrim aer ctx mt ad (size sh) -- mallocPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> IO () mkPrimDispatch(mallocPrim,Prim.mallocArray) -- |Upload an existing array to the device -- useArray :: (Shape dim, Elt e) => Array dim e -> CIO () useArray (Array sh adata) = run doUse where doUse ctx mt = useR arrayElt adata where useR :: ArrayEltR e -> ArrayData e -> IO () useR ArrayEltRunit _ = return () useR (ArrayEltRpair aeR1 aeR2) ad = useR aeR1 (fst ad) >> useR aeR2 (snd ad) useR aer ad = usePrim aer ctx mt ad (size sh) -- usePrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> IO () mkPrimDispatch(usePrim,Prim.useArray) useArrayAsync :: (Shape dim, Elt e) => Array dim e -> Maybe CUDA.Stream -> CIO () useArrayAsync (Array sh adata) ms = run doUse where doUse ctx mt = useR arrayElt adata where useR :: ArrayEltR e -> ArrayData e -> IO () useR ArrayEltRunit _ = return () useR (ArrayEltRpair aeR1 aeR2) ad = useR aeR1 (fst ad) >> useR aeR2 (snd ad) useR aer ad = usePrim aer ctx mt ad (size sh) ms -- usePrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> Maybe CUDA.Stream -> IO () mkPrimDispatch(usePrim,Prim.useArrayAsync) -- |Read a single element from an array at the given row-major index. This is a -- synchronous operation. -- indexArray :: (Shape dim, Elt e) => Array dim e -> dim -> CIO e indexArray (Array sh adata) ix = run doIndex where i = index sh (fromElt ix) doIndex ctx mt = toElt <$> indexR arrayElt adata where indexR :: ArrayEltR e -> ArrayData e -> IO e indexR ArrayEltRunit _ = return () indexR (ArrayEltRpair aeR1 aeR2) ad = (,) <$> indexR aeR1 (fst ad) <*> indexR aeR2 (snd ad) -- indexR ArrayEltRbool ad = toBool <$> Prim.indexArray ctx mt ad i where toBool 0 = False toBool _ = True -- indexR ArrayEltRint ad = Prim.indexArray ctx mt ad i indexR ArrayEltRint8 ad = Prim.indexArray ctx mt ad i indexR ArrayEltRint16 ad = Prim.indexArray ctx mt ad i indexR ArrayEltRint32 ad = Prim.indexArray ctx mt ad i indexR ArrayEltRint64 ad = Prim.indexArray ctx mt ad i indexR ArrayEltRword ad = Prim.indexArray ctx mt ad i indexR ArrayEltRword8 ad = Prim.indexArray ctx mt ad i indexR ArrayEltRword16 ad = Prim.indexArray ctx mt ad i indexR ArrayEltRword32 ad = Prim.indexArray ctx mt ad i indexR ArrayEltRword64 ad = Prim.indexArray ctx mt ad i indexR ArrayEltRfloat ad = Prim.indexArray ctx mt ad i indexR ArrayEltRdouble ad = Prim.indexArray ctx mt ad i indexR ArrayEltRchar ad = Prim.indexArray ctx mt ad i -- |Copy data between two device arrays. The operation is asynchronous with -- respect to the host, but will never overlap kernel execution. -- copyArray :: (Shape dim, Elt e) => Array dim e -> Array dim e -> CIO () copyArray (Array sh1 adata1) (Array sh2 adata2) = BOUNDS_CHECK(check) "copyArray" "shape mismatch" (sh1 == sh2) $ run doCopy where doCopy ctx mt = copyR arrayElt adata1 adata2 where copyR :: ArrayEltR e -> ArrayData e -> ArrayData e -> IO () copyR ArrayEltRunit _ _ = return () copyR (ArrayEltRpair aeR1 aeR2) ad1 ad2 = copyR aeR1 (fst ad1) (fst ad2) >> copyR aeR2 (snd ad1) (snd ad2) copyR aer ad1 ad2 = copyPrim aer ctx mt ad1 ad2 (size sh1) -- copyPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> ArrayData e -> Int -> IO () mkPrimDispatch(copyPrim,Prim.copyArray) -- Copy data from the device into the associated Accelerate host-side array -- peekArray :: (Shape dim, Elt e) => Array dim e -> CIO () peekArray (Array sh adata) = run doPeek where doPeek ctx mt = peekR arrayElt adata where peekR :: ArrayEltR e -> ArrayData e -> IO () peekR ArrayEltRunit _ = return () peekR (ArrayEltRpair aeR1 aeR2) ad = peekR aeR1 (fst ad) >> peekR aeR2 (snd ad) peekR aer ad = peekPrim aer ctx mt ad (size sh) -- peekPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> IO () mkPrimDispatch(peekPrim,Prim.peekArray) peekArrayAsync :: (Shape dim, Elt e) => Array dim e -> Maybe CUDA.Stream -> CIO () peekArrayAsync (Array sh adata) ms = run doPeek where doPeek ctx mt = peekR arrayElt adata where peekR :: ArrayEltR e -> ArrayData e -> IO () peekR ArrayEltRunit _ = return () peekR (ArrayEltRpair aeR1 aeR2) ad = peekR aeR1 (fst ad) >> peekR aeR2 (snd ad) peekR aer ad = peekPrim aer ctx mt ad (size sh) ms -- peekPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> Maybe CUDA.Stream -> IO () mkPrimDispatch(peekPrim,Prim.peekArrayAsync) -- Copy data from an Accelerate array into the associated device array -- pokeArray :: (Shape dim, Elt e) => Array dim e -> CIO () pokeArray (Array sh adata) = run doPoke where doPoke ctx mt = pokeR arrayElt adata where pokeR :: ArrayEltR e -> ArrayData e -> IO () pokeR ArrayEltRunit _ = return () pokeR (ArrayEltRpair aeR1 aeR2) ad = pokeR aeR1 (fst ad) >> pokeR aeR2 (snd ad) pokeR aer ad = pokePrim aer ctx mt ad (size sh) -- pokePrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> IO () mkPrimDispatch(pokePrim,Prim.pokeArray) pokeArrayAsync :: (Shape dim, Elt e) => Array dim e -> Maybe CUDA.Stream -> CIO () pokeArrayAsync (Array sh adata) ms = run doPoke where doPoke ctx mt = pokeR arrayElt adata where pokeR :: ArrayEltR e -> ArrayData e -> IO () pokeR ArrayEltRunit _ = return () pokeR (ArrayEltRpair aeR1 aeR2) ad = pokeR aeR1 (fst ad) >> pokeR aeR2 (snd ad) pokeR aer ad = pokePrim aer ctx mt ad (size sh) ms -- pokePrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> Maybe CUDA.Stream -> IO () mkPrimDispatch(pokePrim,Prim.pokeArrayAsync) -- |Wrap device pointers into arguments that can be passed to a kernel -- invocation -- marshalDevicePtrs :: ArrayElt e => ArrayData e -> Prim.DevicePtrs e -> [CUDA.FunParam] marshalDevicePtrs adata = marshalR arrayElt adata where marshalR :: ArrayEltR e -> ArrayData e -> Prim.DevicePtrs e -> [CUDA.FunParam] marshalR ArrayEltRunit _ _ = [] marshalR (ArrayEltRpair aeR1 aeR2) ad (p1,p2) = marshalR aeR1 (fst ad) p1 ++ marshalR aeR2 (snd ad) p2 marshalR aer ad ptr = [marshalPrim aer ad ptr] -- marshalPrim :: ArrayEltR e -> ArrayData e -> Prim.DevicePtrs e -> CUDA.FunParam mkPrimDispatch(marshalPrim,Prim.marshalDevicePtrs) -- |Wrap the device pointers corresponding to a host-side array into arguments -- that can be passed to a kernel upon invocation. -- marshalArrayData :: ArrayElt e => ArrayData e -> CIO [CUDA.FunParam] marshalArrayData adata = run doMarshal where doMarshal ctx mt = marshalR arrayElt adata where marshalR :: ArrayEltR e -> ArrayData e -> IO [CUDA.FunParam] marshalR ArrayEltRunit _ = return [] marshalR (ArrayEltRpair aeR1 aeR2) ad = (++) <$> marshalR aeR1 (fst ad) <*> marshalR aeR2 (snd ad) marshalR aer ad = return <$> marshalPrim aer ctx mt ad -- marshalPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> IO CUDA.FunParam mkPrimDispatch(marshalPrim,Prim.marshalArrayData) -- |Bind the device memory arrays to the given texture reference(s), setting -- appropriate type. The arrays are bound, and the list of textures thereby -- consumed, in projection index order --- i.e. right-to-left -- marshalTextureData :: ArrayElt e => ArrayData e -> Int -> [CUDA.Texture] -> CIO () marshalTextureData adata n texs = run doMarshal where doMarshal ctx mt = marshalR arrayElt adata texs >> return () where marshalR :: ArrayEltR e -> ArrayData e -> [CUDA.Texture] -> IO Int marshalR ArrayEltRunit _ _ = return 0 marshalR (ArrayEltRpair aeR1 aeR2) ad t = do r <- marshalR aeR2 (snd ad) t l <- marshalR aeR1 (fst ad) (drop r t) return (l + r) marshalR aer ad t = do marshalPrim aer ctx mt ad n (head t) return 1 -- marshalPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> CUDA.Texture -> IO () mkPrimDispatch(marshalPrim,Prim.marshalTextureData) -- |Raw device pointers associated with a host-side array -- devicePtrsOfArrayData :: ArrayElt e => ArrayData e -> CIO (Prim.DevicePtrs e) devicePtrsOfArrayData adata = run ptrs where ptrs ctx mt = ptrsR arrayElt adata where ptrsR :: ArrayEltR e -> ArrayData e -> IO (Prim.DevicePtrs e) ptrsR ArrayEltRunit _ = return () ptrsR (ArrayEltRpair aeR1 aeR2) ad = (,) <$> ptrsR aeR1 (fst ad) <*> ptrsR aeR2 (snd ad) ptrsR aer ad = ptrsPrim aer ctx mt ad -- ptrsPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> IO (Prim.DevicePtrs e) mkPrimDispatch(ptrsPrim,Prim.devicePtrsOfArrayData) -- |Advance a set of device pointers by the given number of elements each -- advancePtrsOfArrayData :: ArrayElt e => ArrayData e -> Int -> Prim.DevicePtrs e -> Prim.DevicePtrs e advancePtrsOfArrayData adata n = advanceR arrayElt adata where advanceR :: ArrayEltR e -> ArrayData e -> Prim.DevicePtrs e -> Prim.DevicePtrs e advanceR ArrayEltRunit _ _ = () advanceR (ArrayEltRpair aeR1 aeR2) ad (p1,p2) = (advanceR aeR1 (fst ad) p1 ,advanceR aeR2 (snd ad) p2) advanceR aer ad ptr = advancePrim aer ad ptr -- advancePrim :: ArrayEltR e -> ArrayData e -> Prim.DevicePtrs e -> Prim.DevicePtrs e mkPrimDispatch(advancePrim,Prim.advancePtrsOfArrayData n)