{-# LANGUAGE BangPatterns        #-}
{-# LANGUAGE CPP                 #-}
{-# LANGUAGE GADTs               #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell     #-}
{-# LANGUAGE TupleSections       #-}
{-# LANGUAGE TypeFamilies        #-}
{-# LANGUAGE TypeOperators       #-}
-- |
-- Module      : Data.Array.Accelerate.CUDA.Array.Data
-- Copyright   : [2008..2014] Manuel M T Chakravarty, Gabriele Keller
--               [2009..2014] Trevor L. McDonell
--               [2013..2014] Robert Clifton-Everest
-- License     : BSD3
--
-- Maintainer  : Trevor L. McDonell <tmcdonell@cse.unsw.edu.au>
-- Stability   : experimental
-- Portability : non-portable (GHC extensions)
--

module Data.Array.Accelerate.CUDA.Array.Data (

  -- * Array operations and representations
  mallocArray, freeArray,
  indexArray,
  useArray,  useArrayAsync, useArraySlice,
  useDevicePtrs,
  copyArray, copyArrayAsync, copyArrayPeer, copyArrayPeerAsync,
  peekArray, peekArrayAsync,
  pokeArray, pokeArrayAsync,
  marshalArrayData, marshalTextureData, marshalDevicePtrs,
  withDevicePtrs, advancePtrsOfArrayData,
  devicePtrsFromList, devicePtrsToWordPtrs,

  -- * Garbage collection
  cleanupArrayData,

) where

-- libraries
import Control.Applicative
import Control.Monad.Reader                             ( asks )
import Control.Monad.State                              ( gets )
import Control.Monad.Trans                              ( liftIO )
import Control.Monad.Trans.Cont
import Foreign.C.Types
import Foreign.Ptr
import Prelude                                          hiding ( fst, snd )
import qualified Prelude                                as P


-- friends
import Data.Array.Accelerate.Error
import Data.Array.Accelerate.Array.Data
import Data.Array.Accelerate.Array.Sugar                ( Array(..), Shape, Elt, fromElt, toElt, EltRepr )
import Data.Array.Accelerate.Array.Representation       ( size, SliceIndex )
import Data.Array.Accelerate.CUDA.State
import Data.Array.Accelerate.CUDA.Array.Slice           ( TransferDesc, transferDesc )
import Data.Array.Accelerate.CUDA.Array.Remote
import Data.Array.Accelerate.CUDA.Persistent            ( KernelTable )
import Data.Array.Accelerate.CUDA.Execute.Event         ( EventTable )
import Data.Array.Accelerate.CUDA.Execute.Stream        ( Reservoir )
import qualified Data.Array.Accelerate.CUDA.Array.Prim  as Prim
import qualified Foreign.CUDA.Driver                    as CUDA
import qualified Foreign.CUDA.Driver.Stream             as CUDA
import qualified Foreign.CUDA.Driver.Texture            as CUDA


-- Array Operations
-- ----------------

-- Garbage collection
--
cleanupArrayData :: CIO ()
cleanupArrayData = liftIO . reclaim =<< gets memoryTable

-- Array tuple extraction
--
fst :: ArrayData (a,b) -> ArrayData a
fst = fstArrayData

snd :: ArrayData (a,b) -> ArrayData b
snd = sndArrayData

-- Extract the state information to pass along to the primitive data handlers
--
{-# INLINE run #-}
run :: (Context -> MemoryTable -> IO a) -> CIO a
run f = do
  ctx    <- asks activeContext
  mt     <- gets memoryTable
  liftIO $! f ctx mt

run' :: (Context -> MemoryTable -> KernelTable -> Reservoir -> EventTable -> IO a) -> CIO a
run' f = do
  ctx    <- asks activeContext
  mt     <- gets memoryTable
  kt     <- gets kernelTable
  rsv    <- gets streamReservoir
  etbl   <- gets eventTable
  liftIO $! f ctx mt kt rsv etbl

-- CPP hackery to generate the cases where we dispatch to the worker function handling
-- elementary types.
--
#define mkPrimDispatch(dispatcher,worker)                                       \
; dispatcher ArrayEltRint     = worker                                          \
; dispatcher ArrayEltRint8    = worker                                          \
; dispatcher ArrayEltRint16   = worker                                          \
; dispatcher ArrayEltRint32   = worker                                          \
; dispatcher ArrayEltRint64   = worker                                          \
; dispatcher ArrayEltRword    = worker                                          \
; dispatcher ArrayEltRword8   = worker                                          \
; dispatcher ArrayEltRword16  = worker                                          \
; dispatcher ArrayEltRword32  = worker                                          \
; dispatcher ArrayEltRword64  = worker                                          \
; dispatcher ArrayEltRfloat   = worker                                          \
; dispatcher ArrayEltRdouble  = worker                                          \
; dispatcher ArrayEltRbool    = worker                                          \
; dispatcher ArrayEltRchar    = worker                                          \
; dispatcher ArrayEltRcshort  = worker                                          \
; dispatcher ArrayEltRcushort = worker                                          \
; dispatcher ArrayEltRcint    = worker                                          \
; dispatcher ArrayEltRcuint   = worker                                          \
; dispatcher ArrayEltRclong   = worker                                          \
; dispatcher ArrayEltRculong  = worker                                          \
; dispatcher ArrayEltRcllong  = worker                                          \
; dispatcher ArrayEltRcullong = worker                                          \
; dispatcher ArrayEltRcfloat  = worker                                          \
; dispatcher ArrayEltRcdouble = worker                                          \
; dispatcher ArrayEltRcchar   = worker                                          \
; dispatcher ArrayEltRcschar  = worker                                          \
; dispatcher ArrayEltRcuchar  = worker                                          \
; dispatcher _                = error "mkPrimDispatcher: not primitive"


-- |Allocate a new device array to accompany the given host-side array.
--
mallocArray :: (Shape dim, Elt e) => Array dim e -> CIO ()
mallocArray (Array !sh !adata) = run doMalloc
  where
    !n                = size sh
    doMalloc !ctx !mt = mallocR arrayElt adata
      where
        mallocR :: ArrayEltR e -> ArrayData e -> IO ()
        mallocR ArrayEltRunit             _  = return ()
        mallocR (ArrayEltRpair aeR1 aeR2) ad = mallocR aeR1 (fst ad) >> mallocR aeR2 (snd ad)
        mallocR aer                       ad = mallocPrim aer ctx mt ad n
        --
        mallocPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> IO ()
        mkPrimDispatch(mallocPrim,Prim.mallocArray)


-- |Deallocate the device array accompanying the given host-side array.
--
-- Note that this does not take into account whether or not the data is still
-- required by the current (or future) computation.
--
freeArray :: Array dim e -> CIO ()
freeArray (Array !_ !adata) = run doFree
  where
    doFree !ctx !mt = freeR arrayElt adata
      where
        freeR :: ArrayEltR e -> ArrayData e -> IO ()
        freeR ArrayEltRunit             _  = return ()
        freeR (ArrayEltRpair aeR1 aeR2) ad = freeR aeR1 (fst ad) >> freeR aeR2 (snd ad)
        freeR aer                       ad = freePrim aer ctx mt ad
        --
        freePrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> IO ()
        mkPrimDispatch(freePrim,free)


-- |Upload an existing array to the device
--
useArray :: (Shape dim, Elt e) => Array dim e -> CIO ()
useArray (Array !sh !adata) = run doUse
  where
    !n             = size sh
    doUse !ctx !mt = useR arrayElt adata
      where
        useR :: ArrayEltR e -> ArrayData e -> IO ()
        useR ArrayEltRunit             _  = return ()
        useR (ArrayEltRpair aeR1 aeR2) ad = useR aeR1 (fst ad) >> useR aeR2 (snd ad)
        useR aer                       ad = usePrim aer ctx mt ad n
        --
        usePrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> IO ()
        mkPrimDispatch(usePrim,Prim.useArray)

useArrayAsync :: (Shape dim, Elt e) => Array dim e -> Maybe CUDA.Stream -> CIO ()
useArrayAsync (Array !sh !adata) ms = run doUse
  where
    !n             = size sh
    doUse !ctx !mt = useR arrayElt adata
      where
        useR :: ArrayEltR e -> ArrayData e -> IO ()
        useR ArrayEltRunit             _  = return ()
        useR (ArrayEltRpair aeR1 aeR2) ad = useR aeR1 (fst ad) >> useR aeR2 (snd ad)
        useR aer                       ad = usePrim aer ctx mt ad n ms
        --
        usePrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> Maybe CUDA.Stream -> IO ()
        mkPrimDispatch(usePrim,Prim.useArrayAsync)


-- | Upload a slice of an existing array (eg. row of a matrix) to the
-- device. TODO : Bounds checking, generalize slices to more than just
-- inner dimension?
useArraySlice :: (Elt slix, Shape sl, Shape dim, Elt e)
              => SliceIndex (EltRepr slix) (EltRepr sl) co (EltRepr dim)
              -> slix        -- Slice
              -> Array dim e -- Host array
              -> Array sl e  -- Device array
              -> CIO ()
useArraySlice slix sl (Array dim !adata_host) (Array _ !adata_dev) = run doUse
  where
    tdesc = transferDesc slix (fromElt sl) dim
    doUse !ctx !mt = useR arrayElt adata_host adata_dev
      where
        useR :: ArrayEltR e -> ArrayData e -> ArrayData e -> IO ()
        useR ArrayEltRunit             _   _   = return ()
        useR (ArrayEltRpair aeR1 aeR2) adh add = useR aeR1 (fst adh) (fst add) >> useR aeR2 (snd adh) (snd add)
        useR aer                       adh add = usePrim aer ctx mt adh add tdesc -- usePrim aer ctx mt adh add tdesc
        usePrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> ArrayData e -> TransferDesc -> IO ()
        mkPrimDispatch(usePrim,Prim.useArraySlice)

useDevicePtrs :: (Shape sh, Elt e) => EltRepr sh -> Prim.DevicePtrs (EltRepr e) -> CIO (Array sh e)
useDevicePtrs sh ptrs = run doUse
  where
    !n             = size sh
    doUse !ctx !mt = Array sh <$> useD arrayElt ptrs
      where
        useD :: ArrayEltR e -> Prim.DevicePtrs e -> IO (ArrayData e)
        useD ArrayEltRunit             _  = return AD_Unit
        useD (ArrayEltRpair aeR1 aeR2) ps = AD_Pair <$> useD aeR1 (P.fst ps)
                                                    <*> useD aeR2 (P.snd ps)
        useD aer                       ps = usePrim aer ctx mt ps n
        --
        usePrim :: ArrayEltR e -> Context -> MemoryTable -> Prim.DevicePtrs e -> Int -> IO (ArrayData e)
        mkPrimDispatch(usePrim,Prim.useDevicePtrs)

devicePtrsFromList :: ArrayEltR e -> [WordPtr] -> Prim.DevicePtrs e
devicePtrsFromList aeR = P.fst . (devP aeR)
  where
    devP :: ArrayEltR e -> [WordPtr] -> (Prim.DevicePtrs e, [WordPtr])
    devP ArrayEltRunit             ps     = ((),ps)
    devP (ArrayEltRpair aeR1 aeR2) ps     = let
        (d1, ps')  = devP aeR1 ps
        (d2, ps'') = devP aeR2 ps'
      in ((d1,d2), ps'')
    devP aer                       (p:ps) = (devPrim aer p, ps)
    devP _                         []     = error "devicePtrsFromList: incorrect number of device pointers for element type"
    --
    devPrim :: ArrayEltR e -> WordPtr -> Prim.DevicePtrs e
    mkPrimDispatch(devPrim,CUDA.wordPtrToDevPtr)


-- |Read a single element from an array at the given row-major index. This is a
-- synchronous operation.
--
indexArray :: (Shape dim, Elt e) => Array dim e -> Int -> CIO e
indexArray (Array _ !adata) i = run doIndex
  where
    doIndex !ctx !mt = toElt <$> indexR arrayElt adata
      where
        indexR :: ArrayEltR e -> ArrayData e -> IO e
        indexR ArrayEltRunit             _  = return ()
        indexR (ArrayEltRpair aeR1 aeR2) ad = (,) <$> indexR aeR1 (fst ad)
                                                  <*> indexR aeR2 (snd ad)
        --
        indexR ArrayEltRint              ad = Prim.indexArray ctx mt ad i
        indexR ArrayEltRint8             ad = Prim.indexArray ctx mt ad i
        indexR ArrayEltRint16            ad = Prim.indexArray ctx mt ad i
        indexR ArrayEltRint32            ad = Prim.indexArray ctx mt ad i
        indexR ArrayEltRint64            ad = Prim.indexArray ctx mt ad i
        indexR ArrayEltRword             ad = Prim.indexArray ctx mt ad i
        indexR ArrayEltRword8            ad = Prim.indexArray ctx mt ad i
        indexR ArrayEltRword16           ad = Prim.indexArray ctx mt ad i
        indexR ArrayEltRword32           ad = Prim.indexArray ctx mt ad i
        indexR ArrayEltRword64           ad = Prim.indexArray ctx mt ad i
        indexR ArrayEltRfloat            ad = Prim.indexArray ctx mt ad i
        indexR ArrayEltRdouble           ad = Prim.indexArray ctx mt ad i
        indexR ArrayEltRchar             ad = Prim.indexArray ctx mt ad i
        --
        indexR ArrayEltRcshort           ad = CShort  <$> Prim.indexArray ctx mt ad i
        indexR ArrayEltRcushort          ad = CUShort <$> Prim.indexArray ctx mt ad i
        indexR ArrayEltRcint             ad = CInt    <$> Prim.indexArray ctx mt ad i
        indexR ArrayEltRcuint            ad = CUInt   <$> Prim.indexArray ctx mt ad i
        indexR ArrayEltRclong            ad = CLong   <$> Prim.indexArray ctx mt ad i
        indexR ArrayEltRculong           ad = CULong  <$> Prim.indexArray ctx mt ad i
        indexR ArrayEltRcllong           ad = CLLong  <$> Prim.indexArray ctx mt ad i
        indexR ArrayEltRcullong          ad = CULLong <$> Prim.indexArray ctx mt ad i
        indexR ArrayEltRcchar            ad = CChar   <$> Prim.indexArray ctx mt ad i
        indexR ArrayEltRcschar           ad = CSChar  <$> Prim.indexArray ctx mt ad i
        indexR ArrayEltRcuchar           ad = CUChar  <$> Prim.indexArray ctx mt ad i
        indexR ArrayEltRcfloat           ad = CFloat  <$> Prim.indexArray ctx mt ad i
        indexR ArrayEltRcdouble          ad = CDouble <$> Prim.indexArray ctx mt ad i
        --
        indexR ArrayEltRbool             ad = toBool  <$> Prim.indexArray ctx mt ad i
          where toBool 0 = False
                toBool _ = True


-- |Copy data between two device arrays. The operation is asynchronous with
-- respect to the host, but will never overlap kernel execution.
--
copyArray :: (Shape dim, Elt e) => Array dim e -> Array dim e -> CIO ()
copyArray (Array !sh1 !adata1) (Array !sh2 !adata2)
  = $boundsCheck "copyArray" "shape mismatch" (sh1 == sh2)
  $ run doCopy
  where
    !n              = size sh1
    doCopy !ctx !mt = copyR arrayElt adata1 adata2
      where
        copyR :: ArrayEltR e -> ArrayData e -> ArrayData e -> IO ()
        copyR ArrayEltRunit             _   _   = return ()
        copyR (ArrayEltRpair aeR1 aeR2) ad1 ad2 = copyR aeR1 (fst ad1) (fst ad2) >>
                                                  copyR aeR2 (snd ad1) (snd ad2)
        copyR aer                       ad1 ad2 = copyPrim aer ctx mt ad1 ad2 n
        --
        copyPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> ArrayData e -> Int -> IO ()
        mkPrimDispatch(copyPrim,Prim.copyArray)

copyArrayAsync :: (Shape dim, Elt e) => Array dim e -> Array dim e -> Maybe CUDA.Stream -> CIO ()
copyArrayAsync (Array !sh1 !adata1) (Array !sh2 !adata2) ms
  = $boundsCheck "copyArrayAsync" "shape mismatch" (sh1 == sh2)
  $ run doCopy
  where
    !n              = size sh1
    doCopy !ctx !mt = copyR arrayElt adata1 adata2
      where
        copyR :: ArrayEltR e -> ArrayData e -> ArrayData e -> IO ()
        copyR ArrayEltRunit             _   _   = return ()
        copyR (ArrayEltRpair aeR1 aeR2) ad1 ad2 = copyR aeR1 (fst ad1) (fst ad2) >>
                                                  copyR aeR2 (snd ad1) (snd ad2)
        copyR aer                       ad1 ad2 = copyPrim aer ctx mt ad1 ad2 n ms
        --
        copyPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> ArrayData e -> Int -> Maybe CUDA.Stream -> IO ()
        mkPrimDispatch(copyPrim,Prim.copyArrayAsync)


-- |Copy data between two device arrays which reside in different contexts. This
-- might entail copying between devices.
--
copyArrayPeer :: (Shape dim, Elt e) => Array dim e -> Context -> Array dim e -> Context -> CIO ()
copyArrayPeer (Array !sh1 !adata1) !ctxSrc (Array !sh2 !adata2) !ctxDst
  = $boundsCheck "copyArrayPeer" "shape mismatch" (sh1 == sh2)
  $ run doCopy
  where
    !n           = size sh1
    doCopy _ !mt = copyR arrayElt adata1 adata2
      where
        copyR :: ArrayEltR e -> ArrayData e -> ArrayData e -> IO ()
        copyR ArrayEltRunit             _   _   = return ()
        copyR (ArrayEltRpair aeR1 aeR2) ad1 ad2 = copyR aeR1 (fst ad1) (fst ad2) >>
                                                  copyR aeR2 (snd ad1) (snd ad2)
        copyR aer                       ad1 ad2 = copyPrim aer mt ad1 ctxSrc ad2 ctxDst n
        --
        copyPrim :: ArrayEltR e -> MemoryTable -> ArrayData e -> Context -> ArrayData e -> Context -> Int -> IO ()
        mkPrimDispatch(copyPrim,Prim.copyArrayPeer)

copyArrayPeerAsync :: (Shape dim, Elt e) => Array dim e -> Context -> Array dim e -> Context -> Maybe CUDA.Stream -> CIO ()
copyArrayPeerAsync (Array !sh1 !adata1) !ctxSrc (Array !sh2 !adata2) !ctxDst !ms
  = $boundsCheck "copyArrayPeerAsync" "shape mismatch" (sh1 == sh2)
  $ run doCopy
  where
    !n           = size sh1
    doCopy _ !mt = copyR arrayElt adata1 adata2
      where
        copyR :: ArrayEltR e -> ArrayData e -> ArrayData e -> IO ()
        copyR ArrayEltRunit             _   _   = return ()
        copyR (ArrayEltRpair aeR1 aeR2) ad1 ad2 = copyR aeR1 (fst ad1) (fst ad2) >>
                                                  copyR aeR2 (snd ad1) (snd ad2)
        copyR aer                       ad1 ad2 = copyPrim aer mt ad1 ctxSrc ad2 ctxDst n ms
        --
        copyPrim :: ArrayEltR e -> MemoryTable -> ArrayData e -> Context -> ArrayData e -> Context -> Int -> Maybe CUDA.Stream -> IO ()
        mkPrimDispatch(copyPrim,Prim.copyArrayPeerAsync)


-- Copy data from the device into the associated Accelerate host-side array
--
peekArray :: (Shape dim, Elt e) => Array dim e -> CIO ()
peekArray (Array !sh !adata) = run doPeek
  where
    !n              = size sh
    doPeek !ctx !mt = peekR arrayElt adata
      where
        peekR :: ArrayEltR e -> ArrayData e -> IO ()
        peekR ArrayEltRunit             _  = return ()
        peekR (ArrayEltRpair aeR1 aeR2) ad = peekR aeR1 (fst ad) >> peekR aeR2 (snd ad)
        peekR aer                       ad = peekPrim aer ctx mt ad n
        --
        peekPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> IO ()
        mkPrimDispatch(peekPrim,Prim.peekArray)

peekArrayAsync :: (Shape dim, Elt e) => Array dim e -> Maybe CUDA.Stream -> CIO ()
peekArrayAsync (Array !sh !adata) !ms = run doPeek
  where
    !n              = size sh
    doPeek !ctx !mt = peekR arrayElt adata
      where
        peekR :: ArrayEltR e -> ArrayData e -> IO ()
        peekR ArrayEltRunit             _  = return ()
        peekR (ArrayEltRpair aeR1 aeR2) ad = peekR aeR1 (fst ad) >> peekR aeR2 (snd ad)
        peekR aer                       ad = peekPrim aer ctx mt ad n ms
        --
        peekPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> Maybe CUDA.Stream -> IO ()
        mkPrimDispatch(peekPrim,Prim.peekArrayAsync)


-- Copy data from an Accelerate array into the associated device array
--
pokeArray :: (Shape dim, Elt e) => Array dim e -> CIO ()
pokeArray (Array !sh !adata) = run doPoke
  where
    !n              = size sh
    doPoke !ctx !mt = pokeR arrayElt adata
      where
        pokeR :: ArrayEltR e -> ArrayData e -> IO ()
        pokeR ArrayEltRunit             _  = return ()
        pokeR (ArrayEltRpair aeR1 aeR2) ad = pokeR aeR1 (fst ad) >> pokeR aeR2 (snd ad)
        pokeR aer                       ad = pokePrim aer ctx mt ad n
        --
        pokePrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> IO ()
        mkPrimDispatch(pokePrim,Prim.pokeArray)

pokeArrayAsync :: (Shape dim, Elt e) => Array dim e -> Maybe CUDA.Stream -> CIO ()
pokeArrayAsync (Array !sh !adata) !ms = run doPoke
  where
    !n              = size sh
    doPoke !ctx !mt = pokeR arrayElt adata
      where
        pokeR :: ArrayEltR e -> ArrayData e -> IO ()
        pokeR ArrayEltRunit             _  = return ()
        pokeR (ArrayEltRpair aeR1 aeR2) ad = pokeR aeR1 (fst ad) >> pokeR aeR2 (snd ad)
        pokeR aer                       ad = pokePrim aer ctx mt ad n ms
        --
        pokePrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> Maybe CUDA.Stream -> IO ()
        mkPrimDispatch(pokePrim,Prim.pokeArrayAsync)


-- |Convert the device pointers into a list of word pointers
--
devicePtrsToWordPtrs :: ArrayElt e => ArrayData e -> Prim.DevicePtrs e -> [WordPtr]
devicePtrsToWordPtrs !adata = wordR arrayElt adata
  where
    wordR :: ArrayEltR e -> ArrayData e -> Prim.DevicePtrs e -> [WordPtr]
    wordR ArrayEltRunit             _  _       = []
    wordR (ArrayEltRpair aeR1 aeR2) ad (p1,p2) = wordR aeR1 (fst ad) p1 ++
                                                 wordR aeR2 (snd ad) p2
    wordR aer                       ad ptr     = [wordPrim aer ad ptr]
    --
    wordPrim :: ArrayEltR e -> ArrayData e -> Prim.DevicePtrs e -> WordPtr
    mkPrimDispatch(wordPrim,const CUDA.devPtrToWordPtr)

-- |Wrap device pointers into arguments that can be passed to a kernel
-- invocation
--
marshalDevicePtrs :: ArrayElt e => ArrayData e -> Prim.DevicePtrs e -> [CUDA.FunParam]
marshalDevicePtrs !adata ptrs = map (CUDA.VArg . CUDA.wordPtrToDevPtr) $ devicePtrsToWordPtrs adata ptrs

-- |Wrap the device pointers corresponding to a host-side array into arguments
-- that can be passed to a kernel upon invocation and call the
-- supplied continuation. Any asynchronous CUDA functions called by the
-- continuation must be in the same stream as given by the 2nd argument.
--
marshalArrayData :: ArrayElt e => ArrayData e -> Maybe CUDA.Stream -> ([CUDA.FunParam] -> CIO b) -> CIO b
marshalArrayData !adata ms f = run' doMarshal
  where
    doMarshal !ctx !mt !kt !rsv !etbl = runContT (marshalR arrayElt adata) (evalCUDAState ctx mt kt rsv etbl . f)
      where
        marshalR :: ArrayEltR e -> ArrayData e -> ContT b IO [CUDA.FunParam]
        marshalR ArrayEltRunit             _  = return []
        marshalR (ArrayEltRpair aeR1 aeR2) ad = (++) <$> marshalR aeR1 (fst ad)
                                                     <*> marshalR aeR2 (snd ad)
        marshalR aer                       ad = do
          param <- ContT $ marshalPrim aer ctx mt ad ms
          return [param]
        --
        marshalPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Maybe CUDA.Stream -> (CUDA.FunParam -> IO b) -> IO b
        mkPrimDispatch(marshalPrim,Prim.marshalArrayData)


-- |Bind the device memory arrays to the given texture reference(s), setting
-- appropriate type, and call the supplied continuation. The arrays are bound,
-- and the list of textures thereby consumed, in projection index order
-- --- i.e. right-to-left
-- The textures should only be considered bound during the execution of the
-- continuation. Any asynchronous CUDA functions called by the continuation
-- must be in the same stream as given by the 4th argument.
--
marshalTextureData :: ArrayElt e => ArrayData e -> Int -> [CUDA.Texture] -> Maybe CUDA.Stream -> ([CUDA.Texture] -> CIO b) -> CIO b
marshalTextureData !adata !n !texs ms f = run' doMarshal
  where
    doMarshal !ctx !mt !kt !rsv !etbl = runContT (marshalR arrayElt adata texs) (evalCUDAState ctx mt kt rsv etbl . \(_,ts) -> f ts)
      where
        marshalR :: ArrayEltR e -> ArrayData e -> [CUDA.Texture] -> ContT b IO (Int, [CUDA.Texture])
        marshalR ArrayEltRunit             _  _ = return (0, [])
        marshalR (ArrayEltRpair aeR1 aeR2) ad t
          = do (r, rs) <- marshalR aeR2 (snd ad) t
               (l, ls) <- marshalR aeR1 (fst ad) (drop r t)
               return (l + r, ls ++ rs)
        marshalR aer                       ad t
          = do param <- ContT $ marshalPrim aer ctx mt ad n (head t) ms
               return (1, [param])
        --
        marshalPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Int -> CUDA.Texture -> Maybe CUDA.Stream -> (CUDA.Texture -> IO b) -> IO b
        mkPrimDispatch(marshalPrim,Prim.marshalTextureData)


-- | Perform an operation using the device pointers of the given array. Any
-- asynchronous CUDA functions called by the supplied continuation must be in
-- the same stream as given by the second argument.
--
withDevicePtrs :: ArrayElt e => ArrayData e -> Maybe CUDA.Stream -> (Prim.DevicePtrs e -> CIO b) -> CIO b
withDevicePtrs !adata ms f = run' ptrs
  where
    ptrs !ctx !mt !kt !rsv !etbl = runContT (ptrsR arrayElt adata) (evalCUDAState ctx mt kt rsv etbl . f)
      where
        ptrsR :: ArrayEltR e -> ArrayData e -> ContT b IO (Prim.DevicePtrs e)
        ptrsR ArrayEltRunit             _  = return ()
        ptrsR (ArrayEltRpair aeR1 aeR2) ad = (,) <$> ptrsR aeR1 (fst ad)
                                                 <*> ptrsR aeR2 (snd ad)
        ptrsR aer                       ad = ContT $ ptrsPrim aer ctx mt ad ms
        --
        ptrsPrim :: ArrayEltR e -> Context -> MemoryTable -> ArrayData e -> Maybe CUDA.Stream -> (Prim.DevicePtrs e -> IO b) -> IO b
        mkPrimDispatch(ptrsPrim,Prim.withDevicePtrs)


-- |Advance a set of device pointers by the given number of elements each
--
advancePtrsOfArrayData :: ArrayElt e => ArrayData e -> Int -> Prim.DevicePtrs e -> Prim.DevicePtrs e
advancePtrsOfArrayData !adata !n = advanceR arrayElt adata
  where
    advanceR :: ArrayEltR e -> ArrayData e -> Prim.DevicePtrs e -> Prim.DevicePtrs e
    advanceR ArrayEltRunit             _  _       = ()
    advanceR (ArrayEltRpair aeR1 aeR2) ad (p1,p2) = (advanceR aeR1 (fst ad) p1
                                                    ,advanceR aeR2 (snd ad) p2)
    advanceR aer                       ad ptr     = advancePrim aer ad ptr
    --
    advancePrim :: ArrayEltR e -> ArrayData e -> Prim.DevicePtrs e -> Prim.DevicePtrs e
    mkPrimDispatch(advancePrim,Prim.advancePtrsOfArrayData n)