/var/folders/0y/t7qfzn8j1y7dlfhjkvry2_hh0000gn/T/doc-package-9OFx3dL/dist/build/Foreign/CUDA/Driver/Marshal.hs

-- GENERATED by C->Haskell Compiler, version 0.20.1 The shapeless maps, 31 Oct 2014 (Haskell)
-- Edit the ORIGNAL .chs file instead!


{-# LINE 1 "./Foreign/CUDA/Driver/Marshal.chs" #-}
{-# LANGUAGE BangPatterns             #-}
{-# LANGUAGE CPP                      #-}
{-# LANGUAGE EmptyDataDecls           #-}
{-# LANGUAGE ForeignFunctionInterface #-}
{-# OPTIONS_HADDOCK prune #-}
--------------------------------------------------------------------------------
-- |
-- Module    : Foreign.CUDA.Driver.Marshal
-- Copyright : [2009..2014] Trevor L. McDonell
-- License   : BSD
--
-- Memory management for low-level driver interface
--
--------------------------------------------------------------------------------

module Foreign.CUDA.Driver.Marshal (

  -- * Host Allocation
  AllocFlag(..),
  mallocHostArray, freeHost, registerArray, unregisterArray,

  -- * Device Allocation
  mallocArray, allocaArray, free,

  -- * Unified Memory Allocation
  AttachFlag(..),
  mallocManagedArray,

  -- * Marshalling
  peekArray, peekArrayAsync, peekArray2D, peekArray2DAsync, peekListArray,
  pokeArray, pokeArrayAsync, pokeArray2D, pokeArray2DAsync, pokeListArray,
  copyArray, copyArrayAsync, copyArray2D, copyArray2DAsync,
  copyArrayPeer, copyArrayPeerAsync,

  -- * Combined Allocation and Marshalling
  newListArray,  newListArrayLen,
  withListArray, withListArrayLen,

  -- * Utility
  memset, memsetAsync,
  getDevicePtr, getBasePtr, getMemInfo,

  -- Internal
  useDeviceHandle, peekDeviceHandle

) where



{-# LINE 49 "./Foreign/CUDA/Driver/Marshal.chs" #-}


-- Friends
import Foreign.CUDA.Ptr
import Foreign.CUDA.Driver.Error
import Foreign.CUDA.Driver.Stream                       ( Stream(..), defaultStream )
import Foreign.CUDA.Driver.Context                      ( Context(..) )
import Foreign.CUDA.Internal.C2HS

-- System
import Data.Int
import Data.Maybe
import Unsafe.Coerce
import Control.Applicative
import Control.Exception

import Foreign.C
import Foreign.Ptr
import Foreign.Storable
import qualified Foreign.Marshal                        as F

--------------------------------------------------------------------------------
-- Host Allocation
--------------------------------------------------------------------------------

-- |
-- Options for host allocation
--
data AllocFlag = Portable
               | DeviceMapped
               | WriteCombined
  deriving (Eq,Show)
instance Enum AllocFlag where
  succ Portable = DeviceMapped
  succ DeviceMapped = WriteCombined
  succ WriteCombined = error "AllocFlag.succ: WriteCombined has no successor"

  pred DeviceMapped = Portable
  pred WriteCombined = DeviceMapped
  pred Portable = error "AllocFlag.pred: Portable has no predecessor"

  enumFromTo from to = go from
    where
      end = fromEnum to
      go v = case compare (fromEnum v) end of
                 LT -> v : go (succ v)
                 EQ -> [v]
                 GT -> []

  enumFrom from = enumFromTo from WriteCombined

  fromEnum Portable = 1
  fromEnum DeviceMapped = 2
  fromEnum WriteCombined = 4

  toEnum 1 = Portable
  toEnum 2 = DeviceMapped
  toEnum 4 = WriteCombined
  toEnum unmatched = error ("AllocFlag.toEnum: Cannot match " ++ show unmatched)

{-# LINE 87 "./Foreign/CUDA/Driver/Marshal.chs" #-}


-- |
-- Allocate a section of linear memory on the host which is page-locked and
-- directly accessible from the device. The storage is sufficient to hold the
-- given number of elements of a storable type.
--
-- Note that since the amount of pageable memory is thusly reduced, overall
-- system performance may suffer. This is best used sparingly to allocate
-- staging areas for data exchange.
--
{-# INLINEABLE mallocHostArray #-}
mallocHostArray :: Storable a => [AllocFlag] -> Int -> IO (HostPtr a)
mallocHostArray !flags = doMalloc undefined
  where
    doMalloc :: Storable a' => a' -> Int -> IO (HostPtr a')
    doMalloc x !n = resultIfOk =<< cuMemHostAlloc (n * sizeOf x) flags

{-# INLINE cuMemHostAlloc #-}
cuMemHostAlloc :: (Int) -> ([AllocFlag]) -> IO ((Status), (HostPtr a))
cuMemHostAlloc a2 a3 =
  alloca' $ \a1' -> 
  let {a2' = fromIntegral a2} in 
  let {a3' = combineBitMasks a3} in 
  cuMemHostAlloc'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  peekHP  a1'>>= \a1'' -> 
  return (res', a1'')

{-# LINE 109 "./Foreign/CUDA/Driver/Marshal.chs" #-}

  where
    alloca' !f = F.alloca $ \ !p -> poke p nullPtr >> f (castPtr p)
    peekHP !p  = HostPtr . castPtr <$> peek p


-- |
-- Free a section of page-locked host memory
--
{-# INLINEABLE freeHost #-}
freeHost :: HostPtr a -> IO ()
freeHost !p = nothingIfOk =<< cuMemFreeHost p

{-# INLINE cuMemFreeHost #-}
cuMemFreeHost :: (HostPtr a) -> IO ((Status))
cuMemFreeHost a1 =
  let {a1' = useHP a1} in 
  cuMemFreeHost'_ a1' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 124 "./Foreign/CUDA/Driver/Marshal.chs" #-}

  where
    useHP = castPtr . useHostPtr


-- |
-- Page-locks the specified array (on the host) and maps it for the device(s) as
-- specified by the given allocation flags. Subsequently, the memory is accessed
-- directly by the device so can be read and written with much higher bandwidth
-- than pageable memory that has not been registered. The memory range is added
-- to the same tracking mechanism as 'mallocHostArray' to automatically
-- accelerate calls to functions such as 'pokeArray'.
--
-- Note that page-locking excessive amounts of memory may degrade system
-- performance, since it reduces the amount of pageable memory available. This
-- is best used sparingly to allocate staging areas for data exchange.
--
-- This function is not yet implemented on Mac OS X. Requires cuda-4.0.
--
{-# INLINEABLE registerArray #-}
registerArray :: Storable a => [AllocFlag] -> Int -> Ptr a -> IO (HostPtr a)
registerArray !flags !n = go undefined
  where
    go :: Storable b => b -> Ptr b -> IO (HostPtr b)
    go x !p = do
      status <- cuMemHostRegister p (n * sizeOf x) flags
      resultIfOk (status,HostPtr p)

{-# INLINE cuMemHostRegister #-}
cuMemHostRegister :: (Ptr a) -> (Int) -> ([AllocFlag]) -> IO ((Status))
cuMemHostRegister a1 a2 a3 =
  let {a1' = castPtr a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = combineBitMasks a3} in 
  cuMemHostRegister'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 160 "./Foreign/CUDA/Driver/Marshal.chs" #-}



-- |
-- Unmaps the memory from the given pointer, and makes it pageable again.
--
-- This function is not yet implemented on Mac OS X. Requires cuda-4.0.
--
{-# INLINEABLE unregisterArray #-}
unregisterArray :: HostPtr a -> IO (Ptr a)
unregisterArray (HostPtr !p) = do
  status <- cuMemHostUnregister p
  resultIfOk (status,p)

{-# INLINE cuMemHostUnregister #-}
cuMemHostUnregister :: (Ptr a) -> IO ((Status))
cuMemHostUnregister a1 =
  let {a1' = castPtr a1} in 
  cuMemHostUnregister'_ a1' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 180 "./Foreign/CUDA/Driver/Marshal.chs" #-}



--------------------------------------------------------------------------------
-- Device Allocation
--------------------------------------------------------------------------------

-- |
-- Allocate a section of linear memory on the device, and return a reference to
-- it. The memory is sufficient to hold the given number of elements of storable
-- type. It is suitably aligned for any type, and is not cleared.
--
{-# INLINEABLE mallocArray #-}
mallocArray :: Storable a => Int -> IO (DevicePtr a)
mallocArray = doMalloc undefined
  where
    doMalloc :: Storable a' => a' -> Int -> IO (DevicePtr a')
    doMalloc x !n = resultIfOk =<< cuMemAlloc (n * sizeOf x)

{-# INLINE cuMemAlloc #-}
cuMemAlloc :: (Int) -> IO ((Status), (DevicePtr a))
cuMemAlloc a2 =
  alloca' $ \a1' -> 
  let {a2' = fromIntegral a2} in 
  cuMemAlloc'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  peekDeviceHandle  a1'>>= \a1'' -> 
  return (res', a1'')

{-# LINE 202 "./Foreign/CUDA/Driver/Marshal.chs" #-}

  where
    alloca' !f = F.alloca $ \ !p -> poke p nullPtr >> f (castPtr p)


-- |
-- Execute a computation on the device, passing a pointer to a temporarily
-- allocated block of memory sufficient to hold the given number of elements of
-- storable type. The memory is freed when the computation terminates (normally
-- or via an exception), so the pointer must not be used after this.
--
-- Note that kernel launches can be asynchronous, so you may want to add a
-- synchronisation point using 'sync' as part of the computation.
--
{-# INLINEABLE allocaArray #-}
allocaArray :: Storable a => Int -> (DevicePtr a -> IO b) -> IO b
allocaArray !n = bracket (mallocArray n) free


-- |
-- Release a section of device memory
--
{-# INLINEABLE free #-}
free :: DevicePtr a -> IO ()
free !dp = nothingIfOk =<< cuMemFree dp

{-# INLINE cuMemFree #-}
cuMemFree :: (DevicePtr a) -> IO ((Status))
cuMemFree a1 =
  let {a1' = useDeviceHandle a1} in 
  cuMemFree'_ a1' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 230 "./Foreign/CUDA/Driver/Marshal.chs" #-}



--------------------------------------------------------------------------------
-- Unified memory allocations
--------------------------------------------------------------------------------

-- |
-- Options for unified memory allocations
--
data AttachFlag = CuMemAttachGlobal
                | CuMemAttachHost
                | CuMemAttachSingle
  deriving (Eq,Show)
instance Enum AttachFlag where
  succ CuMemAttachGlobal = CuMemAttachHost
  succ CuMemAttachHost = CuMemAttachSingle
  succ CuMemAttachSingle = error "AttachFlag.succ: CuMemAttachSingle has no successor"

  pred CuMemAttachHost = CuMemAttachGlobal
  pred CuMemAttachSingle = CuMemAttachHost
  pred CuMemAttachGlobal = error "AttachFlag.pred: CuMemAttachGlobal has no predecessor"

  enumFromTo from to = go from
    where
      end = fromEnum to
      go v = case compare (fromEnum v) end of
                 LT -> v : go (succ v)
                 EQ -> [v]
                 GT -> []

  enumFrom from = enumFromTo from CuMemAttachSingle

  fromEnum CuMemAttachGlobal = 1
  fromEnum CuMemAttachHost = 2
  fromEnum CuMemAttachSingle = 4

  toEnum 1 = CuMemAttachGlobal
  toEnum 2 = CuMemAttachHost
  toEnum 4 = CuMemAttachSingle
  toEnum unmatched = error ("AttachFlag.toEnum: Cannot match " ++ show unmatched)

{-# LINE 246 "./Foreign/CUDA/Driver/Marshal.chs" #-}


-- |
-- Allocates memory that will be automatically managed by the Unified Memory
-- system
--
{-# INLINEABLE mallocManagedArray #-}
mallocManagedArray :: Storable a => [AttachFlag] -> Int -> IO (DevicePtr a)
mallocManagedArray !flags = doMalloc undefined
  where
    doMalloc :: Storable a' => a' -> Int -> IO (DevicePtr a')
    doMalloc x !n = resultIfOk =<< cuMemAllocManaged (n * sizeOf x) flags

{-# INLINE cuMemAllocManaged #-}
cuMemAllocManaged :: (Int) -> ([AttachFlag]) -> IO ((Status), (DevicePtr a))
cuMemAllocManaged a2 a3 =
  alloca' $ \a1' -> 
  let {a2' = fromIntegral a2} in 
  let {a3' = combineBitMasks a3} in 
  cuMemAllocManaged'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  peekDeviceHandle  a1'>>= \a1'' -> 
  return (res', a1'')

{-# LINE 266 "./Foreign/CUDA/Driver/Marshal.chs" #-}

  where
    alloca' !f = F.alloca $ \ !p -> poke p nullPtr >> f (castPtr p)


--------------------------------------------------------------------------------
-- Marshalling
--------------------------------------------------------------------------------

-- Device -> Host
-- --------------

-- |
-- Copy a number of elements from the device to host memory. This is a
-- synchronous operation
--
{-# INLINEABLE peekArray #-}
peekArray :: Storable a => Int -> DevicePtr a -> Ptr a -> IO ()
peekArray !n !dptr !hptr = doPeek undefined dptr
  where
    doPeek :: Storable a' => a' -> DevicePtr a' -> IO ()
    doPeek x _ = nothingIfOk =<< cuMemcpyDtoH hptr dptr (n * sizeOf x)

{-# INLINE cuMemcpyDtoH #-}
cuMemcpyDtoH :: (Ptr a) -> (DevicePtr a) -> (Int) -> IO ((Status))
cuMemcpyDtoH a1 a2 a3 =
  let {a1' = castPtr a1} in 
  let {a2' = useDeviceHandle a2} in 
  let {a3' = fromIntegral a3} in 
  cuMemcpyDtoH'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 294 "./Foreign/CUDA/Driver/Marshal.chs" #-}



-- |
-- Copy memory from the device asynchronously, possibly associated with a
-- particular stream. The destination host memory must be page-locked.
--
{-# INLINEABLE peekArrayAsync #-}
peekArrayAsync :: Storable a => Int -> DevicePtr a -> HostPtr a -> Maybe Stream -> IO ()
peekArrayAsync !n !dptr !hptr !mst = doPeek undefined dptr
  where
    doPeek :: Storable a' => a' -> DevicePtr a' -> IO ()
    doPeek x _ = nothingIfOk =<< cuMemcpyDtoHAsync hptr dptr (n * sizeOf x) (fromMaybe defaultStream mst)

{-# INLINE cuMemcpyDtoHAsync #-}
cuMemcpyDtoHAsync :: (HostPtr a) -> (DevicePtr a) -> (Int) -> (Stream) -> IO ((Status))
cuMemcpyDtoHAsync a1 a2 a3 a4 =
  let {a1' = useHP a1} in 
  let {a2' = useDeviceHandle a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = useStream a4} in 
  cuMemcpyDtoHAsync'_ a1' a2' a3' a4' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 313 "./Foreign/CUDA/Driver/Marshal.chs" #-}

  where
    useHP = castPtr . useHostPtr


-- |
-- Copy a 2D array from the device to the host.
--
{-# INLINEABLE peekArray2D #-}
peekArray2D
    :: Storable a
    => Int                      -- ^ width to copy (elements)
    -> Int                      -- ^ height to copy (elements)
    -> DevicePtr a              -- ^ source array
    -> Int                      -- ^ source array width
    -> Int                      -- ^ source x-coordinate
    -> Int                      -- ^ source y-coordinate
    -> Ptr a                    -- ^ destination array
    -> Int                      -- ^ destination array width
    -> Int                      -- ^ destination x-coordinate
    -> Int                      -- ^ destination y-coordinate
    -> IO ()
peekArray2D !w !h !dptr !dw !dx !dy !hptr !hw !hx !hy = doPeek undefined dptr
  where
    doPeek :: Storable a' => a' -> DevicePtr a' -> IO ()
    doPeek x _ =
      let bytes = sizeOf x
          w'    = w  * bytes
          hw'   = hw * bytes
          hx'   = hx * bytes
          dw'   = dw * bytes
          dx'   = dx * bytes
      in
      nothingIfOk =<< cuMemcpy2DDtoH hptr hw' hx' hy dptr dw' dx' dy w' h

{-# INLINE cuMemcpy2DDtoH #-}
cuMemcpy2DDtoH :: (Ptr a) -> (Int) -> (Int) -> (Int) -> (DevicePtr a) -> (Int) -> (Int) -> (Int) -> (Int) -> (Int) -> IO ((Status))
cuMemcpy2DDtoH a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 =
  let {a1' = castPtr a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = fromIntegral a4} in 
  let {a5' = useDeviceHandle a5} in 
  let {a6' = fromIntegral a6} in 
  let {a7' = fromIntegral a7} in 
  let {a8' = fromIntegral a8} in 
  let {a9' = fromIntegral a9} in 
  let {a10' = fromIntegral a10} in 
  cuMemcpy2DDtoH'_ a1' a2' a3' a4' a5' a6' a7' a8' a9' a10' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 361 "./Foreign/CUDA/Driver/Marshal.chs" #-}



-- |
-- Copy a 2D array from the device to the host asynchronously, possibly
-- associated with a particular execution stream. The destination host memory
-- must be page-locked.
--
{-# INLINEABLE peekArray2DAsync #-}
peekArray2DAsync
    :: Storable a
    => Int                      -- ^ width to copy (elements)
    -> Int                      -- ^ height to copy (elements)
    -> DevicePtr a              -- ^ source array
    -> Int                      -- ^ source array width
    -> Int                      -- ^ source x-coordinate
    -> Int                      -- ^ source y-coordinate
    -> HostPtr a                -- ^ destination array
    -> Int                      -- ^ destination array width
    -> Int                      -- ^ destination x-coordinate
    -> Int                      -- ^ destination y-coordinate
    -> Maybe Stream             -- ^ stream to associate to
    -> IO ()
peekArray2DAsync !w !h !dptr !dw !dx !dy !hptr !hw !hx !hy !mst = doPeek undefined dptr
  where
    doPeek :: Storable a' => a' -> DevicePtr a' -> IO ()
    doPeek x _ =
      let bytes = sizeOf x
          w'    = w  * bytes
          hw'   = hw * bytes
          hx'   = hx * bytes
          dw'   = dw * bytes
          dx'   = dx * bytes
          st    = fromMaybe defaultStream mst
      in
      nothingIfOk =<< cuMemcpy2DDtoHAsync hptr hw' hx' hy dptr dw' dx' dy w' h st

{-# INLINE cuMemcpy2DDtoHAsync #-}
cuMemcpy2DDtoHAsync :: (HostPtr a) -> (Int) -> (Int) -> (Int) -> (DevicePtr a) -> (Int) -> (Int) -> (Int) -> (Int) -> (Int) -> (Stream) -> IO ((Status))
cuMemcpy2DDtoHAsync a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 =
  let {a1' = useHP a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = fromIntegral a4} in 
  let {a5' = useDeviceHandle a5} in 
  let {a6' = fromIntegral a6} in 
  let {a7' = fromIntegral a7} in 
  let {a8' = fromIntegral a8} in 
  let {a9' = fromIntegral a9} in 
  let {a10' = fromIntegral a10} in 
  let {a11' = useStream a11} in 
  cuMemcpy2DDtoHAsync'_ a1' a2' a3' a4' a5' a6' a7' a8' a9' a10' a11' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 412 "./Foreign/CUDA/Driver/Marshal.chs" #-}

  where
    useHP = castPtr . useHostPtr


-- |
-- Copy a number of elements from the device into a new Haskell list. Note that
-- this requires two memory copies: firstly from the device into a heap
-- allocated array, and from there marshalled into a list.
--
{-# INLINEABLE peekListArray #-}
peekListArray :: Storable a => Int -> DevicePtr a -> IO [a]
peekListArray !n !dptr =
  F.allocaArray n $ \p -> do
    peekArray   n dptr p
    F.peekArray n p


-- Host -> Device
-- --------------

-- |
-- Copy a number of elements onto the device. This is a synchronous operation
--
{-# INLINEABLE pokeArray #-}
pokeArray :: Storable a => Int -> Ptr a -> DevicePtr a -> IO ()
pokeArray !n !hptr !dptr = doPoke undefined dptr
  where
    doPoke :: Storable a' => a' -> DevicePtr a' -> IO ()
    doPoke x _ = nothingIfOk =<< cuMemcpyHtoD dptr hptr (n * sizeOf x)

{-# INLINE cuMemcpyHtoD #-}
cuMemcpyHtoD :: (DevicePtr a) -> (Ptr a) -> (Int) -> IO ((Status))
cuMemcpyHtoD a1 a2 a3 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = castPtr a2} in 
  let {a3' = fromIntegral a3} in 
  cuMemcpyHtoD'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 447 "./Foreign/CUDA/Driver/Marshal.chs" #-}



-- |
-- Copy memory onto the device asynchronously, possibly associated with a
-- particular stream. The source host memory must be page-locked.
--
{-# INLINEABLE pokeArrayAsync #-}
pokeArrayAsync :: Storable a => Int -> HostPtr a -> DevicePtr a -> Maybe Stream -> IO ()
pokeArrayAsync !n !hptr !dptr !mst = dopoke undefined dptr
  where
    dopoke :: Storable a' => a' -> DevicePtr a' -> IO ()
    dopoke x _ = nothingIfOk =<< cuMemcpyHtoDAsync dptr hptr (n * sizeOf x) (fromMaybe defaultStream mst)

{-# INLINE cuMemcpyHtoDAsync #-}
cuMemcpyHtoDAsync :: (DevicePtr a) -> (HostPtr a) -> (Int) -> (Stream) -> IO ((Status))
cuMemcpyHtoDAsync a1 a2 a3 a4 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = useHP a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = useStream a4} in 
  cuMemcpyHtoDAsync'_ a1' a2' a3' a4' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 466 "./Foreign/CUDA/Driver/Marshal.chs" #-}

  where
    useHP = castPtr . useHostPtr


-- |
-- Copy a 2D array from the host to the device.
--
{-# INLINEABLE pokeArray2D #-}
pokeArray2D
    :: Storable a
    => Int                      -- ^ width to copy (elements)
    -> Int                      -- ^ height to copy (elements)
    -> Ptr a                    -- ^ source array
    -> Int                      -- ^ source array width
    -> Int                      -- ^ source x-coordinate
    -> Int                      -- ^ source y-coordinate
    -> DevicePtr a              -- ^ destination array
    -> Int                      -- ^ destination array width
    -> Int                      -- ^ destination x-coordinate
    -> Int                      -- ^ destination y-coordinate
    -> IO ()
pokeArray2D !w !h !hptr !hw !hx !hy !dptr !dw !dx !dy = doPoke undefined dptr
  where
    doPoke :: Storable a' => a' -> DevicePtr a' -> IO ()
    doPoke x _ =
      let bytes = sizeOf x
          w'    = w  * bytes
          hw'   = hw * bytes
          hx'   = hx * bytes
          dw'   = dw * bytes
          dx'   = dx * bytes
      in
      nothingIfOk =<< cuMemcpy2DHtoD dptr dw' dx' dy hptr hw' hx' hy w' h

{-# INLINE cuMemcpy2DHtoD #-}
cuMemcpy2DHtoD :: (DevicePtr a) -> (Int) -> (Int) -> (Int) -> (Ptr a) -> (Int) -> (Int) -> (Int) -> (Int) -> (Int) -> IO ((Status))
cuMemcpy2DHtoD a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = fromIntegral a4} in 
  let {a5' = castPtr a5} in 
  let {a6' = fromIntegral a6} in 
  let {a7' = fromIntegral a7} in 
  let {a8' = fromIntegral a8} in 
  let {a9' = fromIntegral a9} in 
  let {a10' = fromIntegral a10} in 
  cuMemcpy2DHtoD'_ a1' a2' a3' a4' a5' a6' a7' a8' a9' a10' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 514 "./Foreign/CUDA/Driver/Marshal.chs" #-}



-- |
-- Copy a 2D array from the host to the device asynchronously, possibly
-- associated with a particular execution stream. The source host memory must be
-- page-locked.
--
{-# INLINEABLE pokeArray2DAsync #-}
pokeArray2DAsync
    :: Storable a
    => Int                      -- ^ width to copy (elements)
    -> Int                      -- ^ height to copy (elements)
    -> HostPtr a                -- ^ source array
    -> Int                      -- ^ source array width
    -> Int                      -- ^ source x-coordinate
    -> Int                      -- ^ source y-coordinate
    -> DevicePtr a              -- ^ destination array
    -> Int                      -- ^ destination array width
    -> Int                      -- ^ destination x-coordinate
    -> Int                      -- ^ destination y-coordinate
    -> Maybe Stream             -- ^ stream to associate to
    -> IO ()
pokeArray2DAsync !w !h !hptr !hw !hx !hy !dptr !dw !dx !dy !mst = doPoke undefined dptr
  where
    doPoke :: Storable a' => a' -> DevicePtr a' -> IO ()
    doPoke x _ =
      let bytes = sizeOf x
          w'    = w  * bytes
          hw'   = hw * bytes
          hx'   = hx * bytes
          dw'   = dw * bytes
          dx'   = dx * bytes
          st    = fromMaybe defaultStream mst
      in
      nothingIfOk =<< cuMemcpy2DHtoDAsync dptr dw' dx' dy hptr hw' hx' hy w' h st

{-# INLINE cuMemcpy2DHtoDAsync #-}
cuMemcpy2DHtoDAsync :: (DevicePtr a) -> (Int) -> (Int) -> (Int) -> (HostPtr a) -> (Int) -> (Int) -> (Int) -> (Int) -> (Int) -> (Stream) -> IO ((Status))
cuMemcpy2DHtoDAsync a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = fromIntegral a4} in 
  let {a5' = useHP a5} in 
  let {a6' = fromIntegral a6} in 
  let {a7' = fromIntegral a7} in 
  let {a8' = fromIntegral a8} in 
  let {a9' = fromIntegral a9} in 
  let {a10' = fromIntegral a10} in 
  let {a11' = useStream a11} in 
  cuMemcpy2DHtoDAsync'_ a1' a2' a3' a4' a5' a6' a7' a8' a9' a10' a11' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 565 "./Foreign/CUDA/Driver/Marshal.chs" #-}

  where
    useHP = castPtr . useHostPtr


-- |
-- Write a list of storable elements into a device array. The device array must
-- be sufficiently large to hold the entire list. This requires two marshalling
-- operations.
--
{-# INLINEABLE pokeListArray #-}
pokeListArray :: Storable a => [a] -> DevicePtr a -> IO ()
pokeListArray !xs !dptr = F.withArrayLen xs $ \ !len !p -> pokeArray len p dptr


-- Device -> Device
-- ----------------

-- |
-- Copy the given number of elements from the first device array (source) to the
-- second device (destination). The copied areas may not overlap. This operation
-- is asynchronous with respect to the host, but will never overlap with kernel
-- execution.
--
{-# INLINEABLE copyArray #-}
copyArray :: Storable a => Int -> DevicePtr a -> DevicePtr a -> IO ()
copyArray !n = docopy undefined
  where
    docopy :: Storable a' => a' -> DevicePtr a' -> DevicePtr a' -> IO ()
    docopy x src dst = nothingIfOk =<< cuMemcpyDtoD dst src (n * sizeOf x)

{-# INLINE cuMemcpyDtoD #-}
cuMemcpyDtoD :: (DevicePtr a) -> (DevicePtr a) -> (Int) -> IO ((Status))
cuMemcpyDtoD a1 a2 a3 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = useDeviceHandle a2} in 
  let {a3' = fromIntegral a3} in 
  cuMemcpyDtoD'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 600 "./Foreign/CUDA/Driver/Marshal.chs" #-}



-- |
-- Copy the given number of elements from the first device array (source) to the
-- second device array (destination). The copied areas may not overlap. The
-- operation is asynchronous with respect to the host, and can be asynchronous
-- to other device operations by associating it with a particular stream.
--
{-# INLINEABLE copyArrayAsync #-}
copyArrayAsync :: Storable a => Int -> DevicePtr a -> DevicePtr a -> Maybe Stream -> IO ()
copyArrayAsync !n !src !dst !mst = docopy undefined src
  where
    docopy :: Storable a' => a' -> DevicePtr a' -> IO ()
    docopy x _ = nothingIfOk =<< cuMemcpyDtoDAsync dst src (n * sizeOf x) (fromMaybe defaultStream mst)

{-# INLINE cuMemcpyDtoDAsync #-}
cuMemcpyDtoDAsync :: (DevicePtr a) -> (DevicePtr a) -> (Int) -> (Stream) -> IO ((Status))
cuMemcpyDtoDAsync a1 a2 a3 a4 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = useDeviceHandle a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = useStream a4} in 
  cuMemcpyDtoDAsync'_ a1' a2' a3' a4' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 621 "./Foreign/CUDA/Driver/Marshal.chs" #-}



-- |
-- Copy a 2D array from the first device array (source) to the second device
-- array (destination). The copied areas must not overlap. This operation is
-- asynchronous with respect to the host, but will never overlap with kernel
-- execution.
--
{-# INLINEABLE copyArray2D #-}
copyArray2D
    :: Storable a
    => Int                      -- ^ width to copy (elements)
    -> Int                      -- ^ height to copy (elements)
    -> DevicePtr a              -- ^ source array
    -> Int                      -- ^ source array width
    -> Int                      -- ^ source x-coordinate
    -> Int                      -- ^ source y-coordinate
    -> DevicePtr a              -- ^ destination array
    -> Int                      -- ^ destination array width
    -> Int                      -- ^ destination x-coordinate
    -> Int                      -- ^ destination y-coordinate
    -> IO ()
copyArray2D !w !h !src !hw !hx !hy !dst !dw !dx !dy = doCopy undefined dst
  where
    doCopy :: Storable a' => a' -> DevicePtr a' -> IO ()
    doCopy x _ =
      let bytes = sizeOf x
          w'    = w  * bytes
          hw'   = hw * bytes
          hx'   = hx * bytes
          dw'   = dw * bytes
          dx'   = dx * bytes
      in
      nothingIfOk =<< cuMemcpy2DDtoD dst dw' dx' dy src hw' hx' hy w' h

{-# INLINE cuMemcpy2DDtoD #-}
cuMemcpy2DDtoD :: (DevicePtr a) -> (Int) -> (Int) -> (Int) -> (DevicePtr a) -> (Int) -> (Int) -> (Int) -> (Int) -> (Int) -> IO ((Status))
cuMemcpy2DDtoD a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = fromIntegral a4} in 
  let {a5' = useDeviceHandle a5} in 
  let {a6' = fromIntegral a6} in 
  let {a7' = fromIntegral a7} in 
  let {a8' = fromIntegral a8} in 
  let {a9' = fromIntegral a9} in 
  let {a10' = fromIntegral a10} in 
  cuMemcpy2DDtoD'_ a1' a2' a3' a4' a5' a6' a7' a8' a9' a10' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 670 "./Foreign/CUDA/Driver/Marshal.chs" #-}



-- |
-- Copy a 2D array from the first device array (source) to the second device
-- array (destination). The copied areas may not overlap. The operation is
-- asynchronous with respect to the host, and can be asynchronous to other
-- device operations by associating it with a particular execution stream.
--
{-# INLINEABLE copyArray2DAsync #-}
copyArray2DAsync
    :: Storable a
    => Int                      -- ^ width to copy (elements)
    -> Int                      -- ^ height to copy (elements)
    -> DevicePtr a              -- ^ source array
    -> Int                      -- ^ source array width
    -> Int                      -- ^ source x-coordinate
    -> Int                      -- ^ source y-coordinate
    -> DevicePtr a              -- ^ destination array
    -> Int                      -- ^ destination array width
    -> Int                      -- ^ destination x-coordinate
    -> Int                      -- ^ destination y-coordinate
    -> Maybe Stream             -- ^ stream to associate to
    -> IO ()
copyArray2DAsync !w !h !src !hw !hx !hy !dst !dw !dx !dy !mst = doCopy undefined dst
  where
    doCopy :: Storable a' => a' -> DevicePtr a' -> IO ()
    doCopy x _ =
      let bytes = sizeOf x
          w'    = w  * bytes
          hw'   = hw * bytes
          hx'   = hx * bytes
          dw'   = dw * bytes
          dx'   = dx * bytes
          st    = fromMaybe defaultStream mst
      in
      nothingIfOk =<< cuMemcpy2DDtoDAsync dst dw' dx' dy src hw' hx' hy w' h st

{-# INLINE cuMemcpy2DDtoDAsync #-}
cuMemcpy2DDtoDAsync :: (DevicePtr a) -> (Int) -> (Int) -> (Int) -> (DevicePtr a) -> (Int) -> (Int) -> (Int) -> (Int) -> (Int) -> (Stream) -> IO ((Status))
cuMemcpy2DDtoDAsync a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = fromIntegral a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = fromIntegral a4} in 
  let {a5' = useDeviceHandle a5} in 
  let {a6' = fromIntegral a6} in 
  let {a7' = fromIntegral a7} in 
  let {a8' = fromIntegral a8} in 
  let {a9' = fromIntegral a9} in 
  let {a10' = fromIntegral a10} in 
  let {a11' = useStream a11} in 
  cuMemcpy2DDtoDAsync'_ a1' a2' a3' a4' a5' a6' a7' a8' a9' a10' a11' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 722 "./Foreign/CUDA/Driver/Marshal.chs" #-}




-- Context -> Context
-- ------------------

-- |
-- Copies an array from device memory in one context to device memory in another
-- context. Note that this function is asynchronous with respect to the host,
-- but serialised with respect to all pending and future asynchronous work in
-- the source and destination contexts. To avoid this synchronisation, use
-- 'copyArrayPeerAsync' instead.
--
{-# INLINEABLE copyArrayPeer #-}
copyArrayPeer :: Storable a
              => Int                            -- ^ number of array elements
              -> DevicePtr a -> Context         -- ^ source array and context
              -> DevicePtr a -> Context         -- ^ destination array and context
              -> IO ()
copyArrayPeer !n !src !srcCtx !dst !dstCtx = go undefined src dst
  where
    go :: Storable b => b -> DevicePtr b -> DevicePtr b -> IO ()
    go x _ _ = nothingIfOk =<< cuMemcpyPeer dst dstCtx src srcCtx (n * sizeOf x)

{-# INLINE cuMemcpyPeer #-}
cuMemcpyPeer :: (DevicePtr a) -> (Context) -> (DevicePtr a) -> (Context) -> (Int) -> IO ((Status))
cuMemcpyPeer a1 a2 a3 a4 a5 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = useContext a2} in 
  let {a3' = useDeviceHandle a3} in 
  let {a4' = useContext a4} in 
  let {a5' = fromIntegral a5} in 
  cuMemcpyPeer'_ a1' a2' a3' a4' a5' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 757 "./Foreign/CUDA/Driver/Marshal.chs" #-}



-- |
-- Copies from device memory in one context to device memory in another context.
-- Note that this function is asynchronous with respect to the host and all work
-- in other streams and devices.
--
{-# INLINEABLE copyArrayPeerAsync #-}
copyArrayPeerAsync :: Storable a
                   => Int                       -- ^ number of array elements
                   -> DevicePtr a -> Context    -- ^ source array and context
                   -> DevicePtr a -> Context    -- ^ destination array and device context
                   -> Maybe Stream              -- ^ stream to associate with
                   -> IO ()
copyArrayPeerAsync !n !src !srcCtx !dst !dstCtx !st = go undefined src dst
  where
    go :: Storable b => b -> DevicePtr b -> DevicePtr b -> IO ()
    go x _ _ = nothingIfOk =<< cuMemcpyPeerAsync dst dstCtx src srcCtx (n * sizeOf x) stream
    stream   = fromMaybe defaultStream st

{-# INLINE cuMemcpyPeerAsync #-}
cuMemcpyPeerAsync :: (DevicePtr a) -> (Context) -> (DevicePtr a) -> (Context) -> (Int) -> (Stream) -> IO ((Status))
cuMemcpyPeerAsync a1 a2 a3 a4 a5 a6 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = useContext a2} in 
  let {a3' = useDeviceHandle a3} in 
  let {a4' = useContext a4} in 
  let {a5' = fromIntegral a5} in 
  let {a6' = useStream a6} in 
  cuMemcpyPeerAsync'_ a1' a2' a3' a4' a5' a6' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 789 "./Foreign/CUDA/Driver/Marshal.chs" #-}



--------------------------------------------------------------------------------
-- Combined Allocation and Marshalling
--------------------------------------------------------------------------------

-- |
-- Write a list of storable elements into a newly allocated device array,
-- returning the device pointer together with the number of elements that were
-- written. Note that this requires two memory copies: firstly from a Haskell
-- list to a heap allocated array, and from there onto the graphics device. The
-- memory should be 'free'd when no longer required.
--
{-# INLINEABLE newListArrayLen #-}
newListArrayLen :: Storable a => [a] -> IO (DevicePtr a, Int)
newListArrayLen xs =
  F.withArrayLen xs                     $ \len p ->
  bracketOnError (mallocArray len) free $ \d_xs  -> do
    pokeArray len p d_xs
    return (d_xs, len)


-- |
-- Write a list of storable elements into a newly allocated device array. This
-- is 'newListArrayLen' composed with 'fst'.
--
{-# INLINEABLE newListArray #-}
newListArray :: Storable a => [a] -> IO (DevicePtr a)
newListArray xs = fst `fmap` newListArrayLen xs


-- |
-- Temporarily store a list of elements into a newly allocated device array. An
-- IO action is applied to to the array, the result of which is returned.
-- Similar to 'newListArray', this requires copying the data twice.
--
-- As with 'allocaArray', the memory is freed once the action completes, so you
-- should not return the pointer from the action, and be wary of asynchronous
-- kernel execution.
--
{-# INLINEABLE withListArray #-}
withListArray :: Storable a => [a] -> (DevicePtr a -> IO b) -> IO b
withListArray xs = withListArrayLen xs . const


-- |
-- A variant of 'withListArray' which also supplies the number of elements in
-- the array to the applied function
--
{-# INLINEABLE withListArrayLen #-}
withListArrayLen :: Storable a => [a] -> (Int -> DevicePtr a -> IO b) -> IO b
withListArrayLen xs f =
  bracket (newListArrayLen xs) (free . fst) (uncurry . flip $ f)
--
-- XXX: Will this attempt to double-free the device array on error (together
-- with newListArrayLen)?
--


--------------------------------------------------------------------------------
-- Utility
--------------------------------------------------------------------------------

-- |
-- Set a number of data elements to the specified value, which may be either 8-,
-- 16-, or 32-bits wide.
--
{-# INLINEABLE memset #-}
memset :: Storable a => DevicePtr a -> Int -> a -> IO ()
memset !dptr !n !val = case sizeOf val of
    1 -> nothingIfOk =<< cuMemsetD8  dptr val n
    2 -> nothingIfOk =<< cuMemsetD16 dptr val n
    4 -> nothingIfOk =<< cuMemsetD32 dptr val n
    _ -> cudaError "can only memset 8-, 16-, and 32-bit values"

--
-- We use unsafe coerce below to reinterpret the bits of the value to memset as,
-- into the integer type required by the setting functions.
--
{-# INLINE cuMemsetD8 #-}
cuMemsetD8 :: (DevicePtr a) -> (a) -> (Int) -> IO ((Status))
cuMemsetD8 a1 a2 a3 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = unsafeCoerce a2} in 
  let {a3' = fromIntegral a3} in 
  cuMemsetD8'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 873 "./Foreign/CUDA/Driver/Marshal.chs" #-}


{-# INLINE cuMemsetD16 #-}
cuMemsetD16 :: (DevicePtr a) -> (a) -> (Int) -> IO ((Status))
cuMemsetD16 a1 a2 a3 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = unsafeCoerce a2} in 
  let {a3' = fromIntegral a3} in 
  cuMemsetD16'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 879 "./Foreign/CUDA/Driver/Marshal.chs" #-}


{-# INLINE cuMemsetD32 #-}
cuMemsetD32 :: (DevicePtr a) -> (a) -> (Int) -> IO ((Status))
cuMemsetD32 a1 a2 a3 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = unsafeCoerce a2} in 
  let {a3' = fromIntegral a3} in 
  cuMemsetD32'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 885 "./Foreign/CUDA/Driver/Marshal.chs" #-}



-- |
-- Set the number of data elements to the specified value, which may be either
-- 8-, 16-, or 32-bits wide. The operation is asynchronous and may optionally be
-- associated with a stream. Requires cuda-3.2.
--
{-# INLINEABLE memsetAsync #-}
memsetAsync :: Storable a => DevicePtr a -> Int -> a -> Maybe Stream -> IO ()
memsetAsync !dptr !n !val !mst = case sizeOf val of
    1 -> nothingIfOk =<< cuMemsetD8Async  dptr val n stream
    2 -> nothingIfOk =<< cuMemsetD16Async dptr val n stream
    4 -> nothingIfOk =<< cuMemsetD32Async dptr val n stream
    _ -> cudaError "can only memset 8-, 16-, and 32-bit values"
    where
      stream = fromMaybe defaultStream mst

{-# INLINE cuMemsetD8Async #-}
cuMemsetD8Async :: (DevicePtr a) -> (a) -> (Int) -> (Stream) -> IO ((Status))
cuMemsetD8Async a1 a2 a3 a4 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = unsafeCoerce a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = useStream a4} in 
  cuMemsetD8Async'_ a1' a2' a3' a4' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 911 "./Foreign/CUDA/Driver/Marshal.chs" #-}


{-# INLINE cuMemsetD16Async #-}
cuMemsetD16Async :: (DevicePtr a) -> (a) -> (Int) -> (Stream) -> IO ((Status))
cuMemsetD16Async a1 a2 a3 a4 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = unsafeCoerce a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = useStream a4} in 
  cuMemsetD16Async'_ a1' a2' a3' a4' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 918 "./Foreign/CUDA/Driver/Marshal.chs" #-}


{-# INLINE cuMemsetD32Async #-}
cuMemsetD32Async :: (DevicePtr a) -> (a) -> (Int) -> (Stream) -> IO ((Status))
cuMemsetD32Async a1 a2 a3 a4 =
  let {a1' = useDeviceHandle a1} in 
  let {a2' = unsafeCoerce a2} in 
  let {a3' = fromIntegral a3} in 
  let {a4' = useStream a4} in 
  cuMemsetD32Async'_ a1' a2' a3' a4' >>= \res ->
  let {res' = cToEnum res} in
  return (res')

{-# LINE 926 "./Foreign/CUDA/Driver/Marshal.chs" #-}



-- |
-- Return the device pointer associated with a mapped, pinned host buffer, which
-- was allocated with the 'DeviceMapped' option by 'mallocHostArray'.
--
-- Currently, no options are supported and this must be empty.
--
{-# INLINEABLE getDevicePtr #-}
getDevicePtr :: [AllocFlag] -> HostPtr a -> IO (DevicePtr a)
getDevicePtr !flags !hp = resultIfOk =<< cuMemHostGetDevicePointer hp flags

{-# INLINE cuMemHostGetDevicePointer #-}
cuMemHostGetDevicePointer :: (HostPtr a) -> ([AllocFlag]) -> IO ((Status), (DevicePtr a))
cuMemHostGetDevicePointer a2 a3 =
  alloca' $ \a1' -> 
  let {a2' = useHP a2} in 
  let {a3' = combineBitMasks a3} in 
  cuMemHostGetDevicePointer'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  peekDeviceHandle  a1'>>= \a1'' -> 
  return (res', a1'')

{-# LINE 943 "./Foreign/CUDA/Driver/Marshal.chs" #-}

  where
    alloca'  = F.alloca
    useHP    = castPtr . useHostPtr

-- |
-- Return the base address and allocation size of the given device pointer
--
{-# INLINEABLE getBasePtr #-}
getBasePtr :: DevicePtr a -> IO (DevicePtr a, Int64)
getBasePtr !dptr = do
  (status,base,size) <- cuMemGetAddressRange dptr
  resultIfOk (status, (base,size))

{-# INLINE cuMemGetAddressRange #-}
cuMemGetAddressRange :: (DevicePtr a) -> IO ((Status), (DevicePtr a), (Int64))
cuMemGetAddressRange a3 =
  alloca' $ \a1' -> 
  alloca' $ \a2' -> 
  let {a3' = useDeviceHandle a3} in 
  cuMemGetAddressRange'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  peekDeviceHandle  a1'>>= \a1'' -> 
  peekIntConv  a2'>>= \a2'' -> 
  return (res', a1'', a2'')

{-# LINE 961 "./Foreign/CUDA/Driver/Marshal.chs" #-}

  where
    alloca' :: Storable a => (Ptr a -> IO b) -> IO b
    alloca' = F.alloca

-- |
-- Return the amount of free and total memory respectively available to the
-- current context (bytes)
--
{-# INLINEABLE getMemInfo #-}
getMemInfo :: IO (Int64, Int64)
getMemInfo = do
  (!status,!f,!t) <- cuMemGetInfo
  resultIfOk (status,(f,t))

{-# INLINE cuMemGetInfo #-}
cuMemGetInfo :: IO ((Status), (Int64), (Int64))
cuMemGetInfo =
  alloca' $ \a1' -> 
  alloca' $ \a2' -> 
  cuMemGetInfo'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  peekIntConv  a1'>>= \a1'' -> 
  peekIntConv  a2'>>= \a2'' -> 
  return (res', a1'', a2'')

{-# LINE 979 "./Foreign/CUDA/Driver/Marshal.chs" #-}

  where
    alloca' = F.alloca


--------------------------------------------------------------------------------
-- Internal
--------------------------------------------------------------------------------

type DeviceHandle = (CULLong)
{-# LINE 988 "./Foreign/CUDA/Driver/Marshal.chs" #-}


-- Lift an opaque handle to a typed DevicePtr representation. This occasions
-- arcane distinctions for the different driver versions and Tesla (compute 1.x)
-- and Fermi (compute 2.x) class architectures on 32- and 64-bit hosts.
--
{-# INLINE peekDeviceHandle #-}
peekDeviceHandle :: Ptr DeviceHandle -> IO (DevicePtr a)
peekDeviceHandle !p = DevicePtr . intPtrToPtr . fromIntegral <$> peek p

-- Use a device pointer as an opaque handle type
--
{-# INLINE useDeviceHandle #-}
useDeviceHandle :: DevicePtr a -> DeviceHandle
useDeviceHandle = fromIntegral . ptrToIntPtr . useDevicePtr


foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemHostAlloc"
  cuMemHostAlloc'_ :: ((Ptr (Ptr ())) -> (CULong -> (CUInt -> (IO CInt))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemFreeHost"
  cuMemFreeHost'_ :: ((Ptr ()) -> (IO CInt))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemHostRegister"
  cuMemHostRegister'_ :: ((Ptr ()) -> (CULong -> (CUInt -> (IO CInt))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemHostUnregister"
  cuMemHostUnregister'_ :: ((Ptr ()) -> (IO CInt))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemAlloc"
  cuMemAlloc'_ :: ((Ptr CULLong) -> (CULong -> (IO CInt)))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemFree"
  cuMemFree'_ :: (CULLong -> (IO CInt))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemAllocManaged"
  cuMemAllocManaged'_ :: ((Ptr CULLong) -> (CULong -> (CUInt -> (IO CInt))))

foreign import ccall safe "Foreign/CUDA/Driver/Marshal.chs.h cuMemcpyDtoH"
  cuMemcpyDtoH'_ :: ((Ptr ()) -> (CULLong -> (CULong -> (IO CInt))))

foreign import ccall safe "Foreign/CUDA/Driver/Marshal.chs.h cuMemcpyDtoHAsync"
  cuMemcpyDtoHAsync'_ :: ((Ptr ()) -> (CULLong -> (CULong -> ((Ptr ()) -> (IO CInt)))))

foreign import ccall safe "Foreign/CUDA/Driver/Marshal.chs.h cuMemcpy2DDtoH"
  cuMemcpy2DDtoH'_ :: ((Ptr ()) -> (CUInt -> (CUInt -> (CUInt -> (CULLong -> (CUInt -> (CUInt -> (CUInt -> (CUInt -> (CUInt -> (IO CInt)))))))))))

foreign import ccall safe "Foreign/CUDA/Driver/Marshal.chs.h cuMemcpy2DDtoHAsync"
  cuMemcpy2DDtoHAsync'_ :: ((Ptr ()) -> (CUInt -> (CUInt -> (CUInt -> (CULLong -> (CUInt -> (CUInt -> (CUInt -> (CUInt -> (CUInt -> ((Ptr ()) -> (IO CInt))))))))))))

foreign import ccall safe "Foreign/CUDA/Driver/Marshal.chs.h cuMemcpyHtoD"
  cuMemcpyHtoD'_ :: (CULLong -> ((Ptr ()) -> (CULong -> (IO CInt))))

foreign import ccall safe "Foreign/CUDA/Driver/Marshal.chs.h cuMemcpyHtoDAsync"
  cuMemcpyHtoDAsync'_ :: (CULLong -> ((Ptr ()) -> (CULong -> ((Ptr ()) -> (IO CInt)))))

foreign import ccall safe "Foreign/CUDA/Driver/Marshal.chs.h cuMemcpy2DHtoD"
  cuMemcpy2DHtoD'_ :: (CULLong -> (CUInt -> (CUInt -> (CUInt -> ((Ptr ()) -> (CUInt -> (CUInt -> (CUInt -> (CUInt -> (CUInt -> (IO CInt)))))))))))

foreign import ccall safe "Foreign/CUDA/Driver/Marshal.chs.h cuMemcpy2DHtoDAsync"
  cuMemcpy2DHtoDAsync'_ :: (CULLong -> (CUInt -> (CUInt -> (CUInt -> ((Ptr ()) -> (CUInt -> (CUInt -> (CUInt -> (CUInt -> (CUInt -> ((Ptr ()) -> (IO CInt))))))))))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemcpyDtoD"
  cuMemcpyDtoD'_ :: (CULLong -> (CULLong -> (CULong -> (IO CInt))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemcpyDtoDAsync"
  cuMemcpyDtoDAsync'_ :: (CULLong -> (CULLong -> (CULong -> ((Ptr ()) -> (IO CInt)))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemcpy2DDtoD"
  cuMemcpy2DDtoD'_ :: (CULLong -> (CUInt -> (CUInt -> (CUInt -> (CULLong -> (CUInt -> (CUInt -> (CUInt -> (CUInt -> (CUInt -> (IO CInt)))))))))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemcpy2DDtoDAsync"
  cuMemcpy2DDtoDAsync'_ :: (CULLong -> (CUInt -> (CUInt -> (CUInt -> (CULLong -> (CUInt -> (CUInt -> (CUInt -> (CUInt -> (CUInt -> ((Ptr ()) -> (IO CInt))))))))))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemcpyPeer"
  cuMemcpyPeer'_ :: (CULLong -> ((Ptr ()) -> (CULLong -> ((Ptr ()) -> (CULong -> (IO CInt))))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemcpyPeerAsync"
  cuMemcpyPeerAsync'_ :: (CULLong -> ((Ptr ()) -> (CULLong -> ((Ptr ()) -> (CULong -> ((Ptr ()) -> (IO CInt)))))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemsetD8"
  cuMemsetD8'_ :: (CULLong -> (CUChar -> (CULong -> (IO CInt))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemsetD16"
  cuMemsetD16'_ :: (CULLong -> (CUShort -> (CULong -> (IO CInt))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemsetD32"
  cuMemsetD32'_ :: (CULLong -> (CUInt -> (CULong -> (IO CInt))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemsetD8Async"
  cuMemsetD8Async'_ :: (CULLong -> (CUChar -> (CULong -> ((Ptr ()) -> (IO CInt)))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemsetD16Async"
  cuMemsetD16Async'_ :: (CULLong -> (CUShort -> (CULong -> ((Ptr ()) -> (IO CInt)))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemsetD32Async"
  cuMemsetD32Async'_ :: (CULLong -> (CUInt -> (CULong -> ((Ptr ()) -> (IO CInt)))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemHostGetDevicePointer"
  cuMemHostGetDevicePointer'_ :: ((Ptr CULLong) -> ((Ptr ()) -> (CUInt -> (IO CInt))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemGetAddressRange"
  cuMemGetAddressRange'_ :: ((Ptr CULLong) -> ((Ptr CULong) -> (CULLong -> (IO CInt))))

foreign import ccall unsafe "Foreign/CUDA/Driver/Marshal.chs.h cuMemGetInfo"
  cuMemGetInfo'_ :: ((Ptr CULong) -> ((Ptr CULong) -> (IO CInt)))