module Foreign.CUDA.Runtime.Device (
  
  Device, DeviceFlag(..), DeviceProperties(..), Compute(..), ComputeMode(..),
  choose, get, count, props, set, setFlags, setOrder, reset, sync,
  
  PeerFlag,
  accessible, add, remove,
  
  Limit(..),
  getLimit, setLimit
) where
import Foreign.CUDA.Analysis.Device
import Foreign.CUDA.Runtime.Error
import Foreign.CUDA.Internal.C2HS
import Foreign
import Foreign.C
type Device = Int
data DeviceFlag = ScheduleAuto
                | ScheduleSpin
                | ScheduleYield
                | BlockingSync
                | MapHost
                | LMemResizeToMax
  deriving (Eq,Show,Bounded)
instance Enum DeviceFlag where
  succ ScheduleAuto = ScheduleSpin
  succ ScheduleSpin = ScheduleYield
  succ ScheduleYield = BlockingSync
  succ BlockingSync = MapHost
  succ MapHost = LMemResizeToMax
  succ LMemResizeToMax = error "DeviceFlag.succ: LMemResizeToMax has no successor"
  pred ScheduleSpin = ScheduleAuto
  pred ScheduleYield = ScheduleSpin
  pred BlockingSync = ScheduleYield
  pred MapHost = BlockingSync
  pred LMemResizeToMax = MapHost
  pred ScheduleAuto = error "DeviceFlag.pred: ScheduleAuto has no predecessor"
  enumFromTo from to = go from
    where
      end = fromEnum to
      go v = case compare (fromEnum v) end of
                 LT -> v : go (succ v)
                 EQ -> [v]
                 GT -> []
  enumFrom from = enumFromTo from LMemResizeToMax
  fromEnum ScheduleAuto = 0
  fromEnum ScheduleSpin = 1
  fromEnum ScheduleYield = 2
  fromEnum BlockingSync = 4
  fromEnum MapHost = 8
  fromEnum LMemResizeToMax = 16
  toEnum 0 = ScheduleAuto
  toEnum 1 = ScheduleSpin
  toEnum 2 = ScheduleYield
  toEnum 4 = BlockingSync
  toEnum 8 = MapHost
  toEnum 16 = LMemResizeToMax
  toEnum unmatched = error ("DeviceFlag.toEnum: Cannot match " ++ show unmatched)
instance Storable DeviceProperties where
  sizeOf _    = 632
  alignment _ = alignment (undefined :: Ptr ())
  poke _ _    = error "no instance for Foreign.Storable.poke DeviceProperties"
  peek p      = do
    n  <- peekCString   =<<   (\ptr -> do {return $ ptr `plusPtr` 0 :: IO (Ptr CChar)}) p
    gm <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 256 :: IO CULong}) p
    sm <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 264 :: IO CULong}) p
    rb <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 272 :: IO CInt}) p
    ws <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 276 :: IO CInt}) p
    mp <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 280 :: IO CULong}) p
    tb <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 288 :: IO CInt}) p
    cl <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 316 :: IO CInt}) p
    cm <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 320 :: IO CULong}) p
    v1 <- fromIntegral `fmap` (\ptr -> do {peekByteOff ptr 328 :: IO CInt}) p
    v2 <- fromIntegral `fmap` (\ptr -> do {peekByteOff ptr 332 :: IO CInt}) p
    ta <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 336 :: IO CULong}) p
    ov <- cToBool      `fmap` (\ptr -> do {peekByteOff ptr 352 :: IO CInt}) p
    pc <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 356 :: IO CInt}) p
    ke <- cToBool      `fmap` (\ptr -> do {peekByteOff ptr 360 :: IO CInt}) p
    tg <- cToBool      `fmap` (\ptr -> do {peekByteOff ptr 364 :: IO CInt}) p
    hm <- cToBool      `fmap` (\ptr -> do {peekByteOff ptr 368 :: IO CInt}) p
    md <- cToEnum      `fmap` (\ptr -> do {peekByteOff ptr 372 :: IO CInt}) p
    ck <- cToBool      `fmap` (\ptr -> do {peekByteOff ptr 544 :: IO CInt}) p
    u1 <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 376 :: IO CInt}) p
    ee <- cToBool      `fmap` (\ptr -> do {peekByteOff ptr 548 :: IO CInt}) p
    ae <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 568 :: IO CInt}) p
    l2 <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 584 :: IO CInt}) p
    tm <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 588 :: IO CInt}) p
    mw <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 580 :: IO CInt}) p
    mc <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 576 :: IO CInt}) p
    pb <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 552 :: IO CInt}) p
    pd <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 556 :: IO CInt}) p
    pm <- cIntConv     `fmap` (\ptr -> do {peekByteOff ptr 560 :: IO CInt}) p
    tc <- cToBool      `fmap` (\ptr -> do {peekByteOff ptr 564 :: IO CInt}) p
    ua <- cToBool      `fmap` (\ptr -> do {peekByteOff ptr 572 :: IO CInt}) p
    [t1,t2,t3]    <- peekArrayWith cIntConv 3 =<< (\ptr -> do {return $ ptr `plusPtr` 292 :: IO (Ptr CInt)}) p
    [g1,g2,g3]    <- peekArrayWith cIntConv 3 =<< (\ptr -> do {return $ ptr `plusPtr` 304 :: IO (Ptr CInt)}) p
    [u21,u22]     <- peekArrayWith cIntConv 2 =<< (\ptr -> do {return $ ptr `plusPtr` 388 :: IO (Ptr CInt)}) p
    [u31,u32,u33] <- peekArrayWith cIntConv 3 =<< (\ptr -> do {return $ ptr `plusPtr` 424 :: IO (Ptr CInt)}) p
    sp  <- cToBool     `fmap` (\ptr -> do {peekByteOff ptr 592 :: IO CInt}) p
    gl1 <- cToBool     `fmap` (\ptr -> do {peekByteOff ptr 596 :: IO CInt}) p
    ll1 <- cToBool     `fmap` (\ptr -> do {peekByteOff ptr 600 :: IO CInt}) p
    mm  <- cToBool     `fmap` (\ptr -> do {peekByteOff ptr 620 :: IO CInt}) p
    mg  <- cToBool     `fmap` (\ptr -> do {peekByteOff ptr 624 :: IO CInt}) p
    mid <- cIntConv    `fmap` (\ptr -> do {peekByteOff ptr 628 :: IO CInt}) p
    return DeviceProperties
      {
        deviceName                      = n
      , computeCapability               = Compute v1 v2
      , totalGlobalMem                  = gm
      , totalConstMem                   = cm
      , sharedMemPerBlock               = sm
      , regsPerBlock                    = rb
      , warpSize                        = ws
      , maxThreadsPerBlock              = tb
      , maxBlockSize                    = (t1,t2,t3)
      , maxGridSize                     = (g1,g2,g3)
      , clockRate                       = cl
      , multiProcessorCount             = pc
      , memPitch                        = mp
      , textureAlignment                = ta
      , computeMode                     = md
      , deviceOverlap                   = ov
      , kernelExecTimeoutEnabled        = ke
      , integrated                      = tg
      , canMapHostMemory                = hm
      , concurrentKernels               = ck
      , maxTextureDim1D                 = u1
      , maxTextureDim2D                 = (u21,u22)
      , maxTextureDim3D                 = (u31,u32,u33)
      , eccEnabled                      = ee
      , asyncEngineCount                = ae
      , cacheMemL2                      = l2
      , maxThreadsPerMultiProcessor     = tm
      , memBusWidth                     = mw
      , memClockRate                    = mc
      , tccDriverEnabled                = tc
      , unifiedAddressing               = ua
      , pciInfo                         = PCI pb pd pm
      , streamPriorities                = sp
      , globalL1Cache                   = gl1
      , localL1Cache                    = ll1
      , managedMemory                   = mm
      , multiGPUBoard                   = mg
      , multiGPUBoardGroupID            = mid
      }
choose :: DeviceProperties -> IO Device
choose !dev = resultIfOk =<< cudaChooseDevice dev
cudaChooseDevice :: (DeviceProperties) -> IO ((Status), (Int))
cudaChooseDevice a2 =
  alloca $ \a1' -> 
  withDevProp a2 $ \a2' -> 
  cudaChooseDevice'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  peekIntConv  a1'>>= \a1'' -> 
  return (res', a1'')
  where
      withDevProp = with
get :: IO Device
get = resultIfOk =<< cudaGetDevice
cudaGetDevice :: IO ((Status), (Int))
cudaGetDevice =
  alloca $ \a1' -> 
  cudaGetDevice'_ a1' >>= \res ->
  let {res' = cToEnum res} in
  peekIntConv  a1'>>= \a1'' -> 
  return (res', a1'')
count :: IO Int
count = resultIfOk =<< cudaGetDeviceCount
cudaGetDeviceCount :: IO ((Status), (Int))
cudaGetDeviceCount =
  alloca $ \a1' -> 
  cudaGetDeviceCount'_ a1' >>= \res ->
  let {res' = cToEnum res} in
  peekIntConv  a1'>>= \a1'' -> 
  return (res', a1'')
props :: Device -> IO DeviceProperties
props !n = resultIfOk =<< cudaGetDeviceProperties n
cudaGetDeviceProperties :: (Int) -> IO ((Status), (DeviceProperties))
cudaGetDeviceProperties a2 =
  alloca $ \a1' -> 
  let {a2' = fromIntegral a2} in 
  cudaGetDeviceProperties'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  peek  a1'>>= \a1'' -> 
  return (res', a1'')
set :: Device -> IO ()
set !n = nothingIfOk =<< cudaSetDevice n
cudaSetDevice :: (Int) -> IO ((Status))
cudaSetDevice a1 =
  let {a1' = fromIntegral a1} in 
  cudaSetDevice'_ a1' >>= \res ->
  let {res' = cToEnum res} in
  return (res')
setFlags :: [DeviceFlag] -> IO ()
setFlags !f = nothingIfOk =<< cudaSetDeviceFlags (combineBitMasks f)
cudaSetDeviceFlags :: (Int) -> IO ((Status))
cudaSetDeviceFlags a1 =
  let {a1' = fromIntegral a1} in 
  cudaSetDeviceFlags'_ a1' >>= \res ->
  let {res' = cToEnum res} in
  return (res')
setOrder :: [Device] -> IO ()
setOrder !l = nothingIfOk =<< cudaSetValidDevices l (length l)
cudaSetValidDevices :: ([Int]) -> (Int) -> IO ((Status))
cudaSetValidDevices a1 a2 =
  withArrayIntConv a1 $ \a1' -> 
  let {a2' = fromIntegral a2} in 
  cudaSetValidDevices'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  return (res')
  where
      withArrayIntConv = withArray . map cIntConv
sync :: IO ()
sync = nothingIfOk =<< cudaDeviceSynchronize
cudaDeviceSynchronize :: IO ((Status))
cudaDeviceSynchronize =
  cudaDeviceSynchronize'_ >>= \res ->
  let {res' = cToEnum res} in
  return (res')
reset :: IO ()
reset = nothingIfOk =<< cudaDeviceReset
cudaDeviceReset :: IO ((Status))
cudaDeviceReset =
  cudaDeviceReset'_ >>= \res ->
  let {res' = cToEnum res} in
  return (res')
data PeerFlag
instance Enum PeerFlag where
  toEnum   x = case x of {}
  fromEnum x = case x of {}
accessible :: Device -> Device -> IO Bool
accessible !dev !peer = resultIfOk =<< cudaDeviceCanAccessPeer dev peer
cudaDeviceCanAccessPeer :: (Device) -> (Device) -> IO ((Status), (Bool))
cudaDeviceCanAccessPeer a2 a3 =
  alloca $ \a1' -> 
  let {a2' = cIntConv a2} in 
  let {a3' = cIntConv a3} in 
  cudaDeviceCanAccessPeer'_ a1' a2' a3' >>= \res ->
  let {res' = cToEnum res} in
  peekBool  a1'>>= \a1'' -> 
  return (res', a1'')
add :: Device -> [PeerFlag] -> IO ()
add !dev !flags = nothingIfOk =<< cudaDeviceEnablePeerAccess dev flags
cudaDeviceEnablePeerAccess :: (Device) -> ([PeerFlag]) -> IO ((Status))
cudaDeviceEnablePeerAccess a1 a2 =
  let {a1' = cIntConv a1} in 
  let {a2' = combineBitMasks a2} in 
  cudaDeviceEnablePeerAccess'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  return (res')
remove :: Device -> IO ()
remove !dev = nothingIfOk =<< cudaDeviceDisablePeerAccess dev
cudaDeviceDisablePeerAccess :: (Device) -> IO ((Status))
cudaDeviceDisablePeerAccess a1 =
  let {a1' = cIntConv a1} in 
  cudaDeviceDisablePeerAccess'_ a1' >>= \res ->
  let {res' = cToEnum res} in
  return (res')
data Limit = Stacksize
           | Printffifosize
           | Mallocheapsize
           | Devruntimesyncdepth
           | Devruntimependinglaunchcount
  deriving (Eq,Show)
instance Enum Limit where
  succ Stacksize = Printffifosize
  succ Printffifosize = Mallocheapsize
  succ Mallocheapsize = Devruntimesyncdepth
  succ Devruntimesyncdepth = Devruntimependinglaunchcount
  succ Devruntimependinglaunchcount = error "Limit.succ: Devruntimependinglaunchcount has no successor"
  pred Printffifosize = Stacksize
  pred Mallocheapsize = Printffifosize
  pred Devruntimesyncdepth = Mallocheapsize
  pred Devruntimependinglaunchcount = Devruntimesyncdepth
  pred Stacksize = error "Limit.pred: Stacksize has no predecessor"
  enumFromTo from to = go from
    where
      end = fromEnum to
      go v = case compare (fromEnum v) end of
                 LT -> v : go (succ v)
                 EQ -> [v]
                 GT -> []
  enumFrom from = enumFromTo from Devruntimependinglaunchcount
  fromEnum Stacksize = 0
  fromEnum Printffifosize = 1
  fromEnum Mallocheapsize = 2
  fromEnum Devruntimesyncdepth = 3
  fromEnum Devruntimependinglaunchcount = 4
  toEnum 0 = Stacksize
  toEnum 1 = Printffifosize
  toEnum 2 = Mallocheapsize
  toEnum 3 = Devruntimesyncdepth
  toEnum 4 = Devruntimependinglaunchcount
  toEnum unmatched = error ("Limit.toEnum: Cannot match " ++ show unmatched)
getLimit :: Limit -> IO Int
getLimit !l = resultIfOk =<< cudaDeviceGetLimit l
cudaDeviceGetLimit :: (Limit) -> IO ((Status), (Int))
cudaDeviceGetLimit a2 =
  alloca $ \a1' -> 
  let {a2' = cFromEnum a2} in 
  cudaDeviceGetLimit'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  peekIntConv  a1'>>= \a1'' -> 
  return (res', a1'')
setLimit :: Limit -> Int -> IO ()
setLimit !l !n = nothingIfOk =<< cudaDeviceSetLimit l n
cudaDeviceSetLimit :: (Limit) -> (Int) -> IO ((Status))
cudaDeviceSetLimit a1 a2 =
  let {a1' = cFromEnum a1} in 
  let {a2' = cIntConv a2} in 
  cudaDeviceSetLimit'_ a1' a2' >>= \res ->
  let {res' = cToEnum res} in
  return (res')
foreign import ccall unsafe "Foreign/CUDA/Runtime/Device.chs.h cudaChooseDevice"
  cudaChooseDevice'_ :: ((Ptr CInt) -> ((Ptr (DeviceProperties)) -> (IO CInt)))
foreign import ccall unsafe "Foreign/CUDA/Runtime/Device.chs.h cudaGetDevice"
  cudaGetDevice'_ :: ((Ptr CInt) -> (IO CInt))
foreign import ccall unsafe "Foreign/CUDA/Runtime/Device.chs.h cudaGetDeviceCount"
  cudaGetDeviceCount'_ :: ((Ptr CInt) -> (IO CInt))
foreign import ccall unsafe "Foreign/CUDA/Runtime/Device.chs.h cudaGetDeviceProperties"
  cudaGetDeviceProperties'_ :: ((Ptr (DeviceProperties)) -> (CInt -> (IO CInt)))
foreign import ccall unsafe "Foreign/CUDA/Runtime/Device.chs.h cudaSetDevice"
  cudaSetDevice'_ :: (CInt -> (IO CInt))
foreign import ccall unsafe "Foreign/CUDA/Runtime/Device.chs.h cudaSetDeviceFlags"
  cudaSetDeviceFlags'_ :: (CUInt -> (IO CInt))
foreign import ccall unsafe "Foreign/CUDA/Runtime/Device.chs.h cudaSetValidDevices"
  cudaSetValidDevices'_ :: ((Ptr CInt) -> (CInt -> (IO CInt)))
foreign import ccall safe "Foreign/CUDA/Runtime/Device.chs.h cudaDeviceSynchronize"
  cudaDeviceSynchronize'_ :: (IO CInt)
foreign import ccall unsafe "Foreign/CUDA/Runtime/Device.chs.h cudaDeviceReset"
  cudaDeviceReset'_ :: (IO CInt)
foreign import ccall unsafe "Foreign/CUDA/Runtime/Device.chs.h cudaDeviceCanAccessPeer"
  cudaDeviceCanAccessPeer'_ :: ((Ptr CInt) -> (CInt -> (CInt -> (IO CInt))))
foreign import ccall unsafe "Foreign/CUDA/Runtime/Device.chs.h cudaDeviceEnablePeerAccess"
  cudaDeviceEnablePeerAccess'_ :: (CInt -> (CUInt -> (IO CInt)))
foreign import ccall unsafe "Foreign/CUDA/Runtime/Device.chs.h cudaDeviceDisablePeerAccess"
  cudaDeviceDisablePeerAccess'_ :: (CInt -> (IO CInt))
foreign import ccall unsafe "Foreign/CUDA/Runtime/Device.chs.h cudaDeviceGetLimit"
  cudaDeviceGetLimit'_ :: ((Ptr CULong) -> (CInt -> (IO CInt)))
foreign import ccall unsafe "Foreign/CUDA/Runtime/Device.chs.h cudaDeviceSetLimit"
  cudaDeviceSetLimit'_ :: (CInt -> (CULong -> (IO CInt)))