{-# LANGUAGE ScopedTypeVariables #-}
-- |
-- Module      : Data.Array.Accelerate.LLVM.PTX.Analysis.Device
-- Copyright   : [2008..2020] The Accelerate Team
-- License     : BSD3
--
-- Maintainer  : Trevor L. McDonell <trevor.mcdonell@gmail.com>
-- Stability   : experimental
-- Portability : non-portable (GHC extensions)
--

module Data.Array.Accelerate.LLVM.PTX.Analysis.Device
  where

import Control.Exception
import Data.Function
import Data.List
import Data.Ord
import Foreign.CUDA.Analysis.Device
import Foreign.CUDA.Driver.Context                                  ( Context )
import Foreign.CUDA.Driver.Device
import Foreign.CUDA.Driver.Error
import qualified Foreign.CUDA.Driver                                as CUDA


-- Select the best of the available CUDA capable devices. This prefers devices
-- with higher compute capability, followed by maximum throughput.
--
-- For hosts with multiple devices in Exclusive Process mode, this will select
-- the first of the _available_ devices. If no devices are available, an
-- exception is thrown indicating that no devices are available.
--
selectBestDevice :: IO (Device, DeviceProperties, Context)
selectBestDevice :: IO (Device, DeviceProperties, Context)
selectBestDevice = [(Device, DeviceProperties)]
-> IO (Device, DeviceProperties, Context)
select ([(Device, DeviceProperties)]
 -> IO (Device, DeviceProperties, Context))
-> IO [(Device, DeviceProperties)]
-> IO (Device, DeviceProperties, Context)
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< IO [(Device, DeviceProperties)]
enumerateDevices
  where
    select :: [(Device, DeviceProperties)] -> IO (Device, DeviceProperties, Context)
    select :: [(Device, DeviceProperties)]
-> IO (Device, DeviceProperties, Context)
select []               = String -> IO (Device, DeviceProperties, Context)
forall a. String -> IO a
cudaErrorIO String
"No CUDA-capable devices are available"
    select ((Device
dev,DeviceProperties
prp):[(Device, DeviceProperties)]
rest) = do
      Either CUDAException Context
r <- IO Context -> IO (Either CUDAException Context)
forall e a. Exception e => IO a -> IO (Either e a)
try (IO Context -> IO (Either CUDAException Context))
-> IO Context -> IO (Either CUDAException Context)
forall a b. (a -> b) -> a -> b
$ Device -> [ContextFlag] -> IO Context
CUDA.create Device
dev [ContextFlag
CUDA.SchedAuto]
      case Either CUDAException Context
r of
        Right Context
ctx               -> (Device, DeviceProperties, Context)
-> IO (Device, DeviceProperties, Context)
forall (m :: * -> *) a. Monad m => a -> m a
return (Device
dev,DeviceProperties
prp,Context
ctx)
        Left (CUDAException
_::CUDAException) -> [(Device, DeviceProperties)]
-> IO (Device, DeviceProperties, Context)
select [(Device, DeviceProperties)]
rest


-- Return the list of all connected CUDA devices, sorted by compute
-- compatibility, followed by maximum throughput.
--
-- Strictly speaking this may not necessary, as the default device enumeration
-- appears to be sorted by some metric already.
--
-- Ignore the possibility of emulation-mode devices, as this has been deprecated
-- as of CUDA v3.0 (compute-capability == 9999.9999)
--
enumerateDevices :: IO [(Device, DeviceProperties)]
enumerateDevices :: IO [(Device, DeviceProperties)]
enumerateDevices = do
  [Device]
devs  <- (Int -> IO Device) -> [Int] -> IO [Device]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM Int -> IO Device
CUDA.device ([Int] -> IO [Device]) -> (Int -> [Int]) -> Int -> IO [Device]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Int -> [Int]
forall a. Enum a => a -> a -> [a]
enumFromTo Int
0 (Int -> [Int]) -> (Int -> Int) -> Int -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Int -> Int
forall a. Num a => a -> a -> a
subtract Int
1 (Int -> IO [Device]) -> IO Int -> IO [Device]
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< IO Int
CUDA.count
  [DeviceProperties]
prps  <- (Device -> IO DeviceProperties)
-> [Device] -> IO [DeviceProperties]
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM Device -> IO DeviceProperties
CUDA.props [Device]
devs
  [(Device, DeviceProperties)] -> IO [(Device, DeviceProperties)]
forall (m :: * -> *) a. Monad m => a -> m a
return ([(Device, DeviceProperties)] -> IO [(Device, DeviceProperties)])
-> [(Device, DeviceProperties)] -> IO [(Device, DeviceProperties)]
forall a b. (a -> b) -> a -> b
$ ((Device, DeviceProperties)
 -> (Device, DeviceProperties) -> Ordering)
-> [(Device, DeviceProperties)] -> [(Device, DeviceProperties)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy ((DeviceProperties -> DeviceProperties -> Ordering)
-> DeviceProperties -> DeviceProperties -> Ordering
forall a b c. (a -> b -> c) -> b -> a -> c
flip DeviceProperties -> DeviceProperties -> Ordering
compareDevices (DeviceProperties -> DeviceProperties -> Ordering)
-> ((Device, DeviceProperties) -> DeviceProperties)
-> (Device, DeviceProperties)
-> (Device, DeviceProperties)
-> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` (Device, DeviceProperties) -> DeviceProperties
forall a b. (a, b) -> b
snd) ([Device] -> [DeviceProperties] -> [(Device, DeviceProperties)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Device]
devs [DeviceProperties]
prps)


-- Return a ordering of two device with respect to (estimated) performance
--
compareDevices :: DeviceProperties -> DeviceProperties -> Ordering
compareDevices :: DeviceProperties -> DeviceProperties -> Ordering
compareDevices = DeviceProperties -> DeviceProperties -> Ordering
cmp
  where
    compute :: DeviceProperties -> Compute
compute     = DeviceProperties -> Compute
computeCapability
    flops :: DeviceProperties -> Int
flops DeviceProperties
d     = DeviceProperties -> Int
multiProcessorCount DeviceProperties
d Int -> Int -> Int
forall a. Num a => a -> a -> a
* DeviceProperties -> Int
coresPerMultiProcessor DeviceProperties
d Int -> Int -> Int
forall a. Num a => a -> a -> a
* DeviceProperties -> Int
clockRate DeviceProperties
d
    cmp :: DeviceProperties -> DeviceProperties -> Ordering
cmp DeviceProperties
x DeviceProperties
y
      | DeviceProperties -> Compute
compute DeviceProperties
x Compute -> Compute -> Bool
forall a. Eq a => a -> a -> Bool
== DeviceProperties -> Compute
compute DeviceProperties
y  = (DeviceProperties -> Int)
-> DeviceProperties -> DeviceProperties -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing DeviceProperties -> Int
flops   DeviceProperties
x DeviceProperties
y
      | Bool
otherwise               = (DeviceProperties -> Compute)
-> DeviceProperties -> DeviceProperties -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing DeviceProperties -> Compute
compute DeviceProperties
x DeviceProperties
y


-- Number of CUDA cores per streaming multiprocessor for a given architecture
-- revision. This is the number of SIMD arithmetic units per multiprocessor,
-- executing in lockstep in half-warp groupings (16 ALUs).
--
coresPerMultiProcessor :: DeviceProperties -> Int
coresPerMultiProcessor :: DeviceProperties -> Int
coresPerMultiProcessor = DeviceResources -> Int
coresPerMP (DeviceResources -> Int)
-> (DeviceProperties -> DeviceResources) -> DeviceProperties -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. DeviceProperties -> DeviceResources
deviceResources