{-# LANGUAGE MultiParamTypeClasses, FunctionalDependencies, TypeFamilies #-} {-# LANGUAGE CPP #-} {-# OPTIONS_GHC -Wall #-} module Control.Monad.Par.Accelerate ( -- * The Class ParAccelerate(..), #ifdef ACC_IO -- * Example applications of `unsafeHybrid` unsafeHybridVector, unsafeHybridIArray #endif ) where import Control.Monad.Par.Class import Data.Array.IArray (IArray) import Foreign (Ptr, Storable) import qualified Data.Array.IArray as IArray import qualified Data.Vector.Storable as Vector -- From 'accelerate': import Data.Array.Accelerate (Acc, Arrays, Shape) import Data.Array.Accelerate.Array.Sugar (EltRepr,Elt,Array,DIM1,toIArray) #ifdef ACC_IO -- From 'accelerate-io', or 'accelerate" <= 0.10 import qualified Data.Array.Accelerate.IO as IO #endif -------------------------------------------------------------------------------- -- | A class containing Accelerate-specific `Par` operations. -- -- A minimal complete instance contains: -- * one of `runAccWith` or `spawnAccWith` -- * `getDefaultAccImpl` -- * `compileAcc`. class ParFuture iv p => ParAccelerate iv p where -- | Run an Accelerate computation and wait for its result. In the -- context of a `Par` computation this can result in better -- performance than using an Accelerate-provided `run` function -- directly, because this version enables the CPU work scheduler to do -- other work while waiting for the GPU computation to complete. -- -- Moreover, when configured with a high-performance /CPU/ Accelerate backend -- in the future this routine can enable automatic CPU/GPU work partitioning. -- -- The specific Accelerate implementation is NOT specified when -- calling `runAcc`. That choice is deferred to the point where -- `runPar` is invoked for the scheduler in question. runAcc :: (Arrays a) => Acc a -> p a runAcc comp = do runner <- getDefaultAccImpl runAccWith runner comp -- | Like `runAcc` but runs the Accelerate computation asynchronously. spawnAcc :: (Arrays a) => Acc a -> p (iv a) spawnAcc comp = do runner <- getDefaultAccImpl spawnAccWith runner comp -- | Spawn an computation which may execute /either/ on the CPU or GPU -- based on runtime load. The CPU and GPU implementations may employ -- completely different algorithms; this is an UNSAFE operation which -- will not guarantee determinism unless the user ensures that the -- result of both computations is always equivalent. -- -- -- A common application of `unsafeHybrid` is the following: -- -- > unsafeHybrid Data.Array.Accelerate.IO.toVector -- unsafeHybrid :: Arrays b => (b -> a) -> (p a, Acc b) -> p (iv a) unsafeHybrid cvrt pr = do runner <- getDefaultAccImpl unsafeHybridWith runner cvrt pr ------------------------------------------------------------ -- * Control over selecting the Accelerate implementation. -- Retrieve the Accelerate @run@ function that is the default for -- this execution, i.e. the one used for `runAcc` or `spawnAcc`. getDefaultAccImpl :: Arrays a => p (Acc a -> a) -- | Like `runAcc` but specify a specific Accelerate implementation, e.g. @CUDA.run@. runAccWith :: (Arrays a) => (Acc a -> a) -> Acc a -> p a runAccWith runner comp = spawnAccWith runner comp >>= get -- | Analogous to `runAccWith`. spawnAccWith :: (Arrays a) => (Acc a -> a) -> Acc a -> p (iv a) -- This default implementation is actually QUITE BAD. It's an -- anti-pattern. We don't want to wait until the spawned -- computation is executed to enqueue the GPU computation. This is -- a problem with child-stealing Par implemenations, but not so much -- with parent-stealing ones. spawnAccWith runner acc = spawn_ $ runAccWith runner acc -- | Analogous to other @*With@ functions. unsafeHybridWith :: Arrays b => (Acc b -> b) -> (b -> a) -> (p a, Acc b) -> p (iv a) -- This default implementation simply /always/ runs the GPU version: unsafeHybridWith runner cvrt (_, acc) = spawn_ $ do x <- runAccWith runner acc return (cvrt x) -- TODO: to be fully consistent we should perhaps have -- compileAccWith, but that gets complicated. ------------------------------------------------------------ -- TODO: We would really like to add this, but it requires more than -- getDefaultAccImpl can provide right now. #if 0 -- | Prepare a GPU computation for repeated execution. -- -- Typically, this is applied to its first argument once in an outer -- scope then applied to its second argument repeatedly inside a loop. -- -- Whereas the normal `runAcc` will /attempt/ to cache compiled -- programs and avoid recompilation, this function guarantees no -- recompilation and further avoids some overhead from re-executing -- the Accelerate front-end. -- -- See "Data.Array.Accelerate.CUDA.run1" for more explanation. compileAcc :: (Arrays a, Arrays b) => (Acc a -> Acc b) -> a -> p b #endif -------------------------------------------------------------------------------- #ifdef ACC_IO -- | An example application of `unsafeHybrid` for vectors. unsafeHybridVector :: (Vector.Storable a, Elt a, IO.BlockPtrs (EltRepr a) ~ ((), Ptr a), ParAccelerate iv p) => (p (Vector.Vector a), Acc (Array DIM1 a)) -> p (iv (Vector.Vector a)) -- /TODO/: make a variant with unrestricted 'Shape' that, e.g., yields -- a vector in row-major order. unsafeHybridVector = unsafeHybrid IO.toVector -- | An example application of `unsafeHybrid` for any IArray type. unsafeHybridIArray :: ( EltRepr ix ~ EltRepr sh , IArray a e, IArray.Ix ix , Shape sh, Elt ix, Elt e , ParAccelerate iv p) => (p (a ix e), Acc (Array sh e)) -> p (iv (a ix e)) unsafeHybridIArray = unsafeHybrid toIArray --IO.toArray #endif