{-# LANGUAGE CPP, GADTs #-} -- | -- Module : Data.Array.Accelerate.CUDA -- Copyright : [2008..2011] Manuel M T Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell -- License : BSD3 -- -- Maintainer : Manuel M T Chakravarty -- Stability : experimental -- Portability : non-portable (GHC extensions) -- -- This module implements the CUDA backend for the embedded array language. -- module Data.Array.Accelerate.CUDA ( -- * Generate and execute CUDA code for an array expression Arrays, run, stream ) where -- standard library import Prelude hiding (catch) import Data.Label import Control.Exception import Control.Applicative import System.IO.Unsafe import qualified Data.HashTable as Hash -- CUDA binding import Foreign.CUDA.Driver.Error import qualified Foreign.CUDA.Driver as CUDA -- friends import Data.Array.Accelerate.AST (Arrays(..), ArraysR(..)) import Data.Array.Accelerate.Smart (Acc, convertAcc, convertAccFun1) import Data.Array.Accelerate.Array.Representation (size) import Data.Array.Accelerate.Array.Sugar (Array(..)) import Data.Array.Accelerate.CUDA.Array.Data import Data.Array.Accelerate.CUDA.State import Data.Array.Accelerate.CUDA.Compile import Data.Array.Accelerate.CUDA.Execute #include "accelerate.h" -- Accelerate: CUDA -- ---------------- -- | Compile and run a complete embedded array program using the CUDA backend -- run :: Arrays a => Acc a -> a {-# NOINLINE run #-} run a = unsafePerformIO execute where acc = convertAcc a execute = evalCUDA (compileAcc acc >>= executeAcc >>= collect) `catch` \e -> INTERNAL_ERROR(error) "unhandled" (show (e :: CUDAException)) -- | Stream a lazily read list of input arrays through the given program, -- collecting results as we go -- stream :: (Arrays a, Arrays b) => (Acc a -> Acc b) -> [a] -> [b] {-# NOINLINE stream #-} stream f arrs = unsafePerformIO $ uncurry (execute arrs) =<< runCUDA (compileAfun1 acc) where acc = convertAccFun1 f execute [] _ state = finalise state >> return [] -- release all constant arrays execute (a:as) afun state = do (b,s) <- runCUDAWith state (executeAfun1 afun a >>= collect) `catch` \e -> INTERNAL_ERROR(error) "unhandled" (show (e :: CUDAException)) bs <- unsafeInterleaveIO (execute as afun s) return (b:bs) -- finalise state = do mem <- Hash.toList (get memoryTable state) mapM_ (\(_,MemoryEntry _ p) -> CUDA.free (CUDA.castDevPtr p)) mem -- Copy from device to host, and decrement the usage counter. This last step -- should result in all transient arrays having been removed from the device. -- collect :: Arrays arrs => arrs -> CIO arrs collect arrs = collectR arrays arrs where collectR :: ArraysR arrs -> arrs -> CIO arrs collectR ArraysRunit () = return () collectR ArraysRarray arr@(Array sh ad) = peekArray ad (size sh) >> freeArray ad >> return arr collectR (ArraysRpair r1 r2) (arrs1, arrs2) = (,) <$> collectR r1 arrs1 <*> collectR r2 arrs2