{-# LANGUAGE CPP #-} {-# LANGUAGE EmptyDataDecls #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeFamilies #-} -- | -- Module : Data.Array.Accelerate.LLVM.PTX.Target -- Copyright : [2014..2017] Trevor L. McDonell -- [2014..2014] Vinod Grover (NVIDIA Corporation) -- License : BSD3 -- -- Maintainer : Trevor L. McDonell -- Stability : experimental -- Portability : non-portable (GHC extensions) -- module Data.Array.Accelerate.LLVM.PTX.Target ( module Data.Array.Accelerate.LLVM.Target, module Data.Array.Accelerate.LLVM.PTX.Target, ) where -- llvm-general import LLVM.AST.AddrSpace import LLVM.AST.DataLayout import LLVM.Target hiding ( Target ) import qualified LLVM.Target as LLVM import qualified LLVM.Relocation as R import qualified LLVM.CodeModel as CM import qualified LLVM.CodeGenOpt as CGO -- accelerate import Data.Array.Accelerate.Error import Data.Array.Accelerate.LLVM.Target import Data.Array.Accelerate.LLVM.Util import Control.Parallel.Meta ( Executable ) import Data.Array.Accelerate.LLVM.PTX.Array.Table ( MemoryTable ) import Data.Array.Accelerate.LLVM.PTX.Context ( Context, deviceProperties ) import Data.Array.Accelerate.LLVM.PTX.Execute.Stream.Reservoir ( Reservoir ) import Data.Array.Accelerate.LLVM.PTX.Link.Cache ( KernelTable ) -- CUDA import qualified Foreign.CUDA.Driver as CUDA -- standard library import Data.ByteString ( ByteString ) import Data.ByteString.Short ( ShortByteString ) import Data.String import System.IO.Unsafe import Text.Printf import qualified Data.Map as Map import qualified Data.Set as Set -- | The PTX execution target for NVIDIA GPUs. -- -- The execution target carries state specific for the current execution -- context. The data here --- device memory and execution streams --- are -- implicitly tied to this CUDA execution context. -- -- Don't store anything here that is independent of the context, for example -- state related to [persistent] kernel caching should _not_ go here. -- data PTX = PTX { ptxContext :: {-# UNPACK #-} !Context , ptxMemoryTable :: {-# UNPACK #-} !MemoryTable , ptxKernelTable :: {-# UNPACK #-} !KernelTable , ptxStreamReservoir :: {-# UNPACK #-} !Reservoir , fillP :: {-# UNPACK #-} !Executable } instance Target PTX where targetTriple _ = Just ptxTargetTriple #if ACCELERATE_USE_NVVM targetDataLayout _ = Nothing -- see note: [NVVM and target data layout] #else targetDataLayout _ = Just ptxDataLayout #endif -- | Extract the properties of the device the current PTX execution state is -- executing on. -- ptxDeviceProperties :: PTX -> CUDA.DeviceProperties ptxDeviceProperties = deviceProperties . ptxContext -- | A description of the various data layout properties that may be used during -- optimisation. For CUDA the following data layouts are supported: -- -- 32-bit: -- e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64 -- -- 64-bit: -- e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64 -- -- Thus, only the size of the pointer layout changes depending on the host -- architecture. -- ptxDataLayout :: DataLayout ptxDataLayout = DataLayout { endianness = LittleEndian , mangling = Nothing , aggregateLayout = AlignmentInfo 0 64 , stackAlignment = Nothing , pointerLayouts = Map.fromList [ (AddrSpace 0, (wordSize, AlignmentInfo wordSize wordSize)) ] , typeLayouts = Map.fromList $ [ ((IntegerAlign, 1), AlignmentInfo 8 8) ] ++ [ ((IntegerAlign, i), AlignmentInfo i i) | i <- [8,16,32,64]] ++ [ ((VectorAlign, v), AlignmentInfo v v) | v <- [16,32,64,128]] ++ [ ((FloatAlign, f), AlignmentInfo f f) | f <- [32,64] ] , nativeSizes = Just $ Set.fromList [ 16,32,64 ] } where wordSize = bitSize (undefined :: Int) -- | String that describes the target host. -- ptxTargetTriple :: ShortByteString ptxTargetTriple = case bitSize (undefined::Int) of 32 -> "nvptx-nvidia-cuda" 64 -> "nvptx64-nvidia-cuda" _ -> $internalError "ptxTargetTriple" "I don't know what architecture I am" -- | Bracket creation and destruction of the NVVM TargetMachine. -- withPTXTargetMachine :: CUDA.DeviceProperties -> (TargetMachine -> IO a) -> IO a withPTXTargetMachine dev go = let CUDA.Compute m n = CUDA.computeCapability dev isa = CPUFeature (ptxISAVersion m n) sm = fromString (printf "sm_%d%d" m n) in withTargetOptions $ \options -> do withTargetMachine ptxTarget ptxTargetTriple sm (Map.singleton isa True) -- CPU features options -- target options R.Default -- relocation model CM.Default -- code model CGO.Default -- optimisation level go -- Some libdevice functions require at least ptx40, even though devices at -- that compute capability also accept older ISA versions. -- -- https://github.com/llvm-mirror/llvm/blob/master/lib/Target/NVPTX/NVPTX.td#L72 -- ptxISAVersion :: Int -> Int -> ByteString ptxISAVersion 2 _ = "ptx40" ptxISAVersion 3 7 = "ptx41" ptxISAVersion 3 _ = "ptx40" ptxISAVersion 5 0 = "ptx40" ptxISAVersion 5 2 = "ptx41" ptxISAVersion 5 3 = "ptx42" ptxISAVersion 6 _ = "ptx50" ptxISAVersion 7 _ = "ptx60" ptxISAVersion _ _ = "ptx40" -- | The NVPTX target for this host. -- -- The top-level 'unsafePerformIO' is so that 'initializeAllTargets' is run once -- per program execution (although that might not be necessary?) -- {-# NOINLINE ptxTarget #-} ptxTarget :: LLVM.Target ptxTarget = unsafePerformIO $ do initializeAllTargets fst `fmap` lookupTarget Nothing ptxTargetTriple