{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Data.Array.Accelerate.LLVM.PTX.Compile (
module Data.Array.Accelerate.LLVM.Compile,
ExecutableR(..), Kernel(..), ObjectCode,
) where
import LLVM.AST hiding ( Module )
import qualified LLVM.AST as AST
import qualified LLVM.AST.Name as LLVM
import qualified LLVM.Analysis as LLVM
import qualified LLVM.Context as LLVM
import qualified LLVM.Module as LLVM
import qualified LLVM.PassManager as LLVM
import Data.Array.Accelerate.Error ( internalError )
import Data.Array.Accelerate.Lifetime
import Data.Array.Accelerate.Trafo ( DelayedOpenAcc )
import Data.Array.Accelerate.LLVM.CodeGen
import Data.Array.Accelerate.LLVM.CodeGen.Environment ( Gamma )
import Data.Array.Accelerate.LLVM.CodeGen.Module ( Module(..) )
import Data.Array.Accelerate.LLVM.Compile
import Data.Array.Accelerate.LLVM.State
#ifdef ACCELERATE_USE_NVVM
import Data.Array.Accelerate.LLVM.Util
#endif
import Data.Array.Accelerate.LLVM.PTX.Analysis.Launch
import Data.Array.Accelerate.LLVM.PTX.CodeGen
import Data.Array.Accelerate.LLVM.PTX.Compile.Link
import Data.Array.Accelerate.LLVM.PTX.Context
import Data.Array.Accelerate.LLVM.PTX.Foreign ( )
import Data.Array.Accelerate.LLVM.PTX.Target
import qualified Data.Array.Accelerate.LLVM.PTX.Debug as Debug
import qualified Foreign.CUDA.Analysis as CUDA
import qualified Foreign.CUDA.Driver as CUDA
#ifdef ACCELERATE_USE_NVVM
import qualified Foreign.NVVM as NVVM
#endif
import Control.Monad.Except
import Control.Monad.State
import Data.ByteString ( ByteString )
import Data.List ( intercalate )
import Text.Printf ( printf )
import qualified Data.ByteString.Char8 as B
import qualified Data.Map as Map
import Prelude as P
instance Compile PTX where
data ExecutableR PTX = PTXR { ptxKernel :: ![Kernel]
, ptxModule :: {-# UNPACK #-} !ObjectCode
}
compileForTarget = compileForPTX
data Kernel = Kernel {
kernelFun :: {-# UNPACK #-} !CUDA.Fun
, kernelOccupancy :: {-# UNPACK #-} !CUDA.Occupancy
, kernelSharedMemBytes :: {-# UNPACK #-} !Int
, kernelThreadBlockSize :: {-# UNPACK #-} !Int
, kernelThreadBlocks :: (Int -> Int)
, kernelName :: String
}
type ObjectCode = Lifetime CUDA.Module
compileForPTX
:: DelayedOpenAcc aenv a
-> Gamma aenv
-> LLVM PTX (ExecutableR PTX)
compileForPTX acc aenv = do
target <- gets llvmTarget
let
Module ast md = llvmOfOpenAcc target acc aenv
dev = ptxDeviceProperties target
liftIO . LLVM.withContext $ \ctx -> do
ptx <- compileModule dev ctx ast
funs <- sequence [ linkFunction ptx f x | (LLVM.Name f, KM_PTX x) <- Map.toList md ]
ptx' <- newLifetime ptx
addFinalizer ptx' $ do
Debug.traceIO Debug.dump_gc
$ printf "gc: unload module: %s"
$ intercalate "," (P.map kernelName funs)
withContext (ptxContext target) (CUDA.unload ptx)
return $! PTXR funs ptx'
compileModule :: CUDA.DeviceProperties -> LLVM.Context -> AST.Module -> IO CUDA.Module
compileModule dev ctx ast =
let name = moduleName ast in
#ifdef ACCELERATE_USE_NVVM
withLibdeviceNVVM dev ctx ast (compileModuleNVVM dev name)
#else
withLibdeviceNVPTX dev ctx ast (compileModuleNVPTX dev name)
#endif
#ifdef ACCELERATE_USE_NVVM
compileModuleNVVM :: CUDA.DeviceProperties -> String -> [(String, ByteString)] -> LLVM.Module -> IO CUDA.Module
compileModuleNVVM dev name libdevice mdl = do
_debug <- Debug.queryFlag Debug.debug_cc
let arch = CUDA.computeCapability dev
verbose = if _debug then [ NVVM.GenerateDebugInfo ] else []
flags = NVVM.Target arch : verbose
header = case bitSize (undefined::Int) of
32 -> "target triple = \"nvptx-nvidia-cuda\"\ntarget datalayout = \"e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\""
64 -> "target triple = \"nvptx64-nvidia-cuda\"\ntarget datalayout = \"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\""
_ -> $internalError "compileModuleNVVM" "I don't know what architecture I am"
Debug.when Debug.dump_cc $ do
Debug.when Debug.verbose $ do
ll <- LLVM.moduleLLVMAssembly mdl
Debug.traceIO Debug.verbose ll
bc <- LLVM.moduleBitcode mdl
ptx <- NVVM.compileModules (("",header) : (name,bc) : libdevice) flags
unless (B.null (NVVM.compileLog ptx)) $ do
Debug.traceIO Debug.dump_cc $ "llvm: " ++ B.unpack (NVVM.compileLog ptx)
linkPTX name (NVVM.compileResult ptx)
#else
compileModuleNVPTX :: CUDA.DeviceProperties -> String -> LLVM.Module -> IO CUDA.Module
compileModuleNVPTX dev name mdl =
withPTXTargetMachine dev $ \nvptx -> do
let pss = LLVM.defaultCuratedPassSetSpec { LLVM.optLevel = Just 3 }
runError e = either ($internalError "compileModuleNVPTX") id `fmap` runExceptT e
LLVM.withPassManager pss $ \pm -> do
#ifdef ACCELERATE_INTERNAL_CHECKS
runError $ LLVM.verify mdl
#endif
b1 <- LLVM.runPassManager pm mdl
Debug.when Debug.dump_cc $ do
Debug.traceIO Debug.dump_cc $ printf "llvm: optimisation did work? %s" (show b1)
Debug.traceIO Debug.verbose =<< LLVM.moduleLLVMAssembly mdl
ptx <- runError (LLVM.moduleTargetAssembly nvptx mdl)
linkPTX name (B.pack ptx)
#endif
linkPTX :: String -> ByteString -> IO CUDA.Module
linkPTX name ptx = do
_verbose <- Debug.queryFlag Debug.verbose
_debug <- Debug.queryFlag Debug.debug_cc
let v = if _verbose then [ CUDA.Verbose ] else []
d = if _debug then [ CUDA.GenerateDebugInfo, CUDA.GenerateLineInfo ] else []
flags = concat [v,d]
Debug.when (Debug.dump_asm) $
Debug.traceIO Debug.verbose (B.unpack ptx)
jit <- CUDA.loadDataEx ptx flags
Debug.traceIO Debug.dump_asm $
printf "ptx: compiled entry function \"%s\" in %s\n%s"
name
(Debug.showFFloatSIBase (Just 2) 1000 (CUDA.jitTime jit / 1000) "s")
(B.unpack (CUDA.jitInfoLog jit))
return $! CUDA.jitModule jit
linkFunction
:: CUDA.Module
-> String
-> LaunchConfig
-> IO Kernel
linkFunction mdl name configure = do
f <- CUDA.getFun mdl name
regs <- CUDA.requires f CUDA.NumRegs
ssmem <- CUDA.requires f CUDA.SharedSizeBytes
cmem <- CUDA.requires f CUDA.ConstSizeBytes
lmem <- CUDA.requires f CUDA.LocalSizeBytes
maxt <- CUDA.requires f CUDA.MaxKernelThreadsPerBlock
let
(occ, cta, grid, dsmem) = configure maxt regs ssmem
msg1, msg2 :: String
msg1 = printf "kernel function '%s' used %d registers, %d bytes smem, %d bytes lmem, %d bytes cmem"
name regs (ssmem + dsmem) lmem cmem
msg2 = printf "multiprocessor occupancy %.1f %% : %d threads over %d warps in %d blocks"
(CUDA.occupancy100 occ)
(CUDA.activeThreads occ)
(CUDA.activeWarps occ)
(CUDA.activeThreadBlocks occ)
Debug.traceIO Debug.dump_cc (printf "cc: %s\n ... %s" msg1 msg2)
return $ Kernel f occ dsmem cta grid name