{-# LANGUAGE CPP #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Data.Array.Accelerate.LLVM.PTX.Compile (
module Data.Array.Accelerate.LLVM.Compile,
ObjectR(..),
) where
import qualified LLVM.AST as AST
import qualified LLVM.AST.Name as LLVM
import qualified LLVM.Context as LLVM
import qualified LLVM.Module as LLVM
import qualified LLVM.PassManager as LLVM
import qualified LLVM.Target as LLVM
import qualified LLVM.Internal.Module as LLVM.Internal
import qualified LLVM.Internal.FFI.LLVMCTypes as LLVM.Internal.FFI
import qualified LLVM.Analysis as LLVM
import Data.Array.Accelerate.Error ( internalError )
import Data.Array.Accelerate.Trafo ( DelayedOpenAcc )
import Data.Array.Accelerate.LLVM.CodeGen
import Data.Array.Accelerate.LLVM.CodeGen.Environment ( Gamma )
import Data.Array.Accelerate.LLVM.CodeGen.Module ( Module(..) )
import Data.Array.Accelerate.LLVM.Compile
import Data.Array.Accelerate.LLVM.State
import Data.Array.Accelerate.LLVM.Util
import Data.Array.Accelerate.LLVM.PTX.Analysis.Launch
import Data.Array.Accelerate.LLVM.PTX.CodeGen
import Data.Array.Accelerate.LLVM.PTX.Compile.Cache
import Data.Array.Accelerate.LLVM.PTX.Compile.Libdevice
import Data.Array.Accelerate.LLVM.PTX.Foreign ( )
import Data.Array.Accelerate.LLVM.PTX.Target
import qualified Data.Array.Accelerate.LLVM.PTX.Debug as Debug
import Foreign.CUDA.Path
import qualified Foreign.CUDA.Analysis as CUDA
import qualified Foreign.NVVM as NVVM
import Control.DeepSeq
import Control.Exception
import Control.Monad.Except
import Control.Monad.State
import Data.ByteString ( ByteString )
import Data.ByteString.Short ( ShortByteString )
import Data.Maybe
import Data.Word
import Foreign.ForeignPtr
import Foreign.Ptr
import Foreign.Storable
import System.Directory
import System.Exit
import System.FilePath
import System.IO
import System.IO.Unsafe
import System.Process
import System.Process.Extra
import Text.Printf ( printf )
import qualified Data.Map as Map
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as B8
import qualified Data.ByteString.Internal as B
import qualified Data.ByteString.Short.Char8 as S8
import Prelude as P
instance Compile PTX where
data ObjectR PTX = ObjectR { objId :: {-# UNPACK #-} !UID
, ptxConfig :: ![(ShortByteString, LaunchConfig)]
, objData :: ByteString
}
compileForTarget = compile
compile :: DelayedOpenAcc aenv a -> Gamma aenv -> LLVM PTX (ObjectR PTX)
compile acc aenv = do
target <- gets llvmTarget
(uid, cacheFile) <- cacheOfOpenAcc acc
let Module ast md = llvmOfOpenAcc target uid acc aenv
dev = ptxDeviceProperties target
config = [ (f,x) | (LLVM.Name f, KM_PTX x) <- Map.toList md ]
cubin <- liftIO . unsafeInterleaveIO $ do
exists <- doesFileExist cacheFile
recomp <- if Debug.debuggingIsEnabled then Debug.getFlag Debug.force_recomp else return False
if exists && not recomp
then do
Debug.traceIO Debug.dump_cc (printf "cc: found cached object code %s" (show uid))
B.readFile cacheFile
else
LLVM.withContext $ \ctx -> do
ptx <- compilePTX dev ctx ast
cubin <- compileCUBIN dev cacheFile ptx
return cubin
return $! ObjectR uid config cubin
compilePTX :: CUDA.DeviceProperties -> LLVM.Context -> AST.Module -> IO ByteString
compilePTX dev ctx ast = do
#ifdef ACCELERATE_USE_NVVM
ptx <- withLibdeviceNVVM dev ctx ast (compileModuleNVVM dev (AST.moduleName ast))
#else
ptx <- withLibdeviceNVPTX dev ctx ast (compileModuleNVPTX dev)
#endif
Debug.when Debug.dump_asm $ Debug.traceIO Debug.verbose (B8.unpack ptx)
return ptx
compileCUBIN :: CUDA.DeviceProperties -> FilePath -> ByteString -> IO ByteString
compileCUBIN dev sass ptx = do
_verbose <- if Debug.debuggingIsEnabled then Debug.getFlag Debug.verbose else return False
_debug <- if Debug.debuggingIsEnabled then Debug.getFlag Debug.debug else return False
let verboseFlag = if _verbose then [ "-v" ] else []
debugFlag = if _debug then [ "-g", "-lineinfo" ] else []
arch = printf "-arch=sm_%d%d" m n
CUDA.Compute m n = CUDA.computeCapability dev
flags = "-" : "-o" : sass : arch : verboseFlag ++ debugFlag
cp = (proc (cudaBinPath </> "ptxas") flags)
{ std_in = CreatePipe
, std_out = NoStream
, std_err = CreatePipe
}
withCreateProcess cp $ \(Just inh) Nothing (Just errh) ph -> do
info <- hGetContents errh
withForkWait (evaluate (rnf info)) $ \waitErr -> do
ignoreSIGPIPE $ B.hPut inh ptx
ignoreSIGPIPE $ hClose inh
waitErr
hClose errh
ex <- waitForProcess ph
case ex of
ExitFailure r -> $internalError "compile" (printf "ptxas %s (exit %d)\n%s" (unwords flags) r info)
ExitSuccess -> return ()
when _verbose $
unless (null info) $
Debug.traceIO Debug.dump_cc (printf "ptx: compiled entry function(s)\n%s" info)
B.readFile sass
compileModuleNVVM :: CUDA.DeviceProperties -> ShortByteString -> [(String, ByteString)] -> LLVM.Module -> IO ByteString
compileModuleNVVM dev name libdevice mdl = do
_debug <- if Debug.debuggingIsEnabled then Debug.getFlag Debug.debug else return False
let arch = CUDA.computeCapability dev
verbose = if _debug then [ NVVM.GenerateDebugInfo ] else []
flags = NVVM.Target arch : verbose
header = case bitSize (undefined::Int) of
32 -> "target triple = \"nvptx-nvidia-cuda\"\ntarget datalayout = \"e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\""
64 -> "target triple = \"nvptx64-nvidia-cuda\"\ntarget datalayout = \"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\""
_ -> $internalError "compileModuleNVVM" "I don't know what architecture I am"
Debug.when Debug.dump_cc $ do
Debug.when Debug.verbose $ do
ll <- LLVM.moduleLLVMAssembly mdl
Debug.traceIO Debug.verbose (B8.unpack ll)
bc <- LLVM.moduleBitcode mdl
#if MIN_VERSION_nvvm(0,9,0)
ptx <- NVVM.compileModules (("",header) : (name,bc) : fmap (\(n,b) -> (S8.pack n, b)) libdevice) flags
#else
ptx <- NVVM.compileModules (("",header) : (S8.unpack name,bc) : libdevice) flags
#endif
unless (B.null (NVVM.compileLog ptx)) $ do
Debug.traceIO Debug.dump_cc $ "llvm: " ++ B8.unpack (NVVM.compileLog ptx)
return (NVVM.compileResult ptx)
compileModuleNVPTX :: CUDA.DeviceProperties -> LLVM.Module -> IO ByteString
compileModuleNVPTX dev mdl =
withPTXTargetMachine dev $ \nvptx -> do
when Debug.internalChecksAreEnabled $ LLVM.verify mdl
let pss = LLVM.defaultCuratedPassSetSpec { LLVM.optLevel = Just 3 }
LLVM.withPassManager pss $ \pm -> do
b1 <- LLVM.runPassManager pm mdl
Debug.when Debug.dump_cc $ do
Debug.traceIO Debug.dump_cc $ printf "llvm: optimisation did work? %s" (show b1)
Debug.traceIO Debug.verbose . B8.unpack =<< LLVM.moduleLLVMAssembly mdl
moduleTargetAssembly nvptx mdl
moduleTargetAssembly :: LLVM.TargetMachine -> LLVM.Module -> IO ByteString
moduleTargetAssembly tm m = unsafe0 =<< LLVM.Internal.emitToByteString LLVM.Internal.FFI.codeGenFileTypeAssembly tm m
where
unsafe0 :: ByteString -> IO ByteString
unsafe0 bs@(B.PS fp s l) =
liftIO . withForeignPtr fp $ \p -> do
let p' :: Ptr Word8
p' = p `plusPtr` (s+l-1)
x <- peek p'
case x of
0 -> return bs
_ | B.isSpaceWord8 x -> poke p' 0 >> return bs
_ -> return (B.snoc bs 0)