{-# LANGUAGE CPP               #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards   #-}
{-# LANGUAGE TemplateHaskell   #-}
{-# LANGUAGE TypeFamilies      #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
-- |
-- Module      : Data.Array.Accelerate.LLVM.PTX.Compile
-- Copyright   : [2014..2020] The Accelerate Team
-- License     : BSD3
--
-- Maintainer  : Trevor L. McDonell <trevor.mcdonell@gmail.com>
-- Stability   : experimental
-- Portability : non-portable (GHC extensions)
--

module Data.Array.Accelerate.LLVM.PTX.Compile (

  module Data.Array.Accelerate.LLVM.Compile,
  ObjectR(..),

) where

import Data.Array.Accelerate.AST                                    ( PreOpenAcc )
import Data.Array.Accelerate.Error
import Data.Array.Accelerate.Trafo.Delayed

import Data.Array.Accelerate.LLVM.CodeGen
import Data.Array.Accelerate.LLVM.CodeGen.Environment               ( Gamma )
import Data.Array.Accelerate.LLVM.CodeGen.Module                    ( Module(..) )
import Data.Array.Accelerate.LLVM.Compile
import Data.Array.Accelerate.LLVM.Extra
import Data.Array.Accelerate.LLVM.State

import Data.Array.Accelerate.LLVM.PTX.Analysis.Launch
import Data.Array.Accelerate.LLVM.PTX.CodeGen
import Data.Array.Accelerate.LLVM.PTX.Compile.Cache
import Data.Array.Accelerate.LLVM.PTX.Compile.Libdevice
import Data.Array.Accelerate.LLVM.PTX.Foreign                       ( )
import Data.Array.Accelerate.LLVM.PTX.Target
import qualified Data.Array.Accelerate.LLVM.PTX.Debug               as Debug

import Foreign.CUDA.Path
import qualified Foreign.CUDA.Analysis                              as CUDA
import qualified Foreign.NVVM                                       as NVVM

import qualified LLVM.AST                                           as AST
import qualified LLVM.AST.Name                                      as LLVM
import qualified LLVM.Context                                       as LLVM
import qualified LLVM.Module                                        as LLVM
import qualified LLVM.PassManager                                   as LLVM
import qualified LLVM.Target                                        as LLVM
import qualified LLVM.Internal.Module                               as LLVM.Internal
import qualified LLVM.Internal.FFI.LLVMCTypes                       as LLVM.Internal.FFI
import qualified LLVM.Analysis                                      as LLVM

import Control.DeepSeq
import Control.Exception
import Control.Monad.Except
import Control.Monad.State
import Data.ByteString                                              ( ByteString )
import Data.ByteString.Short                                        ( ShortByteString )
import Data.Maybe
import Data.Word
import Foreign.ForeignPtr
import Foreign.Ptr
import Foreign.Storable
import System.Directory
import System.Exit
import System.FilePath
import System.IO
import System.IO.Unsafe
import System.Process
import System.Process.Extra
import Text.Printf                                                  ( printf )
import qualified Data.Map                                           as Map
import qualified Data.ByteString                                    as B
import qualified Data.ByteString.Char8                              as B8
import qualified Data.ByteString.Internal                           as B
import Prelude                                                      as P


instance Compile PTX where
  data ObjectR PTX = ObjectR { ObjectR PTX -> UID
objId     :: {-# UNPACK #-} !UID
                             , ObjectR PTX -> [(ShortByteString, LaunchConfig)]
ptxConfig :: ![(ShortByteString, LaunchConfig)]
                             , ObjectR PTX -> ByteString
objData   :: {- LAZY -} ByteString
                             }
  compileForTarget :: PreOpenAcc DelayedOpenAcc aenv a
-> Gamma aenv -> LLVM PTX (ObjectR PTX)
compileForTarget = PreOpenAcc DelayedOpenAcc aenv a
-> Gamma aenv -> LLVM PTX (ObjectR PTX)
forall aenv a.
HasCallStack =>
PreOpenAcc DelayedOpenAcc aenv a
-> Gamma aenv -> LLVM PTX (ObjectR PTX)
compile


-- | Compile an Accelerate expression to object code.
--
-- This generates the target code together with a list of each kernel function
-- defined in the module paired with its occupancy information.
--
compile :: HasCallStack => PreOpenAcc DelayedOpenAcc aenv a -> Gamma aenv -> LLVM PTX (ObjectR PTX)
compile :: PreOpenAcc DelayedOpenAcc aenv a
-> Gamma aenv -> LLVM PTX (ObjectR PTX)
compile PreOpenAcc DelayedOpenAcc aenv a
pacc Gamma aenv
aenv = do

  -- Generate code for this Acc operation
  --
  DeviceProperties
dev               <- (PTX -> DeviceProperties) -> LLVM PTX DeviceProperties
forall s (m :: * -> *) a. MonadState s m => (s -> a) -> m a
gets PTX -> DeviceProperties
ptxDeviceProperties
  (UID
uid, FilePath
cacheFile)  <- PreOpenAcc DelayedOpenAcc aenv a -> LLVM PTX (UID, FilePath)
forall arch aenv a.
Persistent arch =>
PreOpenAcc DelayedOpenAcc aenv a -> LLVM arch (UID, FilePath)
cacheOfPreOpenAcc PreOpenAcc DelayedOpenAcc aenv a
pacc
  Module Module
ast Map Name (KernelMetadata PTX)
md     <- UID
-> PreOpenAcc DelayedOpenAcc aenv a
-> Gamma aenv
-> LLVM PTX (Module PTX aenv a)
forall arch aenv arrs.
(HasCallStack, Target arch, Skeleton arch, Intrinsic arch,
 Foreign arch) =>
UID
-> PreOpenAcc DelayedOpenAcc aenv arrs
-> Gamma aenv
-> LLVM arch (Module arch aenv arrs)
llvmOfPreOpenAcc UID
uid PreOpenAcc DelayedOpenAcc aenv a
pacc Gamma aenv
aenv
  let config :: [(ShortByteString, LaunchConfig)]
config        = [ (ShortByteString
f,LaunchConfig
x) | (LLVM.Name ShortByteString
f, KM_PTX x) <- Map Name (KernelMetadata PTX) -> [(Name, KernelMetadata PTX)]
forall k a. Map k a -> [(k, a)]
Map.toList Map Name (KernelMetadata PTX)
md ]

  -- Lower the generated LLVM into a CUBIN object code.
  --
  -- The 'objData' field is lazily evaluated since the object code might have
  -- already been loaded into the current context from a different function, in
  -- which case it will be found by the linker cache.
  --
  ByteString
cubin <- IO ByteString -> LLVM PTX ByteString
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO ByteString -> LLVM PTX ByteString)
-> (IO ByteString -> IO ByteString)
-> IO ByteString
-> LLVM PTX ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. IO ByteString -> IO ByteString
forall a. IO a -> IO a
unsafeInterleaveIO (IO ByteString -> LLVM PTX ByteString)
-> IO ByteString -> LLVM PTX ByteString
forall a b. (a -> b) -> a -> b
$ do
    Bool
exists <- FilePath -> IO Bool
doesFileExist FilePath
cacheFile
    Bool
recomp <- if Bool
Debug.debuggingIsEnabled then Flag -> IO Bool
Debug.getFlag Flag
Debug.force_recomp else Bool -> IO Bool
forall (m :: * -> *) a. Monad m => a -> m a
return Bool
False
    if Bool
exists Bool -> Bool -> Bool
&& Bool -> Bool
not Bool
recomp
      then do
        Flag -> FilePath -> IO ()
Debug.traceIO Flag
Debug.dump_cc (FilePath -> FilePath -> FilePath
forall r. PrintfType r => FilePath -> r
printf FilePath
"cc: found cached object code %s" (UID -> FilePath
forall a. Show a => a -> FilePath
show UID
uid))
        FilePath -> IO ByteString
B.readFile FilePath
cacheFile

      else
        (Context -> IO ByteString) -> IO ByteString
forall a. (Context -> IO a) -> IO a
LLVM.withContext ((Context -> IO ByteString) -> IO ByteString)
-> (Context -> IO ByteString) -> IO ByteString
forall a b. (a -> b) -> a -> b
$ \Context
ctx -> do
          ByteString
ptx   <- DeviceProperties -> Context -> Module -> IO ByteString
compilePTX DeviceProperties
dev Context
ctx Module
ast
          ByteString
cubin <- HasCallStack =>
DeviceProperties -> FilePath -> ByteString -> IO ByteString
DeviceProperties -> FilePath -> ByteString -> IO ByteString
compileCUBIN DeviceProperties
dev FilePath
cacheFile ByteString
ptx
          ByteString -> IO ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return ByteString
cubin

  ObjectR PTX -> LLVM PTX (ObjectR PTX)
forall (m :: * -> *) a. Monad m => a -> m a
return (ObjectR PTX -> LLVM PTX (ObjectR PTX))
-> ObjectR PTX -> LLVM PTX (ObjectR PTX)
forall a b. (a -> b) -> a -> b
$! UID
-> [(ShortByteString, LaunchConfig)] -> ByteString -> ObjectR PTX
ObjectR UID
uid [(ShortByteString, LaunchConfig)]
config ByteString
cubin


-- | Compile the LLVM module to PTX assembly. This is done either by the
-- closed-source libNVVM library, or via the standard NVPTX backend (which is
-- the default).
--
compilePTX :: CUDA.DeviceProperties -> LLVM.Context -> AST.Module -> IO ByteString
compilePTX :: DeviceProperties -> Context -> Module -> IO ByteString
compilePTX DeviceProperties
dev Context
ctx Module
ast = do
#ifdef ACCELERATE_USE_NVVM
  ptx <- withLibdeviceNVVM  dev ctx ast (_compileModuleNVVM dev (AST.moduleName ast))
#else
  ByteString
ptx <- DeviceProperties
-> Context -> Module -> (Module -> IO ByteString) -> IO ByteString
forall a.
DeviceProperties -> Context -> Module -> (Module -> IO a) -> IO a
withLibdeviceNVPTX DeviceProperties
dev Context
ctx Module
ast (DeviceProperties -> Module -> IO ByteString
_compileModuleNVPTX DeviceProperties
dev)
#endif
  Flag -> IO () -> IO ()
forall (m :: * -> *). MonadIO m => Flag -> m () -> m ()
Debug.when Flag
Debug.dump_asm (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ Flag -> FilePath -> IO ()
Debug.traceIO Flag
Debug.verbose (ByteString -> FilePath
B8.unpack ByteString
ptx)
  ByteString -> IO ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return ByteString
ptx


-- | Compile the given PTX assembly to a CUBIN file (SASS object code). The
-- compiled code will be stored at the given FilePath.
--
compileCUBIN :: HasCallStack => CUDA.DeviceProperties -> FilePath -> ByteString -> IO ByteString
compileCUBIN :: DeviceProperties -> FilePath -> ByteString -> IO ByteString
compileCUBIN DeviceProperties
dev FilePath
sass ByteString
ptx = do
  Bool
_verbose  <- if Bool
Debug.debuggingIsEnabled then Flag -> IO Bool
Debug.getFlag Flag
Debug.verbose else Bool -> IO Bool
forall (m :: * -> *) a. Monad m => a -> m a
return Bool
False
  Bool
_debug    <- if Bool
Debug.debuggingIsEnabled then Flag -> IO Bool
Debug.getFlag Flag
Debug.debug   else Bool -> IO Bool
forall (m :: * -> *) a. Monad m => a -> m a
return Bool
False
  --
  let verboseFlag :: [FilePath]
verboseFlag       = if Bool
_verbose then [ FilePath
"-v" ]              else []
      debugFlag :: [FilePath]
debugFlag         = if Bool
_debug   then [ FilePath
"-g", FilePath
"-lineinfo" ] else []
      arch :: FilePath
arch              = FilePath -> Int -> Int -> FilePath
forall r. PrintfType r => FilePath -> r
printf FilePath
"-arch=sm_%d%d" Int
m Int
n
      CUDA.Compute Int
m Int
n  = DeviceProperties -> Compute
CUDA.computeCapability DeviceProperties
dev
      flags :: [FilePath]
flags             = FilePath
"-" FilePath -> [FilePath] -> [FilePath]
forall a. a -> [a] -> [a]
: FilePath
"-o" FilePath -> [FilePath] -> [FilePath]
forall a. a -> [a] -> [a]
: FilePath
sass FilePath -> [FilePath] -> [FilePath]
forall a. a -> [a] -> [a]
: FilePath
arch FilePath -> [FilePath] -> [FilePath]
forall a. a -> [a] -> [a]
: [FilePath]
verboseFlag [FilePath] -> [FilePath] -> [FilePath]
forall a. [a] -> [a] -> [a]
++ [FilePath]
debugFlag
      --
      cp :: CreateProcess
cp = (FilePath -> [FilePath] -> CreateProcess
proc (FilePath
cudaBinPath FilePath -> FilePath -> FilePath
</> FilePath
"ptxas") [FilePath]
flags)
            { std_in :: StdStream
std_in  = StdStream
CreatePipe
            , std_out :: StdStream
std_out = StdStream
NoStream
            , std_err :: StdStream
std_err = StdStream
CreatePipe
            }

  -- Invoke the 'ptxas' executable to compile the generated PTX into SASS (GPU
  -- object code). The output is written directly to the final cache location.
  --
  CreateProcess
-> (Maybe Handle
    -> Maybe Handle -> Maybe Handle -> ProcessHandle -> IO ())
-> IO ()
forall a.
CreateProcess
-> (Maybe Handle
    -> Maybe Handle -> Maybe Handle -> ProcessHandle -> IO a)
-> IO a
withCreateProcess CreateProcess
cp ((Maybe Handle
  -> Maybe Handle -> Maybe Handle -> ProcessHandle -> IO ())
 -> IO ())
-> (Maybe Handle
    -> Maybe Handle -> Maybe Handle -> ProcessHandle -> IO ())
-> IO ()
forall a b. (a -> b) -> a -> b
$ \(Just Handle
inh) Maybe Handle
Nothing (Just Handle
errh) ProcessHandle
ph -> do

    -- fork off a thread to start consuming stderr
    FilePath
info <- Handle -> IO FilePath
hGetContents Handle
errh
    IO () -> (IO () -> IO ()) -> IO ()
forall a. IO () -> (IO () -> IO a) -> IO a
withForkWait (() -> IO ()
forall a. a -> IO a
evaluate (FilePath -> ()
forall a. NFData a => a -> ()
rnf FilePath
info)) ((IO () -> IO ()) -> IO ()) -> (IO () -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \IO ()
waitErr -> do

      -- write the PTX to the input handle
      -- closing the handle performs an implicit flush, thus may trigger SIGPIPE
      IO () -> IO ()
ignoreSIGPIPE (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ Handle -> ByteString -> IO ()
B.hPut Handle
inh ByteString
ptx
      IO () -> IO ()
ignoreSIGPIPE (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ Handle -> IO ()
hClose Handle
inh

      -- wait on the output
      IO ()
waitErr
      Handle -> IO ()
hClose Handle
errh

    -- wait on the process
    ExitCode
ex <- ProcessHandle -> IO ExitCode
waitForProcess ProcessHandle
ph
    case ExitCode
ex of
      ExitFailure Int
r -> FilePath -> IO ()
forall a. HasCallStack => FilePath -> a
internalError (FilePath -> FilePath -> Int -> FilePath -> FilePath
forall r. PrintfType r => FilePath -> r
printf FilePath
"ptxas %s (exit %d)\n%s" ([FilePath] -> FilePath
unwords [FilePath]
flags) Int
r FilePath
info)
      ExitCode
ExitSuccess   -> () -> IO ()
forall (m :: * -> *) a. Monad m => a -> m a
return ()

    Bool -> IO () -> IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
_verbose (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$
      Bool -> IO () -> IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (FilePath -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null FilePath
info) (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$
        Flag -> FilePath -> IO ()
Debug.traceIO Flag
Debug.dump_cc (FilePath -> FilePath -> FilePath
forall r. PrintfType r => FilePath -> r
printf FilePath
"ptx: compiled entry function(s)\n%s" FilePath
info)

  -- Read back the results
  FilePath -> IO ByteString
B.readFile FilePath
sass


-- Compile and optimise the module to PTX using the (closed source) NVVM
-- library. This _may_ produce faster object code than the LLVM NVPTX compiler.
--
_compileModuleNVVM :: HasCallStack => CUDA.DeviceProperties -> ShortByteString -> [(ShortByteString, ByteString)] -> LLVM.Module -> IO ByteString
_compileModuleNVVM :: DeviceProperties
-> ShortByteString
-> [(ShortByteString, ByteString)]
-> Module
-> IO ByteString
_compileModuleNVVM DeviceProperties
dev ShortByteString
name [(ShortByteString, ByteString)]
libdevice Module
mdl = do
  Bool
_debug <- if Bool
Debug.debuggingIsEnabled then Flag -> IO Bool
Debug.getFlag Flag
Debug.debug else Bool -> IO Bool
forall (m :: * -> *) a. Monad m => a -> m a
return Bool
False
  --
  let arch :: Compute
arch    = DeviceProperties -> Compute
CUDA.computeCapability DeviceProperties
dev
      verbose :: [CompileOption]
verbose = if Bool
_debug then [ CompileOption
NVVM.GenerateDebugInfo ] else []
      flags :: [CompileOption]
flags   = Compute -> CompileOption
NVVM.Target Compute
arch CompileOption -> [CompileOption] -> [CompileOption]
forall a. a -> [a] -> [a]
: [CompileOption]
verbose

      -- Note: [NVVM and target datalayout]
      --
      -- The NVVM library does not correctly parse the target datalayout field,
      -- instead doing a (very dodgy) string compare against exactly two
      -- expected values. This means that it is sensitive to, e.g. the ordering
      -- of the fields, and changes to the representation in each LLVM release.
      --
      -- We get around this by only specifying the data layout in a separate
      -- (otherwise empty) module that we additionally link against.
      --
      header :: ByteString
header  = case Int -> Word32
forall a. (HasCallStack, Bits a) => a -> Word32
bitSize (Int
forall a. HasCallStack => a
undefined::Int) of
                  Word32
32 -> ByteString
"target triple = \"nvptx-nvidia-cuda\"\ntarget datalayout = \"e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\""
                  Word32
64 -> ByteString
"target triple = \"nvptx64-nvidia-cuda\"\ntarget datalayout = \"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64\""
                  Word32
_  -> FilePath -> ByteString
forall a. HasCallStack => FilePath -> a
internalError FilePath
"I don't know what architecture I am"

  Flag -> IO () -> IO ()
forall (m :: * -> *). MonadIO m => Flag -> m () -> m ()
Debug.when Flag
Debug.dump_cc   (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ do
    Flag -> IO () -> IO ()
forall (m :: * -> *). MonadIO m => Flag -> m () -> m ()
Debug.when Flag
Debug.verbose (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ do
      ByteString
ll <- Module -> IO ByteString
LLVM.moduleLLVMAssembly Module
mdl -- TLM: unfortunate to do the lowering twice in debug mode
      Flag -> FilePath -> IO ()
Debug.traceIO Flag
Debug.verbose (ByteString -> FilePath
B8.unpack ByteString
ll)

  -- Lower the generated module to bitcode, then compile and link together with
  -- the shim header and libdevice library (if necessary)
  ByteString
bc  <- Module -> IO ByteString
LLVM.moduleBitcode Module
mdl
  Result
ptx <- [(ShortByteString, ByteString)] -> [CompileOption] -> IO Result
NVVM.compileModules ((ShortByteString
"",ByteString
header) (ShortByteString, ByteString)
-> [(ShortByteString, ByteString)]
-> [(ShortByteString, ByteString)]
forall a. a -> [a] -> [a]
: (ShortByteString
name,ByteString
bc) (ShortByteString, ByteString)
-> [(ShortByteString, ByteString)]
-> [(ShortByteString, ByteString)]
forall a. a -> [a] -> [a]
: [(ShortByteString, ByteString)]
libdevice) [CompileOption]
flags

  Bool -> IO () -> IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless (ByteString -> Bool
B.null (Result -> ByteString
NVVM.compileLog Result
ptx)) (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ do
    Flag -> FilePath -> IO ()
Debug.traceIO Flag
Debug.dump_cc (FilePath -> IO ()) -> FilePath -> IO ()
forall a b. (a -> b) -> a -> b
$ FilePath
"llvm: " FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ ByteString -> FilePath
B8.unpack (Result -> ByteString
NVVM.compileLog Result
ptx)

  -- Return the generated binary code
  ByteString -> IO ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return (Result -> ByteString
NVVM.compileResult Result
ptx)


-- Compiling with the NVPTX backend uses LLVM-3.3 and above
--
_compileModuleNVPTX :: CUDA.DeviceProperties -> LLVM.Module -> IO ByteString
_compileModuleNVPTX :: DeviceProperties -> Module -> IO ByteString
_compileModuleNVPTX DeviceProperties
dev Module
mdl =
  DeviceProperties
-> (TargetMachine -> IO ByteString) -> IO ByteString
forall a.
HasCallStack =>
DeviceProperties -> (TargetMachine -> IO a) -> IO a
withPTXTargetMachine DeviceProperties
dev ((TargetMachine -> IO ByteString) -> IO ByteString)
-> (TargetMachine -> IO ByteString) -> IO ByteString
forall a b. (a -> b) -> a -> b
$ \TargetMachine
nvptx -> do

    Bool -> IO () -> IO ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
when Bool
Debug.internalChecksAreEnabled (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ Module -> IO ()
LLVM.verify Module
mdl

    -- Run the standard optimisation pass
    --
    let pss :: PassSetSpec
pss = PassSetSpec
LLVM.defaultCuratedPassSetSpec { optLevel :: Maybe Word
LLVM.optLevel = Word -> Maybe Word
forall a. a -> Maybe a
Just Word
3 }
    PassSetSpec -> (PassManager -> IO ByteString) -> IO ByteString
forall a. PassSetSpec -> (PassManager -> IO a) -> IO a
LLVM.withPassManager PassSetSpec
pss ((PassManager -> IO ByteString) -> IO ByteString)
-> (PassManager -> IO ByteString) -> IO ByteString
forall a b. (a -> b) -> a -> b
$ \PassManager
pm -> do

      Bool
b1 <- PassManager -> Module -> IO Bool
LLVM.runPassManager PassManager
pm Module
mdl

      -- debug printout
      Flag -> IO () -> IO ()
forall (m :: * -> *). MonadIO m => Flag -> m () -> m ()
Debug.when Flag
Debug.dump_cc (IO () -> IO ()) -> IO () -> IO ()
forall a b. (a -> b) -> a -> b
$ do
        Flag -> FilePath -> IO ()
Debug.traceIO Flag
Debug.dump_cc (FilePath -> IO ()) -> FilePath -> IO ()
forall a b. (a -> b) -> a -> b
$ FilePath -> FilePath -> FilePath
forall r. PrintfType r => FilePath -> r
printf FilePath
"llvm: optimisation did work? %s" (Bool -> FilePath
forall a. Show a => a -> FilePath
show Bool
b1)
        Flag -> FilePath -> IO ()
Debug.traceIO Flag
Debug.verbose (FilePath -> IO ())
-> (ByteString -> FilePath) -> ByteString -> IO ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> FilePath
B8.unpack (ByteString -> IO ()) -> IO ByteString -> IO ()
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< Module -> IO ByteString
LLVM.moduleLLVMAssembly Module
mdl

      -- Lower the LLVM module into target assembly (PTX)
      TargetMachine -> Module -> IO ByteString
moduleTargetAssembly TargetMachine
nvptx Module
mdl


-- | Produce target specific assembly as a 'ByteString'.
--
moduleTargetAssembly :: LLVM.TargetMachine -> LLVM.Module -> IO ByteString
moduleTargetAssembly :: TargetMachine -> Module -> IO ByteString
moduleTargetAssembly TargetMachine
tm Module
m = ByteString -> IO ByteString
unsafe0 (ByteString -> IO ByteString) -> IO ByteString -> IO ByteString
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< CodeGenFileType -> TargetMachine -> Module -> IO ByteString
LLVM.Internal.emitToByteString CodeGenFileType
LLVM.Internal.FFI.codeGenFileTypeAssembly TargetMachine
tm Module
m
  where
    -- Ensure that the ByteString is NULL-terminated, so that it can be passed
    -- directly to C. This will unsafely mutate the underlying ForeignPtr if the
    -- string is not NULL-terminated but the last character is a whitespace
    -- character (there are usually a few blank lines at the end).
    --
    unsafe0 :: ByteString -> IO ByteString
    unsafe0 :: ByteString -> IO ByteString
unsafe0 bs :: ByteString
bs@(B.PS ForeignPtr Word8
fp Int
s Int
l) =
      IO ByteString -> IO ByteString
forall (m :: * -> *) a. MonadIO m => IO a -> m a
liftIO (IO ByteString -> IO ByteString)
-> ((Ptr Word8 -> IO ByteString) -> IO ByteString)
-> (Ptr Word8 -> IO ByteString)
-> IO ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ForeignPtr Word8 -> (Ptr Word8 -> IO ByteString) -> IO ByteString
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Word8
fp ((Ptr Word8 -> IO ByteString) -> IO ByteString)
-> (Ptr Word8 -> IO ByteString) -> IO ByteString
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
p -> do
        let p' :: Ptr Word8
            p' :: Ptr Word8
p' = Ptr Word8
p Ptr Word8 -> Int -> Ptr Word8
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` (Int
sInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
lInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1)
        --
        Word8
x <- Ptr Word8 -> IO Word8
forall a. Storable a => Ptr a -> IO a
peek Ptr Word8
p'
        case Word8
x of
          Word8
0                    -> ByteString -> IO ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return ByteString
bs
          Word8
_ | Word8 -> Bool
B.isSpaceWord8 Word8
x -> Ptr Word8 -> Word8 -> IO ()
forall a. Storable a => Ptr a -> a -> IO ()
poke Ptr Word8
p' Word8
0 IO () -> IO ByteString -> IO ByteString
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ByteString -> IO ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return ByteString
bs
          Word8
_                    -> ByteString -> IO ByteString
forall (m :: * -> *) a. Monad m => a -> m a
return (ByteString -> Word8 -> ByteString
B.snoc ByteString
bs Word8
0)