{-# LANGUAGE CPP                 #-}
{-# LANGUAGE GADTs               #-}
{-# LANGUAGE PatternGuards       #-}
{-# LANGUAGE RecordWildCards     #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TemplateHaskell     #-}
{-# LANGUAGE TupleSections       #-}
-- |
-- Module      : Data.Array.Accelerate.CUDA.Compile
-- Copyright   : [2008..2014] Manuel M T Chakravarty, Gabriele Keller
--               [2009..2014] Trevor L. McDonell
-- License     : BSD3
--
-- Maintainer  : Trevor L. McDonell <tmcdonell@cse.unsw.edu.au>
-- Stability   : experimental
-- Portability : non-portable (GHC extensions)
--

module Data.Array.Accelerate.CUDA.Compile (

  -- * generate and compile kernels to realise a computation
  compileAcc, compileAfun,

) where

-- friends
import Data.Array.Accelerate.Error
import Data.Array.Accelerate.Lifetime
import Data.Array.Accelerate.Trafo
import Data.Array.Accelerate.CUDA.AST
import Data.Array.Accelerate.CUDA.State
import Data.Array.Accelerate.CUDA.Context
import Data.Array.Accelerate.CUDA.CodeGen
import Data.Array.Accelerate.CUDA.Array.Sugar
import Data.Array.Accelerate.CUDA.Analysis.Launch
import Data.Array.Accelerate.CUDA.Foreign.Import                ( canExecuteAcc, canExecuteExp )
import Data.Array.Accelerate.CUDA.Persistent                    as KT
import qualified Data.Array.Accelerate.FullList                 as FL
import qualified Data.Array.Accelerate.CUDA.Debug               as D

-- libraries
import Numeric
import Control.Applicative                                      hiding ( Const )
import Control.Exception
import Control.Monad
import Control.Monad.Reader                                     ( asks )
import Control.Monad.State                                      ( gets )
import Control.Monad.Trans                                      ( liftIO, MonadIO )
import Control.Concurrent
import Crypto.Hash.MD5                                          ( hashlazy )
import Data.List                                                ( intercalate )
import Data.Bits
import Data.Maybe
import Data.Monoid
import System.Directory
import System.Exit                                              ( ExitCode(..) )
import System.FilePath
import System.IO
import System.IO.Error
import System.IO.Unsafe
import System.Process
import Text.PrettyPrint.Mainland                                ( ppr, renderCompact, displayLazyText )
import qualified Data.ByteString                                as B
import qualified Data.Text.Lazy                                 as T
import qualified Data.Text.Lazy.IO                              as T
import qualified Data.Text.Lazy.Encoding                        as T
import qualified Control.Concurrent.MSem                        as Q
import qualified Foreign.CUDA.Driver                            as CUDA
import qualified Foreign.CUDA.Analysis                          as CUDA
import Prelude                                                  hiding ( exp, scanl, scanr )

import GHC.Conc                                                 ( getNumProcessors )

#ifdef ACCELERATE_DEBUG
import System.Time
#endif

-- Multiplatform support for dealing with external process spawning
#if   defined(UNIX)
import System.Posix.Process
#elif defined(WIN32)
import System.Win32.Process                                     hiding ( ProcessHandle )
#else
#error "I don't know what operating system I am"
#endif

import Paths_accelerate_cuda                                    ( getDataDir )


-- | Initiate code generation, compilation, and data transfer for an array
-- expression. The returned array computation is annotated so to be suitable for
-- execution in the CUDA environment. This includes:
--
--   * list of array variables embedded within scalar expressions
--
--   * kernel object(s) required to executed the kernel
--
compileAcc :: DelayedAcc a -> CIO (ExecAcc a)
compileAcc = compileOpenAcc

compileAfun :: DelayedAfun f -> CIO (ExecAfun f)
compileAfun = compileOpenAfun


compileOpenAfun :: DelayedOpenAfun aenv f -> CIO (PreOpenAfun ExecOpenAcc aenv f)
compileOpenAfun (Alam l)  = Alam  <$> compileOpenAfun l
compileOpenAfun (Abody b) = Abody <$> compileOpenAcc b


compileOpenAcc :: DelayedOpenAcc aenv a -> CIO (ExecOpenAcc aenv a)
compileOpenAcc = traverseAcc
  where
    -- Traverse an open array expression in depth-first order. The top-level
    -- function traverseAcc is intended for manifest arrays that we will
    -- generate CUDA code for. Array valued subterms, which might be manifest or
    -- delayed, are handled separately.
    --
    -- The applicative combinators are used to gloss over that we are passing
    -- around the AST nodes together with a set of free variable indices that
    -- are merged at every step.
    --
    traverseAcc :: forall aenv arrs. DelayedOpenAcc aenv arrs -> CIO (ExecOpenAcc aenv arrs)
    traverseAcc Delayed{} = $internalError "compileOpenAcc" "unexpected delayed array"
    traverseAcc topAcc@(Manifest pacc) =
      case pacc of
        -- Environment and control flow
        Avar ix                 -> node $ pure (Avar ix)
        Alet a b                -> node . pure =<< Alet         <$> traverseAcc a <*> traverseAcc b
        Apply f a               -> node =<< liftA2 Apply        <$> travAF f <*> travA a
        Awhile p f a            -> node =<< liftA3 Awhile       <$> travAF p <*> travAF f <*> travA a
        Acond p t e             -> node =<< liftA3 Acond        <$> travE  p <*> travA  t <*> travA e
        Atuple tup              -> node =<< liftA Atuple        <$> travAtup tup
        Aprj ix tup             -> node =<< liftA (Aprj ix)     <$> travA    tup

        -- Foreign
        Aforeign ff afun a      -> foreignA ff afun a

        -- Array injection
        Unit e                  -> node =<< liftA  Unit         <$> travE e
        Use arrs                -> use (arrays (undefined::arrs)) arrs >> node (pure $ Use arrs)

        -- Index space transforms
        Reshape s a             -> node =<< liftA2 Reshape              <$> travE s <*> travA a
        Replicate slix e a      -> exec =<< liftA2 (Replicate slix)     <$> travE e <*> travA a
        Slice slix a e          -> exec =<< liftA2 (Slice slix)         <$> travA a <*> travE e
        Backpermute e f a       -> exec =<< liftA3 Backpermute          <$> travE e <*> travF f <*> travA a

        -- Producers
        Generate e f            -> exec =<< liftA2 Generate             <$> travE e <*> travF f
        Map f a                 -> exec =<< liftA2 Map                  <$> travF f <*> travA a
        ZipWith f a b           -> exec =<< liftA3 ZipWith              <$> travF f <*> travA a <*> travA b
        Transform e p f a       -> exec =<< liftA4 Transform            <$> travE e <*> travF p <*> travF f <*> travA a

        -- Consumers
        Fold f z a              -> exec =<< liftA3 Fold                 <$> travF f <*> travE z <*> travA a
        Fold1 f a               -> exec =<< liftA2 Fold1                <$> travF f <*> travA a
        FoldSeg f e a s         -> exec =<< liftA4 FoldSeg              <$> travF f <*> travE e <*> travA a <*> travA s
        Fold1Seg f a s          -> exec =<< liftA3 Fold1Seg             <$> travF f <*> travA a <*> travA s
        Scanl f e a             -> exec =<< liftA3 Scanl                <$> travF f <*> travE e <*> travA a
        Scanl' f e a            -> exec =<< liftA3 Scanl'               <$> travF f <*> travE e <*> travA a
        Scanl1 f a              -> exec =<< liftA2 Scanl1               <$> travF f <*> travA a
        Scanr f e a             -> exec =<< liftA3 Scanr                <$> travF f <*> travE e <*> travA a
        Scanr' f e a            -> exec =<< liftA3 Scanr'               <$> travF f <*> travE e <*> travA a
        Scanr1 f a              -> exec =<< liftA2 Scanr1               <$> travF f <*> travA a
        Permute f d g a         -> exec =<< liftA4 Permute              <$> travF f <*> travA d <*> travF g <*> travA a
        Stencil f b a           -> exec =<< liftA2 (flip Stencil b)     <$> travF f <*> travA a
        Stencil2 f b1 a1 b2 a2  -> exec =<< liftA3 stencil2             <$> travF f <*> travA a1 <*> travA a2
          where stencil2 f' a1' a2' = Stencil2 f' b1 a1' b2 a2'

        -- Loops
        -- Collect l               -> ExecSeq <$> compileOpenSeq l

      where
        use :: ArraysR a -> a -> CIO ()
        use ArraysRunit         ()       = return ()
        use ArraysRarray        arr      = useArrayAsync arr Nothing
        use (ArraysRpair r1 r2) (a1, a2) = use r1 a1 >> use r2 a2

        exec :: (Free aenv, PreOpenAcc ExecOpenAcc aenv arrs) -> CIO (ExecOpenAcc aenv arrs)
        exec (aenv, eacc) = do
          let gamma = makeEnvMap aenv
          kernel <- build topAcc gamma
          return $! ExecAcc (fullOfList kernel) gamma eacc

        node :: (Free aenv', PreOpenAcc ExecOpenAcc aenv' arrs') -> CIO (ExecOpenAcc aenv' arrs')
        node = fmap snd . wrap

        wrap :: (Free aenv', PreOpenAcc ExecOpenAcc aenv' arrs') -> CIO (Free aenv', ExecOpenAcc aenv' arrs')
        wrap = return . liftA (ExecAcc noKernel mempty)

        travA :: DelayedOpenAcc aenv a -> CIO (Free aenv, ExecOpenAcc aenv a)
        travA acc = case acc of
          Manifest{}    -> pure                    <$> traverseAcc acc
          Delayed{..}   -> liftA2 (const EmbedAcc) <$> travF indexD <*> travE extentD

        travAF :: DelayedOpenAfun aenv f -> CIO (Free aenv, PreOpenAfun ExecOpenAcc aenv f)
        travAF afun = pure <$> compileOpenAfun afun

        travAtup :: Atuple (DelayedOpenAcc aenv) a -> CIO (Free aenv, Atuple (ExecOpenAcc aenv) a)
        travAtup NilAtup        = return (pure NilAtup)
        travAtup (SnocAtup t a) = liftA2 SnocAtup <$> travAtup t <*> travA a

        travE :: DelayedOpenExp env aenv e
              -> CIO (Free aenv, PreOpenExp ExecOpenAcc env aenv e)
        travE = compileOpenExp

        travF :: DelayedOpenFun env aenv t -> CIO (Free aenv, PreOpenFun ExecOpenAcc env aenv t)
        travF (Body b)  = liftA Body <$> travE b
        travF (Lam  f)  = liftA Lam  <$> travF f

        noKernel :: FL.FullList () (AccKernel a)
        noKernel =  FL.FL () ($internalError "compile" "no kernel module for this node") FL.Nil

        fullOfList :: [a] -> FL.FullList () a
        fullOfList []       = $internalError "fullList" "empty list"
        fullOfList [x]      = FL.singleton () x
        fullOfList (x:xs)   = FL.cons () x (fullOfList xs)

        -- If the foreign function targets this backend, drop the remaining
        -- alternatives from the AST. Similarly, we drop the foreign node if it
        -- does not target this backend.
        --
        foreignA :: (Arrays as, Arrays bs, Foreign asm)
                 => asm         (as -> bs)
                 -> DelayedAfun (as -> bs)
                 -> DelayedOpenAcc aenv as
                 -> CIO (ExecOpenAcc aenv bs)
        foreignA ff afun a =
          case canExecuteAcc ff of
            Nothing -> traverseAcc $ Manifest (Apply (weaken absurd afun) a)
            Just{}  -> node =<< liftA (Aforeign ff err) <$> travA a
            where
              absurd :: Idx () t -> Idx env t
              absurd = absurd
              err    = $internalError "Aforeign" "failed to recover foreign function a second time"

-- Traverse a scalar expression
--
compileOpenExp
    :: DelayedOpenExp env aenv e
    -> CIO (Free aenv, PreOpenExp ExecOpenAcc env aenv e)
compileOpenExp topExp =
  case topExp of
    Var ix                  -> return $ pure (Var ix)
    Const c                 -> return $ pure (Const c)
    PrimConst c             -> return $ pure (PrimConst c)
    IndexAny                -> return $ pure IndexAny
    IndexNil                -> return $ pure IndexNil
    Foreign ff f x          -> foreignE ff f x
    --
    Let a b                 -> liftA2 Let                   <$> travE a <*> travE b
    IndexCons t h           -> liftA2 IndexCons             <$> travE t <*> travE h
    IndexHead h             -> liftA  IndexHead             <$> travE h
    IndexTail t             -> liftA  IndexTail             <$> travE t
    IndexSlice slix x s     -> liftA2 (IndexSlice slix)     <$> travE x <*> travE s
    IndexFull slix x s      -> liftA2 (IndexFull slix)      <$> travE x <*> travE s
    ToIndex s i             -> liftA2 ToIndex               <$> travE s <*> travE i
    FromIndex s i           -> liftA2 FromIndex             <$> travE s <*> travE i
    Tuple t                 -> liftA  Tuple                 <$> travT t
    Prj ix e                -> liftA  (Prj ix)              <$> travE e
    Cond p t e              -> liftA3 Cond                  <$> travE p <*> travE t <*> travE e
    While p f x             -> liftA3 While                 <$> travF p <*> travF f <*> travE x
    PrimApp f e             -> liftA  (PrimApp f)           <$> travE e
    Index a e               -> liftA2 Index                 <$> travA a <*> travE e
    LinearIndex a e         -> liftA2 LinearIndex           <$> travA a <*> travE e
    Shape a                 -> liftA  Shape                 <$> travA a
    ShapeSize e             -> liftA  ShapeSize             <$> travE e
    Intersect x y           -> liftA2 Intersect             <$> travE x <*> travE y
    Union x y               -> liftA2 Union                 <$> travE x <*> travE y

  where
    travA :: (Shape sh, Elt e)
          => DelayedOpenAcc aenv (Array sh e)
          -> CIO (Free aenv, ExecOpenAcc aenv (Array sh e))
    travA a = do
      a'    <- compileOpenAcc a
      return $ (bind a', a')

    travT :: Tuple (DelayedOpenExp env aenv) t
          -> CIO (Free aenv, Tuple (PreOpenExp ExecOpenAcc env aenv) t)
    travT NilTup        = return (pure NilTup)
    travT (SnocTup t e) = liftA2 SnocTup <$> travT t <*> travE e

    travE :: DelayedOpenExp env aenv e
          -> CIO (Free aenv, PreOpenExp ExecOpenAcc env aenv e)
    travE = compileOpenExp

    travF :: DelayedOpenFun env aenv t -> CIO (Free aenv, PreOpenFun ExecOpenAcc env aenv t)
    travF (Body b)  = liftA Body <$> travE b
    travF (Lam  f)  = liftA Lam  <$> travF f

    foreignE :: (Elt a, Elt b, Foreign asm)
             => asm           (a -> b)
             -> DelayedFun () (a -> b)
             -> DelayedOpenExp env aenv a
             -> CIO (Free aenv, PreOpenExp ExecOpenAcc env aenv b)
    foreignE ff f x = case canExecuteExp ff of
      -- If it's a foreign function that we can generate code from, just
      -- leave it alone. As the pure function is closed, the array
      -- environment needs to be replaced with one of the right type.
      --
      Just _        -> liftA2 (Foreign ff) <$> pure <$> snd <$> travF f <*> travE x

      -- If the foreign function is not intended for this backend, this node
      -- needs to be replaced by a pure accelerate node giving the same
      -- result. Due to the lack of an 'apply' node in the scalar language,
      -- this is done by substitution.
      --
      Nothing       -> travE (apply f x)
        where
          -- Twiddle the environment variables
          --
          apply :: DelayedFun () (a -> b) -> DelayedOpenExp env aenv a -> DelayedOpenExp env aenv b
          apply (Lam (Body b)) e    = Let e $ weaken wAcc $ weakenE wExp b
          apply _ _                 = error "This was a triumph."

          -- As the expression we want to weaken is closed with respect to the array
          -- environment, the index manipulation function becomes a dummy argument.
          --
          wAcc :: Idx () t -> Idx aenv t
          wAcc _                    = error "I'm making a note here:"

          wExp :: Idx ((),a) t -> Idx (env,a) t
          wExp ZeroIdx              = ZeroIdx
          wExp _                    = error "HUGE SUCCESS"

    bind :: (Shape sh, Elt e) => ExecOpenAcc aenv (Array sh e) -> Free aenv
    bind (ExecAcc _ _ (Avar ix)) = freevar ix
    bind _                       = $internalError "bind" "expected array variable"

{--
compileSeq :: DelayedSeq a -> CIO (ExecSeq a)
compileSeq (DelayedSeq aenv s) = ExecS <$> compileExtend aenv <*> compileOpenSeq s
  where
    compileExtend :: Extend DelayedOpenAcc aenv aenv' -> CIO (Extend ExecOpenAcc aenv aenv')
    compileExtend BaseEnv       = return BaseEnv
    compileExtend (PushEnv e a) = PushEnv <$> compileExtend e <*> compileOpenAcc a

compileOpenSeq
    :: forall aenv lenv arrs'.
       PreOpenSeq DelayedOpenAcc aenv lenv arrs'
    -> CIO (ExecOpenSeq aenv lenv arrs')
compileOpenSeq l =
  case l of
    Producer   p l' -> ExecP <$> compileP p <*> compileOpenSeq l'
    Consumer   c    -> ExecC <$> compileC c
    Reify ix        -> return $ ExecR ix Nothing
  where
    compileP :: forall a. Producer DelayedOpenAcc aenv lenv a -> CIO (ExecP aenv lenv a)
    compileP p =
      case p of
        ToSeq slix (_ :: proxy slix) acc -> do
          case acc of
            -- In the case of converting an array that has not already been copied
            -- to device memory, we are smart and treat it specially.
            Manifest (Use a) -> return $ ExecUseLazy slix (toArr a) ([] :: [slix])
            _   -> do
              (free1, acc') <- travA acc
              let gamma = makeEnvMap free1
              dev <- asks deviceProperties
              -- The array computation passed to 'toSeq' needs to be treated
              -- specially. We don't want the entire array to be made manifest
              -- if we can help it. In the event it is a delayed array, we make
              -- the subarrays manifest one at a time and feed them to the 'Seq'
              -- computation.
              --
              -- For the purposes of device configuration and launching, this can
              -- be seen to work like 'Slice', even though in reality it
              -- resembles a delayed 'Slice'.
              let acc'' = Manifest (Slice slix acc (Const (zeroSlice slix) :: DelayedExp aenv slix))

              kernel <- build1 acc'' (codegenToSeq slix dev acc gamma)
              return $ ExecToSeq slix acc' kernel gamma ([] :: [slix])
        StreamIn xs -> return $ ExecStreamIn xs
        MapSeq f x -> do
          f' <- compileOpenAfun f
          return $ ExecMap f' x
        ZipWithSeq f x y -> do
          f' <- compileOpenAfun f
          return $ ExecZipWith f' x y
        ScanSeq f a0 x ->  do
          (_, a0') <- travE a0
          (_, f')  <- travF f
          return $ ExecScanSeq f' a0' x Nothing
        ChunkedMapSeq{} -> error "TODO: @fmma needs to finish this..."

    compileC :: forall a. Consumer DelayedOpenAcc aenv lenv a -> CIO (ExecC aenv lenv a)
    compileC c =
      case c of
        FoldSeq f a0 x -> do
          (_, a0') <- travE a0
          (_, f')  <- travF f
          return $ ExecFoldSeq f' a0' x Nothing
        FoldSeqFlatten f acc x -> do
          acc' <- compileOpenAcc acc
          f' <- compileOpenAfun f
          return $ ExecFoldSeqFlatten f' acc' x Nothing
        Stuple t -> ExecStuple <$> compileCT t

    compileCT :: forall t. Atuple (Consumer DelayedOpenAcc aenv lenv) t -> CIO (Atuple (ExecC aenv lenv) t)
    compileCT NilAtup        = return NilAtup
    compileCT (SnocAtup t c) = SnocAtup <$> compileCT t <*> compileC c

    travA :: DelayedOpenAcc aenv a -> CIO (Free aenv, ExecOpenAcc aenv a)
    travA acc = case acc of
      Manifest{}    -> pure                    <$> compileOpenAcc acc
      Delayed{..}   -> liftA2 (const EmbedAcc) <$> travF indexD <*> travE extentD

    travE :: DelayedOpenExp env aenv e
          -> CIO (Free aenv, PreOpenExp ExecOpenAcc env aenv e)
    travE = compileOpenExp

    travF :: DelayedOpenFun env aenv t -> CIO (Free aenv, PreOpenFun ExecOpenAcc env aenv t)
    travF (Body b)  = liftA Body <$> travE b
    travF (Lam  f)  = liftA Lam  <$> travF f

    zeroSlice :: SliceIndex slix sl co sh -> slix
    zeroSlice SliceNil = ()
    zeroSlice (SliceFixed sl) = (zeroSlice sl, 0)
    zeroSlice (SliceAll sl)   = (zeroSlice sl, ())
--}


-- Applicative
-- -----------
--
liftA4 :: Applicative f => (a -> b -> c -> d -> e) -> f a -> f b -> f c -> f d -> f e
liftA4 f a b c d = f <$> a <*> b <*> c <*> d


-- Compilation
-- -----------

-- Generate, compile, and link code to evaluate an array computation. We use
-- 'unsafePerformIO' here to leverage laziness, so that the 'link' function
-- evaluates and blocks on the external compiler only once the compiled object
-- is truly needed.
--
build :: DelayedOpenAcc aenv a -> Gamma aenv -> CIO [AccKernel a]
build acc aenv = do
  dev   <- asks deviceProperties
  mapM (build1 acc) (codegenAcc dev acc aenv)

build1 :: DelayedOpenAcc aenv a -> CUTranslSkel aenv a -> CIO (AccKernel a)
build1 acc code = do
  context       <- asks activeContext
  let dev       =  deviceProperties context
  table         <- gets kernelTable
  (entry,key)   <- compile table dev code
  let (cta,blocks,smem) = launchConfig acc dev occ
      (mdl,fun,occ)     = unsafePerformIO $ do
        m <- link context table key
        f <- withLifetime m $ flip CUDA.getFun entry
        l <- CUDA.requires f CUDA.MaxKernelThreadsPerBlock
        o <- determineOccupancy acc dev f l
        D.when D.dump_cc (stats entry f o)
        return (m,f,o)
  --
  return $ AccKernel entry fun mdl occ cta smem blocks
  where
    stats name fn occ = do
      regs      <- CUDA.requires fn CUDA.NumRegs
      smem      <- CUDA.requires fn CUDA.SharedSizeBytes
      cmem      <- CUDA.requires fn CUDA.ConstSizeBytes
      lmem      <- CUDA.requires fn CUDA.LocalSizeBytes
      let msg1  = "entry function '" ++ name ++ "' used "
                  ++ shows regs " registers, "  ++ shows smem " bytes smem, "
                  ++ shows lmem " bytes lmem, " ++ shows cmem " bytes cmem"
          msg2  = "multiprocessor occupancy " ++ showFFloat (Just 1) (CUDA.occupancy100 occ) "% : "
                  ++ shows (CUDA.activeThreads occ)      " threads over "
                  ++ shows (CUDA.activeWarps occ)        " warps in "
                  ++ shows (CUDA.activeThreadBlocks occ) " blocks"
      --
      -- make sure kernel/stats are printed together. Use 'intercalate' rather
      -- than 'unlines' to avoid a trailing newline.
      --
      message   $ intercalate "\n      ... " [msg1, msg2]


-- Link a compiled binary and update the associated kernel entry in the hash
-- table. This may entail waiting for the external compilation process to
-- complete. If successful, the temporary files are removed.
--
link :: Context -> KernelTable -> KernelKey -> IO (Lifetime CUDA.Module)
link context table key =
  let intErr    = $internalError "link" "missing kernel entry"
      ctx       = deviceContext context
      weak_ctx  = weakContext context
  in do
    entry       <- fromMaybe intErr `fmap` KT.lookup context table key
    case entry of
      CompileProcess cufile done -> do
        -- Wait for the compiler to finish and load the binary object into the
        -- current context.
        --
        -- A forked thread will fill the MVar once the external compilation
        -- process completes, but only the main thread executes kernels. Hence,
        -- only one thread will ever attempt to take the MVar in order to link
        -- the binary object.
        --
        message "waiting for nvcc..."
        let cubin       =  replaceExtension cufile ".cubin"
        ()              <- takeMVar done
        bin             <- B.readFile cubin
        mdl             <- CUDA.loadData bin
        lmdl            <- newLifetime mdl
        addFinalizer lmdl (module_finalizer weak_ctx key lmdl)

        -- Update hash tables and stash the binary object into the persistent
        -- cache
        --
        KT.insert table key $! KernelObject bin (FL.singleton ctx lmdl)
        KT.persist table cubin key

        -- Remove temporary build products.
        -- If compiling kernels with debugging symbols, leave the source files
        -- in place so that they can be referenced by 'cuda-gdb'.
        --
        D.unless D.debug_cc $ do
          removeFile      cufile
          removeDirectory (dropFileName cufile)
            `catchIOError` \_ -> return ()      -- directory not empty

        return lmdl

      -- If we get a real object back, then this will already be in the
      -- persistent cache, since either it was just read in from there, or we
      -- had to generate new code and the link step above has added it.
      --
      KernelObject bin active
        | Just lmdl <- FL.lookup ctx active     -> return lmdl
        | otherwise                             -> do
            message "re-linking module for current context"
            mdl                 <- CUDA.loadData bin
            lmdl                <- newLifetime mdl
            addFinalizer lmdl (module_finalizer weak_ctx key lmdl)
            KT.insert table key $! KernelObject bin (FL.cons ctx lmdl active)
            return lmdl


-- Generate and compile code for a single open array expression
--
compile
    :: KernelTable
    -> CUDA.DeviceProperties
    -> CUTranslSkel aenv a
    -> CIO (String, KernelKey)
compile table dev cunit = do
  context       <- asks activeContext
  exists        <- isJust `fmap` liftIO (KT.lookup context table key)
  unless exists $ do
    message     $  unlines [ show key, T.unpack code ]
    nvcc        <- fromMaybe (error "nvcc: command not found") <$> liftIO (findExecutable "nvcc")
    (file,hdl)  <- openTemporaryFile "dragon.cu"   -- rawr!
    flags       <- compileFlags file
    done        <- liftIO $ do
      T.hPutStr hdl code        `finally`     hClose hdl
      enqueueProcess nvcc flags `onException` removeFile file
    --
    liftIO $ KT.insert table key (CompileProcess file done)
  --
  return (entry, key)
  where
    entry       = show cunit
    key         = (CUDA.computeCapability dev, hashlazy (T.encodeUtf8 code) )
    code        = displayLazyText . renderCompact $ ppr cunit


-- Determine the appropriate command line flags to pass to the compiler process.
-- This is dependent on the host architecture and device capabilities.
--
compileFlags :: FilePath -> CIO [String]
compileFlags cufile = do
  CUDA.Compute m n      <- CUDA.computeCapability `fmap` asks deviceProperties
  ddir                  <- liftIO getDataDir
  warnings              <- liftIO $ (&&) <$> D.queryFlag D.dump_cc <*> D.queryFlag D.verbose
  debug                 <- liftIO $ D.queryFlag D.debug_cc
  return                $  filter (not . null) $
    [ "-I", ddir </> "cubits"
    , "-arch=sm_" ++ show m ++ show n
    , "-cubin"
--    , "--restrict"    -- requires nvcc >= 5.0
--    , "--maxrregcount", "32"
    , "-o", cufile `replaceExtension` "cubin"
    , if warnings then ""   else "--disable-warnings"
    , if debug    then ""   else "-DNDEBUG"
    , if debug    then "-G" else "-O3"
    , machine
    , cufile ]
  where
    machine     = case finiteBitSize (undefined :: Int) of
                    32  -> "-m32"
                    64  -> "-m64"
                    _   -> $internalError "compileFlags" "unknown 'Int' size"


-- Open a unique file in the temporary directory used for compilation
-- by-products. The directory will be created if it does not exist.
--
openTemporaryFile :: String -> CIO (FilePath, Handle)
openTemporaryFile template = liftIO $ do
  pid <- getProcessID
  dir <- (</>) <$> getTemporaryDirectory <*> pure ("accelerate-cuda-" ++ show pid)
  createDirectoryIfMissing True dir
  openTempFile dir template

#if defined(WIN32)
-- TLM: On windows, how do we get either the ProcessID or ProcessHandle of the
--      current process? For new, just use a dummy value (the sound of
--      disappearing down a rabbit hole...)
--
getProcessID :: IO ProcessId
getProcessID = return 0xaaaa
#endif


-- Worker pool
-- -----------

{-# NOINLINE workers #-}
workers :: Q.MSem Int
workers = unsafePerformIO $ Q.new =<< getNumProcessors

-- Queue a system process to be executed and return an MVar flag that will be
-- filled once the process completes. The task will only begin once there is a
-- worker available from the pool. This ensures we don't run out of process
-- handles or flood the IO bus, degrading performance.
--
enqueueProcess :: FilePath -> [String] -> IO (MVar ())
enqueueProcess nvcc flags = do
  mvar  <- newEmptyMVar
  _     <- forkIO $ do

    -- Wait for a worker to become available
    (ccT, queueT) <- time $ Q.with workers $ do

      -- Initiate the external process...
      ccBegin           <- getTime
      (_,_,_,pid)       <- createProcess (proc nvcc flags)

      -- ... and wait for it to complete
      waitFor pid
        -- If compilation fails for some reason, fill the MVar by re-throwing
        -- the exception. This prevents the host thread from waiting
        -- indefinitely, which then requires the program to be killed manually.
        `catch` \(e :: SomeException) -> do putMVar mvar (throw e)
      ccEnd             <- getTime

      return (diffTime ccBegin ccEnd)
    --
    let msg2  = nvcc ++ " " ++ unwords flags
        msg1  = "queue: " ++ D.showFFloatSIBase (Just 3) 1000 queueT "s, "
           ++ "execute: " ++ D.showFFloatSIBase (Just 3) 1000 ccT    "s"

    message $ intercalate "\n     ... " [msg1, msg2]

    -- Signal to the host thread that the compiled result is available
    putMVar mvar ()
  --
  return mvar


-- Wait for a (compilation) process to finish
--
waitFor :: ProcessHandle -> IO ()
waitFor pid = do
  status <- waitForProcess pid
  case status of
    ExitSuccess   -> return ()
    ExitFailure c -> error $ "nvcc terminated abnormally (" ++ show c ++ ")"


-- Debug
-- -----

-- Get the current wall clock time in picoseconds since the epoch
--
{-# INLINE getTime #-}
getTime :: IO Integer
#ifdef ACCELERATE_DEBUG
getTime = do
  TOD sec pico  <- getClockTime
  return        $! pico + sec * 1000000000000
#else
getTime = return 0
#endif

-- Return the difference between the first and second (later) time in seconds
--
{-# INLINE diffTime #-}
diffTime :: Integer -> Integer -> Double
diffTime t1 t2 = fromIntegral (t2 - t1) * 1E-12

-- Return the number of seconds of wall-clock time it took to execute the given
-- action. Makes sure to `deepseq` or otherwise fully evaluate the action before
-- returning from the task, otherwise there is a good chance you'll just pass a
-- suspension out and the elapsed time will be zero.
--
time :: IO a -> IO (a, Double)
{-# NOINLINE time #-}
time p = do
  start <- getTime
  res   <- p
  end   <- getTime
  return $ (res, diffTime start end)


{-# INLINE message #-}
message :: MonadIO m => String -> m ()
message msg = liftIO $ D.traceIO D.dump_cc ("cc: " ++ msg)