{-# LANGUAGE CPP, GADTs, TupleSections, ScopedTypeVariables #-}
-- |
-- Module      : Data.Array.Accelerate.CUDA.Compile
-- Copyright   : [2008..2010] Manuel M T Chakravarty, Gabriele Keller, Sean Lee
--               [2009..2012] Manuel M T Chakravarty, Gabriele Keller, Trevor L. McDonell
-- License     : BSD3
--
-- Maintainer  : Trevor L. McDonell <tmcdonell@cse.unsw.edu.au>
-- Stability   : experimental
-- Portability : non-partable (GHC extensions)
--

module Data.Array.Accelerate.CUDA.Compile (

  -- * generate and compile kernels to realise a computation
  compileAcc, compileAfun1

) where

#include "accelerate.h"

-- friends
import Data.Array.Accelerate.Type
import Data.Array.Accelerate.Tuple

import Data.Array.Accelerate.CUDA.AST
import Data.Array.Accelerate.CUDA.State
import Data.Array.Accelerate.CUDA.CodeGen
import Data.Array.Accelerate.CUDA.Array.Sugar
import Data.Array.Accelerate.CUDA.Analysis.Launch
import Data.Array.Accelerate.CUDA.FullList              as FL
import Data.Array.Accelerate.CUDA.Persistent            as KT
import qualified Data.Array.Accelerate.CUDA.Debug       as D

-- libraries
import Numeric
import Prelude                                          hiding ( exp, catch )
import Control.Applicative                              hiding ( Const )
import Blaze.ByteString.Builder
import Blaze.ByteString.Builder.Char8
import Control.Exception
import Control.Monad
import Control.Monad.Trans
import Crypto.Hash.MD5                                  ( hashlazy )
import Data.Label.PureM
import Data.List
import Data.Maybe
import Data.Monoid
import System.Directory
import System.Exit                                      ( ExitCode(..) )
import System.FilePath
import System.IO
import System.IO.Unsafe
import System.Process
import Text.PrettyPrint.Mainland                        ( RDoc(..), ppr, renderCompact )
import Data.ByteString.Internal                         ( w2c )
import qualified Data.HashSet                           as Set
import qualified Data.ByteString                        as B
import qualified Data.ByteString.Lazy                   as L
import qualified Foreign.CUDA.Driver                    as CUDA
import qualified Foreign.CUDA.Analysis                  as CUDA

#ifdef VERSION_unix
import System.Posix.Process
#else
import System.Win32.Process
#endif

import Paths_accelerate_cuda                            ( getDataDir )


-- | Initiate code generation, compilation, and data transfer for an array
-- expression. The returned array computation is annotated so to be suitable for
-- execution in the CUDA environment. This includes:
--
--   * list of array variables embedded within scalar expressions
--   * kernel object(s) required to executed the kernel
--
compileAcc :: Acc a -> CIO (ExecAcc a)
compileAcc acc = prepareAcc acc


compileAfun1 :: Afun (a -> b) -> CIO (ExecAfun (a -> b))
compileAfun1 (Alam (Abody b)) = Alam . Abody <$> prepareAcc b
compileAfun1 _                =
  error "Hope (noun): something that happens to facts when the world refuses to agree"


prepareAcc :: OpenAcc aenv a -> CIO (ExecOpenAcc aenv a)
prepareAcc rootAcc = traverseAcc rootAcc
  where
    -- Traverse an open array expression in depth-first order
    --
    -- The applicative combinators are used to gloss over that we are passing
    -- around the AST nodes together with a set of free variable indices that
    -- are merged at every step.
    --
    traverseAcc :: forall aenv a. OpenAcc aenv a -> CIO (ExecOpenAcc aenv a)
    traverseAcc acc@(OpenAcc pacc) = do

      let exec :: (AccBindings aenv, PreOpenAcc ExecOpenAcc aenv a) -> CIO (ExecOpenAcc aenv a)
          exec (var, eacc) = do
            kernel      <- build acc var
            return      $  ExecAcc (FL.singleton () kernel) var eacc

          node :: (AccBindings aenv, PreOpenAcc ExecOpenAcc aenv a) -> CIO (ExecOpenAcc aenv a)
          node (_, eacc) = return $ ExecAcc noKernel mempty eacc

      case pacc of
        --
        -- Environment manipulations
        --
        Avar ix                 -> node $ pure (Avar ix)

        --
        -- Let bindings
        --
        Alet a b                -> node . pure =<< Alet         <$> traverseAcc a  <*> traverseAcc b
        Apply f a               -> node . pure =<< Apply        <$> compileAfun1 f <*> traverseAcc a
        Acond p t e             -> node =<< liftA3 Acond        <$> travE p <*> travA t <*> travA e

        --
        -- Tuples
        --
        Atuple tup              -> node =<< liftA Atuple        <$> travAtup tup
        Aprj ix tup             -> node =<< liftA (Aprj ix)     <$> travA    tup

        --
        -- Array injection
        --
        Use arrs                -> use (arrays (undefined::a)) arrs >> node (pure $ Use arrs)
          where
            use :: ArraysR a' -> a' -> CIO ()
            use ArraysRunit         ()       = return ()
            use ArraysRarray        arr      = useArray arr
            use (ArraysRpair r1 r2) (a1, a2) = use r1 a1 >> use r2 a2

        --
        -- Computation nodes
        --
        Reshape s a             -> node =<< liftA2 Reshape              <$> travE s <*> travA a
        Unit e                  -> node =<< liftA  Unit                 <$> travE e
        Generate e f            -> exec =<< liftA2 Generate             <$> travE e <*> travF f
        Replicate slix e a      -> exec =<< liftA2 (Replicate slix)     <$> travE e <*> travA a
        Index slix a e          -> exec =<< liftA2 (Index slix)         <$> travA a <*> travE e
        Map f a                 -> exec =<< liftA2 Map                  <$> travF f <*> travA a
        ZipWith f a b           -> exec =<< liftA3 ZipWith              <$> travF f <*> travA a <*> travA b
        Fold f z a              -> exec =<< liftA3 Fold                 <$> travF f <*> travE z <*> travA a
        Fold1 f a               -> exec =<< liftA2 Fold1                <$> travF f <*> travA a
        FoldSeg f e a s         -> exec =<< liftA4 FoldSeg              <$> travF f <*> travE e <*> travA a <*> travA (segments s)
        Fold1Seg f a s          -> exec =<< liftA3 Fold1Seg             <$> travF f <*> travA a <*> travA (segments s)
        Permute f a g b         -> exec =<< liftA4 Permute              <$> travF f <*> travA a <*> travF g <*> travA b
        Backpermute e f a       -> exec =<< liftA3 Backpermute          <$> travE e <*> travF f <*> travA a
        Stencil f b a           -> exec =<< liftA2 (flip Stencil b)     <$> travF f <*> travA a
        Stencil2 f b1 a1 b2 a2  -> exec =<< liftA3 stencil2             <$> travF f <*> travA a1 <*> travA a2
          where stencil2 f' a1' a2' = Stencil2 f' b1 a1' b2 a2'

        -- TODO: write helper functions to clean these up
        Scanl f e a -> do
          ExecAcc (FL _ scan _) var eacc  <- exec =<< liftA3 Scanl <$> travF f <*> travE e <*> travA a
          add           <- build (OpenAcc (Fold1 f mat)) var
          return        $  ExecAcc (cons () add $ FL.singleton () scan) var eacc

        Scanl' f e a -> do
          ExecAcc (FL _ scan _) var eacc  <- exec =<< liftA3 Scanl' <$> travF f <*> travE e <*> travA a
          add           <- build (OpenAcc (Fold1 f mat)) var
          return        $  ExecAcc (cons () (retag add) $ FL.singleton () scan) var eacc

        Scanl1 f a -> do
          ExecAcc (FL _ scan1 _) var eacc <- exec =<< liftA2 Scanl1 <$> travF f <*> travA a
          add           <- build (OpenAcc (Fold1 f mat)) var
          return        $  ExecAcc (cons () add $ FL.singleton () scan1) var eacc

        Scanr f e a -> do
          ExecAcc (FL _ scan _) var eacc  <- exec =<< liftA3 Scanr <$> travF f <*> travE e <*> travA a
          add           <- build (OpenAcc (Fold1 f mat)) var
          return        $  ExecAcc (cons () add $ FL.singleton () scan) var eacc

        Scanr' f e a -> do
          ExecAcc (FL _ scan _) var eacc  <- exec =<< liftA3 Scanr' <$> travF f <*> travE e <*> travA a
          add           <- build (OpenAcc (Fold1 f mat)) var
          return        $  ExecAcc (cons () (retag add) $ FL.singleton () scan) var eacc

        Scanr1 f a -> do
          ExecAcc (FL _ scan1 _) var eacc <- exec =<< liftA2 Scanr1 <$> travF f <*> travA a
          add           <- build (OpenAcc (Fold1 f mat)) var
          return        $  ExecAcc (cons () add $ FL.singleton () scan1) var eacc

      where
        travA :: OpenAcc aenv' a' -> CIO (AccBindings aenv', ExecOpenAcc aenv' a')
        travA a = pure <$> traverseAcc a

        travAtup :: Atuple (OpenAcc aenv') a' -> CIO (AccBindings aenv', Atuple (ExecOpenAcc aenv') a')
        travAtup NilAtup        = return (pure NilAtup)
        travAtup (SnocAtup t a) = liftA2 SnocAtup <$> travAtup t <*> travA a

        travF :: OpenFun env aenv t -> CIO (AccBindings aenv, PreOpenFun ExecOpenAcc env aenv t)
        travF (Body b)  = liftA Body <$> travE b
        travF (Lam  f)  = liftA Lam  <$> travF f

        segments :: forall i. (Elt i, IsIntegral i)
                 => OpenAcc aenv (Segments i) -> OpenAcc aenv (Segments i)
        segments = OpenAcc . Scanl plus (Const (fromElt (0::i)))

        plus :: (Elt i, IsIntegral i) => PreOpenFun OpenAcc () aenv (i -> i -> i)
        plus = Lam (Lam (Body (PrimAdd numType
                              `PrimApp`
                              Tuple (NilTup `SnocTup` Var (SuccIdx ZeroIdx)
                                            `SnocTup` Var ZeroIdx))))

        mat :: Elt e => OpenAcc aenv (Array DIM2 e)
        mat = OpenAcc $ Use ((), Array (((),0),0) undefined)

        noKernel :: FullList () (AccKernel a)
        noKernel =  FL () (INTERNAL_ERROR(error) "compile" "no kernel module for this node") Nil

    -- Traverse a scalar expression
    --
    travE :: OpenExp env aenv e
          -> CIO (AccBindings aenv, PreOpenExp ExecOpenAcc env aenv e)
    travE exp =
      case exp of
        Var ix                  -> return $ pure (Var ix)
        Const c                 -> return $ pure (Const c)
        PrimConst c             -> return $ pure (PrimConst c)
        IndexAny                -> return $ pure IndexAny
        IndexNil                -> return $ pure IndexNil
        --
        Let a b                 -> liftA2 Let           <$> travE a <*> travE b
        IndexCons t h           -> liftA2 IndexCons     <$> travE t <*> travE h
        IndexHead h             -> liftA  IndexHead     <$> travE h
        IndexTail t             -> liftA  IndexTail     <$> travE t
        Tuple t                 -> liftA  Tuple         <$> travT t
        Prj ix e                -> liftA  (Prj ix)      <$> travE e
        Cond p t e              -> liftA3 Cond          <$> travE p <*> travE t <*> travE e
        PrimApp f e             -> liftA  (PrimApp f)   <$> travE e
        IndexScalar a e         -> liftA2 IndexScalar   <$> travA a <*> travE e
        Shape a                 -> liftA  Shape         <$> travA a
        ShapeSize e             -> liftA  ShapeSize     <$> travE e
      where
        travA :: (Shape sh, Elt e)
              => OpenAcc aenv (Array sh e) -> CIO (AccBindings aenv, ExecOpenAcc aenv (Array sh e))
        travA a = do
          a'    <- traverseAcc a
          return $ (bind a', a')

        travT :: Tuple (OpenExp env aenv) t
              -> CIO (AccBindings aenv, Tuple (PreOpenExp ExecOpenAcc env aenv) t)
        travT NilTup        = return (pure NilTup)
        travT (SnocTup t e) = liftA2 SnocTup <$> travT t <*> travE e

        bind :: (Shape sh, Elt e) => ExecOpenAcc aenv (Array sh e) -> AccBindings aenv
        bind (ExecAcc _ _ (Avar ix)) = AccBindings ( Set.singleton (ArrayVar ix) )
        bind _                       = INTERNAL_ERROR(error) "bind" "expected array variable"


-- Applicative
-- -----------
--
liftA4 :: Applicative f => (a -> b -> c -> d -> e) -> f a -> f b -> f c -> f d -> f e
liftA4 f a b c d = f <$> a <*> b <*> c <*> d


-- Compilation
-- -----------

-- Generate, compile, and link code to evaluate an array computation. We use
-- 'unsafePerformIO' here to leverage laziness, so that the 'link' function
-- evaluates and blocks on the external compiler only once the compiled object
-- is truly needed.
--
build :: OpenAcc aenv a -> AccBindings aenv -> CIO (AccKernel a)
build acc fvar = do
  dev           <- gets deviceProps
  table         <- gets kernelTable
  (entry,key)   <- compile table dev acc fvar
  let (mdl,fun,occ) = unsafePerformIO $ do
        m <- link table key
        f <- CUDA.getFun m entry
        l <- CUDA.requires f CUDA.MaxKernelThreadsPerBlock
        o <- determineOccupancy acc dev f l
        D.when D.dump_cc (stats entry f o)
        return (m,f,o)
  --
  return $ Kernel entry mdl fun occ (launchConfig acc dev occ)
  where
    stats name fn occ = do
      regs      <- CUDA.requires fn CUDA.NumRegs
      smem      <- CUDA.requires fn CUDA.SharedSizeBytes
      cmem      <- CUDA.requires fn CUDA.ConstSizeBytes
      lmem      <- CUDA.requires fn CUDA.LocalSizeBytes
      let msg1  = "entry function '" ++ name ++ "' used "
                  ++ shows regs " registers, "  ++ shows smem " bytes smem, "
                  ++ shows lmem " bytes lmem, " ++ shows cmem " bytes cmem"
          msg2  = "multiprocessor occupancy " ++ showFFloat (Just 1) (CUDA.occupancy100 occ) "% : "
                  ++ shows (CUDA.activeThreads occ)      " threads over "
                  ++ shows (CUDA.activeWarps occ)        " warps in "
                  ++ shows (CUDA.activeThreadBlocks occ) " blocks"
      --
      -- make sure kernel/stats are printed together
      --
      message   $ intercalate "\n" [msg1, "     ... " ++ msg2]


-- Link a compiled binary and update the associated kernel entry in the hash
-- table. This may entail waiting for the external compilation process to
-- complete. If successful, the temporary files are removed.
--
link :: KernelTable -> KernelKey -> IO CUDA.Module
link table key =
  let intErr = INTERNAL_ERROR(error) "link" "missing kernel entry"
  in do
    ctx         <- CUDA.get
    entry       <- fromMaybe intErr `fmap` KT.lookup table key
    case entry of
      CompileProcess cufile pid -> do
        -- Wait for the compiler to finish and load the binary object into the
        -- current context
        --
        message "waiting for nvcc..."
        waitFor pid
        let cubin       =  replaceExtension cufile ".cubin"
        bin             <- B.readFile cubin
        mdl             <- CUDA.loadData bin

        -- Update hash tables and stash the binary object into the persistent
        -- cache
        --
        KT.insert table key $! KernelObject bin (FL.singleton ctx mdl)
        KT.persist cubin key

        -- Remove temporary build products
        --
        removeFile      cufile
        removeDirectory (dropFileName cufile)
          `catch` \(_ :: IOError) -> return ()          -- directory not empty

        return mdl

      -- If we get a real object back, then this will already be in the
      -- persistent cache, since either it was just read in from there, or we
      -- had to generate new code and the link step above has added it.
      --
      KernelObject bin active
        | Just mdl <- FL.lookup ctx active      -> return mdl
        | otherwise                             -> do
            message "re-linking module for current context"
            mdl                 <- CUDA.loadData bin
            KT.insert table key $! KernelObject bin (FL.cons ctx mdl active)
            return mdl


-- Generate and compile code for a single open array expression
--
compile :: KernelTable
        -> CUDA.DeviceProperties
        -> OpenAcc aenv a
        -> AccBindings aenv
        -> CIO (String, KernelKey)
compile table dev acc fvar = do
  exists        <- isJust `fmap` liftIO (KT.lookup table key)
  unless exists $ do
    message     $  unlines [ show key, map w2c (L.unpack code) ]
    nvcc        <- fromMaybe (error "nvcc: command not found") <$> liftIO (findExecutable "nvcc")
    (file,hdl)  <- openTemporaryFile "dragon.cu"   -- rawr!
    flags       <- compileFlags file
    (_,_,_,pid) <- liftIO $ do
      L.hPut hdl code                 `finally`     hClose hdl
      createProcess (proc nvcc flags) `onException` removeFile file
    --
    liftIO $ KT.insert table key (CompileProcess file pid)
  --
  return (entry, key)
  where
    cunit       = codegenAcc dev acc fvar
    entry       = show cunit
    key         = (CUDA.computeCapability dev, hashlazy code)
    code        = toLazyByteString
                . layout . renderCompact $ ppr cunit
    --
    layout (RText _ s next)     = fromString s  `mappend` layout next
    layout (RChar c   next)     = fromChar c    `mappend` layout next
    layout (RLine _   next)     = fromChar '\n' `mappend` layout next   -- no indenting
    layout (RPos _    next)     = layout next                           -- no line markers
    layout REmpty               = mempty                                -- done


-- Wait for the compilation process to finish
--
waitFor :: ProcessHandle -> IO ()
waitFor pid = do
  status <- waitForProcess pid
  case status of
    ExitSuccess   -> return ()
    ExitFailure c -> error $ "nvcc terminated abnormally (" ++ show c ++ ")"


-- Determine the appropriate command line flags to pass to the compiler process.
-- This is dependent on the host architecture and device capabilities.
--
compileFlags :: FilePath -> CIO [String]
compileFlags cufile = do
  arch <- CUDA.computeCapability `fmap` gets deviceProps
  ddir <- liftIO getDataDir
  return $ filter (not . null) $
    [ "-I", ddir </> "cubits"
    , "--compiler-options", "-fno-strict-aliasing"
    , "-arch=sm_" ++ show (round (arch * 10) :: Int)
    , "-cubin"
    , "-o", cufile `replaceExtension` "cubin"
    , if D.mode D.dump_cc  then ""   else "--disable-warnings"
    , if D.mode D.debug_cc then "-G" else "-O3"
    , machine
    , cufile ]
  where
#if SIZEOF_HSINT == 4
    machine     = "-m32"
#elif SIZEOF_HSINT == 8
    machine     = "-m64"
#endif


-- Open a unique file in the temporary directory used for compilation
-- by-products. The directory will be created if it does not exist.
--
openTemporaryFile :: String -> CIO (FilePath, Handle)
openTemporaryFile template = liftIO $ do
  pid <- getProcessID
  dir <- (</>) <$> getTemporaryDirectory <*> pure ("accelerate-cuda-" ++ show pid)
  createDirectoryIfMissing True dir
  openTempFile dir template

#ifndef VERSION_unix
getProcessID :: ProcessHandle -> IO ProcessId
getProcessID = getProcessId
#endif

-- Debug
-- -----

{-# INLINE message #-}
message :: MonadIO m => String -> m ()
message msg = trace msg $ return ()

{-# INLINE trace #-}
trace :: MonadIO m => String -> m a -> m a
trace msg next = D.message D.dump_cc ("cc: " ++ msg) >> next