{-# LANGUAGE CPP #-}
{-# LANGUAGE NamedFieldPuns #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE MultiParamTypeClasses #-}

{-# OPTIONS_GHC -Wall #-}
{-# OPTIONS_HADDOCK ignore-exports, prune #-}

-- | Do not use his module directly.  Use a /SCHEDULER/ module (such
--   as 'Control.Monad.Par.Meta.AccSMP').
-- 
-- This provides a component (Resource) for assembling schedulers, as well as 
-- exporting a `Control.Monad.Par.Accelerate.ParAccelerate` instance.

module Control.Monad.Par.Meta.Resources.Accelerate 
  (
      mkResource,
      runAccWith, spawnAccWith, unsafeHybridWith
  ) where

import Control.Concurrent
import Control.Exception.Base (evaluate)
import Control.Monad
import Control.Monad.IO.Class

import Data.Array.Accelerate (Acc, Arrays)
#ifdef ACCELERATE_CUDA_BACKEND
#warning "meta-par-accelerate: Loading REAL, LIVE CUDA BACKEND..."
import qualified Data.Array.Accelerate.CUDA        as Run
#else
import qualified Data.Array.Accelerate.Interpreter as Run
#endif

import Data.Concurrent.Deque.Class (ConcQueue, WSDeque)
import Data.Concurrent.Deque.Reference as R

import System.IO.Unsafe

import Text.Printf

import qualified Control.Monad.Par.Accelerate as AC
-- import qualified Control.Monad.Par.OffChip    as OC
import Control.Monad.Par.Meta 
-- import Control.Monad.Par.Class (new,put_)

dbg :: Bool
#ifdef DEBUG
dbg = True
#else
dbg = False
#endif


--------------------------------------------------------------------------------
-- * The `Resource` itself:

-- | A mix-in component for assembling schedulers with an Accelerate capability.
mkResource :: Resource
mkResource = Resource defaultInit defaultSteal


-- * /Internal/ Definitions
--------------------------------------------------------------------------------
-- Global structures for communicating between Par threads and GPU
-- daemon threads

{-# NOINLINE gpuOnlyQueue #-}
-- | GPU-only queue is pushed to by 'Par' workers on the right, and
-- popped by the GPU daemon on the left. No backstealing is possible
-- from this queue.
gpuOnlyQueue :: WSDeque (IO ())
gpuOnlyQueue = unsafePerformIO R.newQ

{-# NOINLINE gpuBackstealQueue #-}
-- | GPU-only queue is pushed to by 'Par' workers on the right, and
-- popped by the GPU daemon and 'Par' workers on the left.
gpuBackstealQueue :: ConcQueue (Par (), IO ())
gpuBackstealQueue = unsafePerformIO R.newQ

{-# NOINLINE resultQueue #-}
-- | Result queue is pushed to by the GPU daemon, and popped by the
-- 'Par' workers, meaning the 'WSDeque' is appropriate.
resultQueue :: WSDeque (Par ())
resultQueue = unsafePerformIO R.newQ

--------------------------------------------------------------------------------

-- See documentation for `Control.Monad.Par.Accelerate.spawnAcc`
spawnAccWith :: (Arrays a) => (Acc a -> a) -> Acc a -> Par (IVar a)
spawnAccWith runner comp = do 
    when dbg $ liftIO $ printf "spawning Accelerate computation\n"
    iv <- new
    let wrappedComp = do
          when dbg $ printf$ "running Accelerate computation:\n"++show comp++"\n"
          ans <- evaluate $ runner comp
          R.pushL resultQueue $ do
            when dbg $ liftIO $ printf "Accelerate computation finished\n"
            put_ iv ans
    liftIO $ R.pushR gpuOnlyQueue wrappedComp
    return iv               

 -- | Run an Accelerate computation and wait for its result.  In the
 -- context of a `Par` computation this can result in better
 -- performance than using an Accelerate-provided `run` function
 -- directly, because this version enables the CPU work scheduler to do
 -- other work while waiting for the GPU computation to complete.
 -- 
 -- Moreover, when configured with a high-performance /CPU/ Accelerate backend
 -- in the future this routine can enable automatic CPU/GPU work partitioning.
 -- 
 -- The more generic version of this function is "Control.Monad.Par.OffChip.runOffChip".
runAccWith  :: (Arrays a) => (Acc a -> a) -> Acc a -> Par a
runAccWith runner comp = spawnAccWith runner comp  >>= get

-- | See documentation for `Control.Monad.Par.Accelerate.unsafeHybrid`
unsafeHybridWith :: Arrays b => (Acc b -> b) -> (b -> a) -> (Par a, Acc b) -> Par (IVar a)
unsafeHybridWith runner convert (parComp, accComp) = do 
    when dbg $ liftIO $ printf "spawning Accelerate computation\n"
    iv <- new
    let wrappedParComp :: Par ()
        wrappedParComp = do
          when dbg $ liftIO $ printf "running backstolen computation\n"
          put_ iv =<< parComp
        wrappedAccComp :: IO ()
        wrappedAccComp = do
          when dbg $ printf "running Accelerate computation\n"
--          ans <- convert $ Acc.run accComp
          let ans = convert $ runner accComp
          R.pushL resultQueue $ do
            when dbg $ liftIO $ printf "Accelerate computation finished\n"
            put_ iv ans
    liftIO $ R.pushR gpuBackstealQueue (wrappedParComp, wrappedAccComp)
    return iv


--------------------------------------------------------------------------------

-- | Loop for the GPU daemon; repeatedly takes work off the 'gpuQueue'
-- and runs it.
gpuDaemon :: IO ()
gpuDaemon = do
  mwork <- R.tryPopL gpuOnlyQueue
  case mwork of
    Just work -> work
    Nothing -> do
      mwork2 <- R.tryPopL gpuBackstealQueue
      case mwork2 of
        Just (_, work) -> work 
        Nothing -> return ()
  gpuDaemon


defaultInit :: Startup
defaultInit = St ia
  where ia _ _ = do
          void $ forkIO gpuDaemon

defaultSteal :: WorkSearch
defaultSteal = WS sa 
  where sa _ _ = do
          mfinished <- R.tryPopR resultQueue
          case mfinished of
            finished@(Just _) -> return finished
            Nothing -> fmap fst `fmap` R.tryPopL gpuBackstealQueue

--------------------------------------------------------------------------------

-- Generic instance for Meta.Par, needs to be newtype-derived for specific schedulers.
instance AC.ParAccelerate IVar Par where 
  getDefaultAccImpl = return Run.run -- TEMP! FIXME - don't ever use interpreter in the future.  
  runAccWith        = runAccWith
  spawnAccWith      = spawnAccWith
  unsafeHybridWith  = unsafeHybridWith

--  compileAcc   = error "Accelerate resource -- compileAcc not implemented yet"

-- instance OC.ParOffChip Acc IVar Par where
--   type OffChipConstraint a = Arrays a
--   runOffChip = runAcc
--   spawnOffChip = spawnAcc
--   unsafeHybrid = unsafeHybrid