{-# LANGUAGE BangPatterns  #-}
{-# LANGUAGE CPP           #-}
{-# LANGUAGE TypeOperators #-}
-- |
-- Module:      : Data.Array.Accelerate.Examples.Internal.Backend
-- Copyright    : [2014] Trevor L. McDonell
-- License      : BSD3
--
-- Maintainer   : Trevor L. McDonell <tmcdonell@cse.unsw.edu.au>
-- Stability    : experimental
-- Portability  : non-portable (GHC extensions)
--
-- This file is all that should need editing in order to add support for new
-- backends to the accelerate-examples package.
--

module Data.Array.Accelerate.Examples.Internal.Backend
  where

import Prelude                                          as P
import Data.Label
import System.Console.GetOpt

import Data.Array.Accelerate
import qualified Data.Array.Accelerate                  as A
import qualified Data.Array.Accelerate.Interpreter      as Interp
#ifdef ACCELERATE_LLVM_NATIVE_BACKEND
import qualified Data.Array.Accelerate.LLVM.Native      as CPU
#endif
#ifdef ACCELERATE_LLVM_PTX_BACKEND
import qualified Data.Array.Accelerate.LLVM.PTX         as PTX
#endif
#ifdef ACCELERATE_CUDA_BACKEND
import qualified Data.Array.Accelerate.CUDA             as CUDA
#endif
#ifdef ACCELERATE_CILK_BACKEND
import qualified Data.Array.Accelerate.Cilk             as Cilk
#endif


-- | Execute Accelerate expressions
--
{-# INLINE run #-}
run :: Arrays a => Backend -> Acc a -> a
run Interpreter = Interp.run
#ifdef ACCELERATE_LLVM_NATIVE_BACKEND
run CPU         = CPU.run
#endif
#ifdef ACCELERATE_LLVM_PTX_BACKEND
run PTX         = PTX.run
#endif
#ifdef ACCELERATE_CUDA_BACKEND
run CUDA        = CUDA.run
#endif
#ifdef ACCELERATE_CILK_BACKEND
run Cilk        = Cilk.run
#endif


{-# INLINE run1 #-}
run1 :: (Arrays a, Arrays b) => Backend -> (Acc a -> Acc b) -> a -> b
run1 Interpreter f = Interp.run1 f
#ifdef ACCELERATE_LLVM_NATIVE_BACKEND
run1 CPU         f = CPU.run1 f
#endif
#ifdef ACCELERATE_LLVM_PTX_BACKEND
run1 PTX         f = PTX.run1 f
#endif
#ifdef ACCELERATE_CUDA_BACKEND
run1 CUDA        f = CUDA.run1 f
#endif
#ifdef ACCELERATE_CILK_BACKEND
run1 Cilk        f = Cilk.run . f . use
#endif

{-# INLINE run2 #-}
run2 :: (Arrays a, Arrays b, Arrays c) => Backend -> (Acc a -> Acc b -> Acc c) -> a -> b -> c
run2 backend f x y = go (x,y)
  where
    !go = run1 backend (A.uncurry f)

{-# INLINE run3 #-}
run3 :: (Arrays a, Arrays b, Arrays c, Arrays d) => Backend -> (Acc a -> Acc b -> Acc c -> Acc d) -> a -> b -> c -> d
run3 backend f x y z = go (x,y,z)
  where
    !go = run1 backend (\t -> let (a,b,c) = unlift t in f a b c)


{-# INLINE run4 #-}
run4 :: (Arrays a, Arrays b, Arrays c, Arrays d, Arrays e) => Backend -> (Acc a -> Acc b -> Acc c -> Acc d -> Acc e) -> a -> b -> c -> d -> e
run4 backend f x y z w = go (x,y,z,w)
  where
    !go = run1 backend (\t -> let (a,b,c,d) = unlift t in f a b c d)


-- | The set of backends available to execute the program.
--
data Backend = Interpreter
#ifdef ACCELERATE_LLVM_NATIVE_BACKEND
             | CPU
#endif
#ifdef ACCELERATE_LLVM_PTX_BACKEND
             | PTX
#endif
#ifdef ACCELERATE_CUDA_BACKEND
             | CUDA
#endif
#ifdef ACCELERATE_LLVM_MULTIDEV_BACKEND
             | Multi
#endif
#ifdef ACCELERATE_CILK_BACKEND
             | Cilk
#endif
  deriving (P.Eq, P.Enum, P.Bounded)


-- The choice of show instance is important because this will be used to
-- generate the command line flag.
--
instance Show Backend where
  show Interpreter      = "interpreter"
#ifdef ACCELERATE_LLVM_NATIVE_BACKEND
  show CPU              = "llvm-cpu"
#endif
#ifdef ACCELERATE_LLVM_PTX_BACKEND
  show PTX              = "llvm-ptx"
#endif
#ifdef ACCELERATE_CUDA_BACKEND
  show CUDA             = "cuda"
#endif
#ifdef ACCELERATE_LLVM_MULTIDEV_BACKEND
  show Multi            = "llvm-multi"
#endif
#ifdef ACCELERATE_CILK_BACKEND
  show Cilk             = "cilk"
#endif


-- The default backend to use. Use one of the accelerated backends whenever
-- available.
--
defaultBackend :: Backend
defaultBackend =
  case maxBound of
    Interpreter -> Interpreter
    _           -> succ Interpreter


-- The set of available backnds. This will be used for both the command line
-- options as well as the fancy header generation.
--
availableBackends :: (options :-> Backend) -> [OptDescr (options -> options)]
availableBackends optBackend =
  [ Option  [] [show Interpreter]
            (NoArg (set optBackend Interpreter))
            "reference implementation (sequential)"

#ifdef ACCELERATE_LLVM_NATIVE_BACKEND
  , Option  [] [show CPU]
            (NoArg (set optBackend CPU))
            "LLVM based implementation for multicore CPUs (parallel)"
#endif
#ifdef ACCELERATE_LLVM_PTX_BACKEND
  , Option  [] [show PTX]
            (NoArg (set optBackend PTX))
            "LLVM based implementation for NVIDIA GPUs (parallel)"
#endif
#ifdef ACCELERATE_CUDA_BACKEND
  , Option  [] [show CUDA]
            (NoArg (set optBackend CUDA))
            "CUDA based implementation for NVIDIA GPUs (parallel)"
#endif
#ifdef ACCELERATE_LLVM_MULTIDEV_BACKEND
  , Option  [] [show Multi]
            (NoArg (set optBackend Multi))
            "LLVM based multi-device implementation using CPUs and GPUs (parallel)"
#endif
#ifdef ACCELERATE_CILK_BACKEND
  , Option  [] [show Cilk]
            (NoArg (set optBackend Cilk))
            "Cilk based implementation for multicore CPUs (parallel)"
#endif
  ]


-- By default, how many instances of the backend can be run concurrently?
--
-- If this returns `Nothing` then we make no changes to the configuration and
-- inherit the RTS options. If this is `Just n`, then by default only `n`
-- threads will be used to run tests at once. This option can still be
-- overridden via the command line flags, it just sets a default.
--
-- e.g.
--   * A multicore CPU backend may specify `Just 1`, so that only one instance
--     runs at once, and it inherits all threads specified via `+RTS -Nn`
--
--   * A thread-safe accelerator backend can specify `Nothing`, while a
--     non-thread-safe backend (perhaps it requires exclusive access to the
--     accelerator board) should specify `Just 1`.
--
-- Both the LLVM-CPU and LLVM-PTX backends are safe to run concurrently given
-- the same execution context. Although this results in over-subscription,
-- particularly for the CPU backend, it still improves performance because the
-- majority of the time is spent in the reference implementation / checking
-- results, so running multiple tests concurrently is still useful.
--
concurrentBackends :: Backend -> Maybe Int
concurrentBackends Interpreter  = Nothing
#ifdef ACCELERATE_LLVM_NATIVE_BACKEND
concurrentBackends CPU          = Nothing
#endif
#ifdef ACCELERATE_LLVM_PTX_BACKEND
concurrentBackends PTX          = Nothing
#endif
#ifdef ACCELERATE_CUDA_BACKEND
concurrentBackends CUDA         = Just 1      -- not thread safe
#endif
#ifdef ACCELERATE_CILK_BACKEND
concurrentBackends Cilk         = Just 1      -- not thread safe
#endif