{-# LANGUAGE CPP #-} -- | -- Module : Data.Array.Accelerate.LLVM.Native.State -- Copyright : [2014..2017] Trevor L. McDonell -- [2014..2014] Vinod Grover (NVIDIA Corporation) -- License : BSD3 -- -- Maintainer : Trevor L. McDonell -- Stability : experimental -- Portability : non-portable (GHC extensions) -- module Data.Array.Accelerate.LLVM.Native.State ( evalNative, createTarget, defaultTarget, Strategy, balancedParIO, unbalancedParIO, ) where -- accelerate import Control.Parallel.Meta import Control.Parallel.Meta.Worker import qualified Control.Parallel.Meta.Trans.LBS as LBS import qualified Control.Parallel.Meta.Resource.SMP as SMP import qualified Control.Parallel.Meta.Resource.Single as Single import qualified Control.Parallel.Meta.Resource.Backoff as Backoff import Data.Array.Accelerate.LLVM.State import Data.Array.Accelerate.LLVM.Native.Target import qualified Data.Array.Accelerate.LLVM.Native.Link.Cache as LC import qualified Data.Array.Accelerate.LLVM.Native.Debug as Debug -- library import Data.ByteString.Short.Char8 ( ShortByteString, unpack ) import Data.Maybe import Data.Monoid import System.Environment import System.IO.Unsafe import Text.Printf import Text.Read import GHC.Conc -- | Execute a computation in the Native backend -- evalNative :: Native -> LLVM Native a -> IO a evalNative = evalLLVM -- | Create a Native execution target by spawning a worker thread on each of the -- given capabilities, and using the given strategy to load balance the workers -- when executing parallel operations. -- createTarget :: [Int] -- ^ CPU IDs to launch worker threads on -> Strategy -- ^ Strategy to balance parallel workloads -> IO Native createTarget caps parallelIO = do let size = length caps gang <- forkGangOn caps linker <- LC.new return $! Native size linker (sequentialIO gang) (parallelIO gang) (size > 1) -- | The strategy for balancing work amongst the available worker threads. -- type Strategy = Gang -> Executable -- | Execute an operation sequentially on a single thread -- sequentialIO :: Strategy sequentialIO gang = Executable $ \name _ppt range fill -> timed name $ runSeqIO gang range fill -- | Execute a computation without load balancing. Each thread computes an -- equally sized chunk of the input. No work stealing occurs. -- unbalancedParIO :: Strategy unbalancedParIO gang = Executable $ \name _ppt range fill -> timed name $ runParIO Single.mkResource gang range fill -- | Execute a computation where threads use work stealing (based on lazy -- splitting of work stealing queues and exponential backoff) in order to -- automatically balance the workload amongst themselves. -- balancedParIO :: Int -- ^ number of steal attempts before backing off -> Strategy balancedParIO retries gang = Executable $ \name ppt range fill -> -- TLM: A suitable PPT should be chosen when invoking the continuation in -- order to balance scheduler overhead with fine-grained function calls -- let resource = LBS.mkResource ppt (SMP.mkResource retries <> Backoff.mkResource) in timed name $ runParIO resource gang range fill -- Top-level mutable state -- ----------------------- -- -- It is important to keep some information alive for the entire run of the -- program, not just a single execution. These tokens use 'unsafePerformIO' to -- ensure they are executed only once, and reused for subsequent invocations. -- -- | Initialise the gang of threads that will be used to execute computations. -- This spawns one worker for each available processor, or as specified by the -- value of the environment variable @ACCELERATE_LLVM_NATIVE_THREADS@. -- -- This globally shared thread gang is auto-initialised on startup and shared by -- all computations (unless the user chooses to 'run' with a different gang). -- -- In a data parallel setting, it does not help to have multiple gangs running -- at the same time. This is because a single data parallel computation should -- already be able to keep all threads busy. If we had multiple gangs running at -- the same time, then the system as a whole would run slower as the gangs -- contend for cache and thrash the scheduler. -- {-# NOINLINE defaultTarget #-} defaultTarget :: Native defaultTarget = unsafePerformIO $ do nproc <- getNumProcessors ncaps <- getNumCapabilities menv <- (readMaybe =<<) <$> lookupEnv "ACCELERATE_LLVM_NATIVE_THREADS" let nthreads = fromMaybe nproc menv -- Update the number of capabilities, but never set it lower than it already -- is. This target will spawn a worker on each processor (as returned by -- 'getNumProcessors', which includes SMT (hyperthreading) cores), but the -- user may have requested more capabilities than this to handle, for example, -- concurrent output. -- setNumCapabilities (max ncaps nthreads) Debug.traceIO Debug.dump_gc (printf "gc: initialise native target with %d worker threads" nthreads) case nthreads of 1 -> createTarget [0] sequentialIO n -> createTarget [0 .. n-1] (balancedParIO n) -- Debugging -- --------- {-# INLINE timed #-} timed :: ShortByteString -> IO a -> IO a timed name f = Debug.timed Debug.dump_exec (elapsed name) f {-# INLINE elapsed #-} elapsed :: ShortByteString -> Double -> Double -> String elapsed name x y = printf "exec: %s %s" (unpack name) (Debug.elapsedP x y)