module Futhark.CodeGen.ImpGen.Multicore.Base
  ( extractAllocations,
    compileThreadResult,
    Locks (..),
    HostEnv (..),
    AtomicBinOp,
    MulticoreGen,
    decideScheduling,
    decideScheduling',
    groupResultArrays,
    renameSegBinOp,
    freeParams,
    renameHistOpLambda,
    atomicUpdateLocking,
    AtomicUpdate (..),
    DoAtomicUpdate,
    Locking (..),
    getSpace,
    getLoopBounds,
    getIterationDomain,
    getReturnParams,
    segOpString,
    ChunkLoopVectorization (..),
    generateChunkLoop,
    generateUniformizeLoop,
    extractVectorLane,
    inISPC,
    toParam,
    sLoopNestVectorized,
  )
where

import Control.Monad
import Data.Bifunctor
import Data.Map qualified as M
import Data.Maybe
import Futhark.CodeGen.ImpCode.Multicore qualified as Imp
import Futhark.CodeGen.ImpGen
import Futhark.Error
import Futhark.IR.MCMem
import Futhark.MonadFreshNames
import Futhark.Transform.Rename
import Prelude hiding (quot, rem)

-- | Is there an atomic t'BinOp' corresponding to this t'BinOp'?
type AtomicBinOp =
  BinOp ->
  Maybe (VName -> VName -> Imp.Count Imp.Elements (Imp.TExp Int32) -> Imp.Exp -> Imp.AtomicOp)

-- | Information about the locks available for accumulators.
data Locks = Locks
  { Locks -> VName
locksArray :: VName,
    Locks -> Int
locksCount :: Int
  }

data HostEnv = HostEnv
  { HostEnv -> AtomicBinOp
hostAtomics :: AtomicBinOp,
    HostEnv -> Map VName Locks
hostLocks :: M.Map VName Locks
  }

type MulticoreGen = ImpM MCMem HostEnv Imp.Multicore

segOpString :: SegOp () MCMem -> MulticoreGen String
segOpString :: SegOp () MCMem -> MulticoreGen [Char]
segOpString SegMap {} = forall (f :: * -> *) a. Applicative f => a -> f a
pure [Char]
"segmap"
segOpString SegRed {} = forall (f :: * -> *) a. Applicative f => a -> f a
pure [Char]
"segred"
segOpString SegScan {} = forall (f :: * -> *) a. Applicative f => a -> f a
pure [Char]
"segscan"
segOpString SegHist {} = forall (f :: * -> *) a. Applicative f => a -> f a
pure [Char]
"seghist"

arrParam :: VName -> MulticoreGen Imp.Param
arrParam :: VName -> MulticoreGen Param
arrParam VName
arr = do
  VarEntry MCMem
name_entry <- forall {k} (rep :: k) r op. VName -> ImpM rep r op (VarEntry rep)
lookupVar VName
arr
  case VarEntry MCMem
name_entry of
    ArrayVar Maybe (Exp MCMem)
_ (ArrayEntry (MemLoc VName
mem [SubExp]
_ IxFun (TExp Int64)
_) PrimType
_) ->
      forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ VName -> Space -> Param
Imp.MemParam VName
mem Space
DefaultSpace
    VarEntry MCMem
_ -> forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
"arrParam: could not handle array " forall a. [a] -> [a] -> [a]
++ forall a. Show a => a -> [Char]
show VName
arr

toParam :: VName -> TypeBase shape u -> MulticoreGen [Imp.Param]
toParam :: forall shape u. VName -> TypeBase shape u -> MulticoreGen [Param]
toParam VName
name (Prim PrimType
pt) = forall (f :: * -> *) a. Applicative f => a -> f a
pure [VName -> PrimType -> Param
Imp.ScalarParam VName
name PrimType
pt]
toParam VName
name (Mem Space
space) = forall (f :: * -> *) a. Applicative f => a -> f a
pure [VName -> Space -> Param
Imp.MemParam VName
name Space
space]
toParam VName
name Array {} = forall (f :: * -> *) a. Applicative f => a -> f a
pure forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> VName -> MulticoreGen Param
arrParam VName
name
toParam VName
_name Acc {} = forall (f :: * -> *) a. Applicative f => a -> f a
pure [] -- FIXME?  Are we sure this works?

getSpace :: SegOp () MCMem -> SegSpace
getSpace :: SegOp () MCMem -> SegSpace
getSpace (SegHist ()
_ SegSpace
space [HistOp MCMem]
_ [Type]
_ KernelBody MCMem
_) = SegSpace
space
getSpace (SegRed ()
_ SegSpace
space [SegBinOp MCMem]
_ [Type]
_ KernelBody MCMem
_) = SegSpace
space
getSpace (SegScan ()
_ SegSpace
space [SegBinOp MCMem]
_ [Type]
_ KernelBody MCMem
_) = SegSpace
space
getSpace (SegMap ()
_ SegSpace
space [Type]
_ KernelBody MCMem
_) = SegSpace
space

getLoopBounds :: MulticoreGen (Imp.TExp Int64, Imp.TExp Int64)
getLoopBounds :: MulticoreGen (TExp Int64, TExp Int64)
getLoopBounds = do
  TV Int64
start <- forall {k1} {k2} (rep :: k1) r op (t :: k2).
[Char] -> PrimType -> ImpM rep r op (TV t)
dPrim [Char]
"start" PrimType
int64
  TV Int64
end <- forall {k1} {k2} (rep :: k1) r op (t :: k2).
[Char] -> PrimType -> ImpM rep r op (TV t)
dPrim [Char]
"end" PrimType
int64
  forall {k} op (rep :: k) r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. a -> Code a
Imp.Op forall a b. (a -> b) -> a -> b
$ VName -> VName -> Multicore
Imp.GetLoopBounds (forall {k} (t :: k). TV t -> VName
tvVar TV Int64
start) (forall {k} (t :: k). TV t -> VName
tvVar TV Int64
end)
  forall (f :: * -> *) a. Applicative f => a -> f a
pure (forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
start, forall {k} (t :: k). TV t -> TExp t
tvExp TV Int64
end)

getIterationDomain :: SegOp () MCMem -> SegSpace -> MulticoreGen (Imp.TExp Int64)
getIterationDomain :: SegOp () MCMem -> SegSpace -> MulticoreGen (TExp Int64)
getIterationDomain SegMap {} SegSpace
space = do
  let ns :: [SubExp]
ns = forall a b. (a -> b) -> [a] -> [b]
map forall a b. (a, b) -> b
snd forall a b. (a -> b) -> a -> b
$ SegSpace -> [(VName, SubExp)]
unSegSpace SegSpace
space
      ns_64 :: [TExp Int64]
ns_64 = forall a b. (a -> b) -> [a] -> [b]
map SubExp -> TExp Int64
pe64 [SubExp]
ns
  forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
product [TExp Int64]
ns_64
getIterationDomain SegOp () MCMem
_ SegSpace
space = do
  let ns :: [SubExp]
ns = forall a b. (a -> b) -> [a] -> [b]
map forall a b. (a, b) -> b
snd forall a b. (a -> b) -> a -> b
$ SegSpace -> [(VName, SubExp)]
unSegSpace SegSpace
space
      ns_64 :: [TExp Int64]
ns_64 = forall a b. (a -> b) -> [a] -> [b]
map SubExp -> TExp Int64
pe64 [SubExp]
ns
  case SegSpace -> [(VName, SubExp)]
unSegSpace SegSpace
space of
    [(VName, SubExp)
_] -> forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
product [TExp Int64]
ns_64
    -- A segmented SegOp is over the segments
    -- so we drop the last dimension, which is
    -- executed sequentially
    [(VName, SubExp)]
_ -> forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
product forall a b. (a -> b) -> a -> b
$ forall a. [a] -> [a]
init [TExp Int64]
ns_64

-- When the SegRed's return value is a scalar
-- we perform a call by value-result in the segop function
getReturnParams :: Pat LetDecMem -> SegOp () MCMem -> MulticoreGen [Imp.Param]
getReturnParams :: Pat LetDecMem -> SegOp () MCMem -> MulticoreGen [Param]
getReturnParams Pat LetDecMem
pat SegRed {} =
  -- It's a good idea to make sure any prim values are initialised, as
  -- we will load them (redundantly) in the task code, and
  -- uninitialised values are UB.
  forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM (forall dec. Pat dec -> [PatElem dec]
patElems Pat LetDecMem
pat) forall a b. (a -> b) -> a -> b
$ \PatElem LetDecMem
pe -> do
    case forall dec. Typed dec => PatElem dec -> Type
patElemType PatElem LetDecMem
pe of
      Prim PrimType
pt -> forall dec. PatElem dec -> VName
patElemName PatElem LetDecMem
pe forall {k} (rep :: k) r op. VName -> Exp -> ImpM rep r op ()
<~~ forall v. PrimValue -> PrimExp v
ValueExp (PrimType -> PrimValue
blankPrimValue PrimType
pt)
      Type
_ -> forall (f :: * -> *) a. Applicative f => a -> f a
pure ()
    forall shape u. VName -> TypeBase shape u -> MulticoreGen [Param]
toParam (forall dec. PatElem dec -> VName
patElemName PatElem LetDecMem
pe) (forall dec. Typed dec => PatElem dec -> Type
patElemType PatElem LetDecMem
pe)
getReturnParams Pat LetDecMem
_ SegOp () MCMem
_ = forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a. Monoid a => a
mempty

renameSegBinOp :: [SegBinOp MCMem] -> MulticoreGen [SegBinOp MCMem]
renameSegBinOp :: [SegBinOp MCMem] -> MulticoreGen [SegBinOp MCMem]
renameSegBinOp [SegBinOp MCMem]
segbinops =
  forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM [SegBinOp MCMem]
segbinops forall a b. (a -> b) -> a -> b
$ \(SegBinOp Commutativity
comm Lambda MCMem
lam [SubExp]
ne Shape
shape) -> do
    Lambda MCMem
lam' <- forall {k} (rep :: k) (m :: * -> *).
(Renameable rep, MonadFreshNames m) =>
Lambda rep -> m (Lambda rep)
renameLambda Lambda MCMem
lam
    forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ forall {k} (rep :: k).
Commutativity -> Lambda rep -> [SubExp] -> Shape -> SegBinOp rep
SegBinOp Commutativity
comm Lambda MCMem
lam' [SubExp]
ne Shape
shape

compileThreadResult ::
  SegSpace ->
  PatElem LetDecMem ->
  KernelResult ->
  MulticoreGen ()
compileThreadResult :: SegSpace
-> PatElem LetDecMem
-> KernelResult
-> ImpM MCMem HostEnv Multicore ()
compileThreadResult SegSpace
space PatElem LetDecMem
pe (Returns ResultManifest
_ Certs
_ SubExp
what) = do
  let is :: [TExp Int64]
is = forall a b. (a -> b) -> [a] -> [b]
map (forall a. a -> TPrimExp Int64 a
Imp.le64 forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a, b) -> a
fst) forall a b. (a -> b) -> a -> b
$ SegSpace -> [(VName, SubExp)]
unSegSpace SegSpace
space
  forall {k} (rep :: k) r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (forall dec. PatElem dec -> VName
patElemName PatElem LetDecMem
pe) [TExp Int64]
is SubExp
what []
compileThreadResult SegSpace
_ PatElem LetDecMem
_ WriteReturns {} =
  forall a. [Char] -> a
compilerBugS [Char]
"compileThreadResult: WriteReturns unhandled."
compileThreadResult SegSpace
_ PatElem LetDecMem
_ TileReturns {} =
  forall a. [Char] -> a
compilerBugS [Char]
"compileThreadResult: TileReturns unhandled."
compileThreadResult SegSpace
_ PatElem LetDecMem
_ RegTileReturns {} =
  forall a. [Char] -> a
compilerBugS [Char]
"compileThreadResult: RegTileReturns unhandled."

freeParams :: FreeIn a => a -> MulticoreGen [Imp.Param]
freeParams :: forall a. FreeIn a => a -> MulticoreGen [Param]
freeParams a
code = do
  let free :: [VName]
free = Names -> [VName]
namesToList forall a b. (a -> b) -> a -> b
$ forall a. FreeIn a => a -> Names
freeIn a
code
  [Type]
ts <- forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM forall {k} (rep :: k) (m :: * -> *).
HasScope rep m =>
VName -> m Type
lookupType [VName]
free
  forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m [c]
zipWithM forall shape u. VName -> TypeBase shape u -> MulticoreGen [Param]
toParam [VName]
free [Type]
ts

-- | Arrays for storing group results shared between threads
groupResultArrays ::
  String ->
  SubExp ->
  [SegBinOp MCMem] ->
  MulticoreGen [[VName]]
groupResultArrays :: [Char] -> SubExp -> [SegBinOp MCMem] -> MulticoreGen [[VName]]
groupResultArrays [Char]
s SubExp
num_threads [SegBinOp MCMem]
reds =
  forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM [SegBinOp MCMem]
reds forall a b. (a -> b) -> a -> b
$ \(SegBinOp Commutativity
_ Lambda MCMem
lam [SubExp]
_ Shape
shape) ->
    forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM (forall {k} (rep :: k). Lambda rep -> [Type]
lambdaReturnType Lambda MCMem
lam) forall a b. (a -> b) -> a -> b
$ \Type
t -> do
      let full_shape :: Shape
full_shape = forall d. [d] -> ShapeBase d
Shape [SubExp
num_threads] forall a. Semigroup a => a -> a -> a
<> Shape
shape forall a. Semigroup a => a -> a -> a
<> forall shape u. ArrayShape shape => TypeBase shape u -> shape
arrayShape Type
t
      forall {k} (rep :: k) r op.
[Char] -> PrimType -> Shape -> Space -> ImpM rep r op VName
sAllocArray [Char]
s (forall shape u. TypeBase shape u -> PrimType
elemType Type
t) Shape
full_shape Space
DefaultSpace

isLoadBalanced :: Imp.MCCode -> Bool
isLoadBalanced :: Code Multicore -> Bool
isLoadBalanced (Code Multicore
a Imp.:>>: Code Multicore
b) = Code Multicore -> Bool
isLoadBalanced Code Multicore
a Bool -> Bool -> Bool
&& Code Multicore -> Bool
isLoadBalanced Code Multicore
b
isLoadBalanced (Imp.For VName
_ Exp
_ Code Multicore
a) = Code Multicore -> Bool
isLoadBalanced Code Multicore
a
isLoadBalanced (Imp.If TExp Bool
_ Code Multicore
a Code Multicore
b) = Code Multicore -> Bool
isLoadBalanced Code Multicore
a Bool -> Bool -> Bool
&& Code Multicore -> Bool
isLoadBalanced Code Multicore
b
isLoadBalanced (Imp.Comment Text
_ Code Multicore
a) = Code Multicore -> Bool
isLoadBalanced Code Multicore
a
isLoadBalanced Imp.While {} = Bool
False
isLoadBalanced (Imp.Op (Imp.ParLoop [Char]
_ Code Multicore
code [Param]
_)) = Code Multicore -> Bool
isLoadBalanced Code Multicore
code
isLoadBalanced (Imp.Op (Imp.ForEachActive VName
_ Code Multicore
a)) = Code Multicore -> Bool
isLoadBalanced Code Multicore
a
isLoadBalanced (Imp.Op (Imp.ForEach VName
_ Exp
_ Exp
_ Code Multicore
a)) = Code Multicore -> Bool
isLoadBalanced Code Multicore
a
isLoadBalanced (Imp.Op (Imp.ISPCKernel Code Multicore
a [Param]
_)) = Code Multicore -> Bool
isLoadBalanced Code Multicore
a
isLoadBalanced Code Multicore
_ = Bool
True

segBinOpComm' :: [SegBinOp rep] -> Commutativity
segBinOpComm' :: forall {k} (rep :: k). [SegBinOp rep] -> Commutativity
segBinOpComm' = forall a. Monoid a => [a] -> a
mconcat forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall a b. (a -> b) -> [a] -> [b]
map forall {k} (rep :: k). SegBinOp rep -> Commutativity
segBinOpComm

decideScheduling' :: SegOp () rep -> Imp.MCCode -> Imp.Scheduling
decideScheduling' :: forall {k} (rep :: k). SegOp () rep -> Code Multicore -> Scheduling
decideScheduling' SegHist {} Code Multicore
_ = Scheduling
Imp.Static
decideScheduling' SegScan {} Code Multicore
_ = Scheduling
Imp.Static
decideScheduling' (SegRed ()
_ SegSpace
_ [SegBinOp rep]
reds [Type]
_ KernelBody rep
_) Code Multicore
code =
  case forall {k} (rep :: k). [SegBinOp rep] -> Commutativity
segBinOpComm' [SegBinOp rep]
reds of
    Commutativity
Commutative -> Code Multicore -> Scheduling
decideScheduling Code Multicore
code
    Commutativity
Noncommutative -> Scheduling
Imp.Static
decideScheduling' SegMap {} Code Multicore
code = Code Multicore -> Scheduling
decideScheduling Code Multicore
code

decideScheduling :: Imp.MCCode -> Imp.Scheduling
decideScheduling :: Code Multicore -> Scheduling
decideScheduling Code Multicore
code =
  if Code Multicore -> Bool
isLoadBalanced Code Multicore
code
    then Scheduling
Imp.Static
    else Scheduling
Imp.Dynamic

-- | Try to extract invariant allocations.  If we assume that the
-- given 'Imp.MCCode' is the body of a 'SegOp', then it is always safe
-- to move the immediate allocations to the prebody.
extractAllocations :: Imp.MCCode -> (Imp.MCCode, Imp.MCCode)
extractAllocations :: Code Multicore -> (Code Multicore, Code Multicore)
extractAllocations Code Multicore
segop_code = forall {a}. Code Multicore -> (Code a, Code Multicore)
f Code Multicore
segop_code
  where
    declared :: Names
declared = forall a. Code a -> Names
Imp.declaredIn Code Multicore
segop_code
    f :: Code Multicore -> (Code a, Code Multicore)
f (Imp.DeclareMem VName
name Space
space) =
      -- Hoisting declarations out is always safe.
      (forall a. VName -> Space -> Code a
Imp.DeclareMem VName
name Space
space, forall a. Monoid a => a
mempty)
    f (Imp.Allocate VName
name Count Bytes (TExp Int64)
size Space
space)
      | Bool -> Bool
not forall a b. (a -> b) -> a -> b
$ forall a. FreeIn a => a -> Names
freeIn Count Bytes (TExp Int64)
size Names -> Names -> Bool
`namesIntersect` Names
declared =
          (forall a. VName -> Count Bytes (TExp Int64) -> Space -> Code a
Imp.Allocate VName
name Count Bytes (TExp Int64)
size Space
space, forall a. Monoid a => a
mempty)
    f (Code Multicore
x Imp.:>>: Code Multicore
y) = Code Multicore -> (Code a, Code Multicore)
f Code Multicore
x forall a. Semigroup a => a -> a -> a
<> Code Multicore -> (Code a, Code Multicore)
f Code Multicore
y
    f (Imp.While TExp Bool
cond Code Multicore
body) =
      (forall a. Monoid a => a
mempty, forall a. TExp Bool -> Code a -> Code a
Imp.While TExp Bool
cond Code Multicore
body)
    f (Imp.For VName
i Exp
bound Code Multicore
body) =
      (forall a. Monoid a => a
mempty, forall a. VName -> Exp -> Code a -> Code a
Imp.For VName
i Exp
bound Code Multicore
body)
    f (Imp.Comment Text
s Code Multicore
code) =
      forall (p :: * -> * -> *) b c a.
Bifunctor p =>
(b -> c) -> p a b -> p a c
second (forall a. Text -> Code a -> Code a
Imp.Comment Text
s) (Code Multicore -> (Code a, Code Multicore)
f Code Multicore
code)
    f Imp.Free {} =
      forall a. Monoid a => a
mempty
    f (Imp.If TExp Bool
cond Code Multicore
tcode Code Multicore
fcode) =
      let (Code a
ta, Code Multicore
tcode') = Code Multicore -> (Code a, Code Multicore)
f Code Multicore
tcode
          (Code a
fa, Code Multicore
fcode') = Code Multicore -> (Code a, Code Multicore)
f Code Multicore
fcode
       in (Code a
ta forall a. Semigroup a => a -> a -> a
<> Code a
fa, forall a. TExp Bool -> Code a -> Code a -> Code a
Imp.If TExp Bool
cond Code Multicore
tcode' Code Multicore
fcode')
    f (Imp.Op (Imp.ParLoop [Char]
s Code Multicore
body [Param]
free)) =
      let (Code Multicore
body_allocs, Code Multicore
body') = Code Multicore -> (Code Multicore, Code Multicore)
extractAllocations Code Multicore
body
          (Code a
free_allocs, Code Multicore
here_allocs) = Code Multicore -> (Code a, Code Multicore)
f Code Multicore
body_allocs
          free' :: [Param]
free' =
            forall a. (a -> Bool) -> [a] -> [a]
filter
              ( (VName -> Names -> Bool
`notNameIn` forall a. Code a -> Names
Imp.declaredIn Code Multicore
body_allocs) forall b c a. (b -> c) -> (a -> b) -> a -> c
. Param -> VName
Imp.paramName
              )
              [Param]
free
       in ( Code a
free_allocs,
            Code Multicore
here_allocs forall a. Semigroup a => a -> a -> a
<> forall a. a -> Code a
Imp.Op ([Char] -> Code Multicore -> [Param] -> Multicore
Imp.ParLoop [Char]
s Code Multicore
body' [Param]
free')
          )
    f Code Multicore
code =
      (forall a. Monoid a => a
mempty, Code Multicore
code)

-- | Indicates whether to vectorize a chunk loop or keep it sequential.
-- We use this to allow falling back to sequential chunk loops in cases
-- we don't care about trying to vectorize.
data ChunkLoopVectorization = Vectorized | Scalar

-- | Emit code for the chunk loop, given an action that generates code
-- for a single iteration.
--
-- The action is called with the (symbolic) index of the current
-- iteration.
generateChunkLoop ::
  String ->
  ChunkLoopVectorization ->
  (Imp.TExp Int64 -> MulticoreGen ()) ->
  MulticoreGen ()
generateChunkLoop :: [Char]
-> ChunkLoopVectorization
-> (TExp Int64 -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
generateChunkLoop [Char]
desc ChunkLoopVectorization
Scalar TExp Int64 -> ImpM MCMem HostEnv Multicore ()
m = do
  (TExp Int64
start, TExp Int64
end) <- MulticoreGen (TExp Int64, TExp Int64)
getLoopBounds
  TExp Int64
n <- forall {k1} {k2} (t :: k1) (rep :: k2) r op.
[Char] -> TExp t -> ImpM rep r op (TExp t)
dPrimVE [Char]
"n" forall a b. (a -> b) -> a -> b
$ TExp Int64
end forall a. Num a => a -> a -> a
- TExp Int64
start
  VName
i <- forall (m :: * -> *). MonadFreshNames m => [Char] -> m VName
newVName ([Char]
desc forall a. Semigroup a => a -> a -> a
<> [Char]
"_i")
  (Code Multicore
body_allocs, Code Multicore
body) <- forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Code Multicore -> (Code Multicore, Code Multicore)
extractAllocations forall a b. (a -> b) -> a -> b
$
    forall {k} (rep :: k) r op.
ImpM rep r op () -> ImpM rep r op (Code op)
collect forall a b. (a -> b) -> a -> b
$ do
      forall {k} (rep :: k) r op. VName -> IntType -> ImpM rep r op ()
addLoopVar VName
i IntType
Int64
      TExp Int64 -> ImpM MCMem HostEnv Multicore ()
m forall a b. (a -> b) -> a -> b
$ TExp Int64
start forall a. Num a => a -> a -> a
+ forall a. a -> TPrimExp Int64 a
Imp.le64 VName
i
  forall {k} op (rep :: k) r. Code op -> ImpM rep r op ()
emit Code Multicore
body_allocs
  -- Emit either foreach or normal for loop
  let bound :: Exp
bound = forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped TExp Int64
n
  forall {k} op (rep :: k) r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. VName -> Exp -> Code a -> Code a
Imp.For VName
i Exp
bound Code Multicore
body
generateChunkLoop [Char]
desc ChunkLoopVectorization
Vectorized TExp Int64 -> ImpM MCMem HostEnv Multicore ()
m = do
  (TExp Int64
start, TExp Int64
end) <- MulticoreGen (TExp Int64, TExp Int64)
getLoopBounds
  TExp Int64
n <- forall {k1} {k2} (t :: k1) (rep :: k2) r op.
[Char] -> TExp t -> ImpM rep r op (TExp t)
dPrimVE [Char]
"n" forall a b. (a -> b) -> a -> b
$ TExp Int64
end forall a. Num a => a -> a -> a
- TExp Int64
start
  VName
i <- forall (m :: * -> *). MonadFreshNames m => [Char] -> m VName
newVName ([Char]
desc forall a. Semigroup a => a -> a -> a
<> [Char]
"_i")
  (Code Multicore
body_allocs, Code Multicore
body) <- forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Code Multicore -> (Code Multicore, Code Multicore)
extractAllocations forall a b. (a -> b) -> a -> b
$
    forall {k} (rep :: k) r op.
ImpM rep r op () -> ImpM rep r op (Code op)
collect forall a b. (a -> b) -> a -> b
$ do
      forall {k} (rep :: k) r op. VName -> IntType -> ImpM rep r op ()
addLoopVar VName
i IntType
Int64
      TExp Int64 -> ImpM MCMem HostEnv Multicore ()
m forall a b. (a -> b) -> a -> b
$ forall a. a -> TPrimExp Int64 a
Imp.le64 VName
i
  forall {k} op (rep :: k) r. Code op -> ImpM rep r op ()
emit Code Multicore
body_allocs
  -- Emit either foreach or normal for loop
  let from :: Exp
from = forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped TExp Int64
start
  let bound :: Exp
bound = forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped (TExp Int64
start forall a. Num a => a -> a -> a
+ TExp Int64
n)
  forall {k} op (rep :: k) r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. a -> Code a
Imp.Op forall a b. (a -> b) -> a -> b
$ VName -> Exp -> Exp -> Code Multicore -> Multicore
Imp.ForEach VName
i Exp
from Exp
bound Code Multicore
body

-- | Emit code for a sequential loop over each vector lane, given
-- and action that generates code for a single iteration. The action
-- is called with the symbolic index of the current iteration.
generateUniformizeLoop :: (Imp.TExp Int64 -> MulticoreGen ()) -> MulticoreGen ()
generateUniformizeLoop :: (TExp Int64 -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
generateUniformizeLoop TExp Int64 -> ImpM MCMem HostEnv Multicore ()
m = do
  VName
i <- forall (m :: * -> *). MonadFreshNames m => [Char] -> m VName
newVName [Char]
"uni_i"
  Code Multicore
body <- forall {k} (rep :: k) r op.
ImpM rep r op () -> ImpM rep r op (Code op)
collect forall a b. (a -> b) -> a -> b
$ do
    forall {k} (rep :: k) r op. VName -> IntType -> ImpM rep r op ()
addLoopVar VName
i IntType
Int64
    TExp Int64 -> ImpM MCMem HostEnv Multicore ()
m forall a b. (a -> b) -> a -> b
$ forall a. a -> TPrimExp Int64 a
Imp.le64 VName
i
  forall {k} op (rep :: k) r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. a -> Code a
Imp.Op forall a b. (a -> b) -> a -> b
$ VName -> Code Multicore -> Multicore
Imp.ForEachActive VName
i Code Multicore
body

-- | Given a piece of code, if that code performs an assignment, turn
-- that assignment into an extraction of element from a vector on the
-- right hand side, using a passed index for the extraction. Other code
-- is left as is.
extractVectorLane :: Imp.TExp Int64 -> MulticoreGen Imp.MCCode -> MulticoreGen ()
extractVectorLane :: TExp Int64
-> ImpM MCMem HostEnv Multicore (Code Multicore)
-> ImpM MCMem HostEnv Multicore ()
extractVectorLane TExp Int64
j ImpM MCMem HostEnv Multicore (Code Multicore)
code = do
  let ut_exp :: Exp
ut_exp = forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped TExp Int64
j
  Code Multicore
code' <- ImpM MCMem HostEnv Multicore (Code Multicore)
code
  case Code Multicore
code' of
    Imp.SetScalar VName
vname Exp
e -> do
      Type
typ <- forall {k} (rep :: k) (m :: * -> *).
HasScope rep m =>
VName -> m Type
lookupType VName
vname
      case Type
typ of
        -- ISPC v1.17 does not support extract on f16 yet..
        -- Thus we do this stupid conversion to f32
        Prim (FloatType FloatType
Float16) -> do
          TV Any
tv <- forall {k1} {k2} (rep :: k1) r op (t :: k2).
[Char] -> PrimType -> ImpM rep r op (TV t)
dPrim [Char]
"hack_extract_f16" (FloatType -> PrimType
FloatType FloatType
Float32)
          forall {k} op (rep :: k) r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. VName -> Exp -> Code a
Imp.SetScalar (forall {k} (t :: k). TV t -> VName
tvVar TV Any
tv) Exp
e
          forall {k} op (rep :: k) r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. a -> Code a
Imp.Op forall a b. (a -> b) -> a -> b
$ VName -> Exp -> Exp -> Multicore
Imp.ExtractLane VName
vname (forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped forall a b. (a -> b) -> a -> b
$ forall {k} (t :: k). TV t -> TExp t
tvExp TV Any
tv) Exp
ut_exp
        Type
_ -> forall {k} op (rep :: k) r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. a -> Code a
Imp.Op forall a b. (a -> b) -> a -> b
$ VName -> Exp -> Exp -> Multicore
Imp.ExtractLane VName
vname Exp
e Exp
ut_exp
    Code Multicore
_ ->
      forall {k} op (rep :: k) r. Code op -> ImpM rep r op ()
emit Code Multicore
code'

-- | Given an action that may generate some code, put that code
-- into an ISPC kernel.
inISPC :: MulticoreGen () -> MulticoreGen ()
inISPC :: ImpM MCMem HostEnv Multicore () -> ImpM MCMem HostEnv Multicore ()
inISPC ImpM MCMem HostEnv Multicore ()
code = do
  Code Multicore
code' <- forall {k} (rep :: k) r op.
ImpM rep r op () -> ImpM rep r op (Code op)
collect ImpM MCMem HostEnv Multicore ()
code
  [Param]
free <- forall a. FreeIn a => a -> MulticoreGen [Param]
freeParams Code Multicore
code'
  forall {k} op (rep :: k) r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. a -> Code a
Imp.Op forall a b. (a -> b) -> a -> b
$ Code Multicore -> [Param] -> Multicore
Imp.ISPCKernel Code Multicore
code' [Param]
free

-------------------------------
------- SegRed helpers  -------
-------------------------------
sForVectorized' :: VName -> Imp.Exp -> MulticoreGen () -> MulticoreGen ()
sForVectorized' :: VName
-> Exp
-> ImpM MCMem HostEnv Multicore ()
-> ImpM MCMem HostEnv Multicore ()
sForVectorized' VName
i Exp
bound ImpM MCMem HostEnv Multicore ()
body = do
  let it :: IntType
it = case forall v. PrimExp v -> PrimType
primExpType Exp
bound of
        IntType IntType
bound_t -> IntType
bound_t
        PrimType
t -> forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
"sFor': bound " forall a. [a] -> [a] -> [a]
++ forall a. Pretty a => a -> [Char]
prettyString Exp
bound forall a. [a] -> [a] -> [a]
++ [Char]
" is of type " forall a. [a] -> [a] -> [a]
++ forall a. Pretty a => a -> [Char]
prettyString PrimType
t
  forall {k} (rep :: k) r op. VName -> IntType -> ImpM rep r op ()
addLoopVar VName
i IntType
it
  Code Multicore
body' <- forall {k} (rep :: k) r op.
ImpM rep r op () -> ImpM rep r op (Code op)
collect ImpM MCMem HostEnv Multicore ()
body
  forall {k} op (rep :: k) r. Code op -> ImpM rep r op ()
emit forall a b. (a -> b) -> a -> b
$ forall a. a -> Code a
Imp.Op forall a b. (a -> b) -> a -> b
$ VName -> Exp -> Exp -> Code Multicore -> Multicore
Imp.ForEach VName
i (forall v. PrimValue -> PrimExp v
Imp.ValueExp forall a b. (a -> b) -> a -> b
$ PrimType -> PrimValue
blankPrimValue forall a b. (a -> b) -> a -> b
$ IntType -> PrimType
Imp.IntType IntType
Imp.Int64) Exp
bound Code Multicore
body'

sForVectorized :: String -> Imp.TExp t -> (Imp.TExp t -> MulticoreGen ()) -> MulticoreGen ()
sForVectorized :: forall {k} (t :: k).
[Char]
-> TExp t
-> (TExp t -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
sForVectorized [Char]
i TExp t
bound TExp t -> ImpM MCMem HostEnv Multicore ()
body = do
  VName
i' <- forall (m :: * -> *). MonadFreshNames m => [Char] -> m VName
newVName [Char]
i
  VName
-> Exp
-> ImpM MCMem HostEnv Multicore ()
-> ImpM MCMem HostEnv Multicore ()
sForVectorized' VName
i' (forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped TExp t
bound) forall a b. (a -> b) -> a -> b
$
    TExp t -> ImpM MCMem HostEnv Multicore ()
body forall a b. (a -> b) -> a -> b
$
      forall {k} (t :: k) v. PrimExp v -> TPrimExp t v
TPrimExp forall a b. (a -> b) -> a -> b
$
        VName -> PrimType -> Exp
Imp.var VName
i' forall a b. (a -> b) -> a -> b
$
          forall v. PrimExp v -> PrimType
primExpType forall a b. (a -> b) -> a -> b
$
            forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped TExp t
bound

-- | Like sLoopNest, but puts a vectorized loop at the innermost layer.
sLoopNestVectorized ::
  Shape ->
  ([Imp.TExp Int64] -> MulticoreGen ()) ->
  MulticoreGen ()
sLoopNestVectorized :: Shape
-> ([TExp Int64] -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
sLoopNestVectorized = [TExp Int64]
-> [SubExp]
-> ([TExp Int64] -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
sLoopNest' [] forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall d. ShapeBase d -> [d]
shapeDims
  where
    sLoopNest' :: [TExp Int64]
-> [SubExp]
-> ([TExp Int64] -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
sLoopNest' [TExp Int64]
is [] [TExp Int64] -> ImpM MCMem HostEnv Multicore ()
f = [TExp Int64] -> ImpM MCMem HostEnv Multicore ()
f forall a b. (a -> b) -> a -> b
$ forall a. [a] -> [a]
reverse [TExp Int64]
is
    sLoopNest' [TExp Int64]
is [SubExp
d] [TExp Int64] -> ImpM MCMem HostEnv Multicore ()
f =
      forall {k} (t :: k).
[Char]
-> TExp t
-> (TExp t -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
sForVectorized [Char]
"nest_i" (SubExp -> TExp Int64
pe64 SubExp
d) forall a b. (a -> b) -> a -> b
$ \TExp Int64
i -> [TExp Int64]
-> [SubExp]
-> ([TExp Int64] -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
sLoopNest' (TExp Int64
i forall a. a -> [a] -> [a]
: [TExp Int64]
is) [] [TExp Int64] -> ImpM MCMem HostEnv Multicore ()
f
    sLoopNest' [TExp Int64]
is (SubExp
d : [SubExp]
ds) [TExp Int64] -> ImpM MCMem HostEnv Multicore ()
f =
      forall {k1} {k2} (t :: k1) (rep :: k2) r op.
[Char]
-> TExp t -> (TExp t -> ImpM rep r op ()) -> ImpM rep r op ()
sFor [Char]
"nest_i" (SubExp -> TExp Int64
pe64 SubExp
d) forall a b. (a -> b) -> a -> b
$ \TExp Int64
i -> [TExp Int64]
-> [SubExp]
-> ([TExp Int64] -> ImpM MCMem HostEnv Multicore ())
-> ImpM MCMem HostEnv Multicore ()
sLoopNest' (TExp Int64
i forall a. a -> [a] -> [a]
: [TExp Int64]
is) [SubExp]
ds [TExp Int64] -> ImpM MCMem HostEnv Multicore ()
f

-------------------------------
------- SegHist helpers -------
-------------------------------
renameHistOpLambda :: [HistOp MCMem] -> MulticoreGen [HistOp MCMem]
renameHistOpLambda :: [HistOp MCMem] -> MulticoreGen [HistOp MCMem]
renameHistOpLambda [HistOp MCMem]
hist_ops =
  forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
t a -> (a -> m b) -> m (t b)
forM [HistOp MCMem]
hist_ops forall a b. (a -> b) -> a -> b
$ \(HistOp Shape
w SubExp
rf [VName]
dest [SubExp]
neutral Shape
shape Lambda MCMem
lam) -> do
    Lambda MCMem
lam' <- forall {k} (rep :: k) (m :: * -> *).
(Renameable rep, MonadFreshNames m) =>
Lambda rep -> m (Lambda rep)
renameLambda Lambda MCMem
lam
    forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ forall {k} (rep :: k).
Shape
-> SubExp
-> [VName]
-> [SubExp]
-> Shape
-> Lambda rep
-> HistOp rep
HistOp Shape
w SubExp
rf [VName]
dest [SubExp]
neutral Shape
shape Lambda MCMem
lam'

-- | Locking strategy used for an atomic update.
data Locking = Locking
  { -- | Array containing the lock.
    Locking -> VName
lockingArray :: VName,
    -- | Value for us to consider the lock free.
    Locking -> TExp Int32
lockingIsUnlocked :: Imp.TExp Int32,
    -- | What to write when we lock it.
    Locking -> TExp Int32
lockingToLock :: Imp.TExp Int32,
    -- | What to write when we unlock it.
    Locking -> TExp Int32
lockingToUnlock :: Imp.TExp Int32,
    -- | A transformation from the logical lock index to the
    -- physical position in the array.  This can also be used
    -- to make the lock array smaller.
    Locking -> [TExp Int64] -> [TExp Int64]
lockingMapping :: [Imp.TExp Int64] -> [Imp.TExp Int64]
  }

-- | A function for generating code for an atomic update.  Assumes
-- that the bucket is in-bounds.
type DoAtomicUpdate rep r =
  [VName] -> [Imp.TExp Int64] -> MulticoreGen ()

-- | The mechanism that will be used for performing the atomic update.
-- Approximates how efficient it will be.  Ordered from most to least
-- efficient.
data AtomicUpdate rep r
  = AtomicPrim (DoAtomicUpdate rep r)
  | -- | Can be done by efficient swaps.
    AtomicCAS (DoAtomicUpdate rep r)
  | -- | Requires explicit locking.
    AtomicLocking (Locking -> DoAtomicUpdate rep r)

atomicUpdateLocking ::
  AtomicBinOp ->
  Lambda MCMem ->
  AtomicUpdate MCMem ()
atomicUpdateLocking :: AtomicBinOp -> Lambda MCMem -> AtomicUpdate MCMem ()
atomicUpdateLocking AtomicBinOp
atomicBinOp Lambda MCMem
lam
  | Just [(BinOp, PrimType, VName, VName)]
ops_and_ts <- forall {k} (rep :: k).
ASTRep rep =>
Lambda rep -> Maybe [(BinOp, PrimType, VName, VName)]
lamIsBinOp Lambda MCMem
lam,
    forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (\(BinOp
_, PrimType
t, VName
_, VName
_) -> Int -> Bool
supportedPrims forall a b. (a -> b) -> a -> b
$ PrimType -> Int
primBitSize PrimType
t) [(BinOp, PrimType, VName, VName)]
ops_and_ts =
      forall {k} {k} {t :: * -> *} {b} {c} {d} {rep :: k} {r :: k}.
Foldable t =>
t (BinOp, b, c, d) -> DoAtomicUpdate rep r -> AtomicUpdate rep r
primOrCas [(BinOp, PrimType, VName, VName)]
ops_and_ts forall a b. (a -> b) -> a -> b
$ \[VName]
arrs [TExp Int64]
bucket ->
        -- If the operator is a vectorised binary operator on 32-bit values,
        -- we can use a particularly efficient implementation. If the
        -- operator has an atomic implementation we use that, otherwise it
        -- is still a binary operator which can be implemented by atomic
        -- compare-and-swap if 32 bits.
        forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ (forall a b. [a] -> [b] -> [(a, b)]
zip [VName]
arrs [(BinOp, PrimType, VName, VName)]
ops_and_ts) forall a b. (a -> b) -> a -> b
$ \(VName
a, (BinOp
op, PrimType
t, VName
x, VName
y)) -> do
          -- Common variables.
          TV Any
old <- forall {k1} {k2} (rep :: k1) r op (t :: k2).
[Char] -> PrimType -> ImpM rep r op (TV t)
dPrim [Char]
"old" PrimType
t

          (VName
arr', Space
_a_space, Count Elements (TExp Int64)
bucket_offset) <- forall {k} (rep :: k) r op.
VName
-> [TExp Int64]
-> ImpM rep r op (VName, Space, Count Elements (TExp Int64))
fullyIndexArray VName
a [TExp Int64]
bucket

          case VName
-> VName
-> Count Elements (TExp Int32)
-> BinOp
-> Maybe (Exp -> Multicore)
opHasAtomicSupport (forall {k} (t :: k). TV t -> VName
tvVar TV Any
old) VName
arr' (forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Count Elements (TExp Int64)
bucket_offset) BinOp
op of
            Just Exp -> Multicore
f -> forall {k} op (rep :: k) r. op -> ImpM rep r op ()
sOp forall a b. (a -> b) -> a -> b
$ Exp -> Multicore
f forall a b. (a -> b) -> a -> b
$ VName -> PrimType -> Exp
Imp.var VName
y PrimType
t
            Maybe (Exp -> Multicore)
Nothing ->
              PrimType
-> VName
-> VName
-> [TExp Int64]
-> VName
-> ImpM MCMem HostEnv Multicore ()
-> ImpM MCMem HostEnv Multicore ()
atomicUpdateCAS PrimType
t VName
a (forall {k} (t :: k). TV t -> VName
tvVar TV Any
old) [TExp Int64]
bucket VName
x forall a b. (a -> b) -> a -> b
$
                VName
x forall {k} (rep :: k) r op. VName -> Exp -> ImpM rep r op ()
<~~ forall v. BinOp -> PrimExp v -> PrimExp v -> PrimExp v
Imp.BinOpExp BinOp
op (VName -> PrimType -> Exp
Imp.var VName
x PrimType
t) (VName -> PrimType -> Exp
Imp.var VName
y PrimType
t)
  where
    opHasAtomicSupport :: VName
-> VName
-> Count Elements (TExp Int32)
-> BinOp
-> Maybe (Exp -> Multicore)
opHasAtomicSupport VName
old VName
arr' Count Elements (TExp Int32)
bucket' BinOp
bop = do
      let atomic :: (VName -> VName -> Count Elements (TExp Int32) -> a -> AtomicOp)
-> a -> Multicore
atomic VName -> VName -> Count Elements (TExp Int32) -> a -> AtomicOp
f = AtomicOp -> Multicore
Imp.Atomic forall b c a. (b -> c) -> (a -> b) -> a -> c
. VName -> VName -> Count Elements (TExp Int32) -> a -> AtomicOp
f VName
old VName
arr' Count Elements (TExp Int32)
bucket'
      forall {a}.
(VName -> VName -> Count Elements (TExp Int32) -> a -> AtomicOp)
-> a -> Multicore
atomic forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> AtomicBinOp
atomicBinOp BinOp
bop

    primOrCas :: t (BinOp, b, c, d) -> DoAtomicUpdate rep r -> AtomicUpdate rep r
primOrCas t (BinOp, b, c, d)
ops
      | forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all forall {b} {c} {d}. (BinOp, b, c, d) -> Bool
isPrim t (BinOp, b, c, d)
ops = forall {k} {k} (rep :: k) (r :: k).
DoAtomicUpdate rep r -> AtomicUpdate rep r
AtomicPrim
      | Bool
otherwise = forall {k} {k} (rep :: k) (r :: k).
DoAtomicUpdate rep r -> AtomicUpdate rep r
AtomicCAS

    isPrim :: (BinOp, b, c, d) -> Bool
isPrim (BinOp
op, b
_, c
_, d
_) = forall a. Maybe a -> Bool
isJust forall a b. (a -> b) -> a -> b
$ AtomicBinOp
atomicBinOp BinOp
op
atomicUpdateLocking AtomicBinOp
_ Lambda MCMem
op
  | [Prim PrimType
t] <- forall {k} (rep :: k). Lambda rep -> [Type]
lambdaReturnType Lambda MCMem
op,
    [LParam MCMem
xp, LParam MCMem
_] <- forall {k} (rep :: k). Lambda rep -> [LParam rep]
lambdaParams Lambda MCMem
op,
    Int -> Bool
supportedPrims (PrimType -> Int
primBitSize PrimType
t) = forall {k} {k} (rep :: k) (r :: k).
DoAtomicUpdate rep r -> AtomicUpdate rep r
AtomicCAS forall a b. (a -> b) -> a -> b
$ \[VName
arr] [TExp Int64]
bucket -> do
      TV Any
old <- forall {k1} {k2} (rep :: k1) r op (t :: k2).
[Char] -> PrimType -> ImpM rep r op (TV t)
dPrim [Char]
"old" PrimType
t
      PrimType
-> VName
-> VName
-> [TExp Int64]
-> VName
-> ImpM MCMem HostEnv Multicore ()
-> ImpM MCMem HostEnv Multicore ()
atomicUpdateCAS PrimType
t VName
arr (forall {k} (t :: k). TV t -> VName
tvVar TV Any
old) [TExp Int64]
bucket (forall dec. Param dec -> VName
paramName LParam MCMem
xp) forall a b. (a -> b) -> a -> b
$
        forall {k} dec (rep :: k) r op.
[Param dec] -> Body rep -> ImpM rep r op ()
compileBody' [LParam MCMem
xp] forall a b. (a -> b) -> a -> b
$
          forall {k} (rep :: k). Lambda rep -> Body rep
lambdaBody Lambda MCMem
op
atomicUpdateLocking AtomicBinOp
_ Lambda MCMem
op = forall {k} {k} (rep :: k) (r :: k).
(Locking -> DoAtomicUpdate rep r) -> AtomicUpdate rep r
AtomicLocking forall a b. (a -> b) -> a -> b
$ \Locking
locking [VName]
arrs [TExp Int64]
bucket -> do
  TV Int32
old <- forall {k1} {k2} (rep :: k1) r op (t :: k2).
[Char] -> PrimType -> ImpM rep r op (TV t)
dPrim [Char]
"old" PrimType
int32
  TV Int32
continue <- forall {k1} {k2} (t :: k1) (rep :: k2) r op.
[Char] -> PrimType -> TExp t -> ImpM rep r op (TV t)
dPrimVol [Char]
"continue" PrimType
int32 (TExp Int32
0 :: Imp.TExp Int32)

  -- Correctly index into locks.
  (VName
locks', Space
_locks_space, Count Elements (TExp Int64)
locks_offset) <-
    forall {k} (rep :: k) r op.
VName
-> [TExp Int64]
-> ImpM rep r op (VName, Space, Count Elements (TExp Int64))
fullyIndexArray (Locking -> VName
lockingArray Locking
locking) forall a b. (a -> b) -> a -> b
$ Locking -> [TExp Int64] -> [TExp Int64]
lockingMapping Locking
locking [TExp Int64]
bucket

  -- Critical section
  let try_acquire_lock :: ImpM rep r Multicore ()
try_acquire_lock = do
        TV Int32
old forall {k1} {k2} (t :: k1) (rep :: k2) r op.
TV t -> TExp t -> ImpM rep r op ()
<-- (TExp Int32
0 :: Imp.TExp Int32)
        forall {k} op (rep :: k) r. op -> ImpM rep r op ()
sOp forall b c a. (b -> c) -> (a -> b) -> a -> c
. AtomicOp -> Multicore
Imp.Atomic forall a b. (a -> b) -> a -> b
$
          PrimType
-> VName
-> VName
-> Count Elements (TExp Int32)
-> VName
-> Exp
-> AtomicOp
Imp.AtomicCmpXchg
            PrimType
int32
            (forall {k} (t :: k). TV t -> VName
tvVar TV Int32
old)
            VName
locks'
            (forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Count Elements (TExp Int64)
locks_offset)
            (forall {k} (t :: k). TV t -> VName
tvVar TV Int32
continue)
            (forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped (Locking -> TExp Int32
lockingToLock Locking
locking))
      lock_acquired :: TExp Int32
lock_acquired = forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
continue
      -- Even the releasing is done with an atomic rather than a
      -- simple write, for memory coherency reasons.
      release_lock :: ImpM rep r Multicore ()
release_lock = do
        TV Int32
old forall {k1} {k2} (t :: k1) (rep :: k2) r op.
TV t -> TExp t -> ImpM rep r op ()
<-- Locking -> TExp Int32
lockingToLock Locking
locking
        forall {k} op (rep :: k) r. op -> ImpM rep r op ()
sOp forall b c a. (b -> c) -> (a -> b) -> a -> c
. AtomicOp -> Multicore
Imp.Atomic forall a b. (a -> b) -> a -> b
$
          PrimType
-> VName
-> VName
-> Count Elements (TExp Int32)
-> VName
-> Exp
-> AtomicOp
Imp.AtomicCmpXchg
            PrimType
int32
            (forall {k} (t :: k). TV t -> VName
tvVar TV Int32
old)
            VName
locks'
            (forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Count Elements (TExp Int64)
locks_offset)
            (forall {k} (t :: k). TV t -> VName
tvVar TV Int32
continue)
            (forall {k} (t :: k) v. TPrimExp t v -> PrimExp v
untyped (Locking -> TExp Int32
lockingToUnlock Locking
locking))

  -- Preparing parameters. It is assumed that the caller has already
  -- filled the arr_params. We copy the current value to the
  -- accumulator parameters.
  let ([Param LetDecMem]
acc_params, [Param LetDecMem]
_arr_params) = forall a. Int -> [a] -> ([a], [a])
splitAt (forall (t :: * -> *) a. Foldable t => t a -> Int
length [VName]
arrs) forall a b. (a -> b) -> a -> b
$ forall {k} (rep :: k). Lambda rep -> [LParam rep]
lambdaParams Lambda MCMem
op
      bind_acc_params :: ImpM rep r op ()
bind_acc_params =
        forall {k} (rep :: k) r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile forall a b. (a -> b) -> a -> b
$
          forall {k} (rep :: k) r op.
Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"bind lhs" forall a b. (a -> b) -> a -> b
$
            forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ (forall a b. [a] -> [b] -> [(a, b)]
zip [Param LetDecMem]
acc_params [VName]
arrs) forall a b. (a -> b) -> a -> b
$ \(Param LetDecMem
acc_p, VName
arr) ->
              forall {k} (rep :: k) r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix (forall dec. Param dec -> VName
paramName Param LetDecMem
acc_p) [] (VName -> SubExp
Var VName
arr) [TExp Int64]
bucket

  let op_body :: ImpM MCMem r op ()
op_body =
        forall {k} (rep :: k) r op.
Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"execute operation" forall a b. (a -> b) -> a -> b
$
          forall {k} dec (rep :: k) r op.
[Param dec] -> Body rep -> ImpM rep r op ()
compileBody' [Param LetDecMem]
acc_params forall a b. (a -> b) -> a -> b
$
            forall {k} (rep :: k). Lambda rep -> Body rep
lambdaBody Lambda MCMem
op

      do_hist :: ImpM rep r op ()
do_hist =
        forall {k} (rep :: k) r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile forall a b. (a -> b) -> a -> b
$
          forall {k} (rep :: k) r op.
Text -> ImpM rep r op () -> ImpM rep r op ()
sComment Text
"update global result" forall a b. (a -> b) -> a -> b
$
            forall (m :: * -> *) a b c.
Applicative m =>
(a -> b -> m c) -> [a] -> [b] -> m ()
zipWithM_ (forall {k} {rep :: k} {r} {op}.
[TExp Int64] -> VName -> SubExp -> ImpM rep r op ()
writeArray [TExp Int64]
bucket) [VName]
arrs forall a b. (a -> b) -> a -> b
$
              forall a b. (a -> b) -> [a] -> [b]
map (VName -> SubExp
Var forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall dec. Param dec -> VName
paramName) [Param LetDecMem]
acc_params

  -- While-loop: Try to insert your value
  forall {k} (rep :: k) r op.
TExp Bool -> ImpM rep r op () -> ImpM rep r op ()
sWhile (forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
continue forall {k} (t :: k) v.
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int32
0) forall a b. (a -> b) -> a -> b
$ do
    forall {k2} {rep :: k2} {r}. ImpM rep r Multicore ()
try_acquire_lock
    forall {k} (rep :: k) r op.
TExp Bool -> ImpM rep r op () -> ImpM rep r op ()
sUnless (TExp Int32
lock_acquired forall {k} (t :: k) v.
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int32
0) forall a b. (a -> b) -> a -> b
$ do
      forall {k} (rep :: k) inner r op.
Mem rep inner =>
[LParam rep] -> ImpM rep r op ()
dLParams [Param LetDecMem]
acc_params
      forall {k} {rep :: k} {r} {op}. ImpM rep r op ()
bind_acc_params
      forall {r} {op}. ImpM MCMem r op ()
op_body
      forall {k} {rep :: k} {r} {op}. ImpM rep r op ()
do_hist
      forall {k2} {rep :: k2} {r}. ImpM rep r Multicore ()
release_lock
  where
    writeArray :: [TExp Int64] -> VName -> SubExp -> ImpM rep r op ()
writeArray [TExp Int64]
bucket VName
arr SubExp
val = forall {k} (rep :: k) r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
arr [TExp Int64]
bucket SubExp
val []

atomicUpdateCAS ::
  PrimType ->
  VName ->
  VName ->
  [Imp.TExp Int64] ->
  VName ->
  MulticoreGen () ->
  MulticoreGen ()
atomicUpdateCAS :: PrimType
-> VName
-> VName
-> [TExp Int64]
-> VName
-> ImpM MCMem HostEnv Multicore ()
-> ImpM MCMem HostEnv Multicore ()
atomicUpdateCAS PrimType
t VName
arr VName
old [TExp Int64]
bucket VName
x ImpM MCMem HostEnv Multicore ()
do_op = do
  TV Int32
run_loop <- forall {k1} {k2} (t :: k1) (rep :: k2) r op.
[Char] -> TExp t -> ImpM rep r op (TV t)
dPrimV [Char]
"run_loop" (TExp Int32
0 :: Imp.TExp Int32)
  (VName
arr', Space
_a_space, Count Elements (TExp Int64)
bucket_offset) <- forall {k} (rep :: k) r op.
VName
-> [TExp Int64]
-> ImpM rep r op (VName, Space, Count Elements (TExp Int64))
fullyIndexArray VName
arr [TExp Int64]
bucket

  PrimType
bytes <- Int -> MulticoreGen PrimType
toIntegral forall a b. (a -> b) -> a -> b
$ PrimType -> Int
primBitSize PrimType
t
  let (PrimExp v -> PrimExp v
toBits, PrimExp v -> PrimExp v
fromBits) =
        case PrimType
t of
          FloatType FloatType
Float16 ->
            ( \PrimExp v
v -> forall v. [Char] -> [PrimExp v] -> PrimType -> PrimExp v
Imp.FunExp [Char]
"to_bits16" [PrimExp v
v] PrimType
int16,
              \PrimExp v
v -> forall v. [Char] -> [PrimExp v] -> PrimType -> PrimExp v
Imp.FunExp [Char]
"from_bits16" [PrimExp v
v] PrimType
t
            )
          FloatType FloatType
Float32 ->
            ( \PrimExp v
v -> forall v. [Char] -> [PrimExp v] -> PrimType -> PrimExp v
Imp.FunExp [Char]
"to_bits32" [PrimExp v
v] PrimType
int32,
              \PrimExp v
v -> forall v. [Char] -> [PrimExp v] -> PrimType -> PrimExp v
Imp.FunExp [Char]
"from_bits32" [PrimExp v
v] PrimType
t
            )
          FloatType FloatType
Float64 ->
            ( \PrimExp v
v -> forall v. [Char] -> [PrimExp v] -> PrimType -> PrimExp v
Imp.FunExp [Char]
"to_bits64" [PrimExp v
v] PrimType
int64,
              \PrimExp v
v -> forall v. [Char] -> [PrimExp v] -> PrimType -> PrimExp v
Imp.FunExp [Char]
"from_bits64" [PrimExp v
v] PrimType
t
            )
          PrimType
_ -> (forall a. a -> a
id, forall a. a -> a
id)

      int :: PrimType
int
        | PrimType -> Int
primBitSize PrimType
t forall a. Eq a => a -> a -> Bool
== Int
16 = PrimType
int16
        | PrimType -> Int
primBitSize PrimType
t forall a. Eq a => a -> a -> Bool
== Int
32 = PrimType
int32
        | Bool
otherwise = PrimType
int64

  forall {k} (rep :: k) r op a. ImpM rep r op a -> ImpM rep r op a
everythingVolatile forall a b. (a -> b) -> a -> b
$ forall {k} (rep :: k) r op.
VName -> [TExp Int64] -> SubExp -> [TExp Int64] -> ImpM rep r op ()
copyDWIMFix VName
old [] (VName -> SubExp
Var VName
arr) [TExp Int64]
bucket

  VName
old_bits_v <- forall {k} (t :: k). TV t -> VName
tvVar forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall {k1} {k2} (rep :: k1) r op (t :: k2).
[Char] -> PrimType -> ImpM rep r op (TV t)
dPrim [Char]
"old_bits" PrimType
int
  VName
old_bits_v forall {k} (rep :: k) r op. VName -> Exp -> ImpM rep r op ()
<~~ forall {v}. PrimExp v -> PrimExp v
toBits (VName -> PrimType -> Exp
Imp.var VName
old PrimType
t)
  let old_bits :: Exp
old_bits = VName -> PrimType -> Exp
Imp.var VName
old_bits_v PrimType
int

  -- While-loop: Try to insert your value
  forall {k} (rep :: k) r op.
TExp Bool -> ImpM rep r op () -> ImpM rep r op ()
sWhile (forall {k} (t :: k). TV t -> TExp t
tvExp TV Int32
run_loop forall {k} (t :: k) v.
TPrimExp t v -> TPrimExp t v -> TPrimExp Bool v
.==. TExp Int32
0) forall a b. (a -> b) -> a -> b
$ do
    VName
x forall {k} (rep :: k) r op. VName -> Exp -> ImpM rep r op ()
<~~ VName -> PrimType -> Exp
Imp.var VName
old PrimType
t
    ImpM MCMem HostEnv Multicore ()
do_op -- Writes result into x
    forall {k} op (rep :: k) r. op -> ImpM rep r op ()
sOp forall b c a. (b -> c) -> (a -> b) -> a -> c
. AtomicOp -> Multicore
Imp.Atomic forall a b. (a -> b) -> a -> b
$
      PrimType
-> VName
-> VName
-> Count Elements (TExp Int32)
-> VName
-> Exp
-> AtomicOp
Imp.AtomicCmpXchg
        PrimType
bytes
        VName
old_bits_v
        VName
arr'
        (forall {k} (t :: k) v. IntExp t => TPrimExp t v -> TPrimExp Int32 v
sExt32 forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Count Elements (TExp Int64)
bucket_offset)
        (forall {k} (t :: k). TV t -> VName
tvVar TV Int32
run_loop)
        (forall {v}. PrimExp v -> PrimExp v
toBits (VName -> PrimType -> Exp
Imp.var VName
x PrimType
t))
    VName
old forall {k} (rep :: k) r op. VName -> Exp -> ImpM rep r op ()
<~~ forall {v}. PrimExp v -> PrimExp v
fromBits Exp
old_bits

supportedPrims :: Int -> Bool
supportedPrims :: Int -> Bool
supportedPrims Int
8 = Bool
True
supportedPrims Int
16 = Bool
True
supportedPrims Int
32 = Bool
True
supportedPrims Int
64 = Bool
True
supportedPrims Int
_ = Bool
False

-- Supported bytes lengths by GCC (and clang) compiler
toIntegral :: Int -> MulticoreGen PrimType
toIntegral :: Int -> MulticoreGen PrimType
toIntegral Int
8 = forall (f :: * -> *) a. Applicative f => a -> f a
pure PrimType
int8
toIntegral Int
16 = forall (f :: * -> *) a. Applicative f => a -> f a
pure PrimType
int16
toIntegral Int
32 = forall (f :: * -> *) a. Applicative f => a -> f a
pure PrimType
int32
toIntegral Int
64 = forall (f :: * -> *) a. Applicative f => a -> f a
pure PrimType
int64
toIntegral Int
b = forall a. HasCallStack => [Char] -> a
error forall a b. (a -> b) -> a -> b
$ [Char]
"number of bytes is not supported for CAS - " forall a. [a] -> [a] -> [a]
++ forall a. Pretty a => a -> [Char]
prettyString Int
b