module Futhark.CodeGen.ImpGen.Multicore.Base
  ( extractAllocations,
    compileThreadResult,
    Locks (..),
    HostEnv (..),
    AtomicBinOp,
    MulticoreGen,
    decideScheduling,
    decideScheduling',
    groupResultArrays,
    renameSegBinOp,
    freeParams,
    renameHistOpLambda,
    atomicUpdateLocking,
    AtomicUpdate (..),
    DoAtomicUpdate,
    Locking (..),
    getSpace,
    getLoopBounds,
    getIterationDomain,
    getReturnParams,
    segOpString,
    ChunkLoopVectorization (..),
    generateChunkLoop,
    generateUniformizeLoop,
    extractVectorLane,
    inISPC,
    toParam,
    sLoopNestVectorized,
  )
where

import Control.Monad
import Data.Bifunctor
import Data.Map qualified as M
import Data.Maybe
import Futhark.CodeGen.ImpCode.Multicore qualified as Imp
import Futhark.CodeGen.ImpGen
import Futhark.Error
import Futhark.IR.MCMem
import Futhark.MonadFreshNames
import Futhark.Transform.Rename
import Prelude hiding (quot, rem)

-- | Is there an atomic t'BinOp' corresponding to this t'BinOp'?
type AtomicBinOp =
  BinOp ->
  Maybe (VName -> VName -> Imp.Count Imp.Elements (Imp.TExp Int32) -> Imp.Exp -> Imp.AtomicOp)

-- | Information about the locks available for accumulators.
data Locks = Locks
  { locksArray :: VName,
    locksCount :: Int
  }

data HostEnv = HostEnv
  { hostAtomics :: AtomicBinOp,
    hostLocks :: M.Map VName Locks
  }

type MulticoreGen = ImpM MCMem HostEnv Imp.Multicore

segOpString :: SegOp () MCMem -> MulticoreGen String
segOpString SegMap {} = pure "segmap"
segOpString SegRed {} = pure "segred"
segOpString SegScan {} = pure "segscan"
segOpString SegHist {} = pure "seghist"

arrParam :: VName -> MulticoreGen Imp.Param
arrParam arr = do
  name_entry <- lookupVar arr
  case name_entry of
    ArrayVar _ (ArrayEntry (MemLoc mem _ _) _) ->
      pure $ Imp.MemParam mem DefaultSpace
    _ -> error $ "arrParam: could not handle array " ++ show arr

toParam :: VName -> TypeBase shape u -> MulticoreGen [Imp.Param]
toParam name (Prim pt) = pure [Imp.ScalarParam name pt]
toParam name (Mem space) = pure [Imp.MemParam name space]
toParam name Array {} = pure <$> arrParam name
toParam _name Acc {} = pure [] -- FIXME?  Are we sure this works?

getSpace :: SegOp () MCMem -> SegSpace
getSpace (SegHist _ space _ _ _) = space
getSpace (SegRed _ space _ _ _) = space
getSpace (SegScan _ space _ _ _) = space
getSpace (SegMap _ space _ _) = space

getLoopBounds :: MulticoreGen (Imp.TExp Int64, Imp.TExp Int64)
getLoopBounds = do
  start <- dPrim "start" int64
  end <- dPrim "end" int64
  emit $ Imp.Op $ Imp.GetLoopBounds (tvVar start) (tvVar end)
  pure (tvExp start, tvExp end)

getIterationDomain :: SegOp () MCMem -> SegSpace -> MulticoreGen (Imp.TExp Int64)
getIterationDomain SegMap {} space = do
  let ns = map snd $ unSegSpace space
      ns_64 = map pe64 ns
  pure $ product ns_64
getIterationDomain _ space = do
  let ns = map snd $ unSegSpace space
      ns_64 = map pe64 ns
  case unSegSpace space of
    [_] -> pure $ product ns_64
    -- A segmented SegOp is over the segments
    -- so we drop the last dimension, which is
    -- executed sequentially
    _ -> pure $ product $ init ns_64

-- When the SegRed's return value is a scalar
-- we perform a call by value-result in the segop function
getReturnParams :: Pat LetDecMem -> SegOp () MCMem -> MulticoreGen [Imp.Param]
getReturnParams pat SegRed {} =
  -- It's a good idea to make sure any prim values are initialised, as
  -- we will load them (redundantly) in the task code, and
  -- uninitialised values are UB.
  fmap concat . forM (patElems pat) $ \pe -> do
    case patElemType pe of
      Prim pt -> patElemName pe <~~ ValueExp (blankPrimValue pt)
      _ -> pure ()
    toParam (patElemName pe) (patElemType pe)
getReturnParams _ _ = pure mempty

renameSegBinOp :: [SegBinOp MCMem] -> MulticoreGen [SegBinOp MCMem]
renameSegBinOp segbinops =
  forM segbinops $ \(SegBinOp comm lam ne shape) -> do
    lam' <- renameLambda lam
    pure $ SegBinOp comm lam' ne shape

compileThreadResult ::
  SegSpace ->
  PatElem LetDecMem ->
  KernelResult ->
  MulticoreGen ()
compileThreadResult space pe (Returns _ _ what) = do
  let is = map (Imp.le64 . fst) $ unSegSpace space
  copyDWIMFix (patElemName pe) is what []
compileThreadResult _ _ WriteReturns {} =
  compilerBugS "compileThreadResult: WriteReturns unhandled."
compileThreadResult _ _ TileReturns {} =
  compilerBugS "compileThreadResult: TileReturns unhandled."
compileThreadResult _ _ RegTileReturns {} =
  compilerBugS "compileThreadResult: RegTileReturns unhandled."

freeParams :: FreeIn a => a -> MulticoreGen [Imp.Param]
freeParams code = do
  let free = namesToList $ freeIn code
  ts <- mapM lookupType free
  concat <$> zipWithM toParam free ts

-- | Arrays for storing group results shared between threads
groupResultArrays ::
  String ->
  SubExp ->
  [SegBinOp MCMem] ->
  MulticoreGen [[VName]]
groupResultArrays s num_threads reds =
  forM reds $ \(SegBinOp _ lam _ shape) ->
    forM (lambdaReturnType lam) $ \t -> do
      let full_shape = Shape [num_threads] <> shape <> arrayShape t
      sAllocArray s (elemType t) full_shape DefaultSpace

isLoadBalanced :: Imp.MCCode -> Bool
isLoadBalanced (a Imp.:>>: b) = isLoadBalanced a && isLoadBalanced b
isLoadBalanced (Imp.For _ _ a) = isLoadBalanced a
isLoadBalanced (Imp.If _ a b) = isLoadBalanced a && isLoadBalanced b
isLoadBalanced (Imp.Comment _ a) = isLoadBalanced a
isLoadBalanced Imp.While {} = False
isLoadBalanced (Imp.Op (Imp.ParLoop _ code _)) = isLoadBalanced code
isLoadBalanced (Imp.Op (Imp.ForEachActive _ a)) = isLoadBalanced a
isLoadBalanced (Imp.Op (Imp.ForEach _ _ _ a)) = isLoadBalanced a
isLoadBalanced (Imp.Op (Imp.ISPCKernel a _)) = isLoadBalanced a
isLoadBalanced _ = True

segBinOpComm' :: [SegBinOp rep] -> Commutativity
segBinOpComm' = mconcat . map segBinOpComm

decideScheduling' :: SegOp () rep -> Imp.MCCode -> Imp.Scheduling
decideScheduling' SegHist {} _ = Imp.Static
decideScheduling' SegScan {} _ = Imp.Static
decideScheduling' (SegRed _ _ reds _ _) code =
  case segBinOpComm' reds of
    Commutative -> decideScheduling code
    Noncommutative -> Imp.Static
decideScheduling' SegMap {} code = decideScheduling code

decideScheduling :: Imp.MCCode -> Imp.Scheduling
decideScheduling code =
  if isLoadBalanced code
    then Imp.Static
    else Imp.Dynamic

-- | Try to extract invariant allocations.  If we assume that the
-- given 'Imp.MCCode' is the body of a 'SegOp', then it is always safe
-- to move the immediate allocations to the prebody.
extractAllocations :: Imp.MCCode -> (Imp.MCCode, Imp.MCCode)
extractAllocations segop_code = f segop_code
  where
    declared = Imp.declaredIn segop_code
    f (Imp.DeclareMem name space) =
      -- Hoisting declarations out is always safe.
      (Imp.DeclareMem name space, mempty)
    f (Imp.Allocate name size space)
      | not $ freeIn size `namesIntersect` declared =
          (Imp.Allocate name size space, mempty)
    f (x Imp.:>>: y) = f x <> f y
    f (Imp.While cond body) =
      (mempty, Imp.While cond body)
    f (Imp.For i bound body) =
      (mempty, Imp.For i bound body)
    f (Imp.Comment s code) =
      second (Imp.Comment s) (f code)
    f Imp.Free {} =
      mempty
    f (Imp.If cond tcode fcode) =
      let (ta, tcode') = f tcode
          (fa, fcode') = f fcode
       in (ta <> fa, Imp.If cond tcode' fcode')
    f (Imp.Op (Imp.ParLoop s body free)) =
      let (body_allocs, body') = extractAllocations body
          (free_allocs, here_allocs) = f body_allocs
          free' =
            filter
              ( (`notNameIn` Imp.declaredIn body_allocs) . Imp.paramName
              )
              free
       in ( free_allocs,
            here_allocs <> Imp.Op (Imp.ParLoop s body' free')
          )
    f code =
      (mempty, code)

-- | Indicates whether to vectorize a chunk loop or keep it sequential.
-- We use this to allow falling back to sequential chunk loops in cases
-- we don't care about trying to vectorize.
data ChunkLoopVectorization = Vectorized | Scalar

-- | Emit code for the chunk loop, given an action that generates code
-- for a single iteration.
--
-- The action is called with the (symbolic) index of the current
-- iteration.
generateChunkLoop ::
  String ->
  ChunkLoopVectorization ->
  (Imp.TExp Int64 -> MulticoreGen ()) ->
  MulticoreGen ()
generateChunkLoop desc Scalar m = do
  (start, end) <- getLoopBounds
  n <- dPrimVE "n" $ end - start
  i <- newVName (desc <> "_i")
  (body_allocs, body) <- fmap extractAllocations $
    collect $ do
      addLoopVar i Int64
      m $ start + Imp.le64 i
  emit body_allocs
  -- Emit either foreach or normal for loop
  let bound = untyped n
  emit $ Imp.For i bound body
generateChunkLoop desc Vectorized m = do
  (start, end) <- getLoopBounds
  n <- dPrimVE "n" $ end - start
  i <- newVName (desc <> "_i")
  (body_allocs, body) <- fmap extractAllocations $
    collect $ do
      addLoopVar i Int64
      m $ Imp.le64 i
  emit body_allocs
  -- Emit either foreach or normal for loop
  let from = untyped start
  let bound = untyped (start + n)
  emit $ Imp.Op $ Imp.ForEach i from bound body

-- | Emit code for a sequential loop over each vector lane, given
-- and action that generates code for a single iteration. The action
-- is called with the symbolic index of the current iteration.
generateUniformizeLoop :: (Imp.TExp Int64 -> MulticoreGen ()) -> MulticoreGen ()
generateUniformizeLoop m = do
  i <- newVName "uni_i"
  body <- collect $ do
    addLoopVar i Int64
    m $ Imp.le64 i
  emit $ Imp.Op $ Imp.ForEachActive i body

-- | Given a piece of code, if that code performs an assignment, turn
-- that assignment into an extraction of element from a vector on the
-- right hand side, using a passed index for the extraction. Other code
-- is left as is.
extractVectorLane :: Imp.TExp Int64 -> MulticoreGen Imp.MCCode -> MulticoreGen ()
extractVectorLane j code = do
  let ut_exp = untyped j
  code' <- code
  case code' of
    Imp.SetScalar vname e -> do
      typ <- lookupType vname
      case typ of
        -- ISPC v1.17 does not support extract on f16 yet..
        -- Thus we do this stupid conversion to f32
        Prim (FloatType Float16) -> do
          tv <- dPrim "hack_extract_f16" (FloatType Float32)
          emit $ Imp.SetScalar (tvVar tv) e
          emit $ Imp.Op $ Imp.ExtractLane vname (untyped $ tvExp tv) ut_exp
        _ -> emit $ Imp.Op $ Imp.ExtractLane vname e ut_exp
    _ ->
      emit code'

-- | Given an action that may generate some code, put that code
-- into an ISPC kernel.
inISPC :: MulticoreGen () -> MulticoreGen ()
inISPC code = do
  code' <- collect code
  free <- freeParams code'
  emit $ Imp.Op $ Imp.ISPCKernel code' free

-------------------------------
------- SegRed helpers  -------
-------------------------------
sForVectorized' :: VName -> Imp.Exp -> MulticoreGen () -> MulticoreGen ()
sForVectorized' i bound body = do
  let it = case primExpType bound of
        IntType bound_t -> bound_t
        t -> error $ "sFor': bound " ++ prettyString bound ++ " is of type " ++ prettyString t
  addLoopVar i it
  body' <- collect body
  emit $ Imp.Op $ Imp.ForEach i (Imp.ValueExp $ blankPrimValue $ Imp.IntType Imp.Int64) bound body'

sForVectorized :: String -> Imp.TExp t -> (Imp.TExp t -> MulticoreGen ()) -> MulticoreGen ()
sForVectorized i bound body = do
  i' <- newVName i
  sForVectorized' i' (untyped bound) $
    body $
      TPrimExp $
        Imp.var i' $
          primExpType $
            untyped bound

-- | Like sLoopNest, but puts a vectorized loop at the innermost layer.
sLoopNestVectorized ::
  Shape ->
  ([Imp.TExp Int64] -> MulticoreGen ()) ->
  MulticoreGen ()
sLoopNestVectorized = sLoopNest' [] . shapeDims
  where
    sLoopNest' is [] f = f $ reverse is
    sLoopNest' is [d] f =
      sForVectorized "nest_i" (pe64 d) $ \i -> sLoopNest' (i : is) [] f
    sLoopNest' is (d : ds) f =
      sFor "nest_i" (pe64 d) $ \i -> sLoopNest' (i : is) ds f

-------------------------------
------- SegHist helpers -------
-------------------------------
renameHistOpLambda :: [HistOp MCMem] -> MulticoreGen [HistOp MCMem]
renameHistOpLambda hist_ops =
  forM hist_ops $ \(HistOp w rf dest neutral shape lam) -> do
    lam' <- renameLambda lam
    pure $ HistOp w rf dest neutral shape lam'

-- | Locking strategy used for an atomic update.
data Locking = Locking
  { -- | Array containing the lock.
    lockingArray :: VName,
    -- | Value for us to consider the lock free.
    lockingIsUnlocked :: Imp.TExp Int32,
    -- | What to write when we lock it.
    lockingToLock :: Imp.TExp Int32,
    -- | What to write when we unlock it.
    lockingToUnlock :: Imp.TExp Int32,
    -- | A transformation from the logical lock index to the
    -- physical position in the array.  This can also be used
    -- to make the lock array smaller.
    lockingMapping :: [Imp.TExp Int64] -> [Imp.TExp Int64]
  }

-- | A function for generating code for an atomic update.  Assumes
-- that the bucket is in-bounds.
type DoAtomicUpdate rep r =
  [VName] -> [Imp.TExp Int64] -> MulticoreGen ()

-- | The mechanism that will be used for performing the atomic update.
-- Approximates how efficient it will be.  Ordered from most to least
-- efficient.
data AtomicUpdate rep r
  = AtomicPrim (DoAtomicUpdate rep r)
  | -- | Can be done by efficient swaps.
    AtomicCAS (DoAtomicUpdate rep r)
  | -- | Requires explicit locking.
    AtomicLocking (Locking -> DoAtomicUpdate rep r)

atomicUpdateLocking ::
  AtomicBinOp ->
  Lambda MCMem ->
  AtomicUpdate MCMem ()
atomicUpdateLocking atomicBinOp lam
  | Just ops_and_ts <- lamIsBinOp lam,
    all (\(_, t, _, _) -> supportedPrims $ primBitSize t) ops_and_ts =
      primOrCas ops_and_ts $ \arrs bucket ->
        -- If the operator is a vectorised binary operator on 32-bit values,
        -- we can use a particularly efficient implementation. If the
        -- operator has an atomic implementation we use that, otherwise it
        -- is still a binary operator which can be implemented by atomic
        -- compare-and-swap if 32 bits.
        forM_ (zip arrs ops_and_ts) $ \(a, (op, t, x, y)) -> do
          -- Common variables.
          old <- dPrim "old" t

          (arr', _a_space, bucket_offset) <- fullyIndexArray a bucket

          case opHasAtomicSupport (tvVar old) arr' (sExt32 <$> bucket_offset) op of
            Just f -> sOp $ f $ Imp.var y t
            Nothing ->
              atomicUpdateCAS t a (tvVar old) bucket x $
                x <~~ Imp.BinOpExp op (Imp.var x t) (Imp.var y t)
  where
    opHasAtomicSupport old arr' bucket' bop = do
      let atomic f = Imp.Atomic . f old arr' bucket'
      atomic <$> atomicBinOp bop

    primOrCas ops
      | all isPrim ops = AtomicPrim
      | otherwise = AtomicCAS

    isPrim (op, _, _, _) = isJust $ atomicBinOp op
atomicUpdateLocking _ op
  | [Prim t] <- lambdaReturnType op,
    [xp, _] <- lambdaParams op,
    supportedPrims (primBitSize t) = AtomicCAS $ \[arr] bucket -> do
      old <- dPrim "old" t
      atomicUpdateCAS t arr (tvVar old) bucket (paramName xp) $
        compileBody' [xp] $
          lambdaBody op
atomicUpdateLocking _ op = AtomicLocking $ \locking arrs bucket -> do
  old <- dPrim "old" int32
  continue <- dPrimVol "continue" int32 (0 :: Imp.TExp Int32)

  -- Correctly index into locks.
  (locks', _locks_space, locks_offset) <-
    fullyIndexArray (lockingArray locking) $ lockingMapping locking bucket

  -- Critical section
  let try_acquire_lock = do
        old <-- (0 :: Imp.TExp Int32)
        sOp . Imp.Atomic $
          Imp.AtomicCmpXchg
            int32
            (tvVar old)
            locks'
            (sExt32 <$> locks_offset)
            (tvVar continue)
            (untyped (lockingToLock locking))
      lock_acquired = tvExp continue
      -- Even the releasing is done with an atomic rather than a
      -- simple write, for memory coherency reasons.
      release_lock = do
        old <-- lockingToLock locking
        sOp . Imp.Atomic $
          Imp.AtomicCmpXchg
            int32
            (tvVar old)
            locks'
            (sExt32 <$> locks_offset)
            (tvVar continue)
            (untyped (lockingToUnlock locking))

  -- Preparing parameters. It is assumed that the caller has already
  -- filled the arr_params. We copy the current value to the
  -- accumulator parameters.
  let (acc_params, _arr_params) = splitAt (length arrs) $ lambdaParams op
      bind_acc_params =
        everythingVolatile $
          sComment "bind lhs" $
            forM_ (zip acc_params arrs) $ \(acc_p, arr) ->
              copyDWIMFix (paramName acc_p) [] (Var arr) bucket

  let op_body =
        sComment "execute operation" $
          compileBody' acc_params $
            lambdaBody op

      do_hist =
        everythingVolatile $
          sComment "update global result" $
            zipWithM_ (writeArray bucket) arrs $
              map (Var . paramName) acc_params

  -- While-loop: Try to insert your value
  sWhile (tvExp continue .==. 0) $ do
    try_acquire_lock
    sUnless (lock_acquired .==. 0) $ do
      dLParams acc_params
      bind_acc_params
      op_body
      do_hist
      release_lock
  where
    writeArray bucket arr val = copyDWIMFix arr bucket val []

atomicUpdateCAS ::
  PrimType ->
  VName ->
  VName ->
  [Imp.TExp Int64] ->
  VName ->
  MulticoreGen () ->
  MulticoreGen ()
atomicUpdateCAS t arr old bucket x do_op = do
  run_loop <- dPrimV "run_loop" (0 :: Imp.TExp Int32)
  (arr', _a_space, bucket_offset) <- fullyIndexArray arr bucket

  bytes <- toIntegral $ primBitSize t
  let (toBits, fromBits) =
        case t of
          FloatType Float16 ->
            ( \v -> Imp.FunExp "to_bits16" [v] int16,
              \v -> Imp.FunExp "from_bits16" [v] t
            )
          FloatType Float32 ->
            ( \v -> Imp.FunExp "to_bits32" [v] int32,
              \v -> Imp.FunExp "from_bits32" [v] t
            )
          FloatType Float64 ->
            ( \v -> Imp.FunExp "to_bits64" [v] int64,
              \v -> Imp.FunExp "from_bits64" [v] t
            )
          _ -> (id, id)

      int
        | primBitSize t == 16 = int16
        | primBitSize t == 32 = int32
        | otherwise = int64

  everythingVolatile $ copyDWIMFix old [] (Var arr) bucket

  old_bits_v <- tvVar <$> dPrim "old_bits" int
  old_bits_v <~~ toBits (Imp.var old t)
  let old_bits = Imp.var old_bits_v int

  -- While-loop: Try to insert your value
  sWhile (tvExp run_loop .==. 0) $ do
    x <~~ Imp.var old t
    do_op -- Writes result into x
    sOp . Imp.Atomic $
      Imp.AtomicCmpXchg
        bytes
        old_bits_v
        arr'
        (sExt32 <$> bucket_offset)
        (tvVar run_loop)
        (toBits (Imp.var x t))
    old <~~ fromBits old_bits

supportedPrims :: Int -> Bool
supportedPrims 8 = True
supportedPrims 16 = True
supportedPrims 32 = True
supportedPrims 64 = True
supportedPrims _ = False

-- Supported bytes lengths by GCC (and clang) compiler
toIntegral :: Int -> MulticoreGen PrimType
toIntegral 8 = pure int8
toIntegral 16 = pure int16
toIntegral 32 = pure int32
toIntegral 64 = pure int64
toIntegral b = error $ "number of bytes is not supported for CAS - " ++ prettyString b