module Futhark.CodeGen.ImpGen.Multicore.Base
  ( extractAllocations,
    compileThreadResult,
    Locks (..),
    HostEnv (..),
    AtomicBinOp,
    MulticoreGen,
    decideScheduling,
    decideScheduling',
    groupResultArrays,
    renameSegBinOp,
    freeParams,
    renameHistOpLambda,
    atomicUpdateLocking,
    AtomicUpdate (..),
    Locking (..),
    getSpace,
    getIterationDomain,
    getReturnParams,
    segOpString,
  )
where

import Control.Monad
import Data.Bifunctor
import qualified Data.Map as M
import Data.Maybe
import qualified Futhark.CodeGen.ImpCode.Multicore as Imp
import Futhark.CodeGen.ImpGen
import Futhark.Error
import Futhark.IR.MCMem
import Futhark.Transform.Rename
import Prelude hiding (quot, rem)

-- | Is there an atomic t'BinOp' corresponding to this t'BinOp'?
type AtomicBinOp =
  BinOp ->
  Maybe (VName -> VName -> Imp.Count Imp.Elements (Imp.TExp Int32) -> Imp.Exp -> Imp.AtomicOp)

-- | Information about the locks available for accumulators.
data Locks = Locks
  { locksArray :: VName,
    locksCount :: Int
  }

data HostEnv = HostEnv
  { hostAtomics :: AtomicBinOp,
    hostLocks :: M.Map VName Locks
  }

type MulticoreGen = ImpM MCMem HostEnv Imp.Multicore

segOpString :: SegOp () MCMem -> MulticoreGen String
segOpString SegMap {} = return "segmap"
segOpString SegRed {} = return "segred"
segOpString SegScan {} = return "segscan"
segOpString SegHist {} = return "seghist"

arrParam :: VName -> MulticoreGen Imp.Param
arrParam arr = do
  name_entry <- lookupVar arr
  case name_entry of
    ArrayVar _ (ArrayEntry (MemLoc mem _ _) _) ->
      return $ Imp.MemParam mem DefaultSpace
    _ -> error $ "arrParam: could not handle array " ++ show arr

toParam :: VName -> TypeBase shape u -> MulticoreGen [Imp.Param]
toParam name (Prim pt) = return [Imp.ScalarParam name pt]
toParam name (Mem space) = return [Imp.MemParam name space]
toParam name Array {} = pure <$> arrParam name
toParam _name Acc {} = pure [] -- FIXME?  Are we sure this works?

getSpace :: SegOp () MCMem -> SegSpace
getSpace (SegHist _ space _ _ _) = space
getSpace (SegRed _ space _ _ _) = space
getSpace (SegScan _ space _ _ _) = space
getSpace (SegMap _ space _ _) = space

getIterationDomain :: SegOp () MCMem -> SegSpace -> MulticoreGen (Imp.TExp Int64)
getIterationDomain SegMap {} space = do
  let ns = map snd $ unSegSpace space
      ns_64 = map toInt64Exp ns
  return $ product ns_64
getIterationDomain _ space = do
  let ns = map snd $ unSegSpace space
      ns_64 = map toInt64Exp ns
  case unSegSpace space of
    [_] -> return $ product ns_64
    -- A segmented SegOp is over the segments
    -- so we drop the last dimension, which is
    -- executed sequentially
    _ -> return $ product $ init ns_64

-- When the SegRed's return value is a scalar
-- we perform a call by value-result in the segop function
getReturnParams :: Pat MCMem -> SegOp () MCMem -> MulticoreGen [Imp.Param]
getReturnParams pat SegRed {} =
  -- It's a good idea to make sure any prim values are initialised, as
  -- we will load them (redundantly) in the task code, and
  -- uninitialised values are UB.
  fmap concat . forM (patElems pat) $ \pe -> do
    case patElemType pe of
      Prim pt -> patElemName pe <~~ ValueExp (blankPrimValue pt)
      _ -> pure ()
    toParam (patElemName pe) (patElemType pe)
getReturnParams _ _ = return mempty

renameSegBinOp :: [SegBinOp MCMem] -> MulticoreGen [SegBinOp MCMem]
renameSegBinOp segbinops =
  forM segbinops $ \(SegBinOp comm lam ne shape) -> do
    lam' <- renameLambda lam
    return $ SegBinOp comm lam' ne shape

compileThreadResult ::
  SegSpace ->
  PatElem MCMem ->
  KernelResult ->
  MulticoreGen ()
compileThreadResult space pe (Returns _ _ what) = do
  let is = map (Imp.le64 . fst) $ unSegSpace space
  copyDWIMFix (patElemName pe) is what []
compileThreadResult _ _ ConcatReturns {} =
  compilerBugS "compileThreadResult: ConcatReturn unhandled."
compileThreadResult _ _ WriteReturns {} =
  compilerBugS "compileThreadResult: WriteReturns unhandled."
compileThreadResult _ _ TileReturns {} =
  compilerBugS "compileThreadResult: TileReturns unhandled."
compileThreadResult _ _ RegTileReturns {} =
  compilerBugS "compileThreadResult: RegTileReturns unhandled."

freeVariables :: Imp.Code -> [VName] -> [VName]
freeVariables code names =
  namesToList $ freeIn code `namesSubtract` namesFromList names

freeParams :: Imp.Code -> [VName] -> MulticoreGen [Imp.Param]
freeParams code names = do
  let freeVars = freeVariables code names
  ts <- mapM lookupType freeVars
  concat <$> zipWithM toParam freeVars ts

-- | Arrays for storing group results shared between threads
groupResultArrays ::
  String ->
  SubExp ->
  [SegBinOp MCMem] ->
  MulticoreGen [[VName]]
groupResultArrays s num_threads reds =
  forM reds $ \(SegBinOp _ lam _ shape) ->
    forM (lambdaReturnType lam) $ \t -> do
      let full_shape = Shape [num_threads] <> shape <> arrayShape t
      sAllocArray s (elemType t) full_shape DefaultSpace

isLoadBalanced :: Imp.Code -> Bool
isLoadBalanced (a Imp.:>>: b) = isLoadBalanced a && isLoadBalanced b
isLoadBalanced (Imp.For _ _ a) = isLoadBalanced a
isLoadBalanced (Imp.If _ a b) = isLoadBalanced a && isLoadBalanced b
isLoadBalanced (Imp.Comment _ a) = isLoadBalanced a
isLoadBalanced Imp.While {} = False
isLoadBalanced (Imp.Op (Imp.ParLoop _ _ _ code _ _ _)) = isLoadBalanced code
isLoadBalanced _ = True

segBinOpComm' :: [SegBinOp rep] -> Commutativity
segBinOpComm' = mconcat . map segBinOpComm

decideScheduling' :: SegOp () rep -> Imp.Code -> Imp.Scheduling
decideScheduling' SegHist {} _ = Imp.Static
decideScheduling' SegScan {} _ = Imp.Static
decideScheduling' (SegRed _ _ reds _ _) code =
  case segBinOpComm' reds of
    Commutative -> decideScheduling code
    Noncommutative -> Imp.Static
decideScheduling' SegMap {} code = decideScheduling code

decideScheduling :: Imp.Code -> Imp.Scheduling
decideScheduling code =
  if isLoadBalanced code
    then Imp.Static
    else Imp.Dynamic

-- | Try to extract invariant allocations.  If we assume that the
-- given 'Imp.Code' is the body of a 'SegOp', then it is always safe
-- to move the immediate allocations to the prebody.
extractAllocations :: Imp.Code -> (Imp.Code, Imp.Code)
extractAllocations segop_code = f segop_code
  where
    declared = Imp.declaredIn segop_code
    f (Imp.DeclareMem name space) =
      -- Hoisting declarations out is always safe.
      (Imp.DeclareMem name space, mempty)
    f (Imp.Allocate name size space)
      | not $ freeIn size `namesIntersect` declared =
        (Imp.Allocate name size space, mempty)
    f (x Imp.:>>: y) = f x <> f y
    f (Imp.While cond body) =
      (mempty, Imp.While cond body)
    f (Imp.For i bound body) =
      (mempty, Imp.For i bound body)
    f (Imp.Comment s code) =
      second (Imp.Comment s) (f code)
    f Imp.Free {} =
      mempty
    f (Imp.If cond tcode fcode) =
      let (ta, tcode') = f tcode
          (fa, fcode') = f fcode
       in (ta <> fa, Imp.If cond tcode' fcode')
    f (Imp.Op (Imp.ParLoop s i prebody body postbody free info)) =
      let (body_allocs, body') = extractAllocations body
          (free_allocs, here_allocs) = f body_allocs
          free' =
            filter
              ( not
                  . (`nameIn` Imp.declaredIn body_allocs)
                  . Imp.paramName
              )
              free
       in ( free_allocs,
            here_allocs
              <> Imp.Op (Imp.ParLoop s i prebody body' postbody free' info)
          )
    f code =
      (mempty, code)

-------------------------------
------- SegHist helpers -------
-------------------------------
renameHistOpLambda :: [HistOp MCMem] -> MulticoreGen [HistOp MCMem]
renameHistOpLambda hist_ops =
  forM hist_ops $ \(HistOp w rf dest neutral shape lam) -> do
    lam' <- renameLambda lam
    return $ HistOp w rf dest neutral shape lam'

-- | Locking strategy used for an atomic update.
data Locking = Locking
  { -- | Array containing the lock.
    lockingArray :: VName,
    -- | Value for us to consider the lock free.
    lockingIsUnlocked :: Imp.TExp Int32,
    -- | What to write when we lock it.
    lockingToLock :: Imp.TExp Int32,
    -- | What to write when we unlock it.
    lockingToUnlock :: Imp.TExp Int32,
    -- | A transformation from the logical lock index to the
    -- physical position in the array.  This can also be used
    -- to make the lock array smaller.
    lockingMapping :: [Imp.TExp Int64] -> [Imp.TExp Int64]
  }

-- | A function for generating code for an atomic update.  Assumes
-- that the bucket is in-bounds.
type DoAtomicUpdate rep r =
  [VName] -> [Imp.TExp Int64] -> MulticoreGen ()

-- | The mechanism that will be used for performing the atomic update.
-- Approximates how efficient it will be.  Ordered from most to least
-- efficient.
data AtomicUpdate rep r
  = AtomicPrim (DoAtomicUpdate rep r)
  | -- | Can be done by efficient swaps.
    AtomicCAS (DoAtomicUpdate rep r)
  | -- | Requires explicit locking.
    AtomicLocking (Locking -> DoAtomicUpdate rep r)

atomicUpdateLocking ::
  AtomicBinOp ->
  Lambda MCMem ->
  AtomicUpdate MCMem ()
atomicUpdateLocking atomicBinOp lam
  | Just ops_and_ts <- lamIsBinOp lam,
    all (\(_, t, _, _) -> supportedPrims $ primBitSize t) ops_and_ts =
    primOrCas ops_and_ts $ \arrs bucket ->
      -- If the operator is a vectorised binary operator on 32-bit values,
      -- we can use a particularly efficient implementation. If the
      -- operator has an atomic implementation we use that, otherwise it
      -- is still a binary operator which can be implemented by atomic
      -- compare-and-swap if 32 bits.
      forM_ (zip arrs ops_and_ts) $ \(a, (op, t, x, y)) -> do
        -- Common variables.
        old <- dPrim "old" t

        (arr', _a_space, bucket_offset) <- fullyIndexArray a bucket

        case opHasAtomicSupport (tvVar old) arr' (sExt32 <$> bucket_offset) op of
          Just f -> sOp $ f $ Imp.var y t
          Nothing ->
            atomicUpdateCAS t a (tvVar old) bucket x $
              x <~~ Imp.BinOpExp op (Imp.var x t) (Imp.var y t)
  where
    opHasAtomicSupport old arr' bucket' bop = do
      let atomic f = Imp.Atomic . f old arr' bucket'
      atomic <$> atomicBinOp bop

    primOrCas ops
      | all isPrim ops = AtomicPrim
      | otherwise = AtomicCAS

    isPrim (op, _, _, _) = isJust $ atomicBinOp op
atomicUpdateLocking _ op
  | [Prim t] <- lambdaReturnType op,
    [xp, _] <- lambdaParams op,
    supportedPrims (primBitSize t) = AtomicCAS $ \[arr] bucket -> do
    old <- dPrim "old" t
    atomicUpdateCAS t arr (tvVar old) bucket (paramName xp) $
      compileBody' [xp] $ lambdaBody op
atomicUpdateLocking _ op = AtomicLocking $ \locking arrs bucket -> do
  old <- dPrim "old" int32
  continue <- dPrimVol "continue" int32 (0 :: Imp.TExp Int32)

  -- Correctly index into locks.
  (locks', _locks_space, locks_offset) <-
    fullyIndexArray (lockingArray locking) $ lockingMapping locking bucket

  -- Critical section
  let try_acquire_lock = do
        old <-- (0 :: Imp.TExp Int32)
        sOp $
          Imp.Atomic $
            Imp.AtomicCmpXchg
              int32
              (tvVar old)
              locks'
              (sExt32 <$> locks_offset)
              (tvVar continue)
              (untyped (lockingToLock locking))
      lock_acquired = tvExp continue
      -- Even the releasing is done with an atomic rather than a
      -- simple write, for memory coherency reasons.
      release_lock = do
        old <-- lockingToLock locking
        sOp $
          Imp.Atomic $
            Imp.AtomicCmpXchg
              int32
              (tvVar old)
              locks'
              (sExt32 <$> locks_offset)
              (tvVar continue)
              (untyped (lockingToUnlock locking))

  -- Preparing parameters. It is assumed that the caller has already
  -- filled the arr_params. We copy the current value to the
  -- accumulator parameters.
  let (acc_params, _arr_params) = splitAt (length arrs) $ lambdaParams op
      bind_acc_params =
        everythingVolatile $
          sComment "bind lhs" $
            forM_ (zip acc_params arrs) $ \(acc_p, arr) ->
              copyDWIMFix (paramName acc_p) [] (Var arr) bucket

  let op_body =
        sComment "execute operation" $
          compileBody' acc_params $ lambdaBody op

      do_hist =
        everythingVolatile $
          sComment "update global result" $
            zipWithM_ (writeArray bucket) arrs $ map (Var . paramName) acc_params

  -- While-loop: Try to insert your value
  sWhile (tvExp continue .==. 0) $ do
    try_acquire_lock
    sUnless (lock_acquired .==. 0) $ do
      dLParams acc_params
      bind_acc_params
      op_body
      do_hist
      release_lock
  where
    writeArray bucket arr val = copyDWIMFix arr bucket val []

atomicUpdateCAS ::
  PrimType ->
  VName ->
  VName ->
  [Imp.TExp Int64] ->
  VName ->
  MulticoreGen () ->
  MulticoreGen ()
atomicUpdateCAS t arr old bucket x do_op = do
  run_loop <- dPrimV "run_loop" (0 :: Imp.TExp Int32)
  (arr', _a_space, bucket_offset) <- fullyIndexArray arr bucket

  bytes <- toIntegral $ primBitSize t
  let (toBits, fromBits) =
        case t of
          FloatType Float16 ->
            ( \v -> Imp.FunExp "to_bits16" [v] int16,
              \v -> Imp.FunExp "from_bits16" [v] t
            )
          FloatType Float32 ->
            ( \v -> Imp.FunExp "to_bits32" [v] int32,
              \v -> Imp.FunExp "from_bits32" [v] t
            )
          FloatType Float64 ->
            ( \v -> Imp.FunExp "to_bits64" [v] int64,
              \v -> Imp.FunExp "from_bits64" [v] t
            )
          _ -> (id, id)

      int
        | primBitSize t == 16 = int16
        | primBitSize t == 32 = int32
        | otherwise = int64

  everythingVolatile $ copyDWIMFix old [] (Var arr) bucket

  old_bits_v <- tvVar <$> dPrim "old_bits" int
  old_bits_v <~~ toBits (Imp.var old t)
  let old_bits = Imp.var old_bits_v int

  -- While-loop: Try to insert your value
  sWhile (tvExp run_loop .==. 0) $ do
    x <~~ Imp.var old t
    do_op -- Writes result into x
    sOp . Imp.Atomic $
      Imp.AtomicCmpXchg
        bytes
        old_bits_v
        arr'
        (sExt32 <$> bucket_offset)
        (tvVar run_loop)
        (toBits (Imp.var x t))
    old <~~ fromBits old_bits

supportedPrims :: Int -> Bool
supportedPrims 8 = True
supportedPrims 16 = True
supportedPrims 32 = True
supportedPrims 64 = True
supportedPrims _ = False

-- Supported bytes lengths by GCC (and clang) compiler
toIntegral :: Int -> MulticoreGen PrimType
toIntegral 8 = return int8
toIntegral 16 = return int16
toIntegral 32 = return int32
toIntegral 64 = return int64
toIntegral b = error $ "number of bytes is not supported for CAS - " ++ pretty b