module Futhark.CodeGen.ImpGen.Multicore.SegHist
  ( compileSegHist,
  )
where

import Control.Monad
import Data.List (zip4)
import Data.Maybe (listToMaybe)
import Futhark.CodeGen.ImpCode.Multicore qualified as Imp
import Futhark.CodeGen.ImpGen
import Futhark.CodeGen.ImpGen.Multicore.Base
import Futhark.CodeGen.ImpGen.Multicore.SegRed (compileSegRed')
import Futhark.IR.MCMem
import Futhark.MonadFreshNames
import Futhark.Transform.Rename (renameLambda)
import Futhark.Util (chunks, splitFromEnd, takeLast)
import Futhark.Util.IntegralExp (rem)
import Prelude hiding (quot, rem)

compileSegHist ::
  Pat LetDecMem ->
  SegSpace ->
  [HistOp MCMem] ->
  KernelBody MCMem ->
  TV Int32 ->
  MulticoreGen Imp.MCCode
compileSegHist pat space histops kbody nsubtasks
  | [_] <- unSegSpace space =
      nonsegmentedHist pat space histops kbody nsubtasks
  | otherwise =
      segmentedHist pat space histops kbody

-- | Split some list into chunks equal to the number of values
-- returned by each 'SegBinOp'
segHistOpChunks :: [HistOp rep] -> [a] -> [[a]]
segHistOpChunks = chunks . map (length . histNeutral)

histSize :: HistOp MCMem -> Imp.TExp Int64
histSize = product . map pe64 . shapeDims . histShape

genHistOpParams :: HistOp MCMem -> MulticoreGen ()
genHistOpParams histops =
  dScope Nothing $ scopeOfLParams $ lambdaParams $ histOp histops

renameHistop :: HistOp MCMem -> MulticoreGen (HistOp MCMem)
renameHistop histop = do
  let op = histOp histop
  lambda' <- renameLambda op
  pure histop {histOp = lambda'}

nonsegmentedHist ::
  Pat LetDecMem ->
  SegSpace ->
  [HistOp MCMem] ->
  KernelBody MCMem ->
  TV Int32 ->
  MulticoreGen Imp.MCCode
nonsegmentedHist pat space histops kbody num_histos = do
  let ns = map snd $ unSegSpace space
      ns_64 = map pe64 ns
      num_histos' = tvExp num_histos
      hist_width = maybe 0 histSize $ listToMaybe histops
      use_subhistogram = sExt64 num_histos' * hist_width .<=. product ns_64

  histops' <- renameHistOpLambda histops

  -- Only do something if there is actually input.
  collect $
    sUnless (product ns_64 .==. 0) $ do
      sIf
        use_subhistogram
        (subHistogram pat space histops num_histos kbody)
        (atomicHistogram pat space histops' kbody)

-- |
-- Atomic Histogram approach
-- The implementation has three sub-strategies depending on the
-- type of the operator
-- 1. If values are integral scalars, a direct-supported atomic update is used.
-- 2. If values are on one memory location, e.g. a float, then a
-- CAS operation is used to perform the update, where the float is
-- casted to an integral scalar.
-- 1. and 2. currently only works for 32-bit and 64-bit types,
-- but GCC has support for 8-, 16- and 128- bit types as well.
-- 3. Otherwise a locking based approach is used
onOpAtomic :: HistOp MCMem -> MulticoreGen ([VName] -> [Imp.TExp Int64] -> MulticoreGen ())
onOpAtomic op = do
  atomics <- hostAtomics <$> askEnv
  let lambda = histOp op
      do_op = atomicUpdateLocking atomics lambda
  case do_op of
    AtomicPrim f -> pure f
    AtomicCAS f -> pure f
    AtomicLocking f -> do
      -- Allocate a static array of locks
      -- as in the GPU backend
      let num_locks = 100151 -- This number is taken from the GPU backend
          dims = map pe64 $ shapeDims (histOpShape op <> histShape op)
      locks <-
        sStaticArray "hist_locks" int32 $
          Imp.ArrayZeros num_locks
      let l' = Locking locks 0 1 0 (pure . (`rem` fromIntegral num_locks) . flattenIndex dims)
      pure $ f l'

atomicHistogram ::
  Pat LetDecMem ->
  SegSpace ->
  [HistOp MCMem] ->
  KernelBody MCMem ->
  MulticoreGen ()
atomicHistogram pat space histops kbody = do
  let (is, ns) = unzip $ unSegSpace space
      ns_64 = map pe64 ns
  let num_red_res = length histops + sum (map (length . histNeutral) histops)
      (all_red_pes, map_pes) = splitAt num_red_res $ patElems pat

  atomicOps <- mapM onOpAtomic histops

  body <- collect $ do
    dPrim_ (segFlat space) int64
    sOp $ Imp.GetTaskId (segFlat space)
    generateChunkLoop "SegHist" Scalar $ \flat_idx -> do
      zipWithM_ dPrimV_ is $ unflattenIndex ns_64 flat_idx
      compileStms mempty (kernelBodyStms kbody) $ do
        let (red_res, map_res) =
              splitFromEnd (length map_pes) $ kernelBodyResult kbody
            red_res_split = splitHistResults histops $ map kernelResultSubExp red_res

        let pes_per_op = chunks (map (length . histDest) histops) all_red_pes
        forM_ (zip4 histops red_res_split atomicOps pes_per_op) $
          \(HistOp dest_shape _ _ _ shape lam, (bucket, vs'), do_op, dest_res) -> do
            let (_is_params, vs_params) = splitAt (length vs') $ lambdaParams lam
                dest_shape' = map pe64 $ shapeDims dest_shape
                bucket' = map pe64 bucket
                bucket_in_bounds = inBounds (Slice (map DimFix bucket')) dest_shape'

            sComment "save map-out results" $
              forM_ (zip map_pes map_res) $ \(pe, res) ->
                copyDWIMFix (patElemName pe) (map Imp.le64 is) (kernelResultSubExp res) []

            sComment "perform updates" $
              sWhen bucket_in_bounds $ do
                let bucket_is = map Imp.le64 (init is) ++ bucket'
                dLParams $ lambdaParams lam
                sLoopNest shape $ \is' -> do
                  forM_ (zip vs_params vs') $ \(p, res) ->
                    copyDWIMFix (paramName p) [] res is'
                  do_op (map patElemName dest_res) (bucket_is ++ is')

  free_params <- freeParams body
  emit $ Imp.Op $ Imp.ParLoop "atomic_seg_hist" body free_params

updateHisto ::
  HistOp MCMem ->
  [VName] ->
  [Imp.TExp Int64] ->
  Imp.TExp Int64 ->
  [Param LParamMem] ->
  MulticoreGen ()
updateHisto op arrs bucket j uni_acc = do
  let bind_acc_params =
        forM_ (zip uni_acc arrs) $ \(acc_u, arr) -> do
          copyDWIMFix (paramName acc_u) [] (Var arr) bucket

      op_body = compileBody' [] $ lambdaBody $ histOp op
      writeArray arr val = extractVectorLane j $ collect $ copyDWIMFix arr bucket val []
      do_hist = zipWithM_ writeArray arrs $ map resSubExp $ bodyResult $ lambdaBody $ histOp op

  sComment "Start of body" $ do
    bind_acc_params
    op_body
    do_hist

-- Generates num_histos sub-histograms of the size
-- of the destination histogram
-- Then for each chunk of the input each subhistogram
-- is computed and finally combined through a segmented reduction
-- across the histogram indicies.
-- This is expected to be fast if len(histDest) is small
subHistogram ::
  Pat LetDecMem ->
  SegSpace ->
  [HistOp MCMem] ->
  TV Int32 ->
  KernelBody MCMem ->
  MulticoreGen ()
subHistogram pat space histops num_histos kbody = do
  emit $ Imp.DebugPrint "subHistogram segHist" Nothing

  let (is, ns) = unzip $ unSegSpace space
      ns_64 = map pe64 ns

  let pes = patElems pat
      num_red_res = length histops + sum (map (length . histNeutral) histops)
      map_pes = drop num_red_res pes
      per_red_pes = segHistOpChunks histops $ patElems pat

  -- Allocate array of subhistograms in the calling thread.  Each
  -- tasks will work in its own private allocations (to avoid false
  -- sharing), but this is where they will ultimately copy their
  -- results.
  global_subhistograms <- forM histops $ \histop ->
    forM (histType histop) $ \t -> do
      let shape = Shape [tvSize num_histos] <> arrayShape t
      sAllocArray "subhistogram" (elemType t) shape DefaultSpace

  let tid' = Imp.le64 $ segFlat space

  -- Generate loop body of parallel function
  body <- collect $ do
    dPrim_ (segFlat space) int64
    sOp $ Imp.GetTaskId (segFlat space)

    local_subhistograms <- forM (zip per_red_pes histops) $ \(pes', histop) -> do
      op_local_subhistograms <- forM (histType histop) $ \t ->
        sAllocArray "subhistogram" (elemType t) (arrayShape t) DefaultSpace

      forM_ (zip3 pes' op_local_subhistograms (histNeutral histop)) $ \(pe, hist, ne) ->
        -- First thread initializes histogram with dest vals. Others
        -- initialize with neutral element
        sIf
          (tid' .==. 0)
          (copyDWIMFix hist [] (Var $ patElemName pe) [])
          ( sLoopNest (histShape histop) $ \shape_is ->
              sLoopNest (histOpShape histop) $ \vec_is ->
                copyDWIMFix hist (shape_is <> vec_is) ne []
          )

      pure op_local_subhistograms

    inISPC $
      generateChunkLoop "SegRed" Vectorized $ \i -> do
        zipWithM_ dPrimV_ is $ unflattenIndex ns_64 i
        compileStms mempty (kernelBodyStms kbody) $ do
          let (red_res, map_res) =
                splitFromEnd (length map_pes) $
                  map kernelResultSubExp $
                    kernelBodyResult kbody

          sComment "save map-out results" $
            forM_ (zip map_pes map_res) $ \(pe, res) ->
              copyDWIMFix (patElemName pe) (map Imp.le64 is) res []

          forM_ (zip3 histops local_subhistograms (splitHistResults histops red_res)) $
            \( histop@(HistOp dest_shape _ _ _ shape _),
               histop_subhistograms,
               (bucket, vs')
               ) -> do
                histop' <- renameHistop histop

                let bucket' = map pe64 bucket
                    dest_shape' = map pe64 $ shapeDims dest_shape
                    acc_params' = (lambdaParams . histOp) histop'
                    vs_params' = takeLast (length vs') $ lambdaParams $ histOp histop'

                generateUniformizeLoop $ \j ->
                  sComment "perform updates" $ do
                    -- Create new set of uniform buckets
                    -- That is extract each bucket from a SIMD vector lane
                    extract_buckets <- mapM (dPrim "extract_bucket" . (primExpType . untyped)) bucket'
                    forM_ (zip extract_buckets bucket') $ \(x, y) ->
                      emit $ Imp.Op $ Imp.ExtractLane (tvVar x) (untyped y) (untyped j)
                    let bucket'' = map tvExp extract_buckets
                        bucket_in_bounds =
                          inBounds (Slice (map DimFix bucket'')) dest_shape'
                    sWhen bucket_in_bounds $ do
                      genHistOpParams histop'
                      sLoopNest shape $ \is' -> do
                        -- read values vs and perform lambda writing result back to is
                        forM_ (zip vs_params' vs') $ \(p, res) ->
                          ifPrimType (paramType p) $ \pt -> do
                            -- Hack to copy varying load into uniform result variable
                            tmp <- dPrim "tmp" pt
                            copyDWIMFix (tvVar tmp) [] res is'
                            extractVectorLane j $
                              pure $
                                Imp.SetScalar (paramName p) (Imp.LeafExp (tvVar tmp) pt)
                        updateHisto histop' histop_subhistograms (bucket'' ++ is') j acc_params'

    -- Copy the task-local subhistograms to the global subhistograms,
    -- where they will be combined.
    forM_ (zip (concat global_subhistograms) (concat local_subhistograms)) $
      \(global, local) -> copyDWIMFix global [tid'] (Var local) []

  free_params <- freeParams body
  emit $ Imp.Op $ Imp.ParLoop "seghist_stage_1" body free_params

  -- Perform a segmented reduction over the subhistograms
  forM_ (zip3 per_red_pes global_subhistograms histops) $ \(red_pes, hists, op) -> do
    bucket_ids <-
      replicateM (shapeRank (histShape op)) (newVName "bucket_id")
    subhistogram_id <- newVName "subhistogram_id"

    let segred_space =
          SegSpace (segFlat space) $
            segment_dims
              ++ zip bucket_ids (shapeDims (histShape op))
              ++ [(subhistogram_id, tvSize num_histos)]

        segred_op = SegBinOp Noncommutative (histOp op) (histNeutral op) (histOpShape op)

    red_code <- collect $ do
      nsubtasks <- dPrim "nsubtasks" int32
      sOp $ Imp.GetNumTasks $ tvVar nsubtasks
      emit <=< compileSegRed' (Pat red_pes) segred_space [segred_op] nsubtasks $ \red_cont ->
        red_cont $
          segBinOpChunks [segred_op] $
            flip map hists $ \subhisto ->
              ( Var subhisto,
                map Imp.le64 $
                  map fst segment_dims ++ [subhistogram_id] ++ bucket_ids
              )

    let ns_red = map (pe64 . snd) $ unSegSpace segred_space
        iterations = product $ init ns_red -- The segmented reduction is sequential over the inner most dimension
        scheduler_info = Imp.SchedulerInfo (untyped iterations) Imp.Static
        red_task = Imp.ParallelTask red_code
    free_params_red <- freeParams red_code
    emit $ Imp.Op $ Imp.SegOp "seghist_red" free_params_red red_task Nothing mempty scheduler_info
  where
    segment_dims = init $ unSegSpace space
    ifPrimType (Prim pt) f = f pt
    ifPrimType _ _ = pure ()

-- Note: This isn't currently used anywhere.
-- This implementation for a Segmented Hist only
-- parallelize over the segments,
-- where each segment is updated sequentially.
segmentedHist ::
  Pat LetDecMem ->
  SegSpace ->
  [HistOp MCMem] ->
  KernelBody MCMem ->
  MulticoreGen Imp.MCCode
segmentedHist pat space histops kbody = do
  emit $ Imp.DebugPrint "Segmented segHist" Nothing
  collect $ do
    body <- compileSegHistBody pat space histops kbody
    free_params <- freeParams body
    emit $ Imp.Op $ Imp.ParLoop "segmented_hist" body free_params

compileSegHistBody ::
  Pat LetDecMem ->
  SegSpace ->
  [HistOp MCMem] ->
  KernelBody MCMem ->
  MulticoreGen Imp.MCCode
compileSegHistBody pat space histops kbody = collect $ do
  let (is, ns) = unzip $ unSegSpace space
      ns_64 = map pe64 ns

  let num_red_res = length histops + sum (map (length . histNeutral) histops)
      map_pes = drop num_red_res $ patElems pat
      per_red_pes = segHistOpChunks histops $ patElems pat

  dPrim_ (segFlat space) int64
  sOp $ Imp.GetTaskId (segFlat space)

  generateChunkLoop "SegHist" Scalar $ \idx -> do
    let inner_bound = last ns_64
    sFor "i" inner_bound $ \i -> do
      zipWithM_ dPrimV_ (init is) $ unflattenIndex (init ns_64) idx
      dPrimV_ (last is) i

      compileStms mempty (kernelBodyStms kbody) $ do
        let (red_res, map_res) =
              splitFromEnd (length map_pes) $
                map kernelResultSubExp $
                  kernelBodyResult kbody
        forM_ (zip3 per_red_pes histops (splitHistResults histops red_res)) $
          \(red_pes, HistOp dest_shape _ _ _ shape lam, (bucket, vs')) -> do
            let (is_params, vs_params) = splitAt (length vs') $ lambdaParams lam
                bucket' = map pe64 bucket
                dest_shape' = map pe64 $ shapeDims dest_shape
                bucket_in_bounds = inBounds (Slice (map DimFix bucket')) dest_shape'

            sComment "save map-out results" $
              forM_ (zip map_pes map_res) $ \(pe, res) ->
                copyDWIMFix (patElemName pe) (map Imp.le64 is) res []

            sComment "perform updates" $
              sWhen bucket_in_bounds $ do
                dLParams $ lambdaParams lam
                sLoopNest shape $ \vec_is -> do
                  -- Index
                  forM_ (zip red_pes is_params) $ \(pe, p) ->
                    copyDWIMFix
                      (paramName p)
                      []
                      (Var $ patElemName pe)
                      (map Imp.le64 (init is) ++ bucket' ++ vec_is)
                  -- Value at index
                  forM_ (zip vs_params vs') $ \(p, v) ->
                    copyDWIMFix (paramName p) [] v vec_is
                  compileStms mempty (bodyStms $ lambdaBody lam) $
                    forM_ (zip red_pes $ map resSubExp $ bodyResult $ lambdaBody lam) $
                      \(pe, se) ->
                        copyDWIMFix
                          (patElemName pe)
                          (map Imp.le64 (init is) ++ bucket' ++ vec_is)
                          se
                          []