{-# LANGUAGE TypeFamilies #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}

-- | Facilities for converting a 'GPU' program to 'GPUMem'.
module Futhark.Pass.ExplicitAllocations.GPU
  ( explicitAllocations,
    explicitAllocationsInStms,
  )
where

import Control.Monad
import Data.Set qualified as S
import Futhark.IR.GPU
import Futhark.IR.GPUMem
import Futhark.IR.Mem.IxFun qualified as IxFun
import Futhark.Pass.ExplicitAllocations
import Futhark.Pass.ExplicitAllocations.SegOp

instance SizeSubst (HostOp rep op) where
  opIsConst (SizeOp GetSize {}) = True
  opIsConst (SizeOp GetSizeMax {}) = True
  opIsConst _ = False

allocAtLevel :: SegLevel -> AllocM GPU GPUMem a -> AllocM GPU GPUMem a
allocAtLevel lvl = local $ \env ->
  env
    { allocSpace = space,
      aggressiveReuse = True,
      allocInOp = handleHostOp (Just lvl)
    }
  where
    space = case lvl of
      SegGroup {} -> Space "local"
      SegThread {} -> Space "device"
      SegThreadInGroup {} -> Space "device"

handleSegOp ::
  Maybe SegLevel ->
  SegOp SegLevel GPU ->
  AllocM GPU GPUMem (SegOp SegLevel GPUMem)
handleSegOp outer_lvl op = do
  num_threads <-
    letSubExp "num_threads"
      =<< case maybe_grid of
        Just grid ->
          pure . BasicOp $
            BinOp
              (Mul Int64 OverflowUndef)
              (unCount (gridNumGroups grid))
              (unCount (gridGroupSize grid))
        Nothing ->
          foldBinOp
            (Mul Int64 OverflowUndef)
            (intConst Int64 1)
            (segSpaceDims $ segSpace op)
  allocAtLevel (segLevel op) $ mapSegOpM (mapper num_threads) op
  where
    maybe_grid =
      case (outer_lvl, segLevel op) of
        (Just (SegThread _ (Just grid)), _) -> Just grid
        (Just (SegGroup _ (Just grid)), _) -> Just grid
        (_, SegThread _ (Just grid)) -> Just grid
        (_, SegGroup _ (Just grid)) -> Just grid
        _ -> Nothing
    scope = scopeOfSegSpace $ segSpace op
    mapper num_threads =
      identitySegOpMapper
        { mapOnSegOpBody =
            localScope scope . local f . allocInKernelBody,
          mapOnSegOpLambda =
            local inThread
              . allocInBinOpLambda num_threads (segSpace op)
        }
    f = case segLevel op of
      SegThread {} -> inThread
      SegThreadInGroup {} -> inThread
      SegGroup {} -> inGroup
    inThread env = env {envExpHints = inThreadExpHints}
    inGroup env = env {envExpHints = inGroupExpHints}

handleHostOp ::
  Maybe SegLevel ->
  HostOp SOAC GPU ->
  AllocM GPU GPUMem (MemOp (HostOp NoOp) GPUMem)
handleHostOp _ (SizeOp op) =
  pure $ Inner $ SizeOp op
handleHostOp _ (OtherOp op) =
  error $ "Cannot allocate memory in SOAC: " ++ prettyString op
handleHostOp outer_lvl (SegOp op) =
  Inner . SegOp <$> handleSegOp outer_lvl op
handleHostOp _ (GPUBody ts (Body _ stms res)) =
  fmap (Inner . GPUBody ts) . buildBody_ . allocInStms stms $ pure res

kernelExpHints :: Exp GPUMem -> AllocM GPU GPUMem [ExpHint]
kernelExpHints (BasicOp (Manifest perm v)) = do
  dims <- arrayDims <$> lookupType v
  let perm_inv = rearrangeInverse perm
      dims' = rearrangeShape perm dims
      ixfun = IxFun.permute (IxFun.iota $ map pe64 dims') perm_inv
  pure [Hint ixfun $ Space "device"]
kernelExpHints (Op (Inner (SegOp (SegMap lvl@(SegThread _ _) space ts body)))) =
  zipWithM (mapResultHint lvl space) ts $ kernelBodyResult body
kernelExpHints (Op (Inner (SegOp (SegRed lvl@(SegThread _ _) space reds ts body)))) =
  (map (const NoHint) red_res <>) <$> zipWithM (mapResultHint lvl space) (drop num_reds ts) map_res
  where
    num_reds = segBinOpResults reds
    (red_res, map_res) = splitAt num_reds $ kernelBodyResult body
kernelExpHints e = defaultExpHints e

mapResultHint ::
  SegLevel ->
  SegSpace ->
  Type ->
  KernelResult ->
  AllocM GPU GPUMem ExpHint
mapResultHint _lvl space = hint
  where
    -- Heuristic: do not rearrange for returned arrays that are
    -- sufficiently small.
    coalesceReturnOfShape _ [] = False
    coalesceReturnOfShape bs [Constant (IntValue (Int64Value d))] = bs * d > 4
    coalesceReturnOfShape _ _ = True

    hint t Returns {}
      | coalesceReturnOfShape (primByteSize (elemType t)) $ arrayDims t = do
          let space_dims = segSpaceDims space
          pure $ Hint (innermost space_dims (arrayDims t)) $ Space "device"
    hint _ _ = pure NoHint

innermost :: [SubExp] -> [SubExp] -> IxFun
innermost space_dims t_dims =
  let r = length t_dims
      dims = space_dims ++ t_dims
      perm =
        [length space_dims .. length space_dims + r - 1]
          ++ [0 .. length space_dims - 1]
      perm_inv = rearrangeInverse perm
      dims_perm = rearrangeShape perm dims
      ixfun_base = IxFun.iota $ map pe64 dims_perm
      ixfun_rearranged = IxFun.permute ixfun_base perm_inv
   in ixfun_rearranged

semiStatic :: S.Set VName -> SubExp -> Bool
semiStatic _ Constant {} = True
semiStatic consts (Var v) = v `S.member` consts

inGroupExpHints :: Exp GPUMem -> AllocM GPU GPUMem [ExpHint]
inGroupExpHints (Op (Inner (SegOp (SegMap _ space ts body))))
  | any private $ kernelBodyResult body = do
      consts <- asks envConsts
      pure $ do
        (t, r) <- zip ts $ kernelBodyResult body
        pure $
          if private r && all (semiStatic consts) (arrayDims t)
            then
              let seg_dims = map pe64 $ segSpaceDims space
                  dims = seg_dims ++ map pe64 (arrayDims t)
                  nilSlice d = DimSlice 0 d 0
               in Hint
                    ( IxFun.slice (IxFun.iota dims) $
                        fullSliceNum dims $
                          map nilSlice seg_dims
                    )
                    $ ScalarSpace (arrayDims t)
                    $ elemType t
            else NoHint
  where
    private (Returns ResultPrivate _ _) = True
    private _ = False
inGroupExpHints e = defaultExpHints e

inThreadExpHints :: Exp GPUMem -> AllocM GPU GPUMem [ExpHint]
inThreadExpHints e = do
  consts <- asks envConsts
  mapM (maybePrivate consts) =<< expExtType e
  where
    maybePrivate consts t
      | Just (Array pt shape _) <- hasStaticShape t,
        all (semiStatic consts) $ shapeDims shape = do
          let ixfun = IxFun.iota $ map pe64 $ shapeDims shape
          pure $ Hint ixfun $ ScalarSpace (shapeDims shape) pt
      | otherwise =
          pure NoHint

-- | The pass from 'GPU' to 'GPUMem'.
explicitAllocations :: Pass GPU GPUMem
explicitAllocations = explicitAllocationsGeneric (Space "device") (handleHostOp Nothing) kernelExpHints

-- | Convert some 'GPU' stms to 'GPUMem'.
explicitAllocationsInStms ::
  (MonadFreshNames m, HasScope GPUMem m) =>
  Stms GPU ->
  m (Stms GPUMem)
explicitAllocationsInStms = explicitAllocationsInStmsGeneric (Space "device") (handleHostOp Nothing) kernelExpHints