-----------------------------------------------------------------------------
--
-- Cmm optimisation
--
-- (c) The University of Glasgow 2006
--
-----------------------------------------------------------------------------
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE PatternSynonyms #-}
module GHC.Cmm.Opt (
        constantFoldNode,
        constantFoldExpr,
        cmmMachOpFold,
        cmmMachOpFoldM,
        Opt, runOpt
 ) where

import GHC.Prelude

import GHC.Cmm.Dataflow.Block
import GHC.Cmm.Utils
import GHC.Cmm
import GHC.Cmm.Config
import GHC.Types.Unique.DSM

import GHC.Utils.Misc
import GHC.Utils.Panic
import GHC.Platform

import Data.Maybe
import Data.Word
import GHC.Exts (oneShot)
import Control.Monad

constantFoldNode :: CmmNode e x -> Opt (CmmNode e x)
constantFoldNode (CmmUnsafeForeignCall (PrimTarget op) res args)
  = traverse constantFoldExprOpt args >>= cmmCallishMachOpFold op res
constantFoldNode node
  = mapExpOpt constantFoldExprOpt node

constantFoldExprOpt :: CmmExpr -> Opt CmmExpr
constantFoldExprOpt e = wrapRecExpOpt f e
  where
    f (CmmMachOp op args)
      = do
        cfg <- getConfig
        case cmmMachOpFold (cmmPlatform cfg) op args of
          CmmMachOp op' args' -> fromMaybe (CmmMachOp op' args') <$> cmmMachOpFoldOptM cfg op' args'
          e -> pure e
    f (CmmRegOff r 0) = pure (CmmReg r)
    f e = pure e

constantFoldExpr :: Platform -> CmmExpr -> CmmExpr
constantFoldExpr platform = wrapRecExp f
  where f (CmmMachOp op args) = cmmMachOpFold platform op args
        f (CmmRegOff r 0) = CmmReg r
        f e = e

-- -----------------------------------------------------------------------------
-- MachOp constant folder

-- Now, try to constant-fold the MachOps.  The arguments have already
-- been optimized and folded.

cmmMachOpFold
    :: Platform
    -> MachOp       -- The operation from an CmmMachOp
    -> [CmmExpr]    -- The optimized arguments
    -> CmmExpr

cmmMachOpFold platform op args = fromMaybe (CmmMachOp op args) (cmmMachOpFoldM platform op args)

-- Returns Nothing if no changes, useful for Hoopl, also reduces
-- allocation!
cmmMachOpFoldM
    :: Platform
    -> MachOp
    -> [CmmExpr]
    -> Maybe CmmExpr
cmmMachOpFoldM _ (MO_V_Broadcast lg _w) exprs =
  case exprs of
    [CmmLit l] -> Just $! CmmLit (CmmVec $ replicate lg l)
    _ -> Nothing
cmmMachOpFoldM _ (MO_VF_Broadcast lg _w) exprs =
  case exprs of
    [CmmLit l] -> Just $! CmmLit (CmmVec $ replicate lg l)
    _ -> Nothing
cmmMachOpFoldM _ op [CmmLit (CmmInt x rep)]
  = Just $! case op of
      MO_S_Neg _ -> CmmLit (CmmInt (-x) rep)
      MO_Not _   -> CmmLit (CmmInt (complement x) rep)

        -- these are interesting: we must first narrow to the
        -- "from" type, in order to truncate to the correct size.
        -- The final narrow/widen to the destination type
        -- is implicit in the CmmLit.
      MO_SF_Round _frm to -> CmmLit (CmmFloat (fromInteger x) to)
      MO_SS_Conv  from to -> CmmLit (CmmInt (narrowS from x) to)
      MO_UU_Conv  from to -> CmmLit (CmmInt (narrowU from x) to)
      MO_XX_Conv  from to -> CmmLit (CmmInt (narrowS from x) to)
      _ -> panic $ "cmmMachOpFoldM: unknown unary op: " ++ show op

-- Eliminate shifts that are wider than the shiftee
cmmMachOpFoldM _ op [_shiftee, CmmLit (CmmInt shift _)]
  | Just width <- isShift op
  , shift >= fromIntegral (widthInBits width)
  = Just $! CmmLit (CmmInt 0 width)
  where
    isShift (MO_Shl   w) = Just w
    isShift (MO_U_Shr w) = Just w
    isShift (MO_S_Shr w) = Just w
    isShift _            = Nothing

-- Eliminate conversion NOPs
cmmMachOpFoldM _ (MO_SS_Conv rep1 rep2) [x] | rep1 == rep2 = Just x
cmmMachOpFoldM _ (MO_UU_Conv rep1 rep2) [x] | rep1 == rep2 = Just x
cmmMachOpFoldM _ (MO_XX_Conv rep1 rep2) [x] | rep1 == rep2 = Just x

-- Eliminate nested conversions where possible
cmmMachOpFoldM platform conv_outer [CmmMachOp conv_inner [x]]
  | Just (rep1,rep2,signed1) <- isIntConversion conv_inner,
    Just (_,   rep3,signed2) <- isIntConversion conv_outer
  = case () of
        -- widen then narrow to the same size is a nop
      _ | rep1 < rep2 && rep1 == rep3 -> Just x
        -- Widen then narrow to different size: collapse to single conversion
        -- but remember to use the signedness from the widening, just in case
        -- the final conversion is a widen.
        | rep1 < rep2 && rep2 > rep3 ->
            Just $! cmmMachOpFold platform (intconv signed1 rep1 rep3) [x]
        -- Nested widenings: collapse if the signedness is the same
        | rep1 < rep2 && rep2 < rep3 && signed1 == signed2 ->
            Just $! cmmMachOpFold platform (intconv signed1 rep1 rep3) [x]
        -- Nested narrowings: collapse
        | rep1 > rep2 && rep2 > rep3 ->
            Just $! cmmMachOpFold platform (MO_UU_Conv rep1 rep3) [x]
        | otherwise ->
            Nothing
  where
        isIntConversion (MO_UU_Conv rep1 rep2)
          = Just (rep1,rep2,False)
        isIntConversion (MO_SS_Conv rep1 rep2)
          = Just (rep1,rep2,True)
        isIntConversion _ = Nothing

        intconv True  = MO_SS_Conv
        intconv False = MO_UU_Conv

cmmMachOpFoldM platform mop [CmmLit (CmmInt x xrep), CmmLit (CmmInt y _)]
  = case mop of
        -- for comparisons: don't forget to narrow the arguments before
        -- comparing, since they might be out of range.
        MO_Eq _   -> Just $! CmmLit (CmmInt (if x_u == y_u then 1 else 0) (wordWidth platform))
        MO_Ne _   -> Just $! CmmLit (CmmInt (if x_u /= y_u then 1 else 0) (wordWidth platform))

        MO_U_Gt _ -> Just $! CmmLit (CmmInt (if x_u >  y_u then 1 else 0) (wordWidth platform))
        MO_U_Ge _ -> Just $! CmmLit (CmmInt (if x_u >= y_u then 1 else 0) (wordWidth platform))
        MO_U_Lt _ -> Just $! CmmLit (CmmInt (if x_u <  y_u then 1 else 0) (wordWidth platform))
        MO_U_Le _ -> Just $! CmmLit (CmmInt (if x_u <= y_u then 1 else 0) (wordWidth platform))

        MO_S_Gt _ -> Just $! CmmLit (CmmInt (if x_s >  y_s then 1 else 0) (wordWidth platform))
        MO_S_Ge _ -> Just $! CmmLit (CmmInt (if x_s >= y_s then 1 else 0) (wordWidth platform))
        MO_S_Lt _ -> Just $! CmmLit (CmmInt (if x_s <  y_s then 1 else 0) (wordWidth platform))
        MO_S_Le _ -> Just $! CmmLit (CmmInt (if x_s <= y_s then 1 else 0) (wordWidth platform))

        MO_Add r -> Just $! CmmLit (CmmInt (x + y) r)
        MO_Sub r -> Just $! CmmLit (CmmInt (x - y) r)
        MO_Mul r -> Just $! CmmLit (CmmInt (x * y) r)
        MO_U_Quot r | y /= 0 -> Just $! CmmLit (CmmInt (x_u `quot` y_u) r)
        MO_U_Rem  r | y /= 0 -> Just $! CmmLit (CmmInt (x_u `rem`  y_u) r)
        MO_S_Quot r | y /= 0 -> Just $! CmmLit (CmmInt (x_s `quot` y_s) r)
        MO_S_Rem  r | y /= 0 -> Just $! CmmLit (CmmInt (x_s `rem`  y_s) r)

        MO_And   r -> Just $! CmmLit (CmmInt (x .&. y) r)
        MO_Or    r -> Just $! CmmLit (CmmInt (x .|. y) r)
        MO_Xor   r -> Just $! CmmLit (CmmInt (x `xor` y) r)

        MO_Shl   r -> Just $! CmmLit (CmmInt (x   `shiftL` fromIntegral y) r)
        MO_U_Shr r -> Just $! CmmLit (CmmInt (x_u `shiftR` fromIntegral y) r)
        MO_S_Shr r -> Just $! CmmLit (CmmInt (x_s `shiftR` fromIntegral y) r)

        _          -> Nothing

   where
        x_u = narrowU xrep x
        y_u = narrowU xrep y
        x_s = narrowS xrep x
        y_s = narrowS xrep y


-- When possible, shift the constants to the right-hand side, so that we
-- can match for strength reductions.  Note that the code generator will
-- also assume that constants have been shifted to the right when
-- possible.

cmmMachOpFoldM platform op [x@(CmmLit _), y]
   | not (isLit y) && isCommutableMachOp op
   = Just $! (cmmMachOpFold platform op [y, x])

-- Turn (a+b)+c into a+(b+c) where possible.  Because literals are
-- moved to the right, it is more likely that we will find
-- opportunities for constant folding when the expression is
-- right-associated.
--
-- ToDo: this appears to introduce a quadratic behaviour due to the
-- nested cmmMachOpFold.  Can we fix this?
--
-- Why do we check isLit arg1?  If arg1 is a lit, it means that arg2
-- is also a lit (otherwise arg1 would be on the right).  If we
-- put arg1 on the left of the rearranged expression, we'll get into a
-- loop:  (x1+x2)+x3 => x1+(x2+x3)  => (x2+x3)+x1 => x2+(x3+x1) ...
--
-- Also don't do it if arg1 is PicBaseReg, so that we don't separate the
-- PicBaseReg from the corresponding label (or label difference).
--
cmmMachOpFoldM platform mop1 [CmmMachOp mop2 [arg1,arg2], arg3]
   | mop2 `associates_with` mop1
     && not (isLit arg1) && not (isPicReg arg1)
   = Just $! (cmmMachOpFold platform mop2 [arg1, cmmMachOpFold platform mop1 [arg2,arg3]])
   where
     MO_Add{} `associates_with` MO_Sub{} = True
     mop1 `associates_with` mop2 =
        mop1 == mop2 && isAssociativeMachOp mop1

-- special case: (a - b) + c  ==>  a + (c - b)
cmmMachOpFoldM platform mop1@(MO_Add{}) [CmmMachOp mop2@(MO_Sub{}) [arg1,arg2], arg3]
   | not (isLit arg1) && not (isPicReg arg1)
   = Just $! (cmmMachOpFold platform mop1 [arg1, cmmMachOpFold platform mop2 [arg3,arg2]])

-- special case: (PicBaseReg + lit) + N  ==>  PicBaseReg + (lit+N)
--
-- this is better because lit+N is a single link-time constant (e.g. a
-- CmmLabelOff), so the right-hand expression needs only one
-- instruction, whereas the left needs two.  This happens when pointer
-- tagging gives us label+offset, and PIC turns the label into
-- PicBaseReg + label.
--
cmmMachOpFoldM _ MO_Add{} [ CmmMachOp op@MO_Add{} [pic, CmmLit lit]
                          , CmmLit (CmmInt n rep) ]
  | isPicReg pic
  = Just $! CmmMachOp op [pic, CmmLit $ cmmOffsetLit lit off ]
  where off = fromIntegral (narrowS rep n)

-- Make a RegOff if we can. We don't perform this optimization if rep is greater
-- than the host word size because we use an Int to store the offset. See
-- #24893 and #24700. This should be fixed to ensure that optimizations don't
-- depend on the compiler host platform.
cmmMachOpFoldM _ (MO_Add _) [CmmReg reg, CmmLit (CmmInt n rep)]
  | validOffsetRep rep
  = Just $! cmmRegOff reg (fromIntegral (narrowS rep n))
cmmMachOpFoldM _ (MO_Add _) [CmmRegOff reg off, CmmLit (CmmInt n rep)]
  | validOffsetRep rep
  = Just $! cmmRegOff reg (off + fromIntegral (narrowS rep n))
cmmMachOpFoldM _ (MO_Sub _) [CmmReg reg, CmmLit (CmmInt n rep)]
  | validOffsetRep rep
  = Just $! cmmRegOff reg (- fromIntegral (narrowS rep n))
cmmMachOpFoldM _ (MO_Sub _) [CmmRegOff reg off, CmmLit (CmmInt n rep)]
  | validOffsetRep rep
  = Just $! cmmRegOff reg (off - fromIntegral (narrowS rep n))

-- Fold label(+/-)offset into a CmmLit where possible

cmmMachOpFoldM _ (MO_Add _) [CmmLit lit, CmmLit (CmmInt i rep)]
  | validOffsetRep rep
  = Just $! CmmLit (cmmOffsetLit lit (fromIntegral (narrowU rep i)))
cmmMachOpFoldM _ (MO_Add _) [CmmLit (CmmInt i rep), CmmLit lit]
  | validOffsetRep rep
  = Just $! CmmLit (cmmOffsetLit lit (fromIntegral (narrowU rep i)))
cmmMachOpFoldM _ (MO_Sub _) [CmmLit lit, CmmLit (CmmInt i rep)]
  | validOffsetRep rep
  = Just $! CmmLit (cmmOffsetLit lit (fromIntegral (negate (narrowU rep i))))


-- Comparison of literal with widened operand: perform the comparison
-- at the smaller width, as long as the literal is within range.

-- We can't do the reverse trick, when the operand is narrowed:
-- narrowing throws away bits from the operand, there's no way to do
-- the same comparison at the larger size.

cmmMachOpFoldM platform cmp [CmmMachOp conv [x], CmmLit (CmmInt i _)]
  |     -- powerPC NCG has a TODO for I8/I16 comparisons, so don't try
    platformArch platform `elem` [ArchX86, ArchX86_64],
        -- if the operand is widened:
    Just (rep, signed, narrow_fn) <- maybe_conversion conv,
        -- and this is a comparison operation:
    Just narrow_cmp <- maybe_comparison cmp rep signed,
        -- and the literal fits in the smaller size:
    i == narrow_fn rep i
        -- then we can do the comparison at the smaller size
  = Just $! (cmmMachOpFold platform narrow_cmp [x, CmmLit (CmmInt i rep)])
 where
    maybe_conversion (MO_UU_Conv from to)
        | to > from
        = Just (from, False, narrowU)
    maybe_conversion (MO_SS_Conv from to)
        | to > from
        = Just (from, True, narrowS)

        -- don't attempt to apply this optimisation when the source
        -- is a float; see #1916
    maybe_conversion _ = Nothing

        -- careful (#2080): if the original comparison was signed, but
        -- we were doing an unsigned widen, then we must do an
        -- unsigned comparison at the smaller size.
    maybe_comparison (MO_U_Gt _) rep _     = Just (MO_U_Gt rep)
    maybe_comparison (MO_U_Ge _) rep _     = Just (MO_U_Ge rep)
    maybe_comparison (MO_U_Lt _) rep _     = Just (MO_U_Lt rep)
    maybe_comparison (MO_U_Le _) rep _     = Just (MO_U_Le rep)
    maybe_comparison (MO_Eq   _) rep _     = Just (MO_Eq   rep)
    maybe_comparison (MO_S_Gt _) rep True  = Just (MO_S_Gt rep)
    maybe_comparison (MO_S_Ge _) rep True  = Just (MO_S_Ge rep)
    maybe_comparison (MO_S_Lt _) rep True  = Just (MO_S_Lt rep)
    maybe_comparison (MO_S_Le _) rep True  = Just (MO_S_Le rep)
    maybe_comparison (MO_S_Gt _) rep False = Just (MO_U_Gt rep)
    maybe_comparison (MO_S_Ge _) rep False = Just (MO_U_Ge rep)
    maybe_comparison (MO_S_Lt _) rep False = Just (MO_U_Lt rep)
    maybe_comparison (MO_S_Le _) rep False = Just (MO_U_Le rep)
    maybe_comparison _ _ _ = Nothing

-- We can often do something with constants of 0, 1 and (-1) ...
-- See Note [Comparison operators]

cmmMachOpFoldM platform mop [x, y@(CmmLit (CmmInt 0 _))]
  = case mop of
        -- Arithmetic
        MO_Add   _ -> Just x   -- x + 0 = x
        MO_Sub   _ -> Just x   -- x - 0 = x
        MO_Mul   _ -> Just y   -- x * 0 = 0

        -- Logical operations
        MO_And   _ -> Just y   -- x &     0 = 0
        MO_Or    _ -> Just x   -- x |     0 = x
        MO_Xor   _ -> Just x   -- x `xor` 0 = x

        -- Shifts
        MO_Shl   _ -> Just x   -- x << 0 = x
        MO_S_Shr _ -> Just x   -- ditto shift-right
        MO_U_Shr _ -> Just x

        -- Comparisons; these ones are trickier
        -- See Note [Comparison operators]
        MO_Ne    _ | isComparisonExpr x -> Just x                -- (x > y) != 0  =  x > y
        MO_Eq    _ | Just x' <- maybeInvertCmmExpr x -> Just x'  -- (x > y) == 0  =  x <= y
        MO_U_Gt  _ | isComparisonExpr x -> Just x                -- (x > y) > 0   =  x > y
        MO_S_Gt  _ | isComparisonExpr x -> Just x                -- ditto
        MO_U_Lt  _ | isComparisonExpr x -> Just zero             -- (x > y) < 0  =  0
        MO_S_Lt  _ | isComparisonExpr x -> Just zero
        MO_U_Ge  _ | isComparisonExpr x -> Just one              -- (x > y) >= 0  =  1
        MO_S_Ge  _ | isComparisonExpr x -> Just one

        MO_U_Le  _ | Just x' <- maybeInvertCmmExpr x -> Just x'  -- (x > y) <= 0  =  x <= y
        MO_S_Le  _ | Just x' <- maybeInvertCmmExpr x -> Just x'
        _ -> Nothing
  where
    zero = CmmLit (CmmInt 0 (wordWidth platform))
    one  = CmmLit (CmmInt 1 (wordWidth platform))

cmmMachOpFoldM platform mop [x, (CmmLit (CmmInt 1 rep))]
  = case mop of
        -- Arithmetic: x*1 = x, etc
        MO_Mul    _ -> Just x
        MO_S_Quot _ -> Just x
        MO_U_Quot _ -> Just x
        MO_S_Rem  _ -> Just $! CmmLit (CmmInt 0 rep)
        MO_U_Rem  _ -> Just $! CmmLit (CmmInt 0 rep)

        -- Comparisons; trickier
        -- See Note [Comparison operators]
        MO_Ne    _ | Just x' <- maybeInvertCmmExpr x -> Just x'  -- (x>y) != 1  =  x<=y
        MO_Eq    _ | isComparisonExpr x -> Just x                -- (x>y) == 1  =  x>y
        MO_U_Lt  _ | Just x' <- maybeInvertCmmExpr x -> Just x'  -- (x>y) < 1   =  x<=y
        MO_S_Lt  _ | Just x' <- maybeInvertCmmExpr x -> Just x'  -- ditto
        MO_U_Gt  _ | isComparisonExpr x -> Just zero             -- (x>y) > 1   = 0
        MO_S_Gt  _ | isComparisonExpr x -> Just zero
        MO_U_Le  _ | isComparisonExpr x -> Just one              -- (x>y) <= 1  = 1
        MO_S_Le  _ | isComparisonExpr x -> Just one
        MO_U_Ge  _ | isComparisonExpr x -> Just x                -- (x>y) >= 1  = x>y
        MO_S_Ge  _ | isComparisonExpr x -> Just x
        _ -> Nothing
  where
    zero = CmmLit (CmmInt 0 (wordWidth platform))
    one  = CmmLit (CmmInt 1 (wordWidth platform))

-- Now look for multiplication/division by powers of 2 (integers).

cmmMachOpFoldM platform mop [x, (CmmLit (CmmInt n _))]
  = case mop of
        MO_Mul rep
           | Just p <- exactLog2 n ->
                 Just $! (cmmMachOpFold platform (MO_Shl rep) [x, CmmLit (CmmInt p $ wordWidth platform)])
        -- The optimization for division by power of 2 is technically duplicated, but since at least one other part of ghc uses
        -- the pure `constantFoldExpr` this remains
        MO_U_Quot rep
           | Just p <- exactLog2 n ->
                 Just $! (cmmMachOpFold platform (MO_U_Shr rep) [x, CmmLit (CmmInt p $ wordWidth platform)])
        MO_U_Rem rep
           | Just _ <- exactLog2 n ->
                 Just $! (cmmMachOpFold platform (MO_And rep) [x, CmmLit (CmmInt (n - 1) rep)])
        MO_S_Quot rep
           | Just p <- exactLog2 n,
             CmmReg _ <- x ->
                Just $! (cmmMachOpFold platform (MO_S_Shr rep)
                  [signedQuotRemHelper platform n x rep p, CmmLit (CmmInt p $ wordWidth platform)])
        MO_S_Rem rep
           | Just p <- exactLog2 n,
             CmmReg _ <- x ->
                -- We replace (x `rem` 2^p) by (x - (x `quot` 2^p) * 2^p).
                -- Moreover, we fuse MO_S_Shr (last operation of MO_S_Quot)
                -- and MO_S_Shl (multiplication by 2^p) into a single MO_And operation.
                Just $! (cmmMachOpFold platform (MO_Sub rep)
                    [x, cmmMachOpFold platform (MO_And rep)
                      [signedQuotRemHelper platform n x rep p, CmmLit (CmmInt (- n) rep)]])
        _ -> Nothing

-- ToDo (#7116): optimise floating-point multiplication, e.g. x*2.0 -> x+x
-- Unfortunately this needs a unique supply because x might not be a
-- register.  See #2253 (program 6) for an example.


-- Anything else is just too hard.

cmmMachOpFoldM _ _ _ = Nothing

-- | Check that a literal width is compatible with the host word size used to
-- store offsets. This should be fixed properly (using larger types to store
-- literal offsets). See #24893
validOffsetRep :: Width -> Bool
validOffsetRep rep = widthInBits rep <= finiteBitSize (undefined :: Int)


{- Note [Comparison operators]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
If we have
   CmmCondBranch ((x>#y) == 1) t f
we really want to convert to
   CmmCondBranch (x>#y) t f

That's what the constant-folding operations on comparison operators do above.
-}

-- -----------------------------------------------------------------------------
-- Utils

isPicReg :: CmmExpr -> Bool
isPicReg (CmmReg (CmmGlobal (GlobalRegUse PicBaseReg _))) = True
isPicReg _ = False

canOptimizeDivision :: CmmConfig -> Width -> Bool
canOptimizeDivision cfg rep = cmmOptConstDivision cfg &&
  -- we can either widen the arguments to simulate mul2 or use mul2 directly for the platform word size
  (rep < wordWidth platform || (rep == wordWidth platform && cmmAllowMul2 cfg))
  where platform = cmmPlatform cfg

-- -----------------------------------------------------------------------------
-- Folding callish machops

cmmCallishMachOpFold :: CallishMachOp -> [CmmFormal] -> [CmmActual] -> Opt (CmmNode O O)
cmmCallishMachOpFold op res args =
  fromMaybe (CmmUnsafeForeignCall (PrimTarget op) res args) <$> (getConfig >>= \cfg -> cmmCallishMachOpFoldM cfg op res args)

cmmCallishMachOpFoldM :: CmmConfig -> CallishMachOp -> [CmmFormal] -> [CmmActual] -> Opt (Maybe (CmmNode O O))

-- If possible move the literals to the right, the following cases assume that to be the case
cmmCallishMachOpFoldM cfg op res [x@(CmmLit _),y]
  | isCommutableCallishMachOp op && not (isLit y) = cmmCallishMachOpFoldM cfg op res [y,x]

-- Both arguments are literals, replace with the result
cmmCallishMachOpFoldM _ op res [CmmLit (CmmInt x _), CmmLit (CmmInt y _)]
  = case op of
    MO_S_Mul2 rep
      | [rHiNeeded,rHi,rLo] <- res -> do
          let resSz = widthInBits rep
              resVal = (narrowS rep x) * (narrowS rep y)
              high = resVal `shiftR` resSz
              low = narrowS rep resVal
              isHiNeeded = high /= low `shiftR` resSz
              isHiNeededVal = if isHiNeeded then 1 else 0
          prependNode $! CmmAssign (CmmLocal rHiNeeded) (CmmLit $ CmmInt isHiNeededVal rep)
          prependNode $! CmmAssign (CmmLocal rHi) (CmmLit $ CmmInt high rep)
          pure . Just $! CmmAssign (CmmLocal rLo) (CmmLit $ CmmInt low rep)
    MO_U_Mul2 rep
      | [rHi,rLo] <- res -> do
          let resSz = widthInBits rep
              resVal = (narrowU rep x) * (narrowU rep y)
              high = resVal `shiftR` resSz
              low = narrowU rep resVal
          prependNode $! CmmAssign (CmmLocal rHi) (CmmLit $ CmmInt high rep)
          pure . Just $! CmmAssign (CmmLocal rLo) (CmmLit $ CmmInt low rep)
    MO_S_QuotRem rep
      | [rQuot, rRem] <- res,
        y /= 0 -> do
          let (q,r) = quotRem (narrowS rep x) (narrowS rep y)
          prependNode $! CmmAssign (CmmLocal rQuot) (CmmLit $ CmmInt q rep)
          pure . Just $! CmmAssign (CmmLocal rRem) (CmmLit $ CmmInt r rep)
    MO_U_QuotRem rep
      | [rQuot, rRem] <- res,
        y /= 0 -> do
          let (q,r) = quotRem (narrowU rep x) (narrowU rep y)
          prependNode $! CmmAssign (CmmLocal rQuot) (CmmLit $ CmmInt q rep)
          pure . Just $! CmmAssign (CmmLocal rRem) (CmmLit $ CmmInt r rep)
    _ -> pure Nothing

-- 0, 1 or -1 as one of the constants

cmmCallishMachOpFoldM _ op res [_, CmmLit (CmmInt 0 _)]
  = case op of
    -- x * 0 == 0
    MO_S_Mul2 rep
      | [rHiNeeded, rHi, rLo] <- res -> do
        prependNode $! CmmAssign (CmmLocal rHiNeeded) (CmmLit $ CmmInt 0 rep)
        prependNode $! CmmAssign (CmmLocal rHi) (CmmLit $ CmmInt 0 rep)
        pure . Just $! CmmAssign (CmmLocal rLo) (CmmLit $ CmmInt 0 rep)
    -- x * 0 == 0
    MO_U_Mul2 rep
      | [rHi, rLo] <- res -> do
        prependNode $! CmmAssign (CmmLocal rHi) (CmmLit $ CmmInt 0 rep)
        pure . Just $! CmmAssign (CmmLocal rLo) (CmmLit $ CmmInt 0 rep)
    _ -> pure Nothing

cmmCallishMachOpFoldM _ op res [CmmLit (CmmInt 0 _), _]
  = case op of
    -- 0 quotRem d == (0,0)
    MO_S_QuotRem rep
      | [rQuot, rRem] <- res -> do
      prependNode $! CmmAssign (CmmLocal rQuot) (CmmLit $ CmmInt 0 rep)
      pure . Just $! CmmAssign (CmmLocal rRem) (CmmLit $ CmmInt 0 rep)
    -- 0 quotRem d == (0,0)
    MO_U_QuotRem rep
      | [rQuot,rRem] <- res -> do
      prependNode $! CmmAssign (CmmLocal rQuot) (CmmLit $ CmmInt 0 rep)
      pure . Just $! CmmAssign (CmmLocal rRem) (CmmLit $ CmmInt 0 rep)
    _ -> pure Nothing

cmmCallishMachOpFoldM cfg op res [x, CmmLit (CmmInt 1 _)]
  = case op of
    -- x * 1 == x -- Note: The high word needs to be a sign extension of the low word, so we use a sign extending shift
    MO_S_Mul2 rep
      | [rHiNeeded, rHi, rLo] <- res -> do
        let platform = cmmPlatform cfg
            wordRep = wordWidth platform
            repInBits = toInteger $ widthInBits rep
        prependNode $! CmmAssign (CmmLocal rHiNeeded) (CmmLit $ CmmInt 0 rep)
        prependNode $! CmmAssign (CmmLocal rHi) (cmmMachOpFold platform (MO_S_Shr rep) [x, CmmLit $ CmmInt (repInBits - 1) wordRep])
        pure . Just $! CmmAssign (CmmLocal rLo) x
    -- x * 1 == x
    MO_U_Mul2 rep
      | [rHi, rLo] <- res -> do
        prependNode $! CmmAssign (CmmLocal rHi) (CmmLit $ CmmInt 0 rep)
        pure . Just $! CmmAssign (CmmLocal rLo) x
    -- x quotRem 1 == (x, 0)
    MO_S_QuotRem rep
      | [rQuot, rRem] <- res -> do
        prependNode $! CmmAssign (CmmLocal rQuot) x
        pure . Just $! CmmAssign (CmmLocal rRem) (CmmLit $ CmmInt 0 rep)
    -- x quotRem 1 == (x, 0)
    MO_U_QuotRem rep
      | [rQuot, rRem] <- res -> do
        prependNode $! CmmAssign (CmmLocal rQuot) x
        pure . Just $! CmmAssign (CmmLocal rRem) (CmmLit $ CmmInt 0 rep)
    _ -> pure Nothing

-- handle quotRem with a constant divisor

cmmCallishMachOpFoldM cfg op res [n, CmmLit (CmmInt d' _)]
  = case op of
    MO_S_QuotRem rep
      | Just p <- exactLog2 d,
        [rQuot,rRem] <- res -> do
          n' <- intoRegister n (cmmBits rep)
          -- first prepend the optimized division by a power 2
          prependNode $! CmmAssign (CmmLocal rQuot)
            (cmmMachOpFold platform (MO_S_Shr rep)
              [signedQuotRemHelper platform d n' rep p, CmmLit (CmmInt p $ wordWidth platform)])
          -- then output an optimized remainder by a power of 2
          pure . Just $! CmmAssign (CmmLocal rRem)
            (cmmMachOpFold platform (MO_Sub rep)
              [n', cmmMachOpFold platform (MO_And rep)
                [signedQuotRemHelper platform d n' rep p, CmmLit (CmmInt (- d) rep)]])
      | canOptimizeDivision cfg rep,
        d /= (-1), d /= 0, d /= 1,
        [rQuot,rRem] <- res -> do
          -- we are definitely going to use n multiple times, so put it into a register
          n' <- intoRegister n (cmmBits rep)
          -- generate an optimized (signed) division of n by d
          q <- generateDivisionBySigned platform cfg rep n' d
          -- we also need the result multiple times to calculate the remainder
          q' <- intoRegister q (cmmBits rep)

          prependNode $! CmmAssign (CmmLocal rQuot) q'
          -- The remainder now becomes n - q * d
          pure . Just $! CmmAssign (CmmLocal rRem) $ CmmMachOp (MO_Sub rep) [n', CmmMachOp (MO_Mul rep) [q', CmmLit $ CmmInt d rep]]
      where
        platform = cmmPlatform cfg
        d = narrowS rep d'
    MO_U_QuotRem rep
      | Just p <- exactLog2 d,
        [rQuot,rRem] <- res -> do
          -- first prepend the optimized division by a power 2
          prependNode $! CmmAssign (CmmLocal rQuot) $ CmmMachOp (MO_U_Shr rep) [n, CmmLit (CmmInt p $ wordWidth platform)]
          -- then output an optimized remainder by a power of 2
          pure . Just $! CmmAssign (CmmLocal rRem) $ CmmMachOp (MO_And rep) [n, CmmLit (CmmInt (d - 1) rep)]
      | canOptimizeDivision cfg rep,
        d /= 0, d /= 1,
        [rQuot,rRem] <- res -> do
          -- we are definitely going to use n multiple times, so put it into a register
          n' <- intoRegister n (cmmBits rep)
          -- generate an optimized (unsigned) division of n by d
          q <- generateDivisionByUnsigned platform cfg rep n' d
          -- we also need the result multiple times to calculate the remainder
          q' <- intoRegister q (cmmBits rep)

          prependNode $! CmmAssign (CmmLocal rQuot) q'
          -- The remainder now becomes n - q * d
          pure . Just $! CmmAssign (CmmLocal rRem) $ CmmMachOp (MO_Sub rep) [n', CmmMachOp (MO_Mul rep) [q', CmmLit $ CmmInt d rep]]
      where
        platform = cmmPlatform cfg
        d = narrowU rep d'
    _ -> pure Nothing

cmmCallishMachOpFoldM _ _ _ _ = pure Nothing

-- -----------------------------------------------------------------------------
-- Specialized constant folding for MachOps which sometimes need to expand into multiple nodes

cmmMachOpFoldOptM :: CmmConfig -> MachOp -> [CmmExpr] -> Opt (Maybe CmmExpr)

cmmMachOpFoldOptM cfg op [n, CmmLit (CmmInt d' _)] =
  case op of
    MO_S_Quot rep
      -- recheck for power of 2 division. This may not be handled by cmmMachOpFoldM if n is not in a register
      | Just p <- exactLog2 d -> do
        n' <- intoRegister n (cmmBits rep)
        pure . Just $! cmmMachOpFold platform (MO_S_Shr rep)
          [ signedQuotRemHelper platform d n' rep p
          , CmmLit (CmmInt p $ wordWidth platform)
          ]
      | canOptimizeDivision cfg rep,
        d /= (-1), d /= 0, d /= 1 -> Just <$!> generateDivisionBySigned platform cfg rep n d
      where d = narrowS rep d'
    MO_S_Rem rep
      -- recheck for power of 2 remainder. This may not be handled by cmmMachOpFoldM if n is not in a register
      | Just p <- exactLog2 d -> do
        n' <- intoRegister n (cmmBits rep)
        pure . Just $! cmmMachOpFold platform (MO_Sub rep)
          [ n'
          , cmmMachOpFold platform (MO_And rep)
              [ signedQuotRemHelper platform d n' rep p
              , CmmLit (CmmInt (- d) rep)
              ]
          ]
      | canOptimizeDivision cfg rep,
        d /= (-1), d /= 0, d /= 1 -> do
        n' <- intoRegister n (cmmBits rep)
        -- first generate the division
        q <- generateDivisionBySigned platform cfg rep n' d
        -- then calculate the remainder by n - q * d
        pure . Just $! CmmMachOp (MO_Sub rep) [n', CmmMachOp (MO_Mul rep) [q, CmmLit $ CmmInt d rep]]
      where d = narrowS rep d'
    MO_U_Quot rep
      -- No need to recheck power of 2 division because cmmMachOpFoldM always handles that case
      | canOptimizeDivision cfg rep,
        d /= 0, d /= 1, Nothing <- exactLog2 d -> Just <$!> generateDivisionByUnsigned platform cfg rep n d
      where d = narrowU rep d'
    MO_U_Rem rep
      -- No need to recheck power of 2 remainder because cmmMachOpFoldM always handles that case
      | canOptimizeDivision cfg rep,
        d /= 0, d /= 1, Nothing <- exactLog2 d -> do
        n' <- intoRegister n (cmmBits rep)
        -- first generate the division
        q <- generateDivisionByUnsigned platform cfg rep n d
        -- then calculate the remainder by n - q * d
        pure . Just $! CmmMachOp (MO_Sub rep) [n', CmmMachOp (MO_Mul rep) [q, CmmLit $ CmmInt d rep]]
      where d = narrowU rep d'
    _ -> pure Nothing
  where platform = cmmPlatform cfg

cmmMachOpFoldOptM _ _ _ = pure Nothing

-- -----------------------------------------------------------------------------
-- Utils for prepending new nodes

-- Move an expression into a register to possibly use it multiple times
intoRegister :: CmmExpr -> CmmType -> Opt CmmExpr
intoRegister e@(CmmReg _) _ = pure e
intoRegister expr ty = do
  u <- getUniqueM
  let reg = LocalReg u ty
  CmmReg (CmmLocal reg) <$ prependNode (CmmAssign (CmmLocal reg) expr)

prependNode :: CmmNode O O -> Opt ()
prependNode n = Opt $ \_ xs -> pure (xs ++ [n], ())

-- -----------------------------------------------------------------------------
-- Division by constants utils

-- Helper for division by a power of 2
-- In contrast with unsigned integers, for signed ones
-- shift right is not the same as quot, because it rounds
-- to minus infinity, whereas quot rounds toward zero.
-- To fix this up, we add one less than the divisor to the
-- dividend if it is a negative number.
--
-- to avoid a test/jump, we use the following sequence:
--      x1 = x >> word_size-1  (all 1s if -ve, all 0s if +ve)
--      x2 = y & (divisor-1)
--      result = x + x2
-- this could be done a bit more simply using conditional moves,
-- but we're processor independent here.
--
-- we optimize the divide by 2 case slightly, generating
--      x1 = x >> word_size-1  (unsigned)
--      return = x + x1
signedQuotRemHelper :: Platform -> Integer -> CmmExpr -> Width -> Integer -> CmmExpr
signedQuotRemHelper platform n x rep p = CmmMachOp (MO_Add rep) [x, x2]
  where
    bits = fromIntegral (widthInBits rep) - 1
    shr = if p == 1 then MO_U_Shr rep else MO_S_Shr rep
    x1 = CmmMachOp shr [x, CmmLit (CmmInt bits $ wordWidth platform)]
    x2 = if p == 1 then x1 else
          CmmMachOp (MO_And rep) [x1, CmmLit (CmmInt (n-1) rep)]

{- Note: [Division by constants]

Integer division is floor(n / d), the goal is to find m,p
such that floor((m * n) / 2^p) = floor(n / d).

The idea being: n/d = n * (1/d). But we cannot store 1/d in an integer without
some error, so we choose some 2^p / d such that the error ends up small and
thus vanishes when we divide by 2^p again.

The algorithm below to generate these numbers is taken from Hacker's Delight
Second Edition Chapter 10 "Integer division by constants". The chapter also
contains proof that this method does indeed produce correct results.

However this is a much more literal interpretation of the algorithm,
which we can use because of the unbounded Integer type. Hacker's Delight
also provides a much more complex algorithm which computes these numbers
without the need to exceed the word size, but that is not necessary here.
-}

generateDivisionBySigned :: Platform -> CmmConfig -> Width -> CmmExpr -> Integer -> Opt CmmExpr

-- Sanity checks, division will generate incorrect results or undesirable code for these cases
-- cmmMachOpFoldM and cmmMachOpFoldOptM should have already handled these cases!
generateDivisionBySigned _ _ _ _ 0 = panic "generate signed division with 0"
generateDivisionBySigned _ _ _ _ 1 = panic "generate signed division with 1"
generateDivisionBySigned _ _ _ _ (-1) = panic "generate signed division with -1"
generateDivisionBySigned _ _ _ _ d | Just _ <- exactLog2 d = panic $ "generate signed division with " ++ show d

generateDivisionBySigned platform _cfg rep n divisor = do
  -- We only duplicate n' if we actually need to add/subtract it, so we may not need it in a register
  n' <- if sign == 0 then pure n else intoRegister n resRep

  -- Set up mul2
  (shift', qExpr) <- mul2 n'

  -- add/subtract n if necessary
  let qExpr' = case sign of
        1  -> CmmMachOp (MO_Add rep) [qExpr, n']
        -1 -> CmmMachOp (MO_Sub rep) [qExpr, n']
        _  -> qExpr

  qExpr'' <- intoRegister (cmmMachOpFold platform (MO_S_Shr rep) [qExpr', CmmLit $ CmmInt shift' wordRep]) resRep

  -- Lastly add the sign of the quotient to correct for negative results
  pure $! cmmMachOpFold platform
    (MO_Add rep) [qExpr'', cmmMachOpFold platform (MO_U_Shr rep) [qExpr'', CmmLit $ CmmInt (toInteger $ widthInBits rep - 1) wordRep]]
  where
    resRep = cmmBits rep
    wordRep = wordWidth platform
    (magic, sign, shift) = divisionMagicS rep divisor
    -- generate the multiply with the magic number
    mul2 n
      -- Using mul2 for sub-word sizes regresses for signed integers only
      | rep == wordWidth platform = do
        (r1, r2, r3) <- (,,) <$> getUniqueM <*> getUniqueM <*> getUniqueM
        let rg1    = LocalReg r1 resRep
            resReg = LocalReg r2 resRep
            rg3    = LocalReg r3 resRep
        res <- CmmReg (CmmLocal resReg) <$ prependNode (CmmUnsafeForeignCall (PrimTarget (MO_S_Mul2 rep)) [rg1, resReg, rg3] [n, CmmLit $ CmmInt magic rep])
        pure (shift, res)
      -- widen the register and multiply without the MUL2 instruction
      -- if we don't need an additional add after this we can combine the shifts
      | otherwise = pure (if sign == 0 then 0 else shift, res)
          where
            wordRep = wordWidth platform
            -- (n * magic) >> widthInBits + (if sign == 0 then shift else 0) -- With conversion in between to not overflow
            res = cmmMachOpFold platform (MO_SS_Conv wordRep rep)
                    [ cmmMachOpFold platform (MO_S_Shr wordRep)
                      [ cmmMachOpFold platform (MO_Mul wordRep)
                        [ cmmMachOpFold platform (MO_SS_Conv rep wordRep) [n]
                        , CmmLit $ CmmInt magic wordRep
                        ]
                      -- Check if we need to generate an add/subtract later. If not we can combine this with the postshift
                      , CmmLit $ CmmInt ((if sign == 0 then toInteger shift else 0) + (toInteger $ widthInBits rep)) wordRep
                      ]
                    ]

-- See hackers delight for how and why this works (chapter in note [Division by constants])
divisionMagicS :: Width -> Integer -> (Integer, Integer, Integer)
divisionMagicS rep divisor = (magic, sign, toInteger $ p - wSz)
  where
    sign = if divisor > 0
      then if magic < 0 then 1 else 0
      else if magic < 0 then 0 else -1
    wSz = widthInBits rep
    ad = abs divisor
    t = (1 `shiftL` (wSz - 1)) + if divisor > 0 then 0 else 1
    anc = t - 1 - rem t ad
    go p'
      | twoP > anc * (ad - rem twoP ad) = p'
      | otherwise = go (p' + 1)
      where twoP = 1 `shiftL` p'
    p = go wSz
    am = (twoP + ad - rem twoP ad) `quot` ad
      where twoP = 1 `shiftL` p
    magic = narrowS rep $ if divisor > 0 then am else -am

generateDivisionByUnsigned :: Platform -> CmmConfig -> Width -> CmmExpr -> Integer -> Opt CmmExpr
-- Sanity checks, division will generate incorrect results or undesirable code for these cases
-- cmmMachOpFoldM and cmmMachOpFoldOptM should have already handled these cases!
generateDivisionByUnsigned _ _ _ _ 0 = panic "generate signed division with 0"
generateDivisionByUnsigned _ _ _ _ 1 = panic "generate signed division with 1"
generateDivisionByUnsigned _ _ _ _ d | Just _ <- exactLog2 d = panic $ "generate signed division with " ++ show d

generateDivisionByUnsigned platform cfg rep n divisor = do
  -- We only duplicate n' if we actually need to add/subtract it, so we may not need it in a register
  n' <- if not needsAdd -- Invariant: We also never preshift if we need an add, thus we don't need n in a register
    then pure $! cmmMachOpFold platform (MO_U_Shr rep) [n, CmmLit $ CmmInt preShift wordRep]
    else intoRegister n resRep

  -- Set up mul2
  (postShift', qExpr) <- mul2 n'

  -- add/subtract n if necessary
  let qExpr' = if needsAdd
        -- This is qExpr + (n - qExpr) / 2 = (qExpr + n) / 2 but with a guarantee that it'll not overflow
        then cmmMachOpFold platform (MO_Add rep)
          [ cmmMachOpFold platform (MO_U_Shr rep)
            [ cmmMachOpFold platform (MO_Sub rep) [n', qExpr]
            , CmmLit $ CmmInt 1 wordRep
            ]
          , qExpr
          ]
        else qExpr
      -- If we already divided by 2 in the add, remember to shift one bit less
      -- Hacker's Delight, Edition 2 Page 234: postShift > 0 if we needed an add, except if the divisor
      -- is 1, which we checked for above
      finalShift = if needsAdd then postShift' - 1 else postShift'

  -- apply the final postShift
  pure $! cmmMachOpFold platform (MO_U_Shr rep) [qExpr', CmmLit $ CmmInt finalShift wordRep]
  where
    resRep = cmmBits rep
    wordRep = wordWidth platform
    (preShift, magic, needsAdd, postShift) =
        let withPre = divisionMagicU rep True  divisor
            noPre   = divisionMagicU rep False divisor
        in case (withPre, noPre) of
          -- Use whatever does not cause us to take the expensive case
          ((_, _, False, _), (_, _, True, _)) -> withPre
          -- If we cannot avoid the expensive case, don't bother with the pre shift
          _ -> noPre
    -- generate the multiply with the magic number
    mul2 n
      | rep == wordWidth platform || (cmmAllowMul2 cfg && needsAdd) = do
        (r1, r2) <- (,) <$> getUniqueM <*> getUniqueM
        let rg1    = LocalReg r1 resRep
            resReg = LocalReg r2 resRep
        res <- CmmReg (CmmLocal resReg) <$ prependNode (CmmUnsafeForeignCall (PrimTarget (MO_U_Mul2 rep)) [resReg, rg1] [n, CmmLit $ CmmInt magic rep])
        pure (postShift, res)
      | otherwise = do
        pure (if needsAdd then postShift else 0, res)
          where
            wordRep = wordWidth platform
            -- (n * magic) >> widthInBits + (if sign == 0 then shift else 0) -- With conversion in between to not overflow
            res = cmmMachOpFold platform (MO_UU_Conv wordRep rep)
              [ cmmMachOpFold platform (MO_U_Shr wordRep)
                [ cmmMachOpFold platform (MO_Mul wordRep)
                  [ cmmMachOpFold platform (MO_UU_Conv rep wordRep) [n]
                  , CmmLit $ CmmInt magic wordRep
                  ]
                -- Check if we need to generate an add later. If not we can combine this with the postshift
                , CmmLit $ CmmInt ((if needsAdd then 0 else postShift) + (toInteger $ widthInBits rep)) wordRep
                ]
              ]

-- See hackers delight for how and why this works (chapter in note [Division by constants])
-- The preshift isn't described there, but the idea is:
-- If a divisor d has n trailing zeros, then d is a multiple of 2^n. Since we want to divide x by d
-- we can also calculate (x / 2^n) / (d / 2^n) which may then not require an extra addition.
--
-- The addition performs: quotient + dividend, but we need to avoid overflows, so we actually need to
-- calculate: quotient + (dividend - quotient) / 2 = (quotient + dividend) / 2
-- Thus if the preshift can avoid all of this, we have 1 operation in place of 3.
--
-- The decision to use the preshift is made somewhere else, here we only report if the addition is needed
divisionMagicU :: Width -> Bool -> Integer -> (Integer, Integer, Bool, Integer)
divisionMagicU rep doPreShift divisor = (toInteger zeros, magic, needsAdd, toInteger $ p - wSz)
  where
    wSz = widthInBits rep
    zeros = if doPreShift then countTrailingZeros $ fromInteger @Word64 divisor else 0
    d = divisor `shiftR` zeros
    ones = ((1 `shiftL` wSz) - 1) `shiftR` zeros
    nc = ones - rem (ones - d) d
    go p'
      | twoP > nc * (d - 1 - rem (twoP - 1) d) = p'
      | otherwise = go (p' + 1)
      where twoP = 1 `shiftL` p'
    p = go wSz
    m = (twoP + d - 1 - rem (twoP - 1) d) `quot` d
      where twoP = 1 `shiftL` p
    needsAdd = d < 1 `shiftL` (p - wSz)
    magic = if needsAdd then m - (ones + 1) else m

-- -----------------------------------------------------------------------------
-- Opt monad

newtype Opt a = OptI { runOptI :: CmmConfig -> [CmmNode O O] -> UniqDSM ([CmmNode O O], a) }

-- | Pattern synonym for 'Opt', as described in Note [The one-shot state
-- monad trick].
pattern Opt :: (CmmConfig -> [CmmNode O O] -> UniqDSM ([CmmNode O O], a)) -> Opt a
pattern Opt f <- OptI f
  where Opt f = OptI . oneShot $ \cfg -> oneShot $ \out -> f cfg out
{-# COMPLETE Opt #-}

runOpt :: CmmConfig -> Opt a -> UniqDSM ([CmmNode O O], a)
runOpt cf (Opt g) = g cf []

getConfig :: Opt CmmConfig
getConfig = Opt $ \cf xs -> pure (xs, cf)

instance Functor Opt where
  fmap f (Opt g) = Opt $ \cf xs -> fmap (fmap f) (g cf xs)

instance Applicative Opt where
  pure a = Opt $ \_ xs -> pure (xs, a)
  ff <*> fa = do
    f <- ff
    f <$> fa

instance Monad Opt where
  Opt g >>= f = Opt $ \cf xs -> do
    (ys, a) <- g cf xs
    runOptI (f a) cf ys

instance MonadGetUnique Opt where
  getUniqueM = Opt $ \_ xs -> (xs,) <$> getUniqueDSM

mapForeignTargetOpt :: (CmmExpr -> Opt CmmExpr) -> ForeignTarget -> Opt ForeignTarget
mapForeignTargetOpt exp   (ForeignTarget e c) = flip ForeignTarget c <$> exp e
mapForeignTargetOpt _   m@(PrimTarget _)      = pure m

wrapRecExpOpt :: (CmmExpr -> Opt CmmExpr) -> CmmExpr -> Opt CmmExpr
wrapRecExpOpt f (CmmMachOp op es)       = traverse (wrapRecExpOpt f) es >>= f . CmmMachOp op
wrapRecExpOpt f (CmmLoad addr ty align) = wrapRecExpOpt f addr >>= \newAddr -> f (CmmLoad newAddr ty align)
wrapRecExpOpt f e                       = f e

mapExpOpt :: (CmmExpr -> Opt CmmExpr) -> CmmNode e x -> Opt (CmmNode e x)
mapExpOpt _ f@(CmmEntry{})                          = pure f
mapExpOpt _ m@(CmmComment _)                        = pure m
mapExpOpt _ m@(CmmTick _)                           = pure m
mapExpOpt f   (CmmUnwind regs)                      = CmmUnwind <$> traverse (traverse (traverse f)) regs
mapExpOpt f   (CmmAssign r e)                       = CmmAssign r <$> f e
mapExpOpt f   (CmmStore addr e align)               = CmmStore <$> f addr <*> f e <*> pure align
mapExpOpt f   (CmmUnsafeForeignCall tgt fs as)      = CmmUnsafeForeignCall <$> mapForeignTargetOpt f tgt <*> pure fs <*> traverse f as
mapExpOpt _ l@(CmmBranch _)                         = pure l
mapExpOpt f   (CmmCondBranch e ti fi l)             = f e >>= \newE -> pure (CmmCondBranch newE ti fi l)
mapExpOpt f   (CmmSwitch e ids)                     = flip CmmSwitch ids <$> f e
mapExpOpt f   n@CmmCall {cml_target=tgt}            = f tgt >>= \newTgt -> pure n{cml_target = newTgt}
mapExpOpt f   (CmmForeignCall tgt fs as succ ret_args updfr intrbl)
                                                    = do
                                                      newTgt <- mapForeignTargetOpt f tgt
                                                      newAs <- traverse f as
                                                      pure $ CmmForeignCall newTgt fs newAs succ ret_args updfr intrbl