{-# language GADTs #-}
{-# language LambdaCase #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE BinaryLiterals #-}
{-# LANGUAGE OverloadedStrings #-}
module GHC.CmmToAsm.LA64.CodeGen (
      cmmTopCodeGen
    , generateJumpTableForInstr
    , makeFarBranches
)

where

import Data.Maybe
import Data.Word
import GHC.Cmm
import GHC.Cmm.BlockId
import GHC.Cmm.CLabel
import GHC.Cmm.Dataflow.Block
import GHC.Cmm.Dataflow.Graph
import GHC.Cmm.DebugBlock
import GHC.Cmm.Switch
import GHC.Cmm.Utils
import GHC.CmmToAsm.CPrim
import GHC.CmmToAsm.Config
import GHC.CmmToAsm.Format
import GHC.CmmToAsm.Monad
  ( NatM,
    getConfig,
    getDebugBlock,
    getFileId,
    getNewLabelNat,
    getNewRegNat,
    getPicBaseMaybeNat,
    getPlatform
  )
import GHC.CmmToAsm.PIC
import GHC.CmmToAsm.LA64.Cond
import GHC.CmmToAsm.LA64.Instr
import GHC.CmmToAsm.LA64.Regs
import GHC.CmmToAsm.Types
import GHC.Data.FastString
import GHC.Data.OrdList
import GHC.Float
import GHC.Platform
import GHC.Platform.Reg
import GHC.Platform.Regs
import GHC.Prelude hiding (EQ)
import GHC.Types.Basic
import GHC.Types.ForeignCall
import GHC.Types.SrcLoc (srcSpanFile, srcSpanStartCol, srcSpanStartLine)
import GHC.Types.Tickish (GenTickish (..))
import GHC.Utils.Constants (debugIsOn)
import GHC.Utils.Misc
import GHC.Utils.Outputable
import GHC.Utils.Panic
import GHC.Utils.Monad
import Control.Monad
import GHC.Cmm.Dataflow.Label
import GHC.Types.Unique.DSM

-- [General layout of an NCG]
cmmTopCodeGen ::
  RawCmmDecl ->
  NatM [NatCmmDecl RawCmmStatics Instr]
-- Thus we'll have to deal with either CmmProc ...
cmmTopCodeGen _cmm@(CmmProc info lab live graph) = do
  picBaseMb <- getPicBaseMaybeNat
  when (isJust picBaseMb) $ panic "LA64.cmmTopCodeGen: Unexpected PIC base register"

  let blocks = toBlockListEntryFirst graph
  (nat_blocks, statics) <- mapAndUnzipM basicBlockCodeGen blocks

  let proc = CmmProc info lab live (ListGraph $ concat nat_blocks)
      tops = proc : concat statics

  pure tops

-- ... or CmmData.
cmmTopCodeGen (CmmData sec dat) = pure [CmmData sec dat] -- no translation, we just use CmmStatic

basicBlockCodeGen ::
  Block CmmNode C C ->
  NatM
    ( [NatBasicBlock Instr],
      [NatCmmDecl RawCmmStatics Instr]
    )
basicBlockCodeGen block = do
  config <- getConfig
  let (_, nodes, tail) = blockSplit block
      id = entryLabel block
      stmts = blockToList nodes

      header_comment_instr
        | debugIsOn =
            unitOL
              $ MULTILINE_COMMENT
                ( text "-- --------------------------- basicBlockCodeGen --------------------------- --\n"
                    $+$ withPprStyle defaultDumpStyle (pdoc (ncgPlatform config) block)
                )
        | otherwise = nilOL

  -- Generate location directive `.loc` (DWARF debug location info)
  loc_instrs <- genLocInstrs

  -- Generate other instructions
  mid_instrs <- stmtsToInstrs stmts
  (!tail_instrs) <- stmtToInstrs tail

  let instrs = header_comment_instr `appOL` loc_instrs `appOL` mid_instrs `appOL` tail_instrs

      -- TODO: Then x86 backend runs @verifyBasicBlock@ here. How important it is to
      -- have a valid CFG is an open question: This and the AArch64 and PPC NCGs
      -- work fine without it.

      -- Code generation may introduce new basic block boundaries, which are
      -- indicated by the NEWBLOCK instruction. We must split up the instruction
      -- stream into basic blocks again. Also, we extract LDATAs here too.
      (top, other_blocks, statics) = foldrOL mkBlocks ([], [], []) instrs

  return (BasicBlock id top : other_blocks, statics)
  where
    genLocInstrs :: NatM (OrdList Instr)
    genLocInstrs = do
      dbg <- getDebugBlock (entryLabel block)
      case dblSourceTick =<< dbg of
        Just (SourceNote span name) ->
          do
            fileId <- getFileId (srcSpanFile span)
            let line = srcSpanStartLine span; col = srcSpanStartCol span
            pure $ unitOL $ LOCATION fileId line col name
        _ -> pure nilOL

mkBlocks ::
  Instr ->
  ([Instr], [GenBasicBlock Instr], [GenCmmDecl RawCmmStatics h g]) ->
  ([Instr], [GenBasicBlock Instr], [GenCmmDecl RawCmmStatics h g])
mkBlocks (NEWBLOCK id) (instrs, blocks, statics) =
  ([], BasicBlock id instrs : blocks, statics)
mkBlocks (LDATA sec dat) (instrs, blocks, statics) =
  (instrs, blocks, CmmData sec dat : statics)
mkBlocks instr (instrs, blocks, statics) =
  (instr : instrs, blocks, statics)

-- -----------------------------------------------------------------------------
-- | Utilities

-- | Annotate an `Instr` with a `SDoc` comment
ann :: SDoc -> Instr -> Instr
ann doc instr {- debugIsOn -} = ANN doc instr
{-# INLINE ann #-}

-- Using pprExpr will hide the AST, @ANN@ will end up in the assembly with
-- -dppr-debug.  The idea is that we can trivially see how a cmm expression
-- ended up producing the assembly we see.  By having the verbatim AST printed
-- we can simply check the patterns that were matched to arrive at the assembly
-- we generated.
--
-- pprExpr will hide a lot of noise of the underlying data structure and print
-- the expression into something that can be easily read by a human. However
-- going back to the exact CmmExpr representation can be laborious and adds
-- indirections to find the matches that lead to the assembly.
--
-- An improvement oculd be to have
--
--    (pprExpr genericPlatform e) <> parens (text. show e)
--
-- to have the best of both worlds.
--
-- Note: debugIsOn is too restrictive, it only works for debug compilers.
-- However, we do not only want to inspect this for debug compilers. Ideally
-- we'd have a check for -dppr-debug here already, such that we don't even
-- generate the ANN expressions. However, as they are lazy, they shouldn't be
-- forced until we actually force them, and without -dppr-debug they should
-- never end up being forced.
annExpr :: CmmExpr -> Instr -> Instr
annExpr e {- debugIsOn -} = ANN (text . show $ e)
-- annExpr e instr {- debugIsOn -} = ANN (pprExpr genericPlatform e) instr
-- annExpr _ instr = instr
{-# INLINE annExpr #-}

-- -----------------------------------------------------------------------------
-- Generating a table-branch
-- The index into the jump table is calulated by evaluating @expr@. The
-- corresponding table entry contains the address to jump to.
genSwitch :: NCGConfig -> CmmExpr -> SwitchTargets -> NatM InstrBlock
genSwitch config expr targets = do
  (reg, fmt1, e_code) <- getSomeReg indexExpr
  targetReg <- getNewRegNat II64
  lbl <- getNewLabelNat
  dynRef <- cmmMakeDynamicReference config DataReference lbl
  (tableReg, fmt2, t_code) <- getSomeReg $ dynRef
  let code =
        toOL [ COMMENT (text "indexExpr" <+> (text . show) indexExpr)
             , COMMENT (text "dynRef" <+> (text . show) dynRef)
             ]
          `appOL` e_code
          `appOL` t_code
          `appOL` toOL
            [
              COMMENT (ftext "Jump table for switch"),
              -- index to offset into the table (relative to tableReg)
              annExpr expr (SLL (OpReg W64 reg) (OpReg (formatToWidth fmt1) reg) (OpImm (ImmInt 3))),
              -- calculate table entry address
              ADD (OpReg W64 targetReg) (OpReg W64 reg) (OpReg (formatToWidth fmt2) tableReg),
              -- load table entry (relative offset from tableReg (first entry) to target label)
              LDU II64 (OpReg W64 targetReg) (OpAddr (AddrRegImm targetReg (ImmInt 0))),
              -- calculate absolute address of the target label
              ADD (OpReg W64 targetReg) (OpReg W64 targetReg) (OpReg W64 tableReg),
              -- prepare jump to target label
              J_TBL bids (Just lbl) targetReg
            ]
  return code
  where
    platform = ncgPlatform config
    expr_w = cmmExprWidth platform expr
    indexExpr0 = cmmOffset platform expr offset
    -- Widen to a native-width register(addressing modes)
    indexExpr = CmmMachOp
        (MO_UU_Conv expr_w (platformWordWidth platform))
        [indexExpr0]
    (offset, bids) = switchTargetsToTable targets


-- Generate jump table data (if required)
--
-- Relies on PIC relocations. The idea is to emit one table entry per case. The
-- entry is the label of the block to jump to. This will be relocated to be the
-- address of the jump target.
generateJumpTableForInstr ::
  NCGConfig ->
  Instr ->
  Maybe (NatCmmDecl RawCmmStatics Instr)
generateJumpTableForInstr config (J_TBL ids (Just lbl) _) =
  let jumpTable =
        map jumpTableEntryRel ids
        where
          jumpTableEntryRel Nothing =
            CmmStaticLit (CmmInt 0 (ncgWordWidth config))
          jumpTableEntryRel (Just blockid) =
            CmmStaticLit
              ( CmmLabelDiffOff
                  blockLabel
                  lbl
                  0
                  (ncgWordWidth config)
              )
            where
              blockLabel = blockLbl blockid
   in Just (CmmData (Section ReadOnlyData lbl) (CmmStaticsRaw lbl jumpTable))
generateJumpTableForInstr _ _ = Nothing

-- -----------------------------------------------------------------------------
-- Top-level of the instruction selector
stmtsToInstrs ::
  -- | Cmm Statements
  [CmmNode O O] ->
  -- | Resulting instruction
  NatM InstrBlock
stmtsToInstrs stmts = concatOL <$> mapM stmtToInstrs stmts

stmtToInstrs ::
  CmmNode e x ->
  -- | Resulting instructions
  NatM InstrBlock

stmtToInstrs stmt = do
  config <- getConfig
  platform <- getPlatform
  case stmt of
    CmmUnsafeForeignCall target result_regs args
      -> genCCall target result_regs args

    CmmComment s   -> return (unitOL (COMMENT (ftext s)))
    CmmTick {}     -> return nilOL

    CmmAssign reg src
      | isFloatType ty         -> assignReg_FltCode format reg src
      | otherwise              -> assignReg_IntCode format reg src
        where ty = cmmRegType reg
              format = cmmTypeFormat ty

    CmmStore addr src _alignment
      | isFloatType ty         -> assignMem_FltCode format addr src
      | otherwise              -> assignMem_IntCode format addr src
        where ty = cmmExprType platform src
              format = cmmTypeFormat ty

    CmmBranch id          -> genBranch id

    --We try to arrange blocks such that the likely branch is the fallthrough
    --in GHC.Cmm.ContFlowOpt. So we can assume the condition is likely false here.
    CmmCondBranch arg true false _prediction ->
        genCondBranch true false arg

    CmmSwitch arg ids -> genSwitch config arg ids

    CmmCall { cml_target = arg } -> genJump arg

    CmmUnwind _regs -> pure nilOL

    _ ->  pprPanic "stmtToInstrs: statement should have been cps'd away" (pdoc platform stmt)

-- | 'InstrBlock's are the insn sequences generated by the insn selectors.
--  They are really trees of insns to facilitate fast appending, where a
--  left-to-right traversal yields the insns in the correct order.
type InstrBlock =
  OrdList Instr

-- | Register's passed up the tree.
--  If the stix code forces the register to live in a pre-decided machine
--  register, it comes out as @Fixed@; otherwise, it comes out as @Any@, and the
--  parent can decide which register to put it in.
data Register
  = Fixed Format Reg InstrBlock
  | Any Format (Reg -> InstrBlock)

-- | Sometimes we need to change the Format of a register. Primarily during
--  conversion.
swizzleRegisterRep :: Format -> Register -> Register
swizzleRegisterRep format' (Fixed _ reg code) = Fixed format' reg code
swizzleRegisterRep format' (Any _ codefn) = Any format' codefn

-- | Grab a `Reg` for a `CmmReg`
getRegisterReg :: Platform -> CmmReg -> Reg

getRegisterReg _ (CmmLocal (LocalReg u pk))
  = RegVirtual $ mkVirtualReg u (cmmTypeFormat pk)

getRegisterReg platform (CmmGlobal mid)
  = case globalRegMaybe platform (globalRegUse_reg mid) of
        Just reg -> RegReal reg
        Nothing  -> pprPanic "getRegisterReg-memory" (ppr $ CmmGlobal mid)

-- General things for putting together code sequences

-- | Compute an expression into any register
getSomeReg :: CmmExpr -> NatM (Reg, Format, InstrBlock)
getSomeReg expr = do
  r <- getRegister expr
  case r of
    Any rep code -> do
        tmp <- getNewRegNat rep
        return (tmp, rep, code tmp)
    Fixed rep reg code ->
        return (reg, rep, code)

-- | Compute an expression into any floating-point register

-- | Compute an expression into floating point register
--  If the initial expression is not a floating-point expression, finally move
--  the result into a floating-point register.
getFloatReg :: HasCallStack => CmmExpr -> NatM (Reg, Format, InstrBlock)
getFloatReg expr = do
  r <- getRegister expr
  case r of
    Any rep code | isFloatFormat rep -> do
      tmp <- getNewRegNat rep
      return (tmp, rep, code tmp)
    Any II32 code -> do
      tmp <- getNewRegNat FF32
      return (tmp, FF32, code tmp)
    Any II64 code -> do
      tmp <- getNewRegNat FF64
      return (tmp, FF64, code tmp)
    Any _w _code -> do
      config <- getConfig
      pprPanic "can't do getFloatReg on" (pdoc (ncgPlatform config) expr)
    -- can't do much for fixed.
    Fixed rep reg code ->
      return (reg, rep, code)

-- | Map `CmmLit` to `OpImm`
litToImm' :: CmmLit -> Operand
litToImm' = OpImm . litToImm

-- Handling PIC on LA64
-- Commonly, `PIC` means of `position independent code`, that to say, the execution
-- of code does not be influenced by Load_address. Through PC-Relative addressing
-- or GOT addressing, both can be used to implement `PIC`.
--
-- For LoongArch's common compiler(GCC, Clang), they generate PIC code by default
-- without condition. The command option `-fPIC` dicates to generate code for
-- shared-library. If not just specified for shared-library, another option `-fPIE`
-- was be created.
--
-- Like RV64, LA64 does not have a special PIC register, the general approach is to
-- simply do PC-relative addressing or go through the GOT. There is assembly support
-- for both.
--
-- LA64 assembly has many `la*` (load address) pseudo-instructions, that allows
-- loading a symbols's address into a register. These instructions is desugared into
-- different addressing modes. See following:
--
-- la        rd, label + addend  -> Load global symbol
-- la.global rd, label + addend  -> Same as `la`
-- la.local  rd, label + addend  -> Load local symbol
-- la.pcrel  rd, label + addend
-- la.got    rd, label
-- la.abs    rd, label + addend
--
-- `la` is alias of `la.global`. Commonly recommended use `la.local` and `la.global`.
--
-- PC-relative addressing:
--   pcalau12i $a0, %pc_hi20(a)
--   addi.d    $a0, $a0, %pc_lo12(a)
--
-- GOT addressing:
--   pcalau12i $a0, %got_pc_hi20(global_a)
--   ld.d      $a0, $a0, %got_pc_lo12(global_a)
--
-- PIC can be enabled/disabled through:
--  .option pic
--
-- CmmGlobal @PicBaseReg@'s are generated in @GHC.CmmToAsm.PIC@ in the
-- @cmmMakePicReference@.  This is in turn called from @cmmMakeDynamicReference@
-- also in @Cmm.CmmToAsm.PIC@ from where it is also exported.  There are two
-- callsites for this. One is in this module to produce the @target@ in @genCCall@
-- the other is in @GHC.CmmToAsm@ in @cmmExprNative@.
--
-- Conceptually we do not want any special PicBaseReg to be used on LA64. If
-- we want to distinguish between symbol loading, we need to address this through
-- the way we load it, not through a register.

-- Compute a `CmmExpr` into a `Register`
getRegister :: CmmExpr -> NatM Register
getRegister e = do
  config <- getConfig
  getRegister' config (ncgPlatform config) e

-- Signed arithmetic on LoongArch64
--
-- Handling signed arithmetic on sub-word-size values on LA64 is a bit tricky
-- as Cmm's type system does not capture signedness. While 32- and 64-bit
-- values are fairly easy to handle due to LA64's 32- and 64-bit instructions
-- with responding register, 8- and 16-bit values require quite some care.
--
-- For LoongArch64, EXT.W.[B/H] will sign-extend 8- and 16-bit to 64-bit.
-- However, it is best to use EXT instruction only if the input and
-- output data widths are fully determined.
--
-- We handle 16-and 8-bit values by using the following two steps:
--  1. Sign- or Zero-extending operands.
--  2. Truncate results as necessary.
--
-- For simplicity we maintain the invariant that a register containing a
-- sub-word-size value always contains the zero-extended form of that value
-- in between operations.

getRegister' :: NCGConfig -> Platform -> CmmExpr -> NatM Register

-- OPTIMIZATION WARNING: CmmExpr rewrites

-- Generic case.
getRegister' config plat expr =
  case expr of
    CmmReg (CmmGlobal (GlobalRegUse PicBaseReg _)) ->
      pprPanic "getRegisterReg-memory" (ppr PicBaseReg)

    CmmLit lit ->
      case lit of
        CmmInt 0 w -> pure $ Fixed (intFormat w) zeroReg nilOL
        CmmInt i w -> do
          -- narrowU is important: Negative immediates may be
          -- sign-extended on load!
          let imm = OpImm . ImmInteger $ narrowU w i
          return (Any (intFormat w) (\dst -> unitOL $ annExpr expr (MOV (OpReg w dst) imm)))

        CmmFloat 0 w -> do
          let op = litToImm' lit
          pure (Any (floatFormat w) (\dst -> unitOL $ annExpr expr (MOV (OpReg w dst) op)))

        CmmFloat _f W8  -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for bytes" (pdoc plat expr)
        CmmFloat _f W16 -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for halfs" (pdoc plat expr)

        CmmFloat f W32 -> do
          let word = castFloatToWord32 (fromRational f) :: Word32
          tmp <- getNewRegNat (intFormat W32)
          return (Any (floatFormat W32) (\dst -> toOL [ annExpr expr
                                                      $ MOV (OpReg W32 tmp) (OpImm (ImmInteger (fromIntegral word)))
                                                      , MOV (OpReg W32 dst) (OpReg W32 tmp)
                                                      ]))
        CmmFloat f W64 -> do
          let word = castDoubleToWord64 (fromRational f) :: Word64
          tmp <- getNewRegNat (intFormat W64)
          return (Any (floatFormat W64) (\dst -> toOL [ annExpr expr
                                                      $ MOV (OpReg W64 tmp) (OpImm (ImmInteger (fromIntegral word)))
                                                      , MOV (OpReg W64 dst) (OpReg W64 tmp)
                                                      ]))

        CmmFloat _f _w -> pprPanic "getRegister' (CmmLit:CmmFloat), unsupported float lit" (pdoc plat expr)
        CmmVec _lits -> pprPanic "getRegister' (CmmLit:CmmVec): " (pdoc plat expr)

        CmmLabel lbl -> do
          let op = OpImm (ImmCLbl lbl)
              rep = cmmLitType plat lit
              format = cmmTypeFormat rep
          return (Any format (\dst -> unitOL $ annExpr expr (LD format (OpReg (formatToWidth format) dst) op)))

        CmmLabelOff lbl off | isNbitEncodeable 12 (fromIntegral off) -> do
          let op = OpImm (ImmIndex lbl off)
              rep = cmmLitType plat lit
              format = cmmTypeFormat rep
          return (Any format (\dst -> unitOL $ LD format (OpReg (formatToWidth format) dst) op))

        CmmLabelOff lbl off -> do
          let op = litToImm' (CmmLabel lbl)
              rep = cmmLitType plat lit
              format = cmmTypeFormat rep
              width = typeWidth rep
          (off_r, _off_format, off_code) <- getSomeReg $ CmmLit (CmmInt (fromIntegral off) width)
          return (Any format (\dst -> off_code `snocOL`
                                                LD format (OpReg (formatToWidth format) dst) op `snocOL`
                                                ADD (OpReg W64 dst) (OpReg width dst) (OpReg width off_r)
                             ))

        CmmLabelDiffOff {} -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
        CmmBlock _ -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)
        CmmHighStackMark -> pprPanic "getRegister' (CmmLit:CmmLabelOff): " (pdoc plat expr)

    CmmLoad mem rep _ -> do
      let format = cmmTypeFormat rep
          width = typeWidth rep
      Amode addr addr_code <- getAmode plat width mem
      case width of
        w | w `elem` [W8, W16, W32, W64] ->
            -- Load without sign-extension.
            pure (Any format (\dst ->
              addr_code `snocOL`
              LDU format (OpReg width dst) (OpAddr addr))
                              )
        _ -> pprPanic ("Unknown width to load: " ++ show width) (pdoc plat expr)

    CmmStackSlot _ _  -> pprPanic "getRegister' (CmmStackSlot): " (pdoc plat expr)

    CmmReg reg -> return (Fixed (cmmTypeFormat (cmmRegType reg))
                                (getRegisterReg plat reg)
                                nilOL
                         )

    CmmRegOff reg off | isNbitEncodeable 12 (fromIntegral off) -> do
      getRegister' config plat
        $ CmmMachOp (MO_Add width) [CmmReg reg, CmmLit (CmmInt (fromIntegral off) width)]
      where
        width = typeWidth (cmmRegType reg)
    CmmRegOff reg off -> do
      (off_r, _off_format, off_code) <- getSomeReg $ CmmLit (CmmInt (fromIntegral off) width)
      (reg, _format, code) <- getSomeReg $ CmmReg reg
      return $ Any (intFormat width) ( \dst ->
                                        off_code `appOL`
                                        code `snocOL`
                                        ADD (OpReg W64 dst) (OpReg width reg) (OpReg width off_r)
                                     )
      where
        width = typeWidth (cmmRegType reg)

    -- Handle MO_RelaxedRead as a normal CmmLoad, to allow
    -- non-trivial addressing modes to be used.
    CmmMachOp (MO_RelaxedRead w) [e] ->
      getRegister (CmmLoad e (cmmBits w) NaturallyAligned)

    -- for MachOps, see GHC.Cmm.MachOp
    -- For CmmMachOp, see GHC.Cmm.Expr
    CmmMachOp op [e] -> do
      (reg, format, code) <- getSomeReg e
      case op of
        MO_Not w -> return $ Any (intFormat w) $ \dst ->
          code `appOL`
          -- pseudo instruction `not dst rd` is `nor dst, r0, rd`
          truncateReg (formatToWidth format) W64 reg `snocOL`
          -- At this point an 8- or 16-bit value would be zero-extended
          -- to 64-bits. Truncate back down the final width.
          ann (text "not") (NOR (OpReg W64 dst) (OpReg W64 reg) zero) `appOL`
          truncateReg W64 w dst

        MO_S_Neg w -> negate code w reg
        MO_F_Neg w -> return $ Any (floatFormat w) (\dst -> code `snocOL` FNEG (OpReg w dst) (OpReg w reg))

        -- Floating convertion oprations
        -- Float -> Float
        MO_FF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` FCVT (OpReg to dst) (OpReg from reg))

        -- Signed int -> Float
        MO_SF_Round from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` SCVTF (OpReg to dst) (OpReg from reg))

        -- Float -> Signed int
        MO_FS_Truncate from to | from == W32 -> do
            tmp <- getNewRegNat FF32
            return $ Any (intFormat to) (\dst -> code `snocOL` FCVTZS (OpReg to dst) (OpReg from tmp) (OpReg from reg))

        MO_FS_Truncate from to | from == W64-> do
            tmp <- getNewRegNat FF64
            return $ Any (intFormat to) (\dst -> code `snocOL` FCVTZS (OpReg to dst) (OpReg from tmp) (OpReg from reg))

        -- unsigned int -> unsigned int
        MO_UU_Conv from to -> return $ Any (intFormat to) (\dst ->
          code `snocOL` BSTRPICK II64 (OpReg W64 dst) (OpReg W64 reg) (OpImm (ImmInt (widthToInt (min from to) - 1))) (OpImm (ImmInt 0))
                                                          )

        -- Signed int -> Signed int
        MO_SS_Conv from to -> ss_conv from to reg code

        -- int -> int
        MO_XX_Conv _from to -> swizzleRegisterRep (intFormat to) <$> getRegister e

        MO_WF_Bitcast w    -> return $ Any (floatFormat w)  (\dst -> code `snocOL` MOV (OpReg w dst) (OpReg w reg))
        MO_FW_Bitcast w    -> return $ Any (intFormat w)    (\dst -> code `snocOL` MOV (OpReg w dst) (OpReg w reg))

        x -> pprPanic ("getRegister' (monadic CmmMachOp): " ++ show x) (pdoc plat expr)
      where
        -- In the case of 32- or 16- or 8-bit values we need to sign-extend to 64-bits
        negate code w reg
          | w `elem` [W8, W16] = do
            return $ Any (intFormat w) $ \dst ->
                code `snocOL`
                EXT (OpReg W64 reg) (OpReg w reg) `snocOL`
                NEG (OpReg W64 dst) (OpReg W64 reg) `appOL`
                truncateReg W64 w dst
          | otherwise = do
            return $ Any (intFormat w) $ \dst ->
                code `snocOL`
                NEG (OpReg W64 dst) (OpReg w reg)

        ss_conv from to reg code
          | from `elem` [W8, W16] || to `elem` [W8, W16] = do
            return $ Any (intFormat to) $ \dst ->
                code `snocOL`
                EXT (OpReg W64 dst) (OpReg (min from to) reg) `appOL`
                -- At this point an 8- or 16-bit value would be sign-extended
                -- to 64-bits. Truncate back down the final width.
                truncateReg W64 to dst
          | from == W32 && to == W64 = do
            return $ Any (intFormat to) $ \dst ->
                code `snocOL`
                SLL (OpReg to dst) (OpReg from reg) (OpImm (ImmInt 0))
          | from == to = do
            return $ Any (intFormat from) $ \dst ->
                 code `snocOL` MOV (OpReg from dst) (OpReg from reg)
          | otherwise = do
            return $ Any (intFormat to) $ \dst ->
                code `appOL`
                signExtend from W64 reg dst `appOL`
                truncateReg W64 to dst


-- Dyadic machops:
    --
    -- The general idea is:
    -- compute x<i> <- x
    -- compute x<j> <- y
    -- OP x<r>, x<i>, x<j>
    --
    -- TODO: for now we'll only implement the 64bit versions. And rely on the
    --      fallthrough to alert us if things go wrong!
    -- OPTIMIZATION WARNING: Dyadic CmmMachOp destructuring
    -- 0. TODO This should not exist! Rewrite: Reg +- 0 -> Reg
    CmmMachOp (MO_Add _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
    CmmMachOp (MO_Sub _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'

    CmmMachOp (MO_Add w) [x, CmmLit (CmmInt n _)] | fitsInNbits 12 (fromIntegral n) -> do
      if w `elem` [W8, W16]
        then do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst ->
                                        code_x `snocOL`
                                        annExpr expr (EXT (OpReg W64 reg_x) (OpReg w reg_x)) `snocOL`
                                        ADD (OpReg W64 dst) (OpReg W64 reg_x) (OpImm (ImmInteger n))
                                     )
        else do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (ADD (OpReg W64 dst) (OpReg w reg_x) (OpImm (ImmInteger n))))

    CmmMachOp (MO_Sub w) [x, CmmLit (CmmInt n _)] | fitsInNbits 12 (fromIntegral n) -> do
      if w `elem` [W8, W16]
        then do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst ->
                                        code_x `snocOL`
                                        annExpr expr (EXT (OpReg W64 reg_x) (OpReg w reg_x)) `snocOL`
                                        SUB (OpReg W64 dst) (OpReg W64 reg_x) (OpImm (ImmInteger n))
                                     )
        else do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (SUB (OpReg W64 dst) (OpReg w reg_x) (OpImm (ImmInteger n))))

    CmmMachOp (MO_U_Quot w) [x, y]
      | w `elem` [W8, W16] -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) (\dst ->
                                      code_x `appOL`
                                      code_y `appOL`
                                      truncateReg w W64 reg_x `appOL`
                                      truncateReg w W64 reg_y `snocOL`
                                      annExpr expr (DIVU (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )

    -- 2. Shifts.
    CmmMachOp (MO_Shl w) [x, y] ->
      case y of
        CmmLit (CmmInt n _) | w `elem` [W8, W16], 0 <= n, n < fromIntegral (widthInBits w) -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst ->
                                        code_x `snocOL`
                                        annExpr expr (EXT (OpReg W64 reg_x) (OpReg w reg_x)) `snocOL`
                                        SLL (OpReg W64 dst) (OpReg W64 reg_x) (OpImm (ImmInteger n))
                                     )
        CmmLit (CmmInt n _) | 0 <= n, n < fromIntegral (widthInBits w) -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (SLL (OpReg W64 dst) (OpReg w reg_x) (OpImm (ImmInteger n))))

        _ | w `elem` [W8, W16] -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          (reg_y, _format_y, code_y) <- getSomeReg y
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        code_y `snocOL`
                                        annExpr expr (EXT (OpReg W64 reg_x) (OpReg w reg_x)) `snocOL`
                                        EXT (OpReg W64 reg_y) (OpReg w reg_y) `snocOL`
                                        SLL (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y)
                                     )
        _ -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          (reg_y, _format_y, code_y) <- getSomeReg y
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        code_y `snocOL`
                                        annExpr expr (SLL (OpReg W64 dst) (OpReg w reg_x) (OpReg w reg_y))
                                     )

    -- MO_S_Shr: signed-shift-right
    CmmMachOp (MO_S_Shr w) [x, y] ->
      case y of
        CmmLit (CmmInt n _) | w `elem` [W8, W16], 0 <= n, n < fromIntegral (widthInBits w) -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w)  (\dst ->
                                        code_x `snocOL`
                                        annExpr expr (EXT (OpReg W64 reg_x) (OpReg w reg_x)) `snocOL`
                                        SRA (OpReg W64 dst) (OpReg W64 reg_x) (OpImm (ImmInteger n))
                                      )
        CmmLit (CmmInt n _) | 0 <= n, n < fromIntegral (widthInBits w) -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (SRA (OpReg W64 dst) (OpReg w reg_x) (OpImm (ImmInteger n))))

        _ | w `elem` [W8, W16] -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          (reg_y, _format_y, code_y) <- getSomeReg y
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        code_y `snocOL`
                                        annExpr expr (EXT (OpReg W64 reg_x) (OpReg w reg_x)) `snocOL`
                                        EXT (OpReg W64 reg_y) (OpReg w reg_y) `snocOL`
                                        SRA (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y)
                                     )
        _ -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          (reg_y, _format_y, code_y) <- getSomeReg y
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        code_y `snocOL`
                                        annExpr expr (SRA (OpReg W64 dst) (OpReg w reg_x) (OpReg w reg_y))
                                     )

    -- MO_U_Shr: unsigned-shift-right
    CmmMachOp (MO_U_Shr w) [x, y] ->
      case y of
        CmmLit (CmmInt n _) | w `elem` [W8, W16], 0 <= n, n < fromIntegral (widthInBits w) -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        truncateReg w W64 reg_x `snocOL`
                                        annExpr expr (SRL (OpReg W64 dst) (OpReg W64 reg_x) (OpImm (ImmInteger n)))
                                     )
        CmmLit (CmmInt n _) | 0 <= n, n < fromIntegral (widthInBits w) -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (SRL (OpReg W64 dst) (OpReg w reg_x) (OpImm (ImmInteger n))))

        _ | w `elem` [W8, W16] -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          (reg_y, _format_y, code_y) <- getSomeReg y
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        code_y `appOL`
                                        truncateReg w W64 reg_x `appOL`
                                        truncateReg w W64 reg_y `snocOL`
                                        annExpr expr (SRL (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                     )
        _ -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          (reg_y, _format_y, code_y) <- getSomeReg y
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        code_y `snocOL`
                                        annExpr expr (SRL (OpReg W64 dst) (OpReg w reg_x) (OpReg w reg_y))
                                     )

    -- 3. Logic &&, ||
    -- andi Instr's Imm-operand is zero-extended.
    CmmMachOp (MO_And w) [x, y] ->
      case y of
        CmmLit (CmmInt n _) | w `elem` [W8, W16, W32], (n :: Integer) >= 0, (n :: Integer) <= 4095 -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        truncateReg w W64 reg_x `snocOL`
                                        annExpr expr (AND (OpReg W64 dst) (OpReg W64 reg_x) (OpImm (ImmInteger n)))
                                     )

        CmmLit (CmmInt n _) | (n :: Integer) >= 0, (n :: Integer) <= 4095 -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (AND (OpReg W64 dst) (OpReg w reg_x) (OpImm (ImmInteger n))))

        CmmLit (CmmInt n _) | w `elem` [W8, W16, W32] -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          tmp <- getNewRegNat II64
          return $ Any (intFormat w) (\dst ->
                                       code_x `appOL`
                                       truncateReg w W64 reg_x `snocOL`
                                       annExpr expr (MOV (OpReg W64 tmp) (OpImm (ImmInteger n))) `snocOL`
                                       AND (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 tmp)
                                     )

        CmmLit (CmmInt n _) -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          tmp <- getNewRegNat II64
          return $ Any (intFormat w) (\dst ->
                                        code_x `snocOL`
                                        annExpr expr (MOV (OpReg W64 tmp) (OpImm (ImmInteger  n))) `snocOL`
                                        AND (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 tmp)
                                     )

        _ | w `elem` [W8, W16, W32] -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          (reg_y, _format_y, code_y) <- getSomeReg y
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        code_y `appOL`
                                        truncateReg w W64 reg_x `appOL`
                                        truncateReg w W64 reg_y `snocOL`
                                        annExpr expr (AND (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                     )

        _ -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          (reg_y, _format_y, code_y) <- getSomeReg y
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        code_y `snocOL`
                                        annExpr expr (AND (OpReg W64 dst) (OpReg w reg_x) (OpReg w reg_y))
                                     )

    -- ori Instr's Imm-operand is zero-extended.
    CmmMachOp (MO_Or w) [x, y] ->
      case y of
        CmmLit (CmmInt n _) | w `elem` [W8, W16, W32], (n :: Integer) >= 0, (n :: Integer) <= 4095 -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        truncateReg w W64 reg_x `snocOL`
                                        annExpr expr (OR (OpReg W64 dst) (OpReg W64 reg_x) (OpImm (ImmInteger n)))
                                     )

        CmmLit (CmmInt n _) | (n :: Integer) >= 0, (n :: Integer) <= 4095 -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (OR (OpReg W64 dst) (OpReg w reg_x) (OpImm (ImmInteger n))))

        CmmLit (CmmInt n _) | w `elem` [W8, W16, W32] -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          tmp <- getNewRegNat II64
          return $ Any (intFormat w) (\dst ->
                                       code_x `appOL`
                                       truncateReg w W64 reg_x `snocOL`
                                       annExpr expr (MOV (OpReg W64 tmp) (OpImm (ImmInteger n))) `snocOL`
                                       OR (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 tmp)
                                     )

        CmmLit (CmmInt n _) -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          tmp <- getNewRegNat II64
          return $ Any (intFormat w) (\dst ->
                                        code_x `snocOL`
                                        annExpr expr (MOV (OpReg W64 tmp) (OpImm (ImmInteger  n))) `snocOL`
                                        OR (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 tmp)
                                     )

        _ | w `elem` [W8, W16, W32] -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          (reg_y, _format_y, code_y) <- getSomeReg y
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        code_y `appOL`
                                        truncateReg w W64 reg_x `appOL`
                                        truncateReg w W64 reg_y `snocOL`
                                        annExpr expr (OR (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                     )

        _ -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          (reg_y, _format_y, code_y) <- getSomeReg y
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        code_y `snocOL`
                                        annExpr expr (OR (OpReg W64 dst) (OpReg w reg_x) (OpReg w reg_y))
                                     )

    -- xori Instr's Imm-operand is zero-extended.
    CmmMachOp (MO_Xor w) [x, y] ->
      case y of
        CmmLit (CmmInt n _) | w `elem` [W8, W16, W32], (n :: Integer) >= 0, (n :: Integer) <= 4095 -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        truncateReg w W64 reg_x `snocOL`
                                        annExpr expr (XOR (OpReg W64 dst) (OpReg W64 reg_x) (OpImm (ImmInteger n)))
                                     )

        CmmLit (CmmInt n _) | (n :: Integer) >= 0, (n :: Integer) <= 4095 -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          return $ Any (intFormat w) (\dst -> code_x `snocOL` annExpr expr (XOR (OpReg W64 dst) (OpReg w reg_x) (OpImm (ImmInteger n))))

        CmmLit (CmmInt n _) | w `elem` [W8, W16, W32] -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          tmp <- getNewRegNat II64
          return $ Any (intFormat w) (\dst ->
                                       code_x `appOL`
                                       truncateReg w W64 reg_x `snocOL`
                                       annExpr expr (MOV (OpReg W64 tmp) (OpImm (ImmInteger n))) `snocOL`
                                       XOR (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 tmp)
                                     )

        CmmLit (CmmInt n _) -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          tmp <- getNewRegNat II64
          return $ Any (intFormat w) (\dst ->
                                        code_x `snocOL`
                                        annExpr expr (MOV (OpReg W64 tmp) (OpImm (ImmInteger  n))) `snocOL`
                                        XOR (OpReg W64 dst) (OpReg w reg_x) (OpReg W64 tmp)
                                     )

        _ | w `elem` [W8, W16, W32] -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          (reg_y, _format_y, code_y) <- getSomeReg y
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        code_y `appOL`
                                        truncateReg w W64 reg_x `appOL`
                                        truncateReg w W64 reg_y `snocOL`
                                        annExpr expr (XOR (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                     )

        _ -> do
          (reg_x, _format_x, code_x) <- getSomeReg x
          (reg_y, _format_y, code_y) <- getSomeReg y
          return $ Any (intFormat w) (\dst ->
                                        code_x `appOL`
                                        code_y `snocOL`
                                        annExpr expr (XOR (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                     )

    -- CSET commands register operand being W64.
    CmmMachOp (MO_Eq w) [x, y]
      | w `elem` [W8, W16, W32] -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `appOL`
                                      signExtend w W64 reg_x reg_x `appOL`
                                      signExtend w W64 reg_y reg_y `snocOL`
                                      annExpr expr (CSET EQ (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )
       | otherwise -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `snocOL`
                                      annExpr expr (CSET EQ (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )

    CmmMachOp (MO_Ne w) [x, y]
      | w `elem` [W8, W16, W32] -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `appOL`
                                      signExtend w W64 reg_x reg_x `appOL`
                                      signExtend w W64 reg_y reg_y `snocOL`
                                      annExpr expr (CSET NE (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )
      | otherwise -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `snocOL`
                                      annExpr expr (CSET NE (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )

    CmmMachOp (MO_S_Lt w) [x, CmmLit (CmmInt n _)]
      | w `elem` [W8, W16, W32]
      , fitsInNbits 12 (fromIntegral n) -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      signExtend w W64 reg_x reg_x `snocOL`
                                      annExpr expr (SSLT (OpReg W64 dst) (OpReg W64 reg_x) (OpImm (ImmInteger n)))
                                   )
      | fitsInNbits 12 (fromIntegral n) -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        return $ Any (intFormat w) ( \dst -> code_x `snocOL` annExpr expr (SSLT (OpReg W64 dst) (OpReg W64 reg_x) (OpImm (ImmInteger n))))

    CmmMachOp (MO_U_Lt w) [x, CmmLit (CmmInt n _)]
      | w `elem` [W8, W16, W32]
      , fitsInNbits 12 (fromIntegral n) -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      truncateReg w W64 reg_x `snocOL`
                                      annExpr expr (SSLTU (OpReg W64 dst) (OpReg W64 reg_x) (OpImm (ImmInteger n)))
                                   )
      | fitsInNbits 12 (fromIntegral n) -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        return $ Any (intFormat w) ( \dst -> code_x `snocOL` annExpr expr (SSLTU (OpReg W64 dst) (OpReg W64 reg_x) (OpImm (ImmInteger  n))))

    CmmMachOp (MO_S_Lt w) [x, y]
      | w `elem` [W8, W16, W32] -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `appOL`
                                      signExtend w W64 reg_x reg_x `appOL`
                                      signExtend w W64 reg_y reg_y `snocOL`
                                      annExpr expr (CSET SLT (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )
      | otherwise -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `snocOL`
                                      annExpr expr (CSET SLT (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )

    CmmMachOp (MO_S_Le w) [x, y]
      | w `elem` [W8, W16, W32] -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `appOL`
                                      signExtend w W64 reg_x reg_x `appOL`
                                      signExtend w W64 reg_y reg_y `snocOL`
                                      annExpr expr (CSET SLE (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )
      | otherwise -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `snocOL`
                                      annExpr expr (CSET SLE (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )

    CmmMachOp (MO_S_Ge w) [x, y]
      | w `elem` [W8, W16, W32] -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `appOL`
                                      signExtend w W64 reg_x reg_x `appOL`
                                      signExtend w W64 reg_y reg_y `snocOL`
                                      annExpr expr (CSET SGE (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )
      | otherwise -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `snocOL`
                                      annExpr expr (CSET SGE (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )

    CmmMachOp (MO_S_Gt w) [x, y]
      | w `elem` [W8, W16, W32] -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `appOL`
                                      signExtend w W64 reg_x reg_x `appOL`
                                      signExtend w W64 reg_y reg_y `snocOL`
                                      annExpr expr (CSET SGT (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )
      | otherwise -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `snocOL`
                                      annExpr expr (CSET SGT (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )

    CmmMachOp (MO_U_Lt w) [x, y]
      | w `elem` [W8, W16, W32] -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `appOL`
                                      truncateReg w W64 reg_x `appOL`
                                      truncateReg w W64 reg_y `snocOL`
                                      annExpr expr (CSET ULT (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )
      | otherwise -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `snocOL`
                                      annExpr expr (CSET ULT (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )

    CmmMachOp (MO_U_Le w) [x, y]
      | w `elem` [W8, W16, W32] -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `appOL`
                                      truncateReg w W64 reg_x `appOL`
                                      truncateReg w W64 reg_y `snocOL`
                                      annExpr expr (CSET ULE (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )
      | otherwise -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `snocOL`
                                      annExpr expr (CSET ULE (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )

    CmmMachOp (MO_U_Ge w) [x, y]
      | w `elem` [W8, W16, W32] -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `appOL`
                                      truncateReg w W64 reg_x `appOL`
                                      truncateReg w W64 reg_y `snocOL`
                                      annExpr expr (CSET UGE (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )
      | otherwise -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `snocOL`
                                      annExpr expr (CSET UGE (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )

    CmmMachOp (MO_U_Gt w) [x, y]
      | w `elem` [W8, W16, W32] -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `appOL`
                                      truncateReg w W64 reg_x `appOL`
                                      truncateReg w W64 reg_y `snocOL`
                                      annExpr expr (CSET UGT (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )
      | otherwise -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        return $ Any (intFormat w) ( \dst ->
                                      code_x `appOL`
                                      code_y `snocOL`
                                      annExpr expr (CSET UGT (OpReg W64 dst) (OpReg W64 reg_x) (OpReg W64 reg_y))
                                   )


    -- Generic binary case.
    CmmMachOp op [x, y] -> do
      let
          -- A (potentially signed) integer operation.
          -- In the case of 8-, 16- and 32-bit signed arithmetic we must first
          -- sign-extend all arguments to 64-bits.
          -- TODO: can be simplified.
          intOp is_signed w op = do
              -- compute x<m> <- x
              -- compute x<o> <- y
              -- <OP> x<n>, x<m>, x<o>
              (reg_x, format_x, code_x) <- getSomeReg x
              (reg_y, format_y, code_y) <- getSomeReg y
              massertPpr (isIntFormat format_x && isIntFormat format_y) $ text "intOp: non-int"
              let w' = W64
              -- This is the width of the registers on which the operation
              -- should be performed.
              if not is_signed
                then return $ Any (intFormat w) $ \dst ->
                      code_x `appOL`
                      code_y `appOL`
                      -- zero-extend both operands
                      truncateReg (formatToWidth format_x) w' reg_x `appOL`
                      truncateReg (formatToWidth format_y) w' reg_y `snocOL`
                      op (OpReg w' dst) (OpReg w' reg_x) (OpReg w' reg_y) `appOL`
                      truncateReg w' w dst -- truncate back to the operand's original width
                else return $ Any (intFormat w) $ \dst ->
                      code_x `appOL`
                      code_y `appOL`
                      -- sign-extend both operands
                      signExtend (formatToWidth format_x) W64 reg_x reg_x `appOL`
                      signExtend (formatToWidth format_x) W64 reg_y reg_y `snocOL`
                      op (OpReg w' dst) (OpReg w' reg_x) (OpReg w' reg_y) `appOL`
                      truncateReg w' w dst -- truncate back to the operand's original width

          floatOp w op = do
            (reg_fx, format_x, code_fx) <- getFloatReg x
            (reg_fy, format_y, code_fy) <- getFloatReg y
            massertPpr (isFloatFormat format_x && isFloatFormat format_y) $ text "floatOp: non-float"
            return $ Any (floatFormat w) (\dst -> code_fx `appOL` code_fy `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy))

          -- need a special one for conditionals, as they return ints
          floatCond w op = do
            (reg_fx, format_x, code_fx) <- getFloatReg x
            (reg_fy, format_y, code_fy) <- getFloatReg y
            massertPpr (isFloatFormat format_x && isFloatFormat format_y) $ text "floatCond: non-float"
            return $ Any (intFormat w) (\dst -> code_fx `appOL` code_fy `appOL` op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy))

      case op of
        -- Integer operations
        -- Add/Sub should only be Integer Options.
        MO_Add w -> intOp False w (\d x y ->  annExpr expr (ADD d x y))
        MO_Sub w -> intOp False w (\d x y ->  annExpr expr (SUB d x y))

        -- Signed multiply/divide/remain
        MO_Mul w          -> intOp True w (\d x y -> annExpr expr (MUL d x y))
        MO_S_MulMayOflo w -> do_mul_may_oflo w x y

        MO_S_Quot w  -> intOp True w (\d x y -> annExpr expr (DIV d x y))
        MO_S_Rem w   -> intOp True w (\d x y -> annExpr expr (MOD d x y))

        -- Unsigned divide/remain
        MO_U_Quot w  -> intOp False w (\d x y -> annExpr expr (DIVU d x y))
        MO_U_Rem w   -> intOp False w (\d x y -> annExpr expr (MODU d x y))

        -- Floating point arithmetic
        MO_F_Add w   -> floatOp w (\d x y -> unitOL $ annExpr expr (ADD d x y))
        MO_F_Sub w   -> floatOp w (\d x y -> unitOL $ annExpr expr (SUB d x y))
        MO_F_Mul w   -> floatOp w (\d x y -> unitOL $ annExpr expr (MUL d x y))
        MO_F_Quot w  -> floatOp w (\d x y -> unitOL $ annExpr expr (DIV d x y))
        MO_F_Min w   -> floatOp w (\d x y -> unitOL $ annExpr expr (FMIN d x y))
        MO_F_Max w   -> floatOp w (\d x y -> unitOL $ annExpr expr (FMAX d x y))

        -- Floating point comparison
        MO_F_Eq w    -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET EQ d x y))
        MO_F_Ne w    -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET NE d x y))
        MO_F_Ge w    -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET FGE d x y))
        MO_F_Le w    -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET FLE d x y))
        MO_F_Gt w    -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET FGT d x y))
        MO_F_Lt w    -> floatCond w (\d x y -> unitOL $ annExpr expr (CSET FLT d x y))

        op -> pprPanic "getRegister' (unhandled dyadic CmmMachOp): " $ pprMachOp op <+> text "in" <+> pdoc plat expr

    -- Generic ternary case.
    CmmMachOp op [x, y, z] ->
      case op of

        -- Floating-point fused multiply-add operations
        MO_FMA var l w
          | l == 1
          -> case var of
            FMAdd  -> float3Op w (\d n m a -> unitOL $ FMA FMAdd  d n m a)
            FMSub  -> float3Op w (\d n m a -> unitOL $ FMA FMSub d n m a)
            FNMAdd -> float3Op w (\d n m a -> unitOL $ FMA FNMSub  d n m a)
            FNMSub -> float3Op w (\d n m a -> unitOL $ FMA FNMAdd d n m a)
          | otherwise
          -> sorry "The RISCV64 backend does not (yet) support vectors."

        _ -> pprPanic "getRegister' (unhandled ternary CmmMachOp): " $ (pprMachOp op) <+> text "in" <+> (pdoc plat expr)

      where
          float3Op w op = do
            (reg_fx, format_x, code_fx) <- getFloatReg x
            (reg_fy, format_y, code_fy) <- getFloatReg y
            (reg_fz, format_z, code_fz) <- getFloatReg z
            massertPpr (isFloatFormat format_x && isFloatFormat format_y && isFloatFormat format_z) $
              text "float3Op: non-float"
            pure $
              Any (floatFormat w) $ \ dst ->
                code_fx `appOL`
                code_fy `appOL`
                code_fz `appOL`
                op (OpReg w dst) (OpReg w reg_fx) (OpReg w reg_fy) (OpReg w reg_fz)

    CmmMachOp _op _xs
      -> pprPanic "getRegister' (variadic CmmMachOp): " (pdoc plat expr)

  where
    -- N.B. MUL does not set the overflow flag.
    -- Return 0 when the operation cannot overflow, /= 0 otherwise
    do_mul_may_oflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
    do_mul_may_oflo W64 x y = do
      (reg_x, _format_x, code_x) <- getSomeReg x
      (reg_y, _format_y, code_y) <- getSomeReg y
      lo <- getNewRegNat II64
      hi <- getNewRegNat II64
      return $ Any (intFormat W64) (\dst ->
        code_x `appOL`
        code_y `snocOL`
        MULH (OpReg W64 hi) (OpReg W64 reg_x)  (OpReg W64 reg_y) `snocOL`
        MUL  (OpReg W64 lo) (OpReg W64 reg_x)  (OpReg W64 reg_y) `snocOL`
        SRA  (OpReg W64 lo) (OpReg W64 lo)     (OpImm (ImmInt 63)) `snocOL`
        CSET NE (OpReg W64 dst) (OpReg W64 hi)  (OpReg W64 lo)
                                 )

    do_mul_may_oflo W32 x y = do
        (reg_x, _format_x, code_x) <- getSomeReg x
        (reg_y, _format_y, code_y) <- getSomeReg y
        tmp1 <- getNewRegNat II64
        tmp2 <- getNewRegNat II64
        return $ Any (intFormat W32) (\dst ->
            code_x `appOL`
            code_y `snocOL`
            MULW (OpReg W64 tmp1) (OpReg W64 reg_x) (OpReg W64 reg_y) `snocOL`
            ADD (OpReg W64 tmp2) (OpReg W32 tmp1) (OpImm (ImmInt 0)) `snocOL`
            CSET NE (OpReg W64 dst) (OpReg W64 tmp1)  (OpReg W64 tmp2)
                                     )

    -- General case
    do_mul_may_oflo w x y = do
      -- Assert: 8bit * 8bit cannot overflow 16bit, and so on.
      (reg_x, format_x, code_x) <- getSomeReg x
      (reg_y, format_y, code_y) <- getSomeReg y
      tmp1 <- getNewRegNat II64
      tmp2 <- getNewRegNat II64
      let width_x = formatToWidth format_x
          width_y = formatToWidth format_y
          extend dst src =
            case w of
              W8  -> SLL (OpReg W64 dst) (OpReg W32 src) (OpImm (ImmInt 0))
              W16 -> SLL (OpReg W64 dst) (OpReg W32 src) (OpImm (ImmInt 0))
              _   -> panic "Must be in [W8, W16, W32]!"
          extract width dst src =
            case width of
              W8  -> EXT (OpReg W64 dst) (OpReg W8 src)
              W16 -> EXT (OpReg W64 dst) (OpReg W16 src)
              W32 -> SLL (OpReg W64 dst) (OpReg W32 src) (OpImm (ImmInt 0))
              _   -> panic "Must be in [W8, W16, W32]!"

      case w of
        w | (width_x < w) && (width_y < w) ->
          return $ Any (intFormat w) ( \dst ->
            unitOL $ annExpr expr (MOV (OpReg w dst) (OpImm (ImmInt 0)))
                                     )
        w | w <= W32 && width_x <= W32 && width_y <= W32 ->
            return $ Any (intFormat W32) (\dst ->
                code_x `appOL`
                code_y `appOL`
                -- signExtend [W8, W16] register to W64 and then SLL
                -- nil for W32
                signExtend (formatToWidth format_x) W64 reg_x reg_x `appOL`
                signExtend (formatToWidth format_y) W64 reg_y reg_y `snocOL`
                extend reg_x reg_x `snocOL`
                extend reg_y reg_y `snocOL`
                -- 64-bits MUL
                MUL (OpReg W64 tmp1) (OpReg W64 reg_x) (OpReg W64 reg_y) `snocOL`
                -- extract valid result via result's width
                -- slli.w for W32, otherwise ext.w.[b, h]
                extract w tmp2 tmp1 `snocOL`
                CSET NE (OpReg W64 dst) (OpReg W64 tmp1)  (OpReg W64 tmp2)
                                        )

        -- Should it be happened?
        _ ->
          return $ Any (intFormat w) ( \dst ->
            unitOL $ annExpr expr (MOV (OpReg w dst) (OpImm (ImmInt 1))))

-- Sign-extend the value in the given register from width @w@
-- up to width @w'@.
-- TODO: Is there room for optimization?
signExtend :: Width -> Width -> Reg -> Reg -> OrdList Instr
signExtend w w' r r'
  | w > w' = pprPanic "Sign-extend Error: not a sign extension, but a truncation." $ ppr w <> text "->" <+> ppr w'
  | w > W64 || w' > W64  = pprPanic "Sign-extend Error: from/to register width greater than 64-bit." $ ppr w <> text "->" <+> ppr w'
  | w == W64 && w' == W64 && r == r' = nilOL
  | w == W32 && w' == W64 = unitOL $ SLL (OpReg W64 r') (OpReg w r) (OpImm (ImmInt 0))
  -- Sign-extend W8 and W16 to W64.
  | w `elem` [W8, W16] = unitOL $ EXT (OpReg W64 r') (OpReg w r)
  | w == w' = unitOL $ MOV (OpReg w' r') (OpReg w r)
  | otherwise = pprPanic "signExtend: Unexpected width: " $ ppr w  <> text "->" <+> ppr w'

-- | Instructions to truncate the value in the given register from width @w@
-- down to width @w'@.
truncateReg :: Width -> Width -> Reg -> OrdList Instr
truncateReg w w' r
  | w > W64 || w' > W64  = pprPanic "Tructate Error: from/to register width greater than 64-bit." $ ppr w <> text "->" <+> ppr w'
  | w == w' = nilOL
  | w /= w' = toOL
    [
      ann
        (text "truncateReg: " <+> ppr r <+> ppr w <> text "->" <> ppr w')
        (BSTRPICK II64 (OpReg w' r) (OpReg w r) (OpImm (ImmInt shift)) (OpImm (ImmInt 0)))
    ]
  | otherwise = pprPanic "truncateReg: Unexpected width: " $ ppr w  <> text "->" <+> ppr w'
  where
    shift = (min (widthInBits w) (widthInBits w')) - 1

--  The 'Amode' type: Memory addressing modes passed up the tree.
data Amode = Amode AddrMode InstrBlock

-- | Provide the value of a `CmmExpr` with an `Amode`
--  N.B. this function should be used to provide operands to load and store
--  instructions with signed 12bit wide immediates (S & I types). For other
--  immediate sizes and formats (e.g. B type uses multiples of 2) this function
--  would need to be adjusted.
getAmode :: Platform
         -> Width     -- ^ width of loaded value
         -> CmmExpr
         -> NatM Amode

-- LD/ST: Immediate can be represented with 12bits
getAmode platform w (CmmRegOff reg off)
  | w <= W64, fitsInNbits 12 (fromIntegral off)
  = return $ Amode (AddrRegImm reg' off') nilOL
    where reg' = getRegisterReg platform reg
          off' = ImmInt off

-- For Stores we often see something like this:
-- CmmStore (CmmMachOp (MO_Add w) [CmmLoad expr, CmmLit (CmmInt n w')]) (expr2)
-- E.g. a CmmStoreOff really. This can be translated to `str $expr2, [$expr, #n ]
-- for `n` in range.
getAmode _platform _ (CmmMachOp (MO_Add _w) [expr, CmmLit (CmmInt off _w')])
  | fitsInNbits 12 (fromIntegral off)
  = do (reg, _format, code) <- getSomeReg expr
       return $ Amode (AddrRegImm reg (ImmInteger off)) code

getAmode _platform _ (CmmMachOp (MO_Sub _w) [expr, CmmLit (CmmInt off _w')])
  | fitsInNbits 12 (fromIntegral (-off))
  = do (reg, _format, code) <- getSomeReg expr
       return $ Amode (AddrRegImm reg (ImmInteger (-off))) code

-- Generic case
getAmode _platform _ expr
  = do (reg, _format, code) <- getSomeReg expr
       return $ Amode (AddrReg reg) code

-- -----------------------------------------------------------------------------
-- Generating assignments

-- Assignments are really at the heart of the whole code generation
-- business.  Almost all top-level nodes of any real importance are
-- assignments, which correspond to loads, stores, or register
-- transfers.  If we're really lucky, some of the register transfers
-- will go away, because we can use the destination register to
-- complete the code generation for the right hand side.  This only
-- fails when the right hand side is forced into a fixed register
-- (e.g. the result of a call).

assignMem_IntCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
assignReg_IntCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock

assignMem_FltCode :: Format -> CmmExpr -> CmmExpr -> NatM InstrBlock
assignReg_FltCode :: Format -> CmmReg  -> CmmExpr -> NatM InstrBlock

assignMem_IntCode rep addrE srcE
  = do
    (src_reg, _format, code) <- getSomeReg srcE
    platform <- getPlatform
    let w = formatToWidth rep
    Amode addr addr_code <- getAmode platform w addrE
    return $ COMMENT (text "CmmStore" <+> parens (text (show addrE)) <+> parens (text (show srcE)))
            `consOL` (code
            `appOL`   addr_code
            `snocOL`  ST rep (OpReg w src_reg) (OpAddr addr)
                     )

assignReg_IntCode _ reg src
  = do
    platform <- getPlatform
    let dst = getRegisterReg platform reg
    r <- getRegister src
    return $ case r of
      Any _ code              -> COMMENT (text "CmmAssign" <+> parens (text (show reg)) <+> parens (text (show src))) `consOL` code dst
      Fixed format freg fcode -> COMMENT (text "CmmAssign" <+> parens (text (show reg)) <+> parens (text (show src))) `consOL`
                                               (fcode `snocOL`
                                                 MOV (OpReg (formatToWidth format) dst) (OpReg (formatToWidth format) freg)
                                               )

-- Let's treat Floating point stuff
-- as integer code for now. Opaque.
assignMem_FltCode = assignMem_IntCode
assignReg_FltCode = assignReg_IntCode

-- Jumps
genJump :: CmmExpr{-the branch target-} -> NatM InstrBlock
genJump expr = do
  case expr of
    (CmmLit (CmmLabel lbl)) -> do
      return $ unitOL (annExpr expr (TAIL36 (OpReg W64 tmpReg) (TLabel lbl)))
    (CmmLit (CmmBlock bid)) -> do
      return $ unitOL (annExpr expr (TAIL36 (OpReg W64 tmpReg) (TBlock bid)))
    _ -> do
      (target, _format, code) <- getSomeReg expr
      -- I'd like to do more.
      return $ COMMENT (text "genJump for unknow expr: " <+> (text (show expr))) `consOL`
        (code `appOL`
          unitOL (annExpr expr (J (TReg target)))
        )

-- -----------------------------------------------------------------------------
--  Unconditional branches
genBranch :: BlockId -> NatM InstrBlock
genBranch = return . toOL . mkJumpInstr

-- -----------------------------------------------------------------------------
-- Conditional branches
genCondJump
    :: BlockId
    -> CmmExpr
    -> NatM InstrBlock
genCondJump bid expr = do
    case expr of
      -- Optimized == 0 case.
      CmmMachOp (MO_Eq W64) [x, CmmLit (CmmInt 0 _)] -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        return $
          code_x `snocOL`
          BEQZ (OpReg W64 reg_x) (TBlock bid)
      CmmMachOp (MO_Eq w) [x, CmmLit (CmmInt 0 _)]
        | w `elem` [W8, W16, W32] -> do
        (reg_x, format_x, code_x) <- getSomeReg x
        return $
          code_x `appOL`
          signExtend (formatToWidth format_x) W64 reg_x reg_x `snocOL`
          BEQZ (OpReg W64 reg_x) (TBlock bid)

      -- Optimized /= 0 case.
      CmmMachOp (MO_Ne W64) [x, CmmLit (CmmInt 0 _)] -> do
        (reg_x, _format_x, code_x) <- getSomeReg x
        return $ code_x `snocOL` (annExpr expr (BNEZ (OpReg W64 reg_x) (TBlock bid)))
      CmmMachOp (MO_Ne w) [x, CmmLit (CmmInt 0 _)]
        | w `elem` [W8, W16, W32] -> do
        (reg_x, format_x, code_x) <- getSomeReg x
        return $
          code_x `appOL`
          signExtend (formatToWidth format_x) W64 reg_x reg_x `snocOL`
          BNEZ (OpReg W64 reg_x) (TBlock bid)

      -- Generic case.
      CmmMachOp mop [x, y] -> do
        let ubcond w cmp = do
              (reg_x, format_x, code_x) <- getSomeReg x
              (reg_y, format_y, code_y) <- getSomeReg y
              return $ case w of
                w | w `elem` [W8, W16, W32] ->
                    code_x `appOL`
                    truncateReg (formatToWidth format_x) W64 reg_x  `appOL`
                    code_y `appOL`
                    truncateReg (formatToWidth format_y) W64 reg_y  `snocOL`
                    BCOND1 cmp (OpReg W64 reg_x) (OpReg W64 reg_y) (TBlock bid)
                _ ->
                    code_x `appOL`
                    code_y `snocOL`
                    BCOND1 cmp (OpReg W64 reg_x) (OpReg W64 reg_y) (TBlock bid)

            sbcond w cmp = do
              (reg_x, format_x, code_x) <- getSomeReg x
              (reg_y, format_y, code_y) <- getSomeReg y
              return $ case w of
                w | w `elem` [W8, W16, W32] ->
                  code_x `appOL`
                  signExtend (formatToWidth format_x) W64 reg_x reg_x `appOL`
                  code_y `appOL`
                  signExtend (formatToWidth format_y) W64 reg_y reg_y `snocOL`
                  BCOND1 cmp (OpReg W64 reg_x) (OpReg W64 reg_y) (TBlock bid)
                _ ->
                  code_x `appOL`
                  code_y `snocOL`
                  BCOND1 cmp (OpReg W64 reg_x) (OpReg W64 reg_y) (TBlock bid)

            fbcond w cmp = do
              (reg_fx, _format_fx, code_fx) <- getFloatReg x
              (reg_fy, _format_fy, code_fy) <- getFloatReg y
              rst <- OpReg W64 <$> getNewRegNat II64
              oneReg <- OpReg W64 <$> getNewRegNat II64
              return $
                code_fx `appOL`
                code_fy `snocOL`
                CSET cmp rst (OpReg w reg_fx) (OpReg w reg_fy) `snocOL`
                MOV oneReg (OpImm (ImmInt 1)) `snocOL`
                BCOND1 EQ rst oneReg (TBlock bid)


        case mop of
          MO_F_Eq w -> fbcond w EQ
          MO_F_Ne w -> fbcond w NE
          MO_F_Gt w -> fbcond w FGT
          MO_F_Ge w -> fbcond w FGE
          MO_F_Lt w -> fbcond w FLT
          MO_F_Le w -> fbcond w FLE
          MO_Eq w   -> sbcond w EQ
          MO_Ne w   -> sbcond w NE
          MO_S_Gt w -> sbcond w SGT
          MO_S_Ge w -> sbcond w SGE
          MO_S_Lt w -> sbcond w SLT
          MO_S_Le w -> sbcond w SLE
          MO_U_Gt w -> ubcond w UGT
          MO_U_Ge w -> ubcond w UGE
          MO_U_Lt w -> ubcond w ULT
          MO_U_Le w -> ubcond w ULE
          _ -> pprPanic "LA64.genCondJump:case mop: " (text $ show expr)

      _ -> pprPanic "LA64.genCondJump: " (text $ show expr)

-- | Generate conditional branching instructions
-- This is basically an "if with else" statement.
genCondBranch ::
  BlockId ->
  BlockId ->
  CmmExpr ->
  NatM InstrBlock
genCondBranch true false expr = do
  b1 <- genCondJump true expr
  b2 <- genBranch false
  return (b1 `appOL` b2)

-- -----------------------------------------------------------------------------
{-
Generating C calls

Generate a call to a C function:

GARs: 8 general-purpose registers $a0 - $a7, where $a0 and $a1 are also used for
integral values.
FARs: 8 floating-point registers $fa0 - $fa7, where $fa0 and $fa1 are also used
for returning values.

An argument is passed using the stack only when no appropriate argument register
is available.

Subroutines should ensure that the initial values of the general-purpose registers
$s0 - $s9 and floating-point registers $fs0 - $fs7 are preserved across the call.

At the entry of a procedure call, the return address of the call site is stored
in $ra. A branch jump to this address should be the last instruction executed in
the called procedure.

The on-stack part of the structure and scalar arguments are aligned to the greater
of the type alignment and GRLEN bits, except when this alignment is larger than
 the 16-byte stack alignment. In this case, the part of the argument should be
16-byte-aligned.

In a procedure call, GARs / FARs are generally only used for passing non-floating
-point / floating-point argument data, respectively. However, the floating-point
member of a structure or union argument, or a vector/floating-point argument
wider than FRLEN may be passed in a GAR.
-}

genCCall
    :: ForeignTarget      -- function to call
    -> [CmmFormal]        -- where to put the result
    -> [CmmActual]        -- arguments (of mixed type)
    -> NatM InstrBlock

-- TODO: Specialize where we can.
-- Generic impl
genCCall target dest_regs arg_regs = do
  case target of
    -- The target :: ForeignTarget call can either
    -- be a foreign procedure with an address expr
    -- and a calling convention.
    ForeignTarget expr _cconv -> do
      (call_target, call_target_code) <- case expr of
        -- if this is a label, let's just directly to it.
        (CmmLit (CmmLabel lbl)) -> pure (TLabel lbl, nilOL)
        -- if it's not a label, let's compute the expression into a
        -- register and jump to that.
        _ -> do
          (reg, _format, reg_code) <- getSomeReg expr
          pure (TReg reg, reg_code)
      -- compute the code and register logic for all arg_regs.
      -- this will give us the format information to match on.
      arg_regs' <- mapM getSomeReg arg_regs

      -- Now this is stupid.  Our Cmm expressions doesn't carry the proper sizes
      -- so while in Cmm we might get W64 incorrectly for an int, that is W32 in
      -- STG; this thenn breaks packing of stack arguments, if we need to pack
      -- for the pcs, e.g. darwinpcs.  Option one would be to fix the Int type
      -- in Cmm proper. Option two, which we choose here is to use extended Hint
      -- information to contain the size information and use that when packing
      -- arguments, spilled onto the stack.
      let (_res_hints, arg_hints) = foreignTargetHints target
          arg_regs'' = zipWith (\(r, f, c) h -> (r,f,h,c)) arg_regs' arg_hints

      (stackSpaceWords, passRegs, passArgumentsCode) <- passArguments allGpArgRegs allFpArgRegs arg_regs'' 0 [] nilOL

      readResultsCode <- readResults allGpArgRegs allFpArgRegs dest_regs [] nilOL

      let moveStackDown 0 = toOL [ PUSH_STACK_FRAME
                                 , DELTA (-16)
                                 ]
          moveStackDown i | odd i = moveStackDown (i + 1)
          moveStackDown i = toOL [ PUSH_STACK_FRAME
                                 , SUB (OpReg W64 (spMachReg)) (OpReg W64 (spMachReg)) (OpImm (ImmInt (8 * i)))
                                 , DELTA (-8 * i - 16)
                                 ]
          moveStackUp 0 = toOL [ POP_STACK_FRAME
                               , DELTA 0
                               ]
          moveStackUp i | odd i = moveStackUp (i + 1)
          moveStackUp i = toOL [ ADD (OpReg W64 (spMachReg)) (OpReg W64 (spMachReg)) (OpImm (ImmInt (8 * i)))
                               , POP_STACK_FRAME
                               , DELTA 0
                               ]

      let code =
            call_target_code -- compute the label (possibly into a register)
              `appOL` moveStackDown (stackSpaceWords)
              `appOL` passArgumentsCode -- put the arguments into x0, ...
              `snocOL` CALL call_target passRegs -- branch and link (C calls aren't tail calls, but return)
              `appOL` readResultsCode -- parse the results into registers
              `appOL` moveStackUp (stackSpaceWords)
      return code

    PrimTarget MO_F32_Fabs
      | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
        unaryFloatOp W32 (\d x -> unitOL $ FABS d x) arg_reg dest_reg
      | otherwise -> panic "mal-formed MO_F32_Fabs"
    PrimTarget MO_F64_Fabs
      | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
        unaryFloatOp W64 (\d x -> unitOL $ FABS d x) arg_reg dest_reg
      | otherwise -> panic "mal-formed MO_F64_Fabs"

    PrimTarget MO_F32_Sqrt
      | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
        unaryFloatOp W32 (\d x -> unitOL $ FSQRT d x) arg_reg dest_reg
      | otherwise -> panic "mal-formed MO_F32_Sqrt"
    PrimTarget MO_F64_Sqrt
      | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
        unaryFloatOp W64 (\d x -> unitOL $ FSQRT d x) arg_reg dest_reg
      | otherwise -> panic "mal-formed MO_F64_Sqrt"

    PrimTarget (MO_Clz w)
      | w `elem` [W32, W64],
      [arg_reg] <- arg_regs,
      [dest_reg] <- dest_regs -> do
      platform <- getPlatform
      (reg_x, _format_x, code_x) <- getSomeReg arg_reg
      let dst_reg = getRegisterReg platform (CmmLocal dest_reg)
      return ( code_x `snocOL`
               CLZ (OpReg w dst_reg) (OpReg w reg_x)
             )
      | w `elem` [W8, W16],
      [arg_reg] <- arg_regs,
      [dest_reg] <- dest_regs -> do
        platform <- getPlatform
        (reg_x, _format_x, code_x) <- getSomeReg arg_reg
        let dst_reg = getRegisterReg platform (CmmLocal dest_reg)
        return ( code_x `appOL` toOL
                 [
                  MOV (OpReg W64 dst_reg) (OpImm (ImmInt 1)),
                  SLL (OpReg W64 dst_reg) (OpReg W64 dst_reg) (OpImm (ImmInt (31-shift))),
                  SLL (OpReg W64 reg_x) (OpReg W64 reg_x) (OpImm (ImmInt (32-shift))),
                  OR (OpReg W64 dst_reg) (OpReg W64 dst_reg) (OpReg W64 reg_x),
                  CLZ (OpReg W64 dst_reg) (OpReg W32 dst_reg)
                 ]
               )
      | otherwise -> unsupported (MO_Clz w)
      where
        shift = widthToInt w

    PrimTarget (MO_Ctz w)
      | w `elem` [W32, W64],
      [arg_reg] <- arg_regs,
      [dest_reg] <- dest_regs -> do
      platform <- getPlatform
      (reg_x, _format_x, code_x) <- getSomeReg arg_reg
      let dst_reg = getRegisterReg platform (CmmLocal dest_reg)
      return ( code_x `snocOL`
               CTZ (OpReg w dst_reg) (OpReg w reg_x)
             )
      | w `elem` [W8, W16],
      [arg_reg] <- arg_regs,
      [dest_reg] <- dest_regs -> do
      platform <- getPlatform
      (reg_x, _format_x, code_x) <- getSomeReg arg_reg
      let dst_reg = getRegisterReg platform (CmmLocal dest_reg)
      return ( code_x `appOL` toOL
               [
                MOV (OpReg W64 dst_reg) (OpImm (ImmInt 1)),
                SLL (OpReg W64 dst_reg) (OpReg W64 dst_reg) (OpImm (ImmInt shift)),
                BSTRPICK II64 (OpReg W64 reg_x) (OpReg W64 reg_x) (OpImm (ImmInt (shift-1))) (OpImm (ImmInt 0)),
                OR  (OpReg W64 dst_reg) (OpReg W64 dst_reg) (OpReg W64 reg_x),
                CTZ (OpReg W64 dst_reg) (OpReg W64 dst_reg)
               ]
             )
      | otherwise -> unsupported (MO_Ctz w)
      where
        shift = (widthToInt w)

    -- mop :: CallishMachOp (see GHC.Cmm.MachOp)
    PrimTarget mop -> do
      -- We'll need config to construct forien targets
      case mop of
        -- 64 bit float ops
        MO_F64_Pwr   -> mkCCall "pow"

        MO_F64_Sin   -> mkCCall "sin"
        MO_F64_Cos   -> mkCCall "cos"
        MO_F64_Tan   -> mkCCall "tan"

        MO_F64_Sinh  -> mkCCall "sinh"
        MO_F64_Cosh  -> mkCCall "cosh"
        MO_F64_Tanh  -> mkCCall "tanh"

        MO_F64_Asin  -> mkCCall "asin"
        MO_F64_Acos  -> mkCCall "acos"
        MO_F64_Atan  -> mkCCall "atan"

        MO_F64_Asinh -> mkCCall "asinh"
        MO_F64_Acosh -> mkCCall "acosh"
        MO_F64_Atanh -> mkCCall "atanh"

        MO_F64_Log   -> mkCCall "log"
        MO_F64_Log1P -> mkCCall "log1p"
        MO_F64_Exp   -> mkCCall "exp"
        MO_F64_ExpM1 -> mkCCall "expm1"

        -- 32 bit float ops
        MO_F32_Pwr   -> mkCCall "powf"

        MO_F32_Sin   -> mkCCall "sinf"
        MO_F32_Cos   -> mkCCall "cosf"
        MO_F32_Tan   -> mkCCall "tanf"
        MO_F32_Sinh  -> mkCCall "sinhf"
        MO_F32_Cosh  -> mkCCall "coshf"
        MO_F32_Tanh  -> mkCCall "tanhf"
        MO_F32_Asin  -> mkCCall "asinf"
        MO_F32_Acos  -> mkCCall "acosf"
        MO_F32_Atan  -> mkCCall "atanf"
        MO_F32_Asinh -> mkCCall "asinhf"
        MO_F32_Acosh -> mkCCall "acoshf"
        MO_F32_Atanh -> mkCCall "atanhf"
        MO_F32_Log   -> mkCCall "logf"
        MO_F32_Log1P -> mkCCall "log1pf"
        MO_F32_Exp   -> mkCCall "expf"
        MO_F32_ExpM1 -> mkCCall "expm1f"

        -- 64-bit primops
        MO_I64_ToI   -> mkCCall "hs_int64ToInt"
        MO_I64_FromI -> mkCCall "hs_intToInt64"
        MO_W64_ToW   -> mkCCall "hs_word64ToWord"
        MO_W64_FromW -> mkCCall "hs_wordToWord64"
        MO_x64_Neg   -> mkCCall "hs_neg64"
        MO_x64_Add   -> mkCCall "hs_add64"
        MO_x64_Sub   -> mkCCall "hs_sub64"
        MO_x64_Mul   -> mkCCall "hs_mul64"
        MO_I64_Quot  -> mkCCall "hs_quotInt64"
        MO_I64_Rem   -> mkCCall "hs_remInt64"
        MO_W64_Quot  -> mkCCall "hs_quotWord64"
        MO_W64_Rem   -> mkCCall "hs_remWord64"
        MO_x64_And   -> mkCCall "hs_and64"
        MO_x64_Or    -> mkCCall "hs_or64"
        MO_x64_Xor   -> mkCCall "hs_xor64"
        MO_x64_Not   -> mkCCall "hs_not64"
        MO_x64_Shl   -> mkCCall "hs_uncheckedShiftL64"
        MO_I64_Shr   -> mkCCall "hs_uncheckedIShiftRA64"
        MO_W64_Shr   -> mkCCall "hs_uncheckedShiftRL64"
        MO_x64_Eq    -> mkCCall "hs_eq64"
        MO_x64_Ne    -> mkCCall "hs_ne64"
        MO_I64_Ge    -> mkCCall "hs_geInt64"
        MO_I64_Gt    -> mkCCall "hs_gtInt64"
        MO_I64_Le    -> mkCCall "hs_leInt64"
        MO_I64_Lt    -> mkCCall "hs_ltInt64"
        MO_W64_Ge    -> mkCCall "hs_geWord64"
        MO_W64_Gt    -> mkCCall "hs_gtWord64"
        MO_W64_Le    -> mkCCall "hs_leWord64"
        MO_W64_Lt    -> mkCCall "hs_ltWord64"

        -- Conversion
        MO_UF_Conv w        -> mkCCall (word2FloatLabel w)

        -- Optional MachOps
        -- These are enabled/disabled by backend flags: GHC.StgToCmm.Config
        MO_S_Mul2     _w -> unsupported mop
        MO_S_QuotRem  _w -> unsupported mop
        MO_U_QuotRem  _w -> unsupported mop
        MO_U_QuotRem2 _w -> unsupported mop
        MO_Add2       _w -> unsupported mop
        MO_AddWordC   _w -> unsupported mop
        MO_SubWordC   _w -> unsupported mop
        MO_AddIntC    _w -> unsupported mop
        MO_SubIntC    _w -> unsupported mop
        MO_U_Mul2     _w -> unsupported mop

        MO_VS_Quot {} -> unsupported mop
        MO_VS_Rem {}  -> unsupported mop
        MO_VU_Quot {} -> unsupported mop
        MO_VU_Rem {}  -> unsupported mop
        MO_I64X2_Min -> unsupported mop
        MO_I64X2_Max -> unsupported mop
        MO_W64X2_Min -> unsupported mop
        MO_W64X2_Max -> unsupported mop

        -- Memory Ordering
        -- A hint value of 0 is mandatory by default, and it indicates a fully functional synchronization barrier.
        -- Only after all previous load/store access operations are completely executed, the DBAR 0 instruction can be executed;
        -- and only after the execution of DBAR 0 is completed, all subsequent load/store access operations can be executed.

        MO_AcquireFence -> pure (unitOL (DBAR Hint0))
        MO_ReleaseFence -> pure (unitOL (DBAR Hint0))
        MO_SeqCstFence  -> pure (unitOL (DBAR Hint0))

        MO_Touch        -> pure nilOL -- Keep variables live (when using interior pointers)
        -- Prefetch
        MO_Prefetch_Data _n -> pure nilOL -- Prefetch hint.

        -- Memory copy/set/move/cmp, with alignment for optimization

        -- TODO Optimize and use e.g. quad registers to move memory around instead
        -- of offloading this to memcpy. For small memcpys we can utilize
        -- the 128bit quad registers in NEON to move block of bytes around.
        -- Might also make sense of small memsets? Use xzr? What's the function
        -- call overhead?
        MO_Memcpy  _align   -> mkCCall "memcpy"
        MO_Memset  _align   -> mkCCall "memset"
        MO_Memmove _align   -> mkCCall "memmove"
        MO_Memcmp  _align   -> mkCCall "memcmp"

        MO_SuspendThread    -> mkCCall "suspendThread"
        MO_ResumeThread     -> mkCCall "resumeThread"

        MO_PopCnt w         -> mkCCall (popCntLabel w)
        MO_Pdep w           -> mkCCall (pdepLabel w)
        MO_Pext w           -> mkCCall (pextLabel w)
        MO_BSwap w          -> mkCCall (bSwapLabel w)
        MO_BRev w           -> mkCCall (bRevLabel w)

    -- or a possibly side-effecting machine operation
        mo@(MO_AtomicRead w ord)
          | [p_reg] <- arg_regs
          , [dst_reg] <- dest_regs -> do
              (p, _fmt_p, code_p) <- getSomeReg p_reg
              platform <- getPlatform
              let instrs = case ord of
                      MemOrderRelaxed -> unitOL $ ann moDescr (LD (intFormat w) (OpReg w dst) (OpAddr $ AddrReg p))

                      MemOrderAcquire -> toOL [
                                                ann moDescr (LD (intFormat w) (OpReg w dst) (OpAddr $ AddrReg p)),
                                                DBAR Hint0
                                              ]
                      MemOrderSeqCst -> toOL [
                                                ann moDescr (DBAR Hint0),
                                                LD (intFormat w) (OpReg w dst) (OpAddr $ AddrReg p),
                                                DBAR Hint0
                                              ]
                      _ -> panic $ "Unexpected MemOrderRelease on an AtomicRead: " ++ show mo
                  dst = getRegisterReg platform (CmmLocal dst_reg)
                  moDescr = (text . show) mo
                  code = code_p `appOL` instrs
              pure code
          | otherwise -> panic "mal-formed AtomicRead"

        mo@(MO_AtomicWrite w ord)
          | [p_reg, val_reg] <- arg_regs -> do
              (p, _fmt_p, code_p) <- getSomeReg p_reg
              (val, fmt_val, code_val) <- getSomeReg val_reg
              let instrs = case ord of
                      MemOrderRelaxed -> unitOL $ ann moDescr (ST fmt_val (OpReg w val) (OpAddr $ AddrReg p))
                      MemOrderRelease -> toOL [
                                                ann moDescr (DBAR Hint0),
                                                ST fmt_val (OpReg w val) (OpAddr $ AddrReg p)
                                              ]
                      MemOrderSeqCst  -> toOL [
                                                ann moDescr (DBAR Hint0),
                                                ST fmt_val (OpReg w val) (OpAddr $ AddrReg p),
                                                DBAR Hint0
                                              ]
                      _ ->  panic $ "Unexpected MemOrderAcquire on an AtomicWrite" ++ show mo
                  moDescr = (text . show) mo
                  code =
                    code_p `appOL`
                    code_val `appOL`
                    instrs
              pure code
          | otherwise -> panic "mal-formed AtomicWrite"

        MO_AtomicRMW w amop -> mkCCall (atomicRMWLabel w amop)
        MO_Cmpxchg w        -> mkCCall (cmpxchgLabel w)
        MO_Xchg w           -> mkCCall (xchgLabel w)

  where
    unsupported :: Show a => a -> b
    unsupported mop = panic ("outOfLineCmmOp: " ++ show mop
                          ++ " not supported here")

    mkCCall :: FastString -> NatM InstrBlock
    mkCCall name = do
      config <- getConfig
      target <-
        cmmMakeDynamicReference config CallReference
          $ mkForeignLabel name ForeignLabelInThisPackage IsFunction
      let cconv = ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn
      genCCall (ForeignTarget target cconv) dest_regs arg_regs

    -- Implementiation of the LoongArch ABI calling convention.
    -- https://github.com/loongson/la-abi-specs/blob/release/lapcs.adoc#passing-arguments
    passArguments :: [Reg] -> [Reg] -> [(Reg, Format, ForeignHint, InstrBlock)] -> Int -> [Reg] -> InstrBlock -> NatM (Int, [Reg], InstrBlock)

    -- 1. Base case: no more arguments to pass (left)
    passArguments _ _ [] stackSpaceWords accumRegs accumCode = return (stackSpaceWords, accumRegs, accumCode)

    -- 2. Still have GP regs, and we want to pass an GP argument.
    passArguments (gpReg : gpRegs) fpRegs ((r, format, _hint, code_r) : args) stackSpaceWords accumRegs accumCode | isIntFormat format = do
      let w = formatToWidth format
          ext
            -- Specifically, LoongArch64's ABI requires that the caller
            -- sign-extend arguments which are smaller than 64-bits.
            | w `elem` [W8, W16, W32]
            = case w of
              W8  -> EXT (OpReg W64 gpReg) (OpReg w r)
              W16 -> EXT (OpReg W64 gpReg) (OpReg w r)
              W32 -> SLL (OpReg W64 gpReg) (OpReg w r) (OpImm (ImmInt 0))
              _ -> panic "Unexpected width(Here w < W64)!"
            | otherwise
            = MOV (OpReg w gpReg) (OpReg w r)
          accumCode' = accumCode `appOL`
                          code_r `snocOL`
                          ann (text "Pass gp argument: " <> ppr r) ext

      passArguments gpRegs fpRegs args stackSpaceWords (gpReg : accumRegs) accumCode'

    -- 3. Still have FP regs, and we want to pass an FP argument.
    passArguments gpRegs (fpReg : fpRegs) ((r, format, _hint, code_r) : args) stackSpaceWords accumRegs accumCode | isFloatFormat format = do
      let w = formatToWidth format
          mov = MOV (OpReg w fpReg) (OpReg w r)
          accumCode' = accumCode `appOL`
                       code_r `snocOL`
                       ann (text "Pass fp argument: " <> ppr r) mov

      passArguments gpRegs fpRegs args stackSpaceWords (fpReg : accumRegs) accumCode'

    -- 4. No mor regs left to pass. Must pass on stack.
    passArguments [] [] ((r, format, _hint, code_r) : args) stackSpaceWords accumRegs accumCode = do
      let w = formatToWidth format
          spOffet = 8 * stackSpaceWords
          str = ST format (OpReg w r) (OpAddr (AddrRegImm spMachReg (ImmInt spOffet)))
          stackCode =
            code_r
              `snocOL` (MOV (OpReg w tmpReg) (OpReg w r))
              `appOL` truncateReg w W64 tmpReg
              `snocOL` ann (text "Pass signed argument (size " <> ppr w <> text ") on the stack: " <> ppr tmpReg) str

      passArguments [] [] args (stackSpaceWords + 1) accumRegs (stackCode `appOL` accumCode)

    -- 5. Still have fpRegs left, but want to pass a GP argument. Must be passed on the stack then.
    passArguments [] fpRegs ((r, format, _hint, code_r) : args) stackSpaceWords accumRegs accumCode | isIntFormat format = do
      let w = formatToWidth format
          spOffet = 8 * stackSpaceWords
          str = ST format (OpReg w r) (OpAddr (AddrRegImm spMachReg (ImmInt spOffet)))
          stackCode =
            code_r
              `snocOL` ann (text "Pass argument (size " <> ppr w <> text ") on the stack: " <> ppr r) str

      passArguments [] fpRegs args (stackSpaceWords + 1) accumRegs (stackCode `appOL` accumCode)

   -- 6. Still have gpRegs left, but want to pass a FP argument. Must be passed in gpReg then.
    passArguments (gpReg : gpRegs) [] ((r, format, _hint, code_r) : args) stackSpaceWords accumRegs accumCode | isFloatFormat format = do
      let w = formatToWidth format
          mov = MOV (OpReg w gpReg) (OpReg w r)
          accumCode' = accumCode `appOL`
                       code_r `snocOL`
                       ann (text "Pass fp argument in gpReg: " <> ppr r) mov

      passArguments gpRegs [] args stackSpaceWords (gpReg : accumRegs) accumCode'


    passArguments _ _ _ _ _ _ = pprPanic "passArguments" (text "invalid state")

    readResults :: [Reg] -> [Reg] -> [LocalReg] -> [Reg] -> InstrBlock -> NatM InstrBlock
    readResults _ _ [] _ accumCode = return accumCode
    readResults [] _ _ _ _ = do
      platform <- getPlatform
      pprPanic "genCCall, out of gp registers when reading results" (pdoc platform target)
    readResults _ [] _ _ _ = do
      platform <- getPlatform
      pprPanic "genCCall, out of fp registers when reading results" (pdoc platform target)
    readResults (gpReg:gpRegs) (fpReg:fpRegs) (dst:dsts) accumRegs accumCode = do
      -- gp/fp reg -> dst
      platform <- getPlatform
      let rep = cmmRegType (CmmLocal dst)
          format = cmmTypeFormat rep
          w = cmmRegWidth (CmmLocal dst)
          r_dst = getRegisterReg platform (CmmLocal dst)
      if isFloatFormat format
        then readResults (gpReg : gpRegs) fpRegs dsts (fpReg : accumRegs) (accumCode `snocOL` MOV (OpReg w r_dst) (OpReg w fpReg))
        else
          readResults gpRegs (fpReg : fpRegs) dsts (gpReg : accumRegs)
            $ accumCode
            `snocOL` MOV (OpReg w r_dst) (OpReg w gpReg)
            `appOL`
            -- truncate, otherwise an unexpectedly big value might be used in upfollowing calculations
            truncateReg W64 w r_dst

    unaryFloatOp w op arg_reg dest_reg = do
      platform <- getPlatform
      (reg_fx, _format_x, code_fx) <- getFloatReg arg_reg
      let dst = getRegisterReg platform (CmmLocal dest_reg)
      let code = code_fx `appOL` op (OpReg w dst) (OpReg w reg_fx)
      pure code

data BlockInRange = InRange | NotInRange BlockId

genCondFarJump :: (MonadGetUnique m) => Cond -> Operand -> Operand -> BlockId -> m InstrBlock
genCondFarJump cond op1 op2 far_target = do
  return $ toOL [ ann (text "Conditional far jump to: " <> ppr far_target)
                $ BCOND cond op1 op2 (TBlock far_target)
                ]

makeFarBranches ::
  Platform ->
  LabelMap RawCmmStatics ->
  [NatBasicBlock Instr] ->
  UniqDSM [NatBasicBlock Instr]

makeFarBranches {- only used when debugging -} _platform statics basic_blocks = do
  -- All offsets/positions are counted in multiples of 4 bytes (the size of LoongArch64 instructions)
  -- That is an offset of 1 represents a 4-byte/one instruction offset.
  let (func_size, lblMap) = foldl' calc_lbl_positions (0, mapEmpty) basic_blocks
  if func_size < max_cond_jump_dist
    then pure basic_blocks
    else do
      (_, blocks) <- mapAccumLM (replace_blk lblMap) 0 basic_blocks
      pure $ concat blocks
  where
    max_cond_jump_dist = 2 ^ (15 :: Int) - 8 :: Int
    -- Currently all inline info tables fit into 64 bytes.
    max_info_size = 16 :: Int
    long_bc_jump_dist = 2 :: Int

    -- Replace out of range conditional jumps with unconditional jumps.
    replace_blk :: LabelMap Int -> Int -> GenBasicBlock Instr -> UniqDSM (Int, [GenBasicBlock Instr])
    replace_blk !m !pos (BasicBlock lbl instrs) = do
      -- Account for a potential info table before the label.
      let !block_pos = pos + infoTblSize_maybe lbl
      (!pos', instrs') <- mapAccumLM (replace_jump m) block_pos instrs
      let instrs'' = concat instrs'
      -- We might have introduced new labels, so split the instructions into basic blocks again if neccesary.
      let (top, split_blocks, no_data) = foldr mkBlocks ([], [], []) instrs''
      -- There should be no data in the instruction stream at this point
      massert (null no_data)

      let final_blocks = BasicBlock lbl top : split_blocks
      pure (pos', final_blocks)

    replace_jump :: LabelMap Int -> Int -> Instr -> UniqDSM (Int, [Instr])
    replace_jump !m !pos instr = do
      case instr of
        ANN ann instr -> do
          replace_jump m pos instr >>= \case
            (idx, instr' : instrs') -> pure (idx, ANN ann instr' : instrs')
            (idx, []) -> pprPanic "replace_jump" (text "empty return list for " <+> ppr idx)

        BCOND1 cond op1 op2 t ->
          case target_in_range m t pos of
            InRange -> pure (pos + 1, [instr])
            NotInRange far_target -> do
              jmp_code <- genCondFarJump cond op1 op2 far_target
              pure (pos + long_bc_jump_dist, fromOL jmp_code)

        _ -> pure (pos + instr_size instr, [instr])

    target_in_range :: LabelMap Int -> Target -> Int -> BlockInRange
    target_in_range m target src =
      case target of
        (TReg{}) -> InRange
        (TBlock bid) -> block_in_range m src bid
        (TLabel clbl)
          | Just bid <- maybeLocalBlockLabel clbl
          -> block_in_range m src bid
          | otherwise
          -> InRange

    block_in_range :: LabelMap Int -> Int -> BlockId -> BlockInRange
    block_in_range m src_pos dest_lbl =
      case mapLookup dest_lbl m of
        Nothing ->
          pprTrace "not in range" (ppr dest_lbl) $ NotInRange dest_lbl
        Just dest_pos ->
          if abs (dest_pos - src_pos) < max_cond_jump_dist
            then InRange
          else NotInRange dest_lbl

    calc_lbl_positions :: (Int, LabelMap Int) -> GenBasicBlock Instr -> (Int, LabelMap Int)
    calc_lbl_positions (pos, m) (BasicBlock lbl instrs) =
      let !pos' = pos + infoTblSize_maybe lbl
       in foldl' instr_pos (pos', mapInsert lbl pos' m) instrs

    instr_pos :: (Int, LabelMap Int) -> Instr -> (Int, LabelMap Int)
    instr_pos (pos, m) instr = (pos + instr_size instr, m)

    infoTblSize_maybe bid =
      case mapLookup bid statics of
        Nothing -> 0 :: Int
        Just _info_static -> max_info_size

    instr_size :: Instr -> Int
    instr_size i = case i of
      COMMENT {} -> 0
      MULTILINE_COMMENT {} -> 0
      ANN _ instr -> instr_size instr
      LOCATION {} -> 0
      DELTA {} -> 0
      -- At this point there should be no NEWBLOCK in the instruction stream (pos, mapInsert bid pos m)
      NEWBLOCK {} -> panic "mkFarBranched - Unexpected"
      LDATA {} -> panic "mkFarBranched - Unexpected"
      PUSH_STACK_FRAME -> 4
      POP_STACK_FRAME -> 4
      CSET {} -> 2
      LD _ _ (OpImm (ImmIndex _ _)) -> 3
      LD _ _ (OpImm (ImmCLbl _)) -> 2
      SCVTF {} -> 2
      FCVTZS {} -> 4
      BCOND {} -> long_bc_jump_dist
      CALL (TReg _) _ -> 1
      CALL {} -> 2
      CALL36 {} -> 2
      TAIL36 {} -> 2
      _ -> 1