cranelift/codegen/src/isa/riscv64/inst/emit.rs

//! Riscv64 ISA: binary code emission.

use crate::binemit::StackMap;
use crate::ir::{self, LibCall, RelSourceLoc, TrapCode};
use crate::isa::riscv64::inst::*;
use crate::isa::riscv64::lower::isle::generated_code::{
    CaOp, CbOp, CiOp, CiwOp, ClOp, CrOp, CsOp, CssOp, CsznOp, ZcbMemOp,
};
use crate::machinst::{AllocationConsumer, Reg, Writable};
use crate::trace;
use cranelift_control::ControlPlane;
use regalloc2::Allocation;

pub struct EmitInfo {
    shared_flag: settings::Flags,
    isa_flags: super::super::riscv_settings::Flags,
}

impl EmitInfo {
    pub(crate) fn new(
        shared_flag: settings::Flags,
        isa_flags: super::super::riscv_settings::Flags,
    ) -> Self {
        Self {
            shared_flag,
            isa_flags,
        }
    }
}

pub(crate) fn reg_to_gpr_num(m: Reg) -> u32 {
    u32::try_from(m.to_real_reg().unwrap().hw_enc() & 31).unwrap()
}

pub(crate) fn reg_to_compressed_gpr_num(m: Reg) -> u32 {
    let real_reg = m.to_real_reg().unwrap().hw_enc();
    debug_assert!(real_reg >= 8 && real_reg < 16);
    let compressed_reg = real_reg - 8;
    u32::try_from(compressed_reg).unwrap()
}

#[derive(Clone, Debug, PartialEq, Default)]
pub enum EmitVState {
    #[default]
    Unknown,
    Known(VState),
}

/// State carried between emissions of a sequence of instructions.
#[derive(Default, Clone, Debug)]
pub struct EmitState {
    pub(crate) virtual_sp_offset: i64,
    pub(crate) nominal_sp_to_fp: i64,
    /// Safepoint stack map for upcoming instruction, as provided to `pre_safepoint()`.
    stack_map: Option<StackMap>,
    /// Current source-code location corresponding to instruction to be emitted.
    cur_srcloc: RelSourceLoc,
    /// Only used during fuzz-testing. Otherwise, it is a zero-sized struct and
    /// optimized away at compiletime. See [cranelift_control].
    ctrl_plane: ControlPlane,
    /// Vector State
    /// Controls the current state of the vector unit at the emission point.
    vstate: EmitVState,
}

impl EmitState {
    fn take_stack_map(&mut self) -> Option<StackMap> {
        self.stack_map.take()
    }

    fn cur_srcloc(&self) -> RelSourceLoc {
        self.cur_srcloc
    }
}

impl MachInstEmitState<Inst> for EmitState {
    fn new(
        abi: &Callee<crate::isa::riscv64::abi::Riscv64MachineDeps>,
        ctrl_plane: ControlPlane,
    ) -> Self {
        EmitState {
            virtual_sp_offset: 0,
            nominal_sp_to_fp: abi.frame_size() as i64,
            stack_map: None,
            cur_srcloc: RelSourceLoc::default(),
            ctrl_plane,
            vstate: EmitVState::Unknown,
        }
    }

    fn pre_safepoint(&mut self, stack_map: StackMap) {
        self.stack_map = Some(stack_map);
    }

    fn pre_sourceloc(&mut self, srcloc: RelSourceLoc) {
        self.cur_srcloc = srcloc;
    }

    fn ctrl_plane_mut(&mut self) -> &mut ControlPlane {
        &mut self.ctrl_plane
    }

    fn take_ctrl_plane(self) -> ControlPlane {
        self.ctrl_plane
    }

    fn on_new_block(&mut self) {
        // Reset the vector state.
        self.vstate = EmitVState::Unknown;
    }
}

impl Inst {
    /// Load int mask.
    /// If ty is int then 0xff in rd.
    pub(crate) fn load_int_mask(rd: Writable<Reg>, ty: Type) -> SmallInstVec<Inst> {
        let mut insts = SmallInstVec::new();
        assert!(ty.is_int() && ty.bits() <= 64);
        match ty {
            I64 => {
                insts.push(Inst::load_imm12(rd, Imm12::from_i16(-1)));
            }
            I32 | I16 => {
                insts.push(Inst::load_imm12(rd, Imm12::from_i16(-1)));
                insts.push(Inst::Extend {
                    rd: rd,
                    rn: rd.to_reg(),
                    signed: false,
                    from_bits: ty.bits() as u8,
                    to_bits: 64,
                });
            }
            I8 => {
                insts.push(Inst::load_imm12(rd, Imm12::from_i16(255)));
            }
            _ => unreachable!("ty:{:?}", ty),
        }
        insts
    }
    ///  inverse all bit
    pub(crate) fn construct_bit_not(rd: Writable<Reg>, rs: Reg) -> Inst {
        Inst::AluRRImm12 {
            alu_op: AluOPRRI::Xori,
            rd,
            rs,
            imm12: Imm12::from_i16(-1),
        }
    }

    // emit a float is not a nan.
    pub(crate) fn emit_not_nan(rd: Writable<Reg>, rs: Reg, ty: Type) -> Inst {
        Inst::FpuRRR {
            alu_op: if ty == F32 {
                FpuOPRRR::FeqS
            } else {
                FpuOPRRR::FeqD
            },
            frm: FRM::RDN,
            rd: rd,
            rs1: rs,
            rs2: rs,
        }
    }

    pub(crate) fn emit_fabs(rd: Writable<Reg>, rs: Reg, ty: Type) -> Inst {
        Inst::FpuRRR {
            alu_op: if ty == F32 {
                FpuOPRRR::FsgnjxS
            } else {
                FpuOPRRR::FsgnjxD
            },
            frm: FRM::RDN,
            rd: rd,
            rs1: rs,
            rs2: rs,
        }
    }

    /// Returns Some(VState) if this insturction is expecting a specific vector state
    /// before emission.
    fn expected_vstate(&self) -> Option<&VState> {
        match self {
            Inst::Nop0
            | Inst::Nop4
            | Inst::BrTable { .. }
            | Inst::Auipc { .. }
            | Inst::Lui { .. }
            | Inst::LoadInlineConst { .. }
            | Inst::AluRRR { .. }
            | Inst::FpuRRR { .. }
            | Inst::AluRRImm12 { .. }
            | Inst::CsrReg { .. }
            | Inst::CsrImm { .. }
            | Inst::Load { .. }
            | Inst::Store { .. }
            | Inst::Args { .. }
            | Inst::Rets { .. }
            | Inst::Ret { .. }
            | Inst::Extend { .. }
            | Inst::Call { .. }
            | Inst::CallInd { .. }
            | Inst::ReturnCall { .. }
            | Inst::ReturnCallInd { .. }
            | Inst::Jal { .. }
            | Inst::CondBr { .. }
            | Inst::LoadExtName { .. }
            | Inst::ElfTlsGetAddr { .. }
            | Inst::LoadAddr { .. }
            | Inst::VirtualSPOffsetAdj { .. }
            | Inst::Mov { .. }
            | Inst::MovFromPReg { .. }
            | Inst::Fence { .. }
            | Inst::EBreak
            | Inst::Udf { .. }
            | Inst::FpuRR { .. }
            | Inst::FpuRRRR { .. }
            | Inst::Jalr { .. }
            | Inst::Atomic { .. }
            | Inst::Select { .. }
            | Inst::AtomicCas { .. }
            | Inst::RawData { .. }
            | Inst::AtomicStore { .. }
            | Inst::AtomicLoad { .. }
            | Inst::AtomicRmwLoop { .. }
            | Inst::TrapIf { .. }
            | Inst::Unwind { .. }
            | Inst::DummyUse { .. }
            | Inst::FloatRound { .. }
            | Inst::Popcnt { .. }
            | Inst::Cltz { .. }
            | Inst::Brev8 { .. }
            | Inst::StackProbeLoop { .. } => None,

            // VecSetState does not expect any vstate, rather it updates it.
            Inst::VecSetState { .. } => None,

            // `vmv` instructions copy a set of registers and ignore vstate.
            Inst::VecAluRRImm5 { op: VecAluOpRRImm5::VmvrV, .. } => None,

            Inst::VecAluRR { vstate, .. } |
            Inst::VecAluRRR { vstate, .. } |
            Inst::VecAluRRRR { vstate, .. } |
            Inst::VecAluRImm5 { vstate, .. } |
            Inst::VecAluRRImm5 { vstate, .. } |
            Inst::VecAluRRRImm5 { vstate, .. } |
            // TODO: Unit-stride loads and stores only need the AVL to be correct, not
            // the full vtype. A future optimization could be to decouple these two when
            // updating vstate. This would allow us to avoid emitting a VecSetState in
            // some cases.
            Inst::VecLoad { vstate, .. }
            | Inst::VecStore { vstate, .. } => Some(vstate),
        }
    }
}

impl MachInstEmit for Inst {
    type State = EmitState;
    type Info = EmitInfo;

    fn emit(
        &self,
        allocs: &[Allocation],
        sink: &mut MachBuffer<Inst>,
        emit_info: &Self::Info,
        state: &mut EmitState,
    ) {
        // Transform this into a instruction with all the physical regs
        let mut allocs = AllocationConsumer::new(allocs);
        let inst = self.clone().allocate(&mut allocs);

        // Check if we need to update the vector state before emitting this instruction
        if let Some(expected) = inst.expected_vstate() {
            if state.vstate != EmitVState::Known(expected.clone()) {
                // Update the vector state.
                Inst::VecSetState {
                    rd: writable_zero_reg(),
                    vstate: expected.clone(),
                }
                .emit(&[], sink, emit_info, state);
            }
        }

        // N.B.: we *must* not exceed the "worst-case size" used to compute
        // where to insert islands, except when islands are explicitly triggered
        // (with an `EmitIsland`). We check this in debug builds. This is `mut`
        // to allow disabling the check for `JTSequence`, which is always
        // emitted following an `EmitIsland`.
        let mut start_off = sink.cur_offset();

        // First try to emit this as a compressed instruction
        let res = inst.try_emit_compressed(sink, emit_info, state, &mut start_off);
        if res.is_none() {
            // If we can't lets emit it as a normal instruction
            inst.emit_uncompressed(sink, emit_info, state, &mut start_off);
        }

        let end_off = sink.cur_offset();
        assert!(
            (end_off - start_off) <= Inst::worst_case_size(),
            "Inst:{:?} length:{} worst_case_size:{}",
            self,
            end_off - start_off,
            Inst::worst_case_size()
        );
    }

    fn pretty_print_inst(&self, allocs: &[Allocation], state: &mut Self::State) -> String {
        let mut allocs = AllocationConsumer::new(allocs);
        self.print_with_state(state, &mut allocs)
    }
}

impl Inst {
    /// Tries to emit an instruction as compressed, if we can't return false.
    fn try_emit_compressed(
        &self,
        sink: &mut MachBuffer<Inst>,
        emit_info: &EmitInfo,
        state: &mut EmitState,
        start_off: &mut u32,
    ) -> Option<()> {
        let has_m = emit_info.isa_flags.has_m();
        let has_zba = emit_info.isa_flags.has_zba();
        let has_zbb = emit_info.isa_flags.has_zbb();
        let has_zca = emit_info.isa_flags.has_zca();
        let has_zcb = emit_info.isa_flags.has_zcb();
        let has_zcd = emit_info.isa_flags.has_zcd();

        // Currently all compressed extensions (Zcb, Zcd, Zcmp, Zcmt, etc..) require Zca
        // to be enabled, so check it early.
        if !has_zca {
            return None;
        }

        fn reg_is_compressible(r: Reg) -> bool {
            r.to_real_reg()
                .map(|r| r.hw_enc() >= 8 && r.hw_enc() < 16)
                .unwrap_or(false)
        }

        match *self {
            // C.ADD
            Inst::AluRRR {
                alu_op: AluOPRRR::Add,
                rd,
                rs1,
                rs2,
            } if (rd.to_reg() == rs1 || rd.to_reg() == rs2)
                && rs1 != zero_reg()
                && rs2 != zero_reg() =>
            {
                // Technically `c.add rd, rs` expands to `add rd, rd, rs`, but we can
                // also swap rs1 with rs2 and we get an equivalent instruction. i.e we
                // can also compress `add rd, rs, rd` into `c.add rd, rs`.
                let src = if rd.to_reg() == rs1 { rs2 } else { rs1 };

                sink.put2(encode_cr_type(CrOp::CAdd, rd, src));
            }

            // C.MV
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Addi | AluOPRRI::Ori,
                rd,
                rs,
                imm12,
            } if rd.to_reg() != rs
                && rd.to_reg() != zero_reg()
                && rs != zero_reg()
                && imm12.as_i16() == 0 =>
            {
                sink.put2(encode_cr_type(CrOp::CMv, rd, rs));
            }

            // CA Ops
            Inst::AluRRR {
                alu_op:
                    alu_op @ (AluOPRRR::And
                    | AluOPRRR::Or
                    | AluOPRRR::Xor
                    | AluOPRRR::Addw
                    | AluOPRRR::Mul),
                rd,
                rs1,
                rs2,
            } if (rd.to_reg() == rs1 || rd.to_reg() == rs2)
                && reg_is_compressible(rs1)
                && reg_is_compressible(rs2) =>
            {
                let op = match alu_op {
                    AluOPRRR::And => CaOp::CAnd,
                    AluOPRRR::Or => CaOp::COr,
                    AluOPRRR::Xor => CaOp::CXor,
                    AluOPRRR::Addw => CaOp::CAddw,
                    AluOPRRR::Mul if has_zcb && has_m => CaOp::CMul,
                    _ => return None,
                };
                // The canonical expansion for these instruction has `rd == rs1`, but
                // these are all comutative operations, so we can swap the operands.
                let src = if rd.to_reg() == rs1 { rs2 } else { rs1 };

                sink.put2(encode_ca_type(op, rd, src));
            }

            // The sub instructions are non comutative, so we can't swap the operands.
            Inst::AluRRR {
                alu_op: alu_op @ (AluOPRRR::Sub | AluOPRRR::Subw),
                rd,
                rs1,
                rs2,
            } if rd.to_reg() == rs1 && reg_is_compressible(rs1) && reg_is_compressible(rs2) => {
                let op = match alu_op {
                    AluOPRRR::Sub => CaOp::CSub,
                    AluOPRRR::Subw => CaOp::CSubw,
                    _ => return None,
                };
                sink.put2(encode_ca_type(op, rd, rs2));
            }

            // c.j
            //
            // We don't have a separate JAL as that is only availabile in RV32C
            Inst::Jal { label } => {
                sink.use_label_at_offset(*start_off, label, LabelUse::RVCJump);
                sink.add_uncond_branch(*start_off, *start_off + 2, label);
                sink.put2(encode_cj_type(CjOp::CJ, Imm12::ZERO));
            }

            // c.jr
            Inst::Jalr { rd, base, offset }
                if rd.to_reg() == zero_reg() && base != zero_reg() && offset.as_i16() == 0 =>
            {
                sink.put2(encode_cr2_type(CrOp::CJr, base));
            }

            // c.jalr
            Inst::Jalr { rd, base, offset }
                if rd.to_reg() == link_reg() && base != zero_reg() && offset.as_i16() == 0 =>
            {
                sink.put2(encode_cr2_type(CrOp::CJalr, base));
            }

            // c.ebreak
            Inst::EBreak => {
                sink.put2(encode_cr_type(
                    CrOp::CEbreak,
                    writable_zero_reg(),
                    zero_reg(),
                ));
            }

            // c.unimp
            Inst::Udf { trap_code } => {
                sink.add_trap(trap_code);
                if let Some(s) = state.take_stack_map() {
                    sink.add_stack_map(StackMapExtent::UpcomingBytes(2), s);
                }
                sink.put2(0x0000);
            }

            // c.addi16sp
            //
            // c.addi16sp shares the opcode with c.lui, but has a destination field of x2.
            // c.addi16sp adds the non-zero sign-extended 6-bit immediate to the value in the stack pointer (sp=x2),
            // where the immediate is scaled to represent multiples of 16 in the range (-512,496). c.addi16sp is used
            // to adjust the stack pointer in procedure prologues and epilogues. It expands into addi x2, x2, nzimm. c.addi16sp
            // is only valid when nzimm≠0; the code point with nzimm=0 is reserved.
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Addi,
                rd,
                rs,
                imm12,
            } if rd.to_reg() == rs
                && rs == stack_reg()
                && imm12.as_i16() != 0
                && (imm12.as_i16() % 16) == 0
                && Imm6::maybe_from_i16(imm12.as_i16() / 16).is_some() =>
            {
                let imm6 = Imm6::maybe_from_i16(imm12.as_i16() / 16).unwrap();
                sink.put2(encode_c_addi16sp(imm6));
            }

            // c.addi4spn
            //
            // c.addi4spn is a CIW-format instruction that adds a zero-extended non-zero
            // immediate, scaled by 4, to the stack pointer, x2, and writes the result to
            // rd. This instruction is used to generate pointers to stack-allocated variables
            // and expands to addi rd, x2, nzuimm. c.addi4spn is only valid when nzuimm≠0;
            // the code points with nzuimm=0 are reserved.
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Addi,
                rd,
                rs,
                imm12,
            } if reg_is_compressible(rd.to_reg())
                && rs == stack_reg()
                && imm12.as_i16() != 0
                && (imm12.as_i16() % 4) == 0
                && u8::try_from(imm12.as_i16() / 4).is_ok() =>
            {
                let imm = u8::try_from(imm12.as_i16() / 4).unwrap();
                sink.put2(encode_ciw_type(CiwOp::CAddi4spn, rd, imm));
            }

            // c.li
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Addi,
                rd,
                rs,
                imm12,
            } if rd.to_reg() != zero_reg() && rs == zero_reg() => {
                let imm6 = Imm6::maybe_from_imm12(imm12)?;
                sink.put2(encode_ci_type(CiOp::CLi, rd, imm6));
            }

            // c.addi
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Addi,
                rd,
                rs,
                imm12,
            } if rd.to_reg() == rs && rs != zero_reg() && imm12.as_i16() != 0 => {
                let imm6 = Imm6::maybe_from_imm12(imm12)?;
                sink.put2(encode_ci_type(CiOp::CAddi, rd, imm6));
            }

            // c.addiw
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Addiw,
                rd,
                rs,
                imm12,
            } if rd.to_reg() == rs && rs != zero_reg() => {
                let imm6 = Imm6::maybe_from_imm12(imm12)?;
                sink.put2(encode_ci_type(CiOp::CAddiw, rd, imm6));
            }

            // c.lui
            //
            // c.lui loads the non-zero 6-bit immediate field into bits 17–12
            // of the destination register, clears the bottom 12 bits, and
            // sign-extends bit 17 into all higher bits of the destination.
            Inst::Lui { rd, imm: imm20 }
                if rd.to_reg() != zero_reg()
                    && rd.to_reg() != stack_reg()
                    && imm20.as_i32() != 0 =>
            {
                // Check that the top bits are sign extended
                let imm = imm20.as_i32() << 14 >> 14;
                if imm != imm20.as_i32() {
                    return None;
                }
                let imm6 = Imm6::maybe_from_i32(imm)?;
                sink.put2(encode_ci_type(CiOp::CLui, rd, imm6));
            }

            // c.slli
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Slli,
                rd,
                rs,
                imm12,
            } if rd.to_reg() == rs && rs != zero_reg() && imm12.as_i16() != 0 => {
                // The shift amount is unsigned, but we encode it as signed.
                let shift = imm12.as_i16() & 0x3f;
                let imm6 = Imm6::maybe_from_i16(shift << 10 >> 10).unwrap();
                sink.put2(encode_ci_type(CiOp::CSlli, rd, imm6));
            }

            // c.srli / c.srai
            Inst::AluRRImm12 {
                alu_op: op @ (AluOPRRI::Srli | AluOPRRI::Srai),
                rd,
                rs,
                imm12,
            } if rd.to_reg() == rs && reg_is_compressible(rs) && imm12.as_i16() != 0 => {
                let op = match op {
                    AluOPRRI::Srli => CbOp::CSrli,
                    AluOPRRI::Srai => CbOp::CSrai,
                    _ => unreachable!(),
                };

                // The shift amount is unsigned, but we encode it as signed.
                let shift = imm12.as_i16() & 0x3f;
                let imm6 = Imm6::maybe_from_i16(shift << 10 >> 10).unwrap();
                sink.put2(encode_cb_type(op, rd, imm6));
            }

            // c.zextb
            //
            // This is an alias for `andi rd, rd, 0xff`
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Andi,
                rd,
                rs,
                imm12,
            } if has_zcb
                && rd.to_reg() == rs
                && reg_is_compressible(rs)
                && imm12.as_i16() == 0xff =>
            {
                sink.put2(encode_cszn_type(CsznOp::CZextb, rd));
            }

            // c.andi
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Andi,
                rd,
                rs,
                imm12,
            } if rd.to_reg() == rs && reg_is_compressible(rs) => {
                let imm6 = Imm6::maybe_from_imm12(imm12)?;
                sink.put2(encode_cb_type(CbOp::CAndi, rd, imm6));
            }

            // Stack Based Loads
            Inst::Load {
                rd,
                op: op @ (LoadOP::Lw | LoadOP::Ld | LoadOP::Fld),
                from,
                flags,
            } if from.get_base_register() == Some(stack_reg())
                && (from.get_offset_with_state(state) % op.size()) == 0 =>
            {
                // We encode the offset in multiples of the load size.
                let offset = from.get_offset_with_state(state);
                let imm6 = u8::try_from(offset / op.size())
                    .ok()
                    .and_then(Uimm6::maybe_from_u8)?;

                // Some additional constraints on these instructions.
                //
                // Integer loads are not allowed to target x0, but floating point loads
                // are, since f0 is not a special register.
                //
                // Floating point loads are not included in the base Zca extension
                // but in a separate Zcd extension. Both of these are part of the C Extension.
                let rd_is_zero = rd.to_reg() == zero_reg();
                let op = match op {
                    LoadOP::Lw if !rd_is_zero => CiOp::CLwsp,
                    LoadOP::Ld if !rd_is_zero => CiOp::CLdsp,
                    LoadOP::Fld if has_zcd => CiOp::CFldsp,
                    _ => return None,
                };

                let srcloc = state.cur_srcloc();
                if !srcloc.is_default() && !flags.notrap() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(TrapCode::HeapOutOfBounds);
                }
                sink.put2(encode_ci_sp_load(op, rd, imm6));
            }

            // Regular Loads
            Inst::Load {
                rd,
                op:
                    op
                    @ (LoadOP::Lw | LoadOP::Ld | LoadOP::Fld | LoadOP::Lbu | LoadOP::Lhu | LoadOP::Lh),
                from,
                flags,
            } if reg_is_compressible(rd.to_reg())
                && from
                    .get_base_register()
                    .map(reg_is_compressible)
                    .unwrap_or(false)
                && (from.get_offset_with_state(state) % op.size()) == 0 =>
            {
                let base = from.get_base_register().unwrap();

                // We encode the offset in multiples of the store size.
                let offset = from.get_offset_with_state(state);
                let offset = u8::try_from(offset / op.size()).ok()?;

                // We mix two different formats here.
                //
                // c.lw / c.ld / c.fld instructions are available in the standard Zca
                // extension using the CL format.
                //
                // c.lbu / c.lhu / c.lh are only available in the Zcb extension and
                // are also encoded differently. Technically they each have a different
                // format, but they are similar enough that we can group them.
                let is_zcb_load = matches!(op, LoadOP::Lbu | LoadOP::Lhu | LoadOP::Lh);
                let encoded = if is_zcb_load {
                    if !has_zcb {
                        return None;
                    }

                    let op = match op {
                        LoadOP::Lbu => ZcbMemOp::CLbu,
                        LoadOP::Lhu => ZcbMemOp::CLhu,
                        LoadOP::Lh => ZcbMemOp::CLh,
                        _ => unreachable!(),
                    };

                    // Byte stores & loads have 2 bits of immediate offset. Halfword stores
                    // and loads only have 1 bit.
                    let imm2 = Uimm2::maybe_from_u8(offset)?;
                    if (offset & !((1 << op.imm_bits()) - 1)) != 0 {
                        return None;
                    }

                    encode_zcbmem_load(op, rd, base, imm2)
                } else {
                    // Floating point loads are not included in the base Zca extension
                    // but in a separate Zcd extension. Both of these are part of the C Extension.
                    let op = match op {
                        LoadOP::Lw => ClOp::CLw,
                        LoadOP::Ld => ClOp::CLd,
                        LoadOP::Fld if has_zcd => ClOp::CFld,
                        _ => return None,
                    };
                    let imm5 = Uimm5::maybe_from_u8(offset)?;

                    encode_cl_type(op, rd, base, imm5)
                };

                let srcloc = state.cur_srcloc();
                if !srcloc.is_default() && !flags.notrap() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(TrapCode::HeapOutOfBounds);
                }
                sink.put2(encoded);
            }

            // Stack Based Stores
            Inst::Store {
                src,
                op: op @ (StoreOP::Sw | StoreOP::Sd | StoreOP::Fsd),
                to,
                flags,
            } if to.get_base_register() == Some(stack_reg())
                && (to.get_offset_with_state(state) % op.size()) == 0 =>
            {
                // We encode the offset in multiples of the store size.
                let offset = to.get_offset_with_state(state);
                let imm6 = u8::try_from(offset / op.size())
                    .ok()
                    .and_then(Uimm6::maybe_from_u8)?;

                // Floating point stores are not included in the base Zca extension
                // but in a separate Zcd extension. Both of these are part of the C Extension.
                let op = match op {
                    StoreOP::Sw => CssOp::CSwsp,
                    StoreOP::Sd => CssOp::CSdsp,
                    StoreOP::Fsd if has_zcd => CssOp::CFsdsp,
                    _ => return None,
                };

                let srcloc = state.cur_srcloc();
                if !srcloc.is_default() && !flags.notrap() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(TrapCode::HeapOutOfBounds);
                }
                sink.put2(encode_css_type(op, src, imm6));
            }

            // Regular Stores
            Inst::Store {
                src,
                op: op @ (StoreOP::Sw | StoreOP::Sd | StoreOP::Fsd | StoreOP::Sh | StoreOP::Sb),
                to,
                flags,
            } if reg_is_compressible(src)
                && to
                    .get_base_register()
                    .map(reg_is_compressible)
                    .unwrap_or(false)
                && (to.get_offset_with_state(state) % op.size()) == 0 =>
            {
                let base = to.get_base_register().unwrap();

                // We encode the offset in multiples of the store size.
                let offset = to.get_offset_with_state(state);
                let offset = u8::try_from(offset / op.size()).ok()?;

                // We mix two different formats here.
                //
                // c.sw / c.sd / c.fsd instructions are available in the standard Zca
                // extension using the CL format.
                //
                // c.sb / c.sh are only available in the Zcb extension and are also
                // encoded differently.
                let is_zcb_store = matches!(op, StoreOP::Sh | StoreOP::Sb);
                let encoded = if is_zcb_store {
                    if !has_zcb {
                        return None;
                    }

                    let op = match op {
                        StoreOP::Sh => ZcbMemOp::CSh,
                        StoreOP::Sb => ZcbMemOp::CSb,
                        _ => unreachable!(),
                    };

                    // Byte stores & loads have 2 bits of immediate offset. Halfword stores
                    // and loads only have 1 bit.
                    let imm2 = Uimm2::maybe_from_u8(offset)?;
                    if (offset & !((1 << op.imm_bits()) - 1)) != 0 {
                        return None;
                    }

                    encode_zcbmem_store(op, src, base, imm2)
                } else {
                    // Floating point stores are not included in the base Zca extension
                    // but in a separate Zcd extension. Both of these are part of the C Extension.
                    let op = match op {
                        StoreOP::Sw => CsOp::CSw,
                        StoreOP::Sd => CsOp::CSd,
                        StoreOP::Fsd if has_zcd => CsOp::CFsd,
                        _ => return None,
                    };
                    let imm5 = Uimm5::maybe_from_u8(offset)?;

                    encode_cs_type(op, src, base, imm5)
                };

                let srcloc = state.cur_srcloc();
                if !srcloc.is_default() && !flags.notrap() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(TrapCode::HeapOutOfBounds);
                }
                sink.put2(encoded);
            }

            // c.not
            //
            // This is an alias for `xori rd, rd, -1`
            Inst::AluRRImm12 {
                alu_op: AluOPRRI::Xori,
                rd,
                rs,
                imm12,
            } if has_zcb
                && rd.to_reg() == rs
                && reg_is_compressible(rs)
                && imm12.as_i16() == -1 =>
            {
                sink.put2(encode_cszn_type(CsznOp::CNot, rd));
            }

            // c.sext.b / c.sext.h / c.zext.h
            //
            // These are all the extend instructions present in `Zcb`, they
            // also require `Zbb` since they aren't available in the base ISA.
            Inst::AluRRImm12 {
                alu_op: alu_op @ (AluOPRRI::Sextb | AluOPRRI::Sexth | AluOPRRI::Zexth),
                rd,
                rs,
                imm12,
            } if has_zcb
                && has_zbb
                && rd.to_reg() == rs
                && reg_is_compressible(rs)
                && imm12.as_i16() == 0 =>
            {
                let op = match alu_op {
                    AluOPRRI::Sextb => CsznOp::CSextb,
                    AluOPRRI::Sexth => CsznOp::CSexth,
                    AluOPRRI::Zexth => CsznOp::CZexth,
                    _ => unreachable!(),
                };
                sink.put2(encode_cszn_type(op, rd));
            }

            // c.zext.w
            //
            // This is an alias for `add.uw rd, rd, zero`
            Inst::AluRRR {
                alu_op: AluOPRRR::Adduw,
                rd,
                rs1,
                rs2,
            } if has_zcb
                && has_zba
                && rd.to_reg() == rs1
                && reg_is_compressible(rs1)
                && rs2 == zero_reg() =>
            {
                sink.put2(encode_cszn_type(CsznOp::CZextw, rd));
            }

            _ => return None,
        }

        return Some(());
    }

    fn emit_uncompressed(
        &self,
        sink: &mut MachBuffer<Inst>,
        emit_info: &EmitInfo,
        state: &mut EmitState,
        start_off: &mut u32,
    ) {
        match self {
            &Inst::Nop0 => {
                // do nothing
            }
            // Addi x0, x0, 0
            &Inst::Nop4 => {
                let x = Inst::AluRRImm12 {
                    alu_op: AluOPRRI::Addi,
                    rd: Writable::from_reg(zero_reg()),
                    rs: zero_reg(),
                    imm12: Imm12::ZERO,
                };
                x.emit(&[], sink, emit_info, state)
            }
            &Inst::RawData { ref data } => {
                // Right now we only put a u32 or u64 in this instruction.
                // It is not very long, no need to check if need `emit_island`.
                // If data is very long , this is a bug because RawData is typecial
                // use to load some data and rely on some positon in the code stream.
                // and we may exceed `Inst::worst_case_size`.
                // for more information see https://github.com/bytecodealliance/wasmtime/pull/5612.
                sink.put_data(&data[..]);
            }
            &Inst::Lui { rd, ref imm } => {
                let x: u32 = 0b0110111 | reg_to_gpr_num(rd.to_reg()) << 7 | (imm.bits() << 12);
                sink.put4(x);
            }
            &Inst::LoadInlineConst { rd, ty, imm } => {
                let data = &imm.to_le_bytes()[..ty.bytes() as usize];

                let label_data: MachLabel = sink.get_label();
                let label_end: MachLabel = sink.get_label();

                // Load into rd
                Inst::Load {
                    rd,
                    op: LoadOP::from_type(ty),
                    flags: MemFlags::new(),
                    from: AMode::Label(label_data),
                }
                .emit(&[], sink, emit_info, state);

                // Jump over the inline pool
                Inst::gen_jump(label_end).emit(&[], sink, emit_info, state);

                // Emit the inline data
                sink.bind_label(label_data, &mut state.ctrl_plane);
                Inst::RawData { data: data.into() }.emit(&[], sink, emit_info, state);

                sink.bind_label(label_end, &mut state.ctrl_plane);
            }
            &Inst::FpuRR {
                frm,
                alu_op,
                rd,
                rs,
            } => {
                let x = alu_op.op_code()
                    | reg_to_gpr_num(rd.to_reg()) << 7
                    | frm.as_u32() << 12
                    | reg_to_gpr_num(rs) << 15
                    | alu_op.rs2_funct5() << 20
                    | alu_op.funct7() << 25;
                let srcloc = state.cur_srcloc();
                if !srcloc.is_default() && alu_op.is_convert_to_int() {
                    sink.add_trap(TrapCode::BadConversionToInteger);
                }
                sink.put4(x);
            }
            &Inst::FpuRRRR {
                alu_op,
                rd,
                rs1,
                rs2,
                rs3,
                frm,
            } => {
                let x = alu_op.op_code()
                    | reg_to_gpr_num(rd.to_reg()) << 7
                    | frm.as_u32() << 12
                    | reg_to_gpr_num(rs1) << 15
                    | reg_to_gpr_num(rs2) << 20
                    | alu_op.funct2() << 25
                    | reg_to_gpr_num(rs3) << 27;

                sink.put4(x);
            }
            &Inst::FpuRRR {
                alu_op,
                frm,
                rd,
                rs1,
                rs2,
            } => {
                let x: u32 = alu_op.op_code()
                    | reg_to_gpr_num(rd.to_reg()) << 7
                    | frm.as_u32() << 12
                    | reg_to_gpr_num(rs1) << 15
                    | reg_to_gpr_num(rs2) << 20
                    | alu_op.funct7() << 25;
                sink.put4(x);
            }
            &Inst::Unwind { ref inst } => {
                sink.add_unwind(inst.clone());
            }
            &Inst::DummyUse { .. } => {
                // This has already been handled by Inst::allocate.
            }
            &Inst::AluRRR {
                alu_op,
                rd,
                rs1,
                rs2,
            } => {
                let (rs1, rs2) = if alu_op.reverse_rs() {
                    (rs2, rs1)
                } else {
                    (rs1, rs2)
                };

                sink.put4(encode_r_type(
                    alu_op.op_code(),
                    rd,
                    alu_op.funct3(),
                    rs1,
                    rs2,
                    alu_op.funct7(),
                ));
            }
            &Inst::AluRRImm12 {
                alu_op,
                rd,
                rs,
                imm12,
            } => {
                let x = alu_op.op_code()
                    | reg_to_gpr_num(rd.to_reg()) << 7
                    | alu_op.funct3() << 12
                    | reg_to_gpr_num(rs) << 15
                    | alu_op.imm12(imm12) << 20;
                sink.put4(x);
            }
            &Inst::CsrReg { op, rd, rs, csr } => {
                sink.put4(encode_csr_reg(op, rd, rs, csr));
            }
            &Inst::CsrImm { op, rd, csr, imm } => {
                sink.put4(encode_csr_imm(op, rd, csr, imm));
            }
            &Inst::Load {
                rd,
                op,
                from,
                flags,
            } => {
                let base = from.get_base_register();
                let offset = from.get_offset_with_state(state);
                let offset_imm12 = Imm12::maybe_from_i64(offset);
                let label = from.get_label_with_sink(sink);

                let (addr, imm12) = match (base, offset_imm12, label) {
                    // When loading from a Reg+Offset, if the offset fits into an imm12 we can directly encode it.
                    (Some(base), Some(imm12), None) => (base, imm12),

                    // Otherwise, if the offset does not fit into a imm12, we need to materialize it into a
                    // register and load from that.
                    (Some(_), None, None) => {
                        let tmp = writable_spilltmp_reg();
                        Inst::LoadAddr { rd: tmp, mem: from }.emit(&[], sink, emit_info, state);
                        (tmp.to_reg(), Imm12::ZERO)
                    }

                    // If the AMode contains a label we can emit an internal relocation that gets
                    // resolved with the correct address later.
                    (None, Some(imm), Some(label)) => {
                        debug_assert_eq!(imm.as_i16(), 0);

                        // Get the current PC.
                        sink.use_label_at_offset(sink.cur_offset(), label, LabelUse::PCRelHi20);
                        Inst::Auipc {
                            rd,
                            imm: Imm20::ZERO,
                        }
                        .emit_uncompressed(sink, emit_info, state, start_off);

                        // Emit a relocation for the load. This patches the offset into the instruction.
                        sink.use_label_at_offset(sink.cur_offset(), label, LabelUse::PCRelLo12I);

                        // Imm12 here is meaningless since it's going to get replaced.
                        (rd.to_reg(), Imm12::ZERO)
                    }

                    // These cases are impossible with the current AModes that we have. We either
                    // always have a register, or always have a label. Never both, and never neither.
                    (None, None, None)
                    | (None, Some(_), None)
                    | (Some(_), None, Some(_))
                    | (Some(_), Some(_), Some(_))
                    | (None, None, Some(_)) => {
                        unreachable!("Invalid load address")
                    }
                };

                let srcloc = state.cur_srcloc();
                if !srcloc.is_default() && !flags.notrap() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(TrapCode::HeapOutOfBounds);
                }

                sink.put4(encode_i_type(op.op_code(), rd, op.funct3(), addr, imm12));
            }
            &Inst::Store { op, src, flags, to } => {
                let base = to.get_base_register();
                let offset = to.get_offset_with_state(state);
                let offset_imm12 = Imm12::maybe_from_i64(offset);

                let (addr, imm12) = match (base, offset_imm12) {
                    // If the offset fits into an imm12 we can directly encode it.
                    (Some(base), Some(imm12)) => (base, imm12),
                    // Otherwise load the address it into a reg and load from it.
                    _ => {
                        let tmp = writable_spilltmp_reg();
                        Inst::LoadAddr { rd: tmp, mem: to }.emit(&[], sink, emit_info, state);
                        (tmp.to_reg(), Imm12::ZERO)
                    }
                };

                let srcloc = state.cur_srcloc();
                if !srcloc.is_default() && !flags.notrap() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(TrapCode::HeapOutOfBounds);
                }

                sink.put4(encode_s_type(op.op_code(), op.funct3(), addr, src, imm12));
            }
            &Inst::Args { .. } | &Inst::Rets { .. } => {
                // Nothing: this is a pseudoinstruction that serves
                // only to constrain registers at a certain point.
            }
            &Inst::Ret {} => {
                // RISC-V does not have a dedicated ret instruction, instead we emit the equivalent
                // `jalr x0, x1, 0` that jumps to the return address.
                Inst::Jalr {
                    rd: writable_zero_reg(),
                    base: link_reg(),
                    offset: Imm12::ZERO,
                }
                .emit(&[], sink, emit_info, state);
            }

            &Inst::Extend {
                rd,
                rn,
                signed,
                from_bits,
                to_bits: _to_bits,
            } => {
                let mut insts = SmallInstVec::new();
                let shift_bits = (64 - from_bits) as i16;
                let is_u8 = || from_bits == 8 && signed == false;
                if is_u8() {
                    // special for u8.
                    insts.push(Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Andi,
                        rd,
                        rs: rn,
                        imm12: Imm12::from_i16(255),
                    });
                } else {
                    insts.push(Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Slli,
                        rd,
                        rs: rn,
                        imm12: Imm12::from_i16(shift_bits),
                    });
                    insts.push(Inst::AluRRImm12 {
                        alu_op: if signed {
                            AluOPRRI::Srai
                        } else {
                            AluOPRRI::Srli
                        },
                        rd,
                        rs: rd.to_reg(),
                        imm12: Imm12::from_i16(shift_bits),
                    });
                }
                insts
                    .into_iter()
                    .for_each(|i| i.emit(&[], sink, emit_info, state));
            }

            &Inst::Call { ref info } => {
                if info.opcode.is_call() {
                    sink.add_call_site(info.opcode);
                }
                sink.add_reloc(Reloc::RiscvCallPlt, &info.dest, 0);
                if let Some(s) = state.take_stack_map() {
                    sink.add_stack_map(StackMapExtent::UpcomingBytes(8), s);
                }
                Inst::construct_auipc_and_jalr(Some(writable_link_reg()), writable_link_reg(), 0)
                    .into_iter()
                    .for_each(|i| i.emit_uncompressed(sink, emit_info, state, start_off));

                let callee_pop_size = i64::from(info.callee_pop_size);
                state.virtual_sp_offset -= callee_pop_size;
                trace!(
                    "call adjusts virtual sp offset by {callee_pop_size} -> {}",
                    state.virtual_sp_offset
                );
            }
            &Inst::CallInd { ref info } => {
                let start_offset = sink.cur_offset();

                Inst::Jalr {
                    rd: writable_link_reg(),
                    base: info.rn,
                    offset: Imm12::ZERO,
                }
                .emit(&[], sink, emit_info, state);

                if let Some(s) = state.take_stack_map() {
                    sink.add_stack_map(StackMapExtent::StartedAtOffset(start_offset), s);
                }

                if info.opcode.is_call() {
                    sink.add_call_site(info.opcode);
                }

                let callee_pop_size = i64::from(info.callee_pop_size);
                state.virtual_sp_offset -= callee_pop_size;
                trace!(
                    "call adjusts virtual sp offset by {callee_pop_size} -> {}",
                    state.virtual_sp_offset
                );
            }

            &Inst::ReturnCall {
                ref callee,
                ref info,
            } => {
                emit_return_call_common_sequence(
                    sink,
                    emit_info,
                    state,
                    info.new_stack_arg_size,
                    info.old_stack_arg_size,
                );

                sink.add_call_site(ir::Opcode::ReturnCall);
                sink.add_reloc(Reloc::RiscvCallPlt, &**callee, 0);
                Inst::construct_auipc_and_jalr(None, writable_spilltmp_reg(), 0)
                    .into_iter()
                    .for_each(|i| i.emit_uncompressed(sink, emit_info, state, start_off));

                // `emit_return_call_common_sequence` emits an island if
                // necessary, so we can safely disable the worst-case-size check
                // in this case.
                *start_off = sink.cur_offset();
            }

            &Inst::ReturnCallInd { callee, ref info } => {
                emit_return_call_common_sequence(
                    sink,
                    emit_info,
                    state,
                    info.new_stack_arg_size,
                    info.old_stack_arg_size,
                );

                Inst::Jalr {
                    rd: writable_zero_reg(),
                    base: callee,
                    offset: Imm12::ZERO,
                }
                .emit(&[], sink, emit_info, state);

                // `emit_return_call_common_sequence` emits an island if
                // necessary, so we can safely disable the worst-case-size check
                // in this case.
                *start_off = sink.cur_offset();
            }
            &Inst::Jal { label } => {
                sink.use_label_at_offset(*start_off, label, LabelUse::Jal20);
                sink.add_uncond_branch(*start_off, *start_off + 4, label);
                sink.put4(0b1101111);
            }
            &Inst::CondBr {
                taken,
                not_taken,
                kind,
            } => {
                match taken {
                    CondBrTarget::Label(label) => {
                        let code = kind.emit();
                        let code_inverse = kind.inverse().emit().to_le_bytes();
                        sink.use_label_at_offset(*start_off, label, LabelUse::B12);
                        sink.add_cond_branch(*start_off, *start_off + 4, label, &code_inverse);
                        sink.put4(code);
                    }
                    CondBrTarget::Fallthrough => panic!("Cannot fallthrough in taken target"),
                }

                match not_taken {
                    CondBrTarget::Label(label) => {
                        Inst::gen_jump(label).emit(&[], sink, emit_info, state)
                    }
                    CondBrTarget::Fallthrough => {}
                };
            }

            &Inst::Mov { rd, rm, ty } => {
                debug_assert_eq!(rd.to_reg().class(), rm.class());
                if rd.to_reg() == rm {
                    return;
                }

                match rm.class() {
                    RegClass::Int => Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Addi,
                        rd: rd,
                        rs: rm,
                        imm12: Imm12::ZERO,
                    },
                    RegClass::Float => Inst::FpuRRR {
                        alu_op: if ty == F32 {
                            FpuOPRRR::FsgnjS
                        } else {
                            FpuOPRRR::FsgnjD
                        },
                        frm: FRM::RNE,
                        rd: rd,
                        rs1: rm,
                        rs2: rm,
                    },
                    RegClass::Vector => Inst::VecAluRRImm5 {
                        op: VecAluOpRRImm5::VmvrV,
                        vd: rd,
                        vs2: rm,
                        // Imm 0 means copy 1 register.
                        imm: Imm5::maybe_from_i8(0).unwrap(),
                        mask: VecOpMasking::Disabled,
                        // Vstate for this instruction is ignored.
                        vstate: VState::from_type(ty),
                    },
                }
                .emit(&[], sink, emit_info, state);
            }

            &Inst::MovFromPReg { rd, rm } => {
                Inst::gen_move(rd, Reg::from(rm), I64).emit(&[], sink, emit_info, state);
            }

            &Inst::BrTable {
                index,
                tmp1,
                tmp2,
                ref targets,
            } => {
                let ext_index = writable_spilltmp_reg();

                let label_compute_target = sink.get_label();

                // The default target is passed in as the 0th element of `targets`
                // separate it here for clarity.
                let default_target = targets[0];
                let targets = &targets[1..];

                // We are going to potentially emit a large amount of instructions, so ensure that we emit an island
                // now if we need one.
                //
                // The worse case PC calculations are 12 instructions. And each entry in the jump table is 2 instructions.
                // Check if we need to emit a jump table here to support that jump.
                let inst_count = 12 + (targets.len() * 2);
                let distance = (inst_count * Inst::UNCOMPRESSED_INSTRUCTION_SIZE as usize) as u32;
                if sink.island_needed(distance) {
                    let jump_around_label = sink.get_label();
                    Inst::gen_jump(jump_around_label).emit(&[], sink, emit_info, state);
                    sink.emit_island(distance + 4, &mut state.ctrl_plane);
                    sink.bind_label(jump_around_label, &mut state.ctrl_plane);
                }

                // We emit a bounds check on the index, if the index is larger than the number of
                // jump table entries, we jump to the default block.  Otherwise we compute a jump
                // offset by multiplying the index by 8 (the size of each entry) and then jump to
                // that offset. Each jump table entry is a regular auipc+jalr which we emit sequentially.
                //
                // Build the following sequence:
                //
                // extend_index:
                //     zext.w  ext_index, index
                // bounds_check:
                //     li      tmp, n_labels
                //     bltu    ext_index, tmp, compute_target
                // jump_to_default_block:
                //     auipc   pc, 0
                //     jalr    zero, pc, default_block
                // compute_target:
                //     auipc   pc, 0
                //     slli    tmp, ext_index, 3
                //     add     pc, pc, tmp
                //     jalr    zero, pc, 0x10
                // jump_table:
                //     ; This repeats for each entry in the jumptable
                //     auipc   pc, 0
                //     jalr    zero, pc, block_target

                // Extend the index to 64 bits.
                //
                // This prevents us branching on the top 32 bits of the index, which
                // are undefined.
                Inst::Extend {
                    rd: ext_index,
                    rn: index,
                    signed: false,
                    from_bits: 32,
                    to_bits: 64,
                }
                .emit(&[], sink, emit_info, state);

                // Bounds check.
                //
                // Check if the index passed in is larger than the number of jumptable
                // entries that we have. If it is, we fallthrough to a jump into the
                // default block.
                Inst::load_constant_u32(tmp2, targets.len() as u64)
                    .iter()
                    .for_each(|i| i.emit(&[], sink, emit_info, state));
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_compute_target),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::UnsignedLessThan,
                        rs1: ext_index.to_reg(),
                        rs2: tmp2.to_reg(),
                    },
                }
                .emit(&[], sink, emit_info, state);

                sink.use_label_at_offset(sink.cur_offset(), default_target, LabelUse::PCRel32);
                Inst::construct_auipc_and_jalr(None, tmp2, 0)
                    .iter()
                    .for_each(|i| i.emit_uncompressed(sink, emit_info, state, start_off));

                // Compute the jump table offset.
                // We need to emit a PC relative offset,
                sink.bind_label(label_compute_target, &mut state.ctrl_plane);

                // Get the current PC.
                Inst::Auipc {
                    rd: tmp1,
                    imm: Imm20::ZERO,
                }
                .emit_uncompressed(sink, emit_info, state, start_off);

                // These instructions must be emitted as uncompressed since we
                // are manually computing the offset from the PC.

                // Multiply the index by 8, since that is the size in
                // bytes of each jump table entry
                Inst::AluRRImm12 {
                    alu_op: AluOPRRI::Slli,
                    rd: tmp2,
                    rs: ext_index.to_reg(),
                    imm12: Imm12::from_i16(3),
                }
                .emit_uncompressed(sink, emit_info, state, start_off);

                // Calculate the base of the jump, PC + the offset from above.
                Inst::AluRRR {
                    alu_op: AluOPRRR::Add,
                    rd: tmp1,
                    rs1: tmp1.to_reg(),
                    rs2: tmp2.to_reg(),
                }
                .emit_uncompressed(sink, emit_info, state, start_off);

                // Jump to the middle of the jump table.
                // We add a 16 byte offset here, since we used 4 instructions
                // since the AUIPC that was used to get the PC.
                Inst::Jalr {
                    rd: writable_zero_reg(),
                    base: tmp1.to_reg(),
                    offset: Imm12::from_i16((4 * Inst::UNCOMPRESSED_INSTRUCTION_SIZE) as i16),
                }
                .emit_uncompressed(sink, emit_info, state, start_off);

                // Emit the jump table.
                //
                // Each entry is a auipc + jalr to the target block. We also start with a island
                // if necessary.

                // Emit the jumps back to back
                for target in targets.iter() {
                    sink.use_label_at_offset(sink.cur_offset(), *target, LabelUse::PCRel32);

                    Inst::construct_auipc_and_jalr(None, tmp2, 0)
                        .iter()
                        .for_each(|i| i.emit_uncompressed(sink, emit_info, state, start_off));
                }

                // We've just emitted an island that is safe up to *here*.
                // Mark it as such so that we don't needlessly emit additional islands.
                *start_off = sink.cur_offset();
            }

            &Inst::VirtualSPOffsetAdj { amount } => {
                crate::trace!(
                    "virtual sp offset adjusted by {} -> {}",
                    amount,
                    state.virtual_sp_offset + amount
                );
                state.virtual_sp_offset += amount;
            }
            &Inst::Atomic {
                op,
                rd,
                addr,
                src,
                amo,
            } => {
                let srcloc = state.cur_srcloc();
                if !srcloc.is_default() {
                    sink.add_trap(TrapCode::HeapOutOfBounds);
                }
                let x = op.op_code()
                    | reg_to_gpr_num(rd.to_reg()) << 7
                    | op.funct3() << 12
                    | reg_to_gpr_num(addr) << 15
                    | reg_to_gpr_num(src) << 20
                    | op.funct7(amo) << 25;

                sink.put4(x);
            }
            &Inst::Fence { pred, succ } => {
                let x = 0b0001111
                    | 0b00000 << 7
                    | 0b000 << 12
                    | 0b00000 << 15
                    | (succ as u32) << 20
                    | (pred as u32) << 24;

                sink.put4(x);
            }
            &Inst::Auipc { rd, imm } => {
                sink.put4(enc_auipc(rd, imm));
            }

            &Inst::LoadAddr { rd, mem } => {
                let base = mem.get_base_register();
                let offset = mem.get_offset_with_state(state);
                let offset_imm12 = Imm12::maybe_from_i64(offset);

                match (mem, base, offset_imm12) {
                    (_, Some(rs), Some(imm12)) => {
                        Inst::AluRRImm12 {
                            alu_op: AluOPRRI::Addi,
                            rd,
                            rs,
                            imm12,
                        }
                        .emit(&[], sink, emit_info, state);
                    }
                    (_, Some(rs), None) => {
                        let mut insts = Inst::load_constant_u64(rd, offset as u64);
                        insts.push(Inst::AluRRR {
                            alu_op: AluOPRRR::Add,
                            rd,
                            rs1: rd.to_reg(),
                            rs2: rs,
                        });
                        insts
                            .into_iter()
                            .for_each(|inst| inst.emit(&[], sink, emit_info, state));
                    }
                    (AMode::Const(addr), None, _) => {
                        // Get an address label for the constant and recurse.
                        let label = sink.get_label_for_constant(addr);
                        Inst::LoadAddr {
                            rd,
                            mem: AMode::Label(label),
                        }
                        .emit(&[], sink, emit_info, state);
                    }
                    (AMode::Label(label), None, _) => {
                        // Get the current PC.
                        sink.use_label_at_offset(sink.cur_offset(), label, LabelUse::PCRelHi20);
                        let inst = Inst::Auipc {
                            rd,
                            imm: Imm20::ZERO,
                        };
                        inst.emit_uncompressed(sink, emit_info, state, start_off);

                        // Emit an add to the address with a relocation.
                        // This later gets patched up with the correct offset.
                        sink.use_label_at_offset(sink.cur_offset(), label, LabelUse::PCRelLo12I);
                        Inst::AluRRImm12 {
                            alu_op: AluOPRRI::Addi,
                            rd,
                            rs: rd.to_reg(),
                            imm12: Imm12::ZERO,
                        }
                        .emit_uncompressed(sink, emit_info, state, start_off);
                    }
                    (amode, _, _) => {
                        unimplemented!("LoadAddr: {:?}", amode);
                    }
                }
            }

            &Inst::Select {
                ref dst,
                condition,
                ref x,
                ref y,
            } => {
                let label_true = sink.get_label();
                let label_false = sink.get_label();
                let label_end = sink.get_label();
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_true),
                    not_taken: CondBrTarget::Label(label_false),
                    kind: condition,
                }
                .emit(&[], sink, emit_info, state);
                sink.bind_label(label_true, &mut state.ctrl_plane);
                // here is the true
                // select the first value
                for i in gen_moves(dst.regs(), x.regs()) {
                    i.emit(&[], sink, emit_info, state);
                }
                Inst::gen_jump(label_end).emit(&[], sink, emit_info, state);

                sink.bind_label(label_false, &mut state.ctrl_plane);
                for i in gen_moves(dst.regs(), y.regs()) {
                    i.emit(&[], sink, emit_info, state);
                }

                sink.bind_label(label_end, &mut state.ctrl_plane);
            }
            &Inst::Jalr { rd, base, offset } => {
                sink.put4(enc_jalr(rd, base, offset));
            }
            &Inst::EBreak => {
                sink.put4(0x00100073);
            }
            &Inst::AtomicCas {
                offset,
                t0,
                dst,
                e,
                addr,
                v,
                ty,
            } => {
                //     # addr holds address of memory location
                //     # e holds expected value
                //     # v holds desired value
                //     # dst holds return value
                // cas:
                //     lr.w dst, (addr)       # Load original value.
                //     bne dst, e, fail       # Doesn’t match, so fail.
                //     sc.w t0, v, (addr)     # Try to update.
                //     bnez t0 , cas          # if store not ok,retry.
                // fail:
                let fail_label = sink.get_label();
                let cas_lebel = sink.get_label();
                sink.bind_label(cas_lebel, &mut state.ctrl_plane);
                Inst::Atomic {
                    op: AtomicOP::load_op(ty),
                    rd: dst,
                    addr,
                    src: zero_reg(),
                    amo: AMO::SeqCst,
                }
                .emit(&[], sink, emit_info, state);
                if ty.bits() < 32 {
                    AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                        .iter()
                        .for_each(|i| i.emit(&[], sink, emit_info, state));
                } else if ty.bits() == 32 {
                    Inst::Extend {
                        rd: dst,
                        rn: dst.to_reg(),
                        signed: false,
                        from_bits: 32,
                        to_bits: 64,
                    }
                    .emit(&[], sink, emit_info, state);
                }
                Inst::CondBr {
                    taken: CondBrTarget::Label(fail_label),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::NotEqual,
                        rs1: e,
                        rs2: dst.to_reg(),
                    },
                }
                .emit(&[], sink, emit_info, state);
                let store_value = if ty.bits() < 32 {
                    // reload value to t0.
                    Inst::Atomic {
                        op: AtomicOP::load_op(ty),
                        rd: t0,
                        addr,
                        src: zero_reg(),
                        amo: AMO::SeqCst,
                    }
                    .emit(&[], sink, emit_info, state);
                    // set reset part.
                    AtomicOP::merge(t0, writable_spilltmp_reg(), offset, v, ty)
                        .iter()
                        .for_each(|i| i.emit(&[], sink, emit_info, state));
                    t0.to_reg()
                } else {
                    v
                };
                Inst::Atomic {
                    op: AtomicOP::store_op(ty),
                    rd: t0,
                    addr,
                    src: store_value,
                    amo: AMO::SeqCst,
                }
                .emit(&[], sink, emit_info, state);
                // check is our value stored.
                Inst::CondBr {
                    taken: CondBrTarget::Label(cas_lebel),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::NotEqual,
                        rs1: t0.to_reg(),
                        rs2: zero_reg(),
                    },
                }
                .emit(&[], sink, emit_info, state);
                sink.bind_label(fail_label, &mut state.ctrl_plane);
            }
            &Inst::AtomicRmwLoop {
                offset,
                op,
                dst,
                ty,
                p,
                x,
                t0,
            } => {
                let retry = sink.get_label();
                sink.bind_label(retry, &mut state.ctrl_plane);
                // load old value.
                Inst::Atomic {
                    op: AtomicOP::load_op(ty),
                    rd: dst,
                    addr: p,
                    src: zero_reg(),
                    amo: AMO::SeqCst,
                }
                .emit(&[], sink, emit_info, state);
                //

                let store_value: Reg = match op {
                    crate::ir::AtomicRmwOp::Add
                    | crate::ir::AtomicRmwOp::Sub
                    | crate::ir::AtomicRmwOp::And
                    | crate::ir::AtomicRmwOp::Or
                    | crate::ir::AtomicRmwOp::Xor => {
                        AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                            .iter()
                            .for_each(|i| i.emit(&[], sink, emit_info, state));
                        Inst::AluRRR {
                            alu_op: match op {
                                crate::ir::AtomicRmwOp::Add => AluOPRRR::Add,
                                crate::ir::AtomicRmwOp::Sub => AluOPRRR::Sub,
                                crate::ir::AtomicRmwOp::And => AluOPRRR::And,
                                crate::ir::AtomicRmwOp::Or => AluOPRRR::Or,
                                crate::ir::AtomicRmwOp::Xor => AluOPRRR::Xor,
                                _ => unreachable!(),
                            },
                            rd: t0,
                            rs1: dst.to_reg(),
                            rs2: x,
                        }
                        .emit(&[], sink, emit_info, state);
                        Inst::Atomic {
                            op: AtomicOP::load_op(ty),
                            rd: writable_spilltmp_reg2(),
                            addr: p,
                            src: zero_reg(),
                            amo: AMO::SeqCst,
                        }
                        .emit(&[], sink, emit_info, state);
                        AtomicOP::merge(
                            writable_spilltmp_reg2(),
                            writable_spilltmp_reg(),
                            offset,
                            t0.to_reg(),
                            ty,
                        )
                        .iter()
                        .for_each(|i| i.emit(&[], sink, emit_info, state));
                        spilltmp_reg2()
                    }
                    crate::ir::AtomicRmwOp::Nand => {
                        if ty.bits() < 32 {
                            AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                                .iter()
                                .for_each(|i| i.emit(&[], sink, emit_info, state));
                        }
                        Inst::AluRRR {
                            alu_op: AluOPRRR::And,
                            rd: t0,
                            rs1: x,
                            rs2: dst.to_reg(),
                        }
                        .emit(&[], sink, emit_info, state);
                        Inst::construct_bit_not(t0, t0.to_reg()).emit(&[], sink, emit_info, state);
                        if ty.bits() < 32 {
                            Inst::Atomic {
                                op: AtomicOP::load_op(ty),
                                rd: writable_spilltmp_reg2(),
                                addr: p,
                                src: zero_reg(),
                                amo: AMO::SeqCst,
                            }
                            .emit(&[], sink, emit_info, state);
                            AtomicOP::merge(
                                writable_spilltmp_reg2(),
                                writable_spilltmp_reg(),
                                offset,
                                t0.to_reg(),
                                ty,
                            )
                            .iter()
                            .for_each(|i| i.emit(&[], sink, emit_info, state));
                            spilltmp_reg2()
                        } else {
                            t0.to_reg()
                        }
                    }

                    crate::ir::AtomicRmwOp::Umin
                    | crate::ir::AtomicRmwOp::Umax
                    | crate::ir::AtomicRmwOp::Smin
                    | crate::ir::AtomicRmwOp::Smax => {
                        let label_select_dst = sink.get_label();
                        let label_select_done = sink.get_label();
                        if op == crate::ir::AtomicRmwOp::Umin || op == crate::ir::AtomicRmwOp::Umax
                        {
                            AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                        } else {
                            AtomicOP::extract_sext(dst, offset, dst.to_reg(), ty)
                        }
                        .iter()
                        .for_each(|i| i.emit(&[], sink, emit_info, state));

                        Inst::CondBr {
                            taken: CondBrTarget::Label(label_select_dst),
                            not_taken: CondBrTarget::Fallthrough,
                            kind: IntegerCompare {
                                kind: match op {
                                    crate::ir::AtomicRmwOp::Umin => IntCC::UnsignedLessThan,
                                    crate::ir::AtomicRmwOp::Umax => IntCC::UnsignedGreaterThan,
                                    crate::ir::AtomicRmwOp::Smin => IntCC::SignedLessThan,
                                    crate::ir::AtomicRmwOp::Smax => IntCC::SignedGreaterThan,
                                    _ => unreachable!(),
                                },
                                rs1: dst.to_reg(),
                                rs2: x,
                            },
                        }
                        .emit(&[], sink, emit_info, state);
                        // here we select x.
                        Inst::gen_move(t0, x, I64).emit(&[], sink, emit_info, state);
                        Inst::gen_jump(label_select_done).emit(&[], sink, emit_info, state);
                        sink.bind_label(label_select_dst, &mut state.ctrl_plane);
                        Inst::gen_move(t0, dst.to_reg(), I64).emit(&[], sink, emit_info, state);
                        sink.bind_label(label_select_done, &mut state.ctrl_plane);
                        Inst::Atomic {
                            op: AtomicOP::load_op(ty),
                            rd: writable_spilltmp_reg2(),
                            addr: p,
                            src: zero_reg(),
                            amo: AMO::SeqCst,
                        }
                        .emit(&[], sink, emit_info, state);
                        AtomicOP::merge(
                            writable_spilltmp_reg2(),
                            writable_spilltmp_reg(),
                            offset,
                            t0.to_reg(),
                            ty,
                        )
                        .iter()
                        .for_each(|i| i.emit(&[], sink, emit_info, state));
                        spilltmp_reg2()
                    }
                    crate::ir::AtomicRmwOp::Xchg => {
                        AtomicOP::extract(dst, offset, dst.to_reg(), ty)
                            .iter()
                            .for_each(|i| i.emit(&[], sink, emit_info, state));
                        Inst::Atomic {
                            op: AtomicOP::load_op(ty),
                            rd: writable_spilltmp_reg2(),
                            addr: p,
                            src: zero_reg(),
                            amo: AMO::SeqCst,
                        }
                        .emit(&[], sink, emit_info, state);
                        AtomicOP::merge(
                            writable_spilltmp_reg2(),
                            writable_spilltmp_reg(),
                            offset,
                            x,
                            ty,
                        )
                        .iter()
                        .for_each(|i| i.emit(&[], sink, emit_info, state));
                        spilltmp_reg2()
                    }
                };

                Inst::Atomic {
                    op: AtomicOP::store_op(ty),
                    rd: t0,
                    addr: p,
                    src: store_value,
                    amo: AMO::SeqCst,
                }
                .emit(&[], sink, emit_info, state);

                // if store is not ok,retry.
                Inst::CondBr {
                    taken: CondBrTarget::Label(retry),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::NotEqual,
                        rs1: t0.to_reg(),
                        rs2: zero_reg(),
                    },
                }
                .emit(&[], sink, emit_info, state);
            }

            &Inst::LoadExtName {
                rd,
                ref name,
                offset,
            } => {
                if emit_info.shared_flag.is_pic() {
                    // Load a PC-relative address into a register.
                    // RISC-V does this slightly differently from other arches. We emit a relocation
                    // with a label, instead of the symbol itself.
                    //
                    // See: https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc#pc-relative-symbol-addresses
                    //
                    // Emit the following code:
                    // label:
                    //   auipc rd, 0              # R_RISCV_GOT_HI20 (symbol_name)
                    //   ld    rd, rd, 0          # R_RISCV_PCREL_LO12_I (label)

                    // Create the lable that is going to be published to the final binary object.
                    let auipc_label = sink.get_label();
                    sink.bind_label(auipc_label, &mut state.ctrl_plane);

                    // Get the current PC.
                    sink.add_reloc(Reloc::RiscvGotHi20, &**name, 0);
                    Inst::Auipc {
                        rd: rd,
                        imm: Imm20::from_i32(0),
                    }
                    .emit_uncompressed(sink, emit_info, state, start_off);

                    // The `ld` here, points to the `auipc` label instead of directly to the symbol.
                    sink.add_reloc(Reloc::RiscvPCRelLo12I, &auipc_label, 0);
                    Inst::Load {
                        rd,
                        op: LoadOP::Ld,
                        flags: MemFlags::trusted(),
                        from: AMode::RegOffset(rd.to_reg(), 0, I64),
                    }
                    .emit_uncompressed(sink, emit_info, state, start_off);
                } else {
                    // In the non PIC sequence we relocate the absolute address into
                    // a prealocatted space, load it into a register and jump over it.
                    //
                    // Emit the following code:
                    //   ld rd, label_data
                    //   j label_end
                    // label_data:
                    //   <8 byte space>           # ABS8
                    // label_end:

                    let label_data = sink.get_label();
                    let label_end = sink.get_label();

                    // Load the value from a label
                    Inst::Load {
                        rd,
                        op: LoadOP::Ld,
                        flags: MemFlags::trusted(),
                        from: AMode::Label(label_data),
                    }
                    .emit(&[], sink, emit_info, state);

                    // Jump over the data
                    Inst::gen_jump(label_end).emit(&[], sink, emit_info, state);

                    sink.bind_label(label_data, &mut state.ctrl_plane);
                    sink.add_reloc(Reloc::Abs8, name.as_ref(), offset);
                    sink.put8(0);

                    sink.bind_label(label_end, &mut state.ctrl_plane);
                }
            }

            &Inst::ElfTlsGetAddr { rd, ref name } => {
                // RISC-V's TLS GD model is slightly different from other arches.
                //
                // We have a relocation (R_RISCV_TLS_GD_HI20) that loads the high 20 bits
                // of the address relative to the GOT entry. This relocation points to
                // the symbol as usual.
                //
                // However when loading the bottom 12bits of the address, we need to
                // use a label that points to the previous AUIPC instruction.
                //
                // label:
                //    auipc a0,0                    # R_RISCV_TLS_GD_HI20 (symbol)
                //    addi  a0,a0,0                 # R_RISCV_PCREL_LO12_I (label)
                //
                // https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc#global-dynamic

                // Create the lable that is going to be published to the final binary object.
                let auipc_label = sink.get_label();
                sink.bind_label(auipc_label, &mut state.ctrl_plane);

                // Get the current PC.
                sink.add_reloc(Reloc::RiscvTlsGdHi20, &**name, 0);
                Inst::Auipc {
                    rd: rd,
                    imm: Imm20::from_i32(0),
                }
                .emit_uncompressed(sink, emit_info, state, start_off);

                // The `addi` here, points to the `auipc` label instead of directly to the symbol.
                sink.add_reloc(Reloc::RiscvPCRelLo12I, &auipc_label, 0);
                Inst::AluRRImm12 {
                    alu_op: AluOPRRI::Addi,
                    rd: rd,
                    rs: rd.to_reg(),
                    imm12: Imm12::from_i16(0),
                }
                .emit_uncompressed(sink, emit_info, state, start_off);

                Inst::Call {
                    info: Box::new(CallInfo {
                        dest: ExternalName::LibCall(LibCall::ElfTlsGetAddr),
                        uses: smallvec![],
                        defs: smallvec![],
                        opcode: crate::ir::Opcode::TlsValue,
                        caller_callconv: CallConv::SystemV,
                        callee_callconv: CallConv::SystemV,
                        callee_pop_size: 0,
                        clobbers: PRegSet::empty(),
                    }),
                }
                .emit_uncompressed(sink, emit_info, state, start_off);
            }

            &Inst::TrapIf {
                rs1,
                rs2,
                cc,
                trap_code,
            } => {
                let label_end = sink.get_label();
                let cond = IntegerCompare { kind: cc, rs1, rs2 };

                // Jump over the trap if we the condition is false.
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_end),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: cond.inverse(),
                }
                .emit(&[], sink, emit_info, state);
                Inst::Udf { trap_code }.emit(&[], sink, emit_info, state);

                sink.bind_label(label_end, &mut state.ctrl_plane);
            }
            &Inst::Udf { trap_code } => {
                sink.add_trap(trap_code);
                if let Some(s) = state.take_stack_map() {
                    sink.add_stack_map(
                        StackMapExtent::UpcomingBytes(Inst::TRAP_OPCODE.len() as u32),
                        s,
                    );
                }
                sink.put_data(Inst::TRAP_OPCODE);
            }
            &Inst::AtomicLoad { rd, ty, p } => {
                // emit the fence.
                Inst::Fence {
                    pred: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W,
                    succ: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W,
                }
                .emit(&[], sink, emit_info, state);
                // load.
                Inst::Load {
                    rd: rd,
                    op: LoadOP::from_type(ty),
                    flags: MemFlags::new(),
                    from: AMode::RegOffset(p, 0, ty),
                }
                .emit(&[], sink, emit_info, state);
                Inst::Fence {
                    pred: Inst::FENCE_REQ_R,
                    succ: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W,
                }
                .emit(&[], sink, emit_info, state);
            }
            &Inst::AtomicStore { src, ty, p } => {
                Inst::Fence {
                    pred: Inst::FENCE_REQ_R | Inst::FENCE_REQ_W,
                    succ: Inst::FENCE_REQ_W,
                }
                .emit(&[], sink, emit_info, state);
                Inst::Store {
                    to: AMode::RegOffset(p, 0, ty),
                    op: StoreOP::from_type(ty),
                    flags: MemFlags::new(),
                    src,
                }
                .emit(&[], sink, emit_info, state);
            }
            &Inst::FloatRound {
                op,
                rd,
                int_tmp,
                f_tmp,
                rs,
                ty,
            } => {
                // this code is port from glibc ceil floor ... implementation.
                let label_nan = sink.get_label();
                let label_x = sink.get_label();
                let label_jump_over = sink.get_label();
                // check if is nan.
                Inst::emit_not_nan(int_tmp, rs, ty).emit(&[], sink, emit_info, state);
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_nan),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::Equal,
                        rs1: int_tmp.to_reg(),
                        rs2: zero_reg(),
                    },
                }
                .emit(&[], sink, emit_info, state);
                fn max_value_need_round(ty: Type) -> u64 {
                    match ty {
                        F32 => {
                            let x: u64 = 1 << f32::MANTISSA_DIGITS;
                            let x = x as f32;
                            let x = u32::from_le_bytes(x.to_le_bytes());
                            x as u64
                        }
                        F64 => {
                            let x: u64 = 1 << f64::MANTISSA_DIGITS;
                            let x = x as f64;
                            u64::from_le_bytes(x.to_le_bytes())
                        }
                        _ => unreachable!(),
                    }
                }
                // load max value need to round.
                if ty == F32 {
                    Inst::load_fp_constant32(f_tmp, max_value_need_round(ty) as u32, &mut |_| {
                        writable_spilltmp_reg()
                    })
                } else {
                    Inst::load_fp_constant64(f_tmp, max_value_need_round(ty), &mut |_| {
                        writable_spilltmp_reg()
                    })
                }
                .into_iter()
                .for_each(|i| i.emit(&[], sink, emit_info, state));

                // get abs value.
                Inst::emit_fabs(rd, rs, ty).emit(&[], sink, emit_info, state);

                // branch if f_tmp < rd
                Inst::FpuRRR {
                    frm: FRM::RTZ,
                    alu_op: if ty == F32 {
                        FpuOPRRR::FltS
                    } else {
                        FpuOPRRR::FltD
                    },
                    rd: int_tmp,
                    rs1: f_tmp.to_reg(),
                    rs2: rd.to_reg(),
                }
                .emit(&[], sink, emit_info, state);

                Inst::CondBr {
                    taken: CondBrTarget::Label(label_x),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::NotEqual,
                        rs1: int_tmp.to_reg(),
                        rs2: zero_reg(),
                    },
                }
                .emit(&[], sink, emit_info, state);

                //convert to int.
                Inst::FpuRR {
                    alu_op: FpuOPRR::float_convert_2_int_op(ty, true, I64),
                    frm: op.to_frm(),
                    rd: int_tmp,
                    rs: rs,
                }
                .emit(&[], sink, emit_info, state);
                //convert back.
                Inst::FpuRR {
                    alu_op: if ty == F32 {
                        FpuOPRR::FcvtSL
                    } else {
                        FpuOPRR::FcvtDL
                    },
                    frm: op.to_frm(),
                    rd,
                    rs: int_tmp.to_reg(),
                }
                .emit(&[], sink, emit_info, state);
                // copy sign.
                Inst::FpuRRR {
                    alu_op: if ty == F32 {
                        FpuOPRRR::FsgnjS
                    } else {
                        FpuOPRRR::FsgnjD
                    },
                    frm: FRM::RNE,
                    rd,
                    rs1: rd.to_reg(),
                    rs2: rs,
                }
                .emit(&[], sink, emit_info, state);
                // jump over.
                Inst::gen_jump(label_jump_over).emit(&[], sink, emit_info, state);
                // here is nan.
                sink.bind_label(label_nan, &mut state.ctrl_plane);
                Inst::FpuRRR {
                    alu_op: if ty == F32 {
                        FpuOPRRR::FaddS
                    } else {
                        FpuOPRRR::FaddD
                    },
                    frm: FRM::RNE,
                    rd: rd,
                    rs1: rs,
                    rs2: rs,
                }
                .emit(&[], sink, emit_info, state);
                Inst::gen_jump(label_jump_over).emit(&[], sink, emit_info, state);
                // here select origin x.
                sink.bind_label(label_x, &mut state.ctrl_plane);
                Inst::gen_move(rd, rs, ty).emit(&[], sink, emit_info, state);
                sink.bind_label(label_jump_over, &mut state.ctrl_plane);
            }

            &Inst::Popcnt {
                sum,
                tmp,
                step,
                rs,
                ty,
            } => {
                // load 0 to sum , init.
                Inst::gen_move(sum, zero_reg(), I64).emit(&[], sink, emit_info, state);
                // load
                Inst::load_imm12(step, Imm12::from_i16(ty.bits() as i16)).emit(
                    &[],
                    sink,
                    emit_info,
                    state,
                );
                //
                Inst::load_imm12(tmp, Imm12::ONE).emit(&[], sink, emit_info, state);
                Inst::AluRRImm12 {
                    alu_op: AluOPRRI::Slli,
                    rd: tmp,
                    rs: tmp.to_reg(),
                    imm12: Imm12::from_i16((ty.bits() - 1) as i16),
                }
                .emit(&[], sink, emit_info, state);
                let label_done = sink.get_label();
                let label_loop = sink.get_label();
                sink.bind_label(label_loop, &mut state.ctrl_plane);
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_done),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::SignedLessThanOrEqual,
                        rs1: step.to_reg(),
                        rs2: zero_reg(),
                    },
                }
                .emit(&[], sink, emit_info, state);
                // test and add sum.
                {
                    Inst::AluRRR {
                        alu_op: AluOPRRR::And,
                        rd: writable_spilltmp_reg2(),
                        rs1: tmp.to_reg(),
                        rs2: rs,
                    }
                    .emit(&[], sink, emit_info, state);
                    let label_over = sink.get_label();
                    Inst::CondBr {
                        taken: CondBrTarget::Label(label_over),
                        not_taken: CondBrTarget::Fallthrough,
                        kind: IntegerCompare {
                            kind: IntCC::Equal,
                            rs1: zero_reg(),
                            rs2: spilltmp_reg2(),
                        },
                    }
                    .emit(&[], sink, emit_info, state);
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Addi,
                        rd: sum,
                        rs: sum.to_reg(),
                        imm12: Imm12::ONE,
                    }
                    .emit(&[], sink, emit_info, state);
                    sink.bind_label(label_over, &mut state.ctrl_plane);
                }
                // set step and tmp.
                {
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Addi,
                        rd: step,
                        rs: step.to_reg(),
                        imm12: Imm12::from_i16(-1),
                    }
                    .emit(&[], sink, emit_info, state);
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Srli,
                        rd: tmp,
                        rs: tmp.to_reg(),
                        imm12: Imm12::ONE,
                    }
                    .emit(&[], sink, emit_info, state);
                    Inst::gen_jump(label_loop).emit(&[], sink, emit_info, state);
                }
                sink.bind_label(label_done, &mut state.ctrl_plane);
            }
            &Inst::Cltz {
                sum,
                tmp,
                step,
                rs,
                leading,
                ty,
            } => {
                // load 0 to sum , init.
                Inst::gen_move(sum, zero_reg(), I64).emit(&[], sink, emit_info, state);
                // load
                Inst::load_imm12(step, Imm12::from_i16(ty.bits() as i16)).emit(
                    &[],
                    sink,
                    emit_info,
                    state,
                );
                //
                Inst::load_imm12(tmp, Imm12::ONE).emit(&[], sink, emit_info, state);
                if leading {
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Slli,
                        rd: tmp,
                        rs: tmp.to_reg(),
                        imm12: Imm12::from_i16((ty.bits() - 1) as i16),
                    }
                    .emit(&[], sink, emit_info, state);
                }
                let label_done = sink.get_label();
                let label_loop = sink.get_label();
                sink.bind_label(label_loop, &mut state.ctrl_plane);
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_done),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::SignedLessThanOrEqual,
                        rs1: step.to_reg(),
                        rs2: zero_reg(),
                    },
                }
                .emit(&[], sink, emit_info, state);
                // test and add sum.
                {
                    Inst::AluRRR {
                        alu_op: AluOPRRR::And,
                        rd: writable_spilltmp_reg2(),
                        rs1: tmp.to_reg(),
                        rs2: rs,
                    }
                    .emit(&[], sink, emit_info, state);
                    Inst::CondBr {
                        taken: CondBrTarget::Label(label_done),
                        not_taken: CondBrTarget::Fallthrough,
                        kind: IntegerCompare {
                            kind: IntCC::NotEqual,
                            rs1: zero_reg(),
                            rs2: spilltmp_reg2(),
                        },
                    }
                    .emit(&[], sink, emit_info, state);
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Addi,
                        rd: sum,
                        rs: sum.to_reg(),
                        imm12: Imm12::ONE,
                    }
                    .emit(&[], sink, emit_info, state);
                }
                // set step and tmp.
                {
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Addi,
                        rd: step,
                        rs: step.to_reg(),
                        imm12: Imm12::from_i16(-1),
                    }
                    .emit(&[], sink, emit_info, state);
                    Inst::AluRRImm12 {
                        alu_op: if leading {
                            AluOPRRI::Srli
                        } else {
                            AluOPRRI::Slli
                        },
                        rd: tmp,
                        rs: tmp.to_reg(),
                        imm12: Imm12::ONE,
                    }
                    .emit(&[], sink, emit_info, state);
                    Inst::gen_jump(label_loop).emit(&[], sink, emit_info, state);
                }
                sink.bind_label(label_done, &mut state.ctrl_plane);
            }
            &Inst::Brev8 {
                rs,
                ty,
                step,
                tmp,
                tmp2,
                rd,
            } => {
                Inst::gen_move(rd, zero_reg(), I64).emit(&[], sink, emit_info, state);
                Inst::load_imm12(step, Imm12::from_i16(ty.bits() as i16)).emit(
                    &[],
                    sink,
                    emit_info,
                    state,
                );
                //
                Inst::load_imm12(tmp, Imm12::ONE).emit(&[], sink, emit_info, state);
                Inst::AluRRImm12 {
                    alu_op: AluOPRRI::Slli,
                    rd: tmp,
                    rs: tmp.to_reg(),
                    imm12: Imm12::from_i16((ty.bits() - 1) as i16),
                }
                .emit(&[], sink, emit_info, state);
                Inst::load_imm12(tmp2, Imm12::ONE).emit(&[], sink, emit_info, state);
                Inst::AluRRImm12 {
                    alu_op: AluOPRRI::Slli,
                    rd: tmp2,
                    rs: tmp2.to_reg(),
                    imm12: Imm12::from_i16((ty.bits() - 8) as i16),
                }
                .emit(&[], sink, emit_info, state);

                let label_done = sink.get_label();
                let label_loop = sink.get_label();
                sink.bind_label(label_loop, &mut state.ctrl_plane);
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_done),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::SignedLessThanOrEqual,
                        rs1: step.to_reg(),
                        rs2: zero_reg(),
                    },
                }
                .emit(&[], sink, emit_info, state);
                // test and set bit.
                {
                    Inst::AluRRR {
                        alu_op: AluOPRRR::And,
                        rd: writable_spilltmp_reg2(),
                        rs1: tmp.to_reg(),
                        rs2: rs,
                    }
                    .emit(&[], sink, emit_info, state);
                    let label_over = sink.get_label();
                    Inst::CondBr {
                        taken: CondBrTarget::Label(label_over),
                        not_taken: CondBrTarget::Fallthrough,
                        kind: IntegerCompare {
                            kind: IntCC::Equal,
                            rs1: zero_reg(),
                            rs2: spilltmp_reg2(),
                        },
                    }
                    .emit(&[], sink, emit_info, state);
                    Inst::AluRRR {
                        alu_op: AluOPRRR::Or,
                        rd: rd,
                        rs1: rd.to_reg(),
                        rs2: tmp2.to_reg(),
                    }
                    .emit(&[], sink, emit_info, state);
                    sink.bind_label(label_over, &mut state.ctrl_plane);
                }
                // set step and tmp.
                {
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Addi,
                        rd: step,
                        rs: step.to_reg(),
                        imm12: Imm12::from_i16(-1),
                    }
                    .emit(&[], sink, emit_info, state);
                    Inst::AluRRImm12 {
                        alu_op: AluOPRRI::Srli,
                        rd: tmp,
                        rs: tmp.to_reg(),
                        imm12: Imm12::ONE,
                    }
                    .emit(&[], sink, emit_info, state);
                    {
                        // reset tmp2
                        // if (step %=8 == 0) then tmp2 = tmp2 >> 15
                        // if (step %=8 != 0) then tmp2 = tmp2 << 1
                        let label_over = sink.get_label();
                        let label_sll_1 = sink.get_label();
                        Inst::load_imm12(writable_spilltmp_reg2(), Imm12::from_i16(8)).emit(
                            &[],
                            sink,
                            emit_info,
                            state,
                        );
                        Inst::AluRRR {
                            alu_op: AluOPRRR::Rem,
                            rd: writable_spilltmp_reg2(),
                            rs1: step.to_reg(),
                            rs2: spilltmp_reg2(),
                        }
                        .emit(&[], sink, emit_info, state);
                        Inst::CondBr {
                            taken: CondBrTarget::Label(label_sll_1),
                            not_taken: CondBrTarget::Fallthrough,
                            kind: IntegerCompare {
                                kind: IntCC::NotEqual,
                                rs1: spilltmp_reg2(),
                                rs2: zero_reg(),
                            },
                        }
                        .emit(&[], sink, emit_info, state);
                        Inst::AluRRImm12 {
                            alu_op: AluOPRRI::Srli,
                            rd: tmp2,
                            rs: tmp2.to_reg(),
                            imm12: Imm12::from_i16(15),
                        }
                        .emit(&[], sink, emit_info, state);
                        Inst::gen_jump(label_over).emit(&[], sink, emit_info, state);
                        sink.bind_label(label_sll_1, &mut state.ctrl_plane);
                        Inst::AluRRImm12 {
                            alu_op: AluOPRRI::Slli,
                            rd: tmp2,
                            rs: tmp2.to_reg(),
                            imm12: Imm12::ONE,
                        }
                        .emit(&[], sink, emit_info, state);
                        sink.bind_label(label_over, &mut state.ctrl_plane);
                    }
                    Inst::gen_jump(label_loop).emit(&[], sink, emit_info, state);
                }
                sink.bind_label(label_done, &mut state.ctrl_plane);
            }
            &Inst::StackProbeLoop {
                guard_size,
                probe_count,
                tmp: guard_size_tmp,
            } => {
                let step = writable_spilltmp_reg();
                Inst::load_constant_u64(step, (guard_size as u64) * (probe_count as u64))
                    .iter()
                    .for_each(|i| i.emit(&[], sink, emit_info, state));
                Inst::load_constant_u64(guard_size_tmp, guard_size as u64)
                    .iter()
                    .for_each(|i| i.emit(&[], sink, emit_info, state));

                let loop_start = sink.get_label();
                let label_done = sink.get_label();
                sink.bind_label(loop_start, &mut state.ctrl_plane);
                Inst::CondBr {
                    taken: CondBrTarget::Label(label_done),
                    not_taken: CondBrTarget::Fallthrough,
                    kind: IntegerCompare {
                        kind: IntCC::UnsignedLessThanOrEqual,
                        rs1: step.to_reg(),
                        rs2: guard_size_tmp.to_reg(),
                    },
                }
                .emit(&[], sink, emit_info, state);
                // compute address.
                Inst::AluRRR {
                    alu_op: AluOPRRR::Sub,
                    rd: writable_spilltmp_reg2(),
                    rs1: stack_reg(),
                    rs2: step.to_reg(),
                }
                .emit(&[], sink, emit_info, state);
                Inst::Store {
                    to: AMode::RegOffset(spilltmp_reg2(), 0, I8),
                    op: StoreOP::Sb,
                    flags: MemFlags::new(),
                    src: zero_reg(),
                }
                .emit(&[], sink, emit_info, state);
                // reset step.
                Inst::AluRRR {
                    alu_op: AluOPRRR::Sub,
                    rd: step,
                    rs1: step.to_reg(),
                    rs2: guard_size_tmp.to_reg(),
                }
                .emit(&[], sink, emit_info, state);
                Inst::gen_jump(loop_start).emit(&[], sink, emit_info, state);
                sink.bind_label(label_done, &mut state.ctrl_plane);
            }
            &Inst::VecAluRRRImm5 {
                op,
                vd,
                vd_src,
                imm,
                vs2,
                ref mask,
                ..
            } => {
                debug_assert_eq!(vd.to_reg(), vd_src);

                sink.put4(encode_valu_rrr_imm(op, vd, imm, vs2, *mask));
            }
            &Inst::VecAluRRRR {
                op,
                vd,
                vd_src,
                vs1,
                vs2,
                ref mask,
                ..
            } => {
                debug_assert_eq!(vd.to_reg(), vd_src);

                sink.put4(encode_valu_rrrr(op, vd, vs2, vs1, *mask));
            }
            &Inst::VecAluRRR {
                op,
                vd,
                vs1,
                vs2,
                ref mask,
                ..
            } => {
                sink.put4(encode_valu(op, vd, vs1, vs2, *mask));
            }
            &Inst::VecAluRRImm5 {
                op,
                vd,
                imm,
                vs2,
                ref mask,
                ..
            } => {
                sink.put4(encode_valu_rr_imm(op, vd, imm, vs2, *mask));
            }
            &Inst::VecAluRR {
                op,
                vd,
                vs,
                ref mask,
                ..
            } => {
                sink.put4(encode_valu_rr(op, vd, vs, *mask));
            }
            &Inst::VecAluRImm5 {
                op,
                vd,
                imm,
                ref mask,
                ..
            } => {
                sink.put4(encode_valu_r_imm(op, vd, imm, *mask));
            }
            &Inst::VecSetState { rd, ref vstate } => {
                sink.put4(encode_vcfg_imm(
                    0x57,
                    rd.to_reg(),
                    vstate.avl.unwrap_static(),
                    &vstate.vtype,
                ));

                // Update the current vector emit state.
                state.vstate = EmitVState::Known(vstate.clone());
            }

            &Inst::VecLoad {
                eew,
                to,
                ref from,
                ref mask,
                flags,
                ..
            } => {
                // Vector Loads don't support immediate offsets, so we need to load it into a register.
                let addr = match from {
                    VecAMode::UnitStride { base } => {
                        let base_reg = base.get_base_register();
                        let offset = base.get_offset_with_state(state);

                        // Reg+0 Offset can be directly encoded
                        if let (Some(base_reg), 0) = (base_reg, offset) {
                            base_reg
                        } else {
                            // Otherwise load the address it into a reg and load from it.
                            let tmp = writable_spilltmp_reg();
                            Inst::LoadAddr {
                                rd: tmp,
                                mem: base.clone(),
                            }
                            .emit(&[], sink, emit_info, state);
                            tmp.to_reg()
                        }
                    }
                };

                let srcloc = state.cur_srcloc();
                if !srcloc.is_default() && !flags.notrap() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(TrapCode::HeapOutOfBounds);
                }

                sink.put4(encode_vmem_load(
                    0x07,
                    to.to_reg(),
                    eew,
                    addr,
                    from.lumop(),
                    *mask,
                    from.mop(),
                    from.nf(),
                ));
            }

            &Inst::VecStore {
                eew,
                ref to,
                from,
                ref mask,
                flags,
                ..
            } => {
                // Vector Stores don't support immediate offsets, so we need to load it into a register.
                let addr = match to {
                    VecAMode::UnitStride { base } => {
                        let base_reg = base.get_base_register();
                        let offset = base.get_offset_with_state(state);

                        // Reg+0 Offset can be directly encoded
                        if let (Some(base_reg), 0) = (base_reg, offset) {
                            base_reg
                        } else {
                            // Otherwise load the address it into a reg and load from it.
                            let tmp = writable_spilltmp_reg();
                            Inst::LoadAddr {
                                rd: tmp,
                                mem: base.clone(),
                            }
                            .emit(&[], sink, emit_info, state);
                            tmp.to_reg()
                        }
                    }
                };

                let srcloc = state.cur_srcloc();
                if !srcloc.is_default() && !flags.notrap() {
                    // Register the offset at which the actual load instruction starts.
                    sink.add_trap(TrapCode::HeapOutOfBounds);
                }

                sink.put4(encode_vmem_store(
                    0x27,
                    from,
                    eew,
                    addr,
                    to.sumop(),
                    *mask,
                    to.mop(),
                    to.nf(),
                ));
            }
        };
    }

    fn allocate(self, allocs: &mut AllocationConsumer) -> Self {
        fn alloc_value_regs(
            orgin: &ValueRegs<Reg>,
            alloc: &mut AllocationConsumer,
        ) -> ValueRegs<Reg> {
            match orgin.regs().len() {
                1 => ValueRegs::one(alloc.next(orgin.regs()[0])),
                2 => ValueRegs::two(alloc.next(orgin.regs()[0]), alloc.next(orgin.regs()[1])),
                _ => unreachable!(),
            }
        }

        fn alloc_writable_value_regs(
            origin: &ValueRegs<Writable<Reg>>,
            alloc: &mut AllocationConsumer,
        ) -> ValueRegs<Writable<Reg>> {
            alloc_value_regs(&origin.map(|r| r.to_reg()), alloc).map(Writable::from_reg)
        }

        match self {
            Inst::Nop0 => self,
            Inst::Nop4 => self,
            Inst::RawData { .. } => self,
            Inst::Lui { rd, imm } => Inst::Lui {
                rd: allocs.next_writable(rd),
                imm,
            },
            Inst::LoadInlineConst { rd, ty, imm } => Inst::LoadInlineConst {
                rd: allocs.next_writable(rd),
                ty,
                imm,
            },
            Inst::FpuRR {
                frm,
                alu_op,
                rd,
                rs,
            } => Inst::FpuRR {
                rs: allocs.next(rs),
                rd: allocs.next_writable(rd),
                frm,
                alu_op,
            },
            Inst::FpuRRRR {
                alu_op,
                rd,
                rs1,
                rs2,
                rs3,
                frm,
            } => Inst::FpuRRRR {
                rs1: allocs.next(rs1),
                rs2: allocs.next(rs2),
                rs3: allocs.next(rs3),
                rd: allocs.next_writable(rd),
                alu_op,
                frm,
            },
            Inst::FpuRRR {
                alu_op,
                frm,
                rd,
                rs1,
                rs2,
            } => Inst::FpuRRR {
                alu_op,
                frm,
                rs1: allocs.next(rs1),
                rs2: allocs.next(rs2),
                rd: allocs.next_writable(rd),
            },
            Inst::Unwind { .. } => self,
            Inst::DummyUse { reg } => Inst::DummyUse {
                reg: allocs.next(reg),
            },
            Inst::AluRRR {
                alu_op,
                rd,
                rs1,
                rs2,
            } => Inst::AluRRR {
                alu_op,
                rs1: allocs.next(rs1),
                rs2: allocs.next(rs2),
                rd: allocs.next_writable(rd),
            },
            Inst::AluRRImm12 {
                alu_op,
                rd,
                rs,
                imm12,
            } => Inst::AluRRImm12 {
                alu_op,
                rs: allocs.next(rs),
                rd: allocs.next_writable(rd),
                imm12,
            },
            Inst::CsrReg { op, rd, rs, csr } => Inst::CsrReg {
                op,
                rs: allocs.next(rs),
                rd: allocs.next_writable(rd),
                csr,
            },
            Inst::CsrImm { op, rd, csr, imm } => Inst::CsrImm {
                op,
                rd: allocs.next_writable(rd),
                csr,
                imm,
            },
            Inst::Load {
                rd,
                op,
                from,
                flags,
            } => Inst::Load {
                from: from.clone().with_allocs(allocs),
                rd: allocs.next_writable(rd),
                op,
                flags,
            },
            Inst::Store { op, src, flags, to } => Inst::Store {
                op,
                flags,
                to: to.clone().with_allocs(allocs),
                src: allocs.next(src),
            },

            Inst::Args { .. } => self,
            Inst::Rets { .. } => self,
            Inst::Ret { .. } => self,

            Inst::Extend {
                rd,
                rn,
                signed,
                from_bits,
                to_bits,
            } => Inst::Extend {
                rn: allocs.next(rn),
                rd: allocs.next_writable(rd),
                signed,
                from_bits,
                to_bits,
            },

            Inst::Call { .. } => self,
            Inst::CallInd { mut info } => {
                info.rn = allocs.next(info.rn);
                Inst::CallInd { info }
            }

            Inst::ReturnCall { callee, info } => {
                for u in &info.uses {
                    let _ = allocs.next(u.vreg);
                }

                Inst::ReturnCall { callee, info }
            }

            Inst::ReturnCallInd { callee, info } => {
                let callee = allocs.next(callee);

                for u in &info.uses {
                    let _ = allocs.next(u.vreg);
                }

                Inst::ReturnCallInd { callee, info }
            }

            Inst::Jal { .. } => self,

            Inst::CondBr {
                taken,
                not_taken,
                mut kind,
            } => {
                kind.rs1 = allocs.next(kind.rs1);
                kind.rs2 = allocs.next(kind.rs2);
                Inst::CondBr {
                    taken,
                    not_taken,
                    kind,
                }
            }

            Inst::Mov { rd, rm, ty } => Inst::Mov {
                ty,
                rm: allocs.next(rm),
                rd: allocs.next_writable(rd),
            },

            Inst::MovFromPReg { rd, rm } => {
                debug_assert!([px_reg(2), px_reg(8)].contains(&rm));
                let rd = allocs.next_writable(rd);
                Inst::MovFromPReg { rd, rm }
            }

            Inst::BrTable {
                index,
                tmp1,
                tmp2,
                targets,
            } => Inst::BrTable {
                index: allocs.next(index),
                tmp1: allocs.next_writable(tmp1),
                tmp2: allocs.next_writable(tmp2),
                targets,
            },

            Inst::VirtualSPOffsetAdj { .. } => self,
            Inst::Atomic {
                op,
                rd,
                addr,
                src,
                amo,
            } => Inst::Atomic {
                op,
                amo,
                addr: allocs.next(addr),
                src: allocs.next(src),
                rd: allocs.next_writable(rd),
            },
            Inst::Fence { .. } => self,
            Inst::Auipc { rd, imm } => Inst::Auipc {
                rd: allocs.next_writable(rd),
                imm,
            },

            Inst::LoadAddr { rd, mem } => Inst::LoadAddr {
                mem: mem.with_allocs(allocs),
                rd: allocs.next_writable(rd),
            },

            Inst::Select {
                ref dst,
                condition,
                ref x,
                ref y,
            } => {
                let mut condition: IntegerCompare = condition.clone();
                condition.rs1 = allocs.next(condition.rs1);
                condition.rs2 = allocs.next(condition.rs2);
                let x = alloc_value_regs(x, allocs);
                let y = alloc_value_regs(y, allocs);
                let dst = alloc_writable_value_regs(dst, allocs);

                Inst::Select {
                    dst,
                    condition,
                    x,
                    y,
                }
            }
            Inst::Jalr { rd, base, offset } => {
                // For some reason this does not use base?
                debug_assert!(base.is_real());
                Inst::Jalr {
                    rd: allocs.next_writable(rd),
                    base,
                    offset,
                }
            }

            Inst::EBreak => self,

            Inst::AtomicCas {
                offset,
                t0,
                dst,
                e,
                addr,
                v,
                ty,
            } => Inst::AtomicCas {
                ty,
                offset: allocs.next(offset),
                e: allocs.next(e),
                addr: allocs.next(addr),
                v: allocs.next(v),
                t0: allocs.next_writable(t0),
                dst: allocs.next_writable(dst),
            },

            Inst::AtomicRmwLoop {
                offset,
                op,
                dst,
                ty,
                p,
                x,
                t0,
            } => Inst::AtomicRmwLoop {
                op,
                ty,
                offset: allocs.next(offset),
                p: allocs.next(p),
                x: allocs.next(x),
                t0: allocs.next_writable(t0),
                dst: allocs.next_writable(dst),
            },

            Inst::LoadExtName { rd, name, offset } => Inst::LoadExtName {
                rd: allocs.next_writable(rd),
                name,
                offset,
            },

            Inst::ElfTlsGetAddr { rd, name } => {
                let rd = allocs.next_writable(rd);
                debug_assert_eq!(a0(), rd.to_reg());
                Inst::ElfTlsGetAddr { rd, name }
            }

            Inst::TrapIf {
                rs1,
                rs2,
                cc,
                trap_code,
            } => Inst::TrapIf {
                rs1: allocs.next(rs1),
                rs2: allocs.next(rs2),
                cc,
                trap_code,
            },

            Inst::Udf { .. } => self,

            Inst::AtomicLoad { rd, ty, p } => Inst::AtomicLoad {
                ty,
                p: allocs.next(p),
                rd: allocs.next_writable(rd),
            },

            Inst::AtomicStore { src, ty, p } => Inst::AtomicStore {
                ty,
                src: allocs.next(src),
                p: allocs.next(p),
            },

            Inst::FloatRound {
                op,
                rd,
                int_tmp,
                f_tmp,
                rs,
                ty,
            } => Inst::FloatRound {
                op,
                ty,
                rs: allocs.next(rs),
                int_tmp: allocs.next_writable(int_tmp),
                f_tmp: allocs.next_writable(f_tmp),
                rd: allocs.next_writable(rd),
            },

            Inst::Popcnt {
                sum,
                tmp,
                step,
                rs,
                ty,
            } => Inst::Popcnt {
                rs: allocs.next(rs),
                tmp: allocs.next_writable(tmp),
                step: allocs.next_writable(step),
                sum: allocs.next_writable(sum),
                ty,
            },
            Inst::Cltz {
                sum,
                tmp,
                step,
                rs,
                leading,
                ty,
            } => Inst::Cltz {
                rs: allocs.next(rs),
                tmp: allocs.next_writable(tmp),
                step: allocs.next_writable(step),
                sum: allocs.next_writable(sum),
                leading,
                ty,
            },

            Inst::Brev8 {
                rs,
                ty,
                step,
                tmp,
                tmp2,
                rd,
            } => Inst::Brev8 {
                rs: allocs.next(rs),
                step: allocs.next_writable(step),
                tmp: allocs.next_writable(tmp),
                tmp2: allocs.next_writable(tmp2),
                rd: allocs.next_writable(rd),
                ty,
            },

            Inst::StackProbeLoop { .. } => self,

            Inst::VecAluRRRImm5 {
                op,
                vd,
                vd_src,
                imm,
                vs2,
                mask,
                vstate,
            } => Inst::VecAluRRRImm5 {
                op,
                vs2: allocs.next(vs2),
                vd_src: allocs.next(vd_src),
                vd: allocs.next_writable(vd),
                mask: mask.with_allocs(allocs),
                imm,
                vstate,
            },

            Inst::VecAluRRRR {
                op,
                vd,
                vd_src,
                vs1,
                vs2,
                mask,
                vstate,
            } => Inst::VecAluRRRR {
                op,
                vs1: allocs.next(vs1),
                vs2: allocs.next(vs2),
                vd_src: allocs.next(vd_src),
                vd: allocs.next_writable(vd),
                mask: mask.with_allocs(allocs),
                vstate,
            },

            Inst::VecAluRRR {
                op,
                vd,
                vs1,
                vs2,
                mask,
                vstate,
            } => Inst::VecAluRRR {
                op,
                vs1: allocs.next(vs1),
                vs2: allocs.next(vs2),
                vd: allocs.next_writable(vd),
                mask: mask.with_allocs(allocs),
                vstate,
            },

            Inst::VecAluRRImm5 {
                op,
                vd,
                imm,
                vs2,
                mask,
                vstate,
            } => Inst::VecAluRRImm5 {
                op,
                imm,
                vs2: allocs.next(vs2),
                vd: allocs.next_writable(vd),
                mask: mask.with_allocs(allocs),
                vstate,
            },

            Inst::VecAluRR {
                op,
                vd,
                vs,
                mask,
                vstate,
            } => Inst::VecAluRR {
                op,
                vs: allocs.next(vs),
                vd: allocs.next_writable(vd),
                mask: mask.with_allocs(allocs),
                vstate,
            },

            Inst::VecAluRImm5 {
                op,
                vd,
                imm,
                mask,
                vstate,
            } => Inst::VecAluRImm5 {
                vd: allocs.next_writable(vd),
                mask: mask.with_allocs(allocs),
                op,
                imm,
                vstate,
            },

            Inst::VecSetState { rd, vstate } => Inst::VecSetState {
                rd: allocs.next_writable(rd),
                vstate,
            },

            Inst::VecLoad {
                eew,
                to,
                from,
                mask,
                flags,
                vstate,
            } => Inst::VecLoad {
                eew,
                from: from.clone().with_allocs(allocs),
                to: allocs.next_writable(to),
                mask: mask.with_allocs(allocs),
                flags,
                vstate,
            },

            Inst::VecStore {
                eew,
                to,
                from,
                mask,
                flags,
                vstate,
            } => Inst::VecStore {
                eew,
                to: to.clone().with_allocs(allocs),
                from: allocs.next(from),
                mask: mask.with_allocs(allocs),
                flags,
                vstate,
            },
        }
    }
}

fn emit_return_call_common_sequence(
    sink: &mut MachBuffer<Inst>,
    emit_info: &EmitInfo,
    state: &mut EmitState,
    new_stack_arg_size: u32,
    old_stack_arg_size: u32,
) {
    // We are emitting a dynamic number of instructions and might need an
    // island. We emit four instructions regardless of how many stack arguments
    // we have, up to two instructions for the actual call, and then two
    // instructions per word of stack argument space.
    let new_stack_words = new_stack_arg_size / 8;
    let insts = 4 + 2 + 2 * new_stack_words;
    let space_needed = insts * u32::try_from(Inst::UNCOMPRESSED_INSTRUCTION_SIZE).unwrap();
    if sink.island_needed(space_needed) {
        let jump_around_label = sink.get_label();
        Inst::gen_jump(jump_around_label).emit(&[], sink, emit_info, state);
        sink.emit_island(space_needed + 4, &mut state.ctrl_plane);
        sink.bind_label(jump_around_label, &mut state.ctrl_plane);
    }

    // Copy the new frame on top of our current frame.
    //
    // The current stack layout is the following:
    //
    //            | ...                 |
    //            +---------------------+
    //            | ...                 |
    //            | stack arguments     |
    //            | ...                 |
    //    current | return address      |
    //    frame   | old FP              | <-- FP
    //            | ...                 |
    //            | old stack slots     |
    //            | ...                 |
    //            +---------------------+
    //            | ...                 |
    //    new     | new stack arguments |
    //    frame   | ...                 | <-- SP
    //            +---------------------+
    //
    // We need to restore the old FP, restore the return address from the stack
    // to the link register, copy the new stack arguments over the old stack
    // arguments, adjust SP to point to the new stack arguments, and then jump
    // to the callee (which will push the old FP and RA again). Note that the
    // actual jump happens outside this helper function.

    assert_eq!(
        new_stack_arg_size % 8,
        0,
        "size of new stack arguments must be 8-byte aligned"
    );

    // The delta from our frame pointer to the (eventual) stack pointer value
    // when we jump to the tail callee. This is the difference in size of stack
    // arguments as well as accounting for the two words we pushed onto the
    // stack upon entry to this function (the return address and old frame
    // pointer).
    let fp_to_callee_sp = i64::from(old_stack_arg_size) - i64::from(new_stack_arg_size) + 16;

    let tmp1 = regs::writable_spilltmp_reg();
    let tmp2 = regs::writable_spilltmp_reg2();

    // Restore the return address to the link register, and load the old FP into
    // a temporary register.
    //
    // We can't put the old FP into the FP register until after we copy the
    // stack arguments into place, since that uses address modes that are
    // relative to our current FP.
    //
    // Note that the FP is saved in the function prologue for all non-leaf
    // functions, even when `preserve_frame_pointers=false`. Note also that
    // `return_call` instructions make it so that a function is considered
    // non-leaf. Therefore we always have an FP to restore here.

    Inst::gen_load(
        writable_link_reg(),
        AMode::FPOffset(8, I64),
        I64,
        MemFlags::trusted(),
    )
    .emit(&[], sink, emit_info, state);
    Inst::gen_load(tmp1, AMode::FPOffset(0, I64), I64, MemFlags::trusted()).emit(
        &[],
        sink,
        emit_info,
        state,
    );

    // Copy the new stack arguments over the old stack arguments.
    for i in (0..new_stack_words).rev() {
        // Load the `i`th new stack argument word from the temporary stack
        // space.
        Inst::gen_load(
            tmp2,
            AMode::SPOffset(i64::from(i * 8), types::I64),
            types::I64,
            ir::MemFlags::trusted(),
        )
        .emit(&[], sink, emit_info, state);

        // Store it to its final destination on the stack, overwriting our
        // current frame.
        Inst::gen_store(
            AMode::FPOffset(fp_to_callee_sp + i64::from(i * 8), types::I64),
            tmp2.to_reg(),
            types::I64,
            ir::MemFlags::trusted(),
        )
        .emit(&[], sink, emit_info, state);
    }

    // Initialize the SP for the tail callee, deallocating the temporary stack
    // argument space and our current frame at the same time.
    Inst::AluRRImm12 {
        alu_op: AluOPRRI::Addi,
        rd: regs::writable_stack_reg(),
        rs: regs::fp_reg(),
        imm12: Imm12::maybe_from_i64(fp_to_callee_sp).unwrap(),
    }
    .emit(&[], sink, emit_info, state);

    // Move the old FP value from the temporary into the FP register.
    Inst::Mov {
        ty: types::I64,
        rd: regs::writable_fp_reg(),
        rm: tmp1.to_reg(),
    }
    .emit(&[], sink, emit_info, state);

    state.virtual_sp_offset -= i64::from(new_stack_arg_size);
    trace!(
        "return_call[_ind] adjusts virtual sp offset by {} -> {}",
        new_stack_arg_size,
        state.virtual_sp_offset
    );
}