diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index 671a17b19448..3698a449c597 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -1069,7 +1069,11 @@ impl ABIMachineSpec for AArch64MachineDeps { insts } - fn get_number_of_spillslots_for_value(rc: RegClass, vector_size: u32) -> u32 { + fn get_number_of_spillslots_for_value( + rc: RegClass, + vector_size: u32, + _isa_flags: &Self::F, + ) -> u32 { assert_eq!(vector_size % 8, 0); // We allocate in terms of 8-byte slots. match rc { diff --git a/cranelift/codegen/src/isa/riscv64/abi.rs b/cranelift/codegen/src/isa/riscv64/abi.rs index 1dc9a6d9fc4e..7a1f1f9b0be6 100644 --- a/cranelift/codegen/src/isa/riscv64/abi.rs +++ b/cranelift/codegen/src/isa/riscv64/abi.rs @@ -43,6 +43,40 @@ pub struct Riscv64MachineDeps; impl IsaFlags for RiscvFlags {} +impl RiscvFlags { + pub(crate) fn min_vec_reg_size(&self) -> u64 { + let entries = [ + (self.has_zvl65536b(), 65536), + (self.has_zvl32768b(), 32768), + (self.has_zvl16384b(), 16384), + (self.has_zvl8192b(), 8192), + (self.has_zvl4096b(), 4096), + (self.has_zvl2048b(), 2048), + (self.has_zvl1024b(), 1024), + (self.has_zvl512b(), 512), + (self.has_zvl256b(), 256), + // In order to claim the Application Profile V extension, a minimum + // register size of 128 is required. i.e. V implies Zvl128b. + (self.has_v(), 128), + (self.has_zvl128b(), 128), + (self.has_zvl64b(), 64), + (self.has_zvl32b(), 32), + ]; + + for (has_flag, size) in entries.into_iter() { + if !has_flag { + continue; + } + + // Due to a limitation in regalloc2, we can't support types + // larger than 1024 bytes. So limit that here. + return std::cmp::min(size, 1024); + } + + return 0; + } +} + impl ABIMachineSpec for Riscv64MachineDeps { type I = Inst; type F = RiscvFlags; @@ -415,9 +449,9 @@ impl ABIMachineSpec for Riscv64MachineDeps { for reg in clobbered_callee_saves { let r_reg = reg.to_reg(); let ty = match r_reg.class() { - regalloc2::RegClass::Int => I64, - regalloc2::RegClass::Float => F64, - RegClass::Vector => unreachable!(), + RegClass::Int => I64, + RegClass::Float => F64, + RegClass::Vector => unimplemented!("Vector Clobber Saves"), }; if flags.unwind_info() { insts.push(Inst::Unwind { @@ -462,9 +496,9 @@ impl ABIMachineSpec for Riscv64MachineDeps { for reg in &clobbered_callee_saves { let rreg = reg.to_reg(); let ty = match rreg.class() { - regalloc2::RegClass::Int => I64, - regalloc2::RegClass::Float => F64, - RegClass::Vector => unreachable!(), + RegClass::Int => I64, + RegClass::Float => F64, + RegClass::Vector => unimplemented!("Vector Clobber Restores"), }; insts.push(Self::gen_load_stack( StackAMode::SPOffset(-cur_offset, ty), @@ -572,12 +606,16 @@ impl ABIMachineSpec for Riscv64MachineDeps { insts } - fn get_number_of_spillslots_for_value(rc: RegClass, _target_vector_bytes: u32) -> u32 { + fn get_number_of_spillslots_for_value( + rc: RegClass, + _target_vector_bytes: u32, + isa_flags: &RiscvFlags, + ) -> u32 { // We allocate in terms of 8-byte slots. match rc { RegClass::Int => 1, RegClass::Float => 1, - RegClass::Vector => unreachable!(), + RegClass::Vector => (isa_flags.min_vec_reg_size() / 8) as u32, } } @@ -592,20 +630,7 @@ impl ABIMachineSpec for Riscv64MachineDeps { } fn get_regs_clobbered_by_call(_call_conv_of_callee: isa::CallConv) -> PRegSet { - let mut v = PRegSet::empty(); - for (k, need_save) in CALLER_SAVE_X_REG.iter().enumerate() { - if !*need_save { - continue; - } - v.add(px_reg(k)); - } - for (k, need_save) in CALLER_SAVE_F_REG.iter().enumerate() { - if !*need_save { - continue; - } - v.add(pf_reg(k)); - } - v + CLOBBERS } fn get_clobbered_callee_saves( @@ -652,24 +677,12 @@ impl ABIMachineSpec for Riscv64MachineDeps { } } -const CALLER_SAVE_X_REG: [bool; 32] = [ - false, true, false, false, false, true, true, true, // 0-7 - false, false, true, true, true, true, true, true, // 8-15 - true, true, false, false, false, false, false, false, // 16-23 - false, false, false, false, true, true, true, true, // 24-31 -]; const CALLEE_SAVE_X_REG: [bool; 32] = [ false, false, true, false, false, false, false, false, // 0-7 true, true, false, false, false, false, false, false, // 8-15 false, false, true, true, true, true, true, true, // 16-23 true, true, true, true, false, false, false, false, // 24-31 ]; -const CALLER_SAVE_F_REG: [bool; 32] = [ - true, true, true, true, true, true, true, true, // 0-7 - false, true, true, true, true, true, true, true, // 8-15 - true, true, false, false, false, false, false, false, // 16-23 - false, false, false, false, true, true, true, true, // 24-31 -]; const CALLEE_SAVE_F_REG: [bool; 32] = [ false, false, false, false, false, false, false, false, // 0-7 true, false, false, false, false, false, false, false, // 8-15 @@ -680,10 +693,11 @@ const CALLEE_SAVE_F_REG: [bool; 32] = [ /// This should be the registers that must be saved by callee. #[inline] fn is_reg_saved_in_prologue(_conv: CallConv, reg: RealReg) -> bool { - if reg.class() == RegClass::Int { - CALLEE_SAVE_X_REG[reg.hw_enc() as usize] - } else { - CALLEE_SAVE_F_REG[reg.hw_enc() as usize] + match reg.class() { + RegClass::Int => CALLEE_SAVE_X_REG[reg.hw_enc() as usize], + RegClass::Float => CALLEE_SAVE_F_REG[reg.hw_enc() as usize], + // All vector registers are caller saved. + RegClass::Vector => false, } } @@ -697,12 +711,89 @@ fn compute_clobber_size(clobbers: &[Writable]) -> u32 { RegClass::Float => { clobbered_size += 8; } - RegClass::Vector => unreachable!(), + RegClass::Vector => unimplemented!("Vector Size Clobbered"), } } align_to(clobbered_size, 16) } +const fn clobbers() -> PRegSet { + PRegSet::empty() + .with(px_reg(1)) + .with(px_reg(5)) + .with(px_reg(6)) + .with(px_reg(7)) + .with(px_reg(10)) + .with(px_reg(11)) + .with(px_reg(12)) + .with(px_reg(13)) + .with(px_reg(14)) + .with(px_reg(15)) + .with(px_reg(16)) + .with(px_reg(17)) + .with(px_reg(28)) + .with(px_reg(29)) + .with(px_reg(30)) + .with(px_reg(31)) + // F Regs + .with(pf_reg(0)) + .with(pf_reg(1)) + .with(pf_reg(2)) + .with(pf_reg(3)) + .with(pf_reg(4)) + .with(pf_reg(5)) + .with(pf_reg(6)) + .with(pf_reg(7)) + .with(pf_reg(9)) + .with(pf_reg(10)) + .with(pf_reg(11)) + .with(pf_reg(12)) + .with(pf_reg(13)) + .with(pf_reg(14)) + .with(pf_reg(15)) + .with(pf_reg(16)) + .with(pf_reg(17)) + .with(pf_reg(28)) + .with(pf_reg(29)) + .with(pf_reg(30)) + .with(pf_reg(31)) + // V Regs - All vector regs get clobbered + .with(pv_reg(0)) + .with(pv_reg(1)) + .with(pv_reg(2)) + .with(pv_reg(3)) + .with(pv_reg(4)) + .with(pv_reg(5)) + .with(pv_reg(6)) + .with(pv_reg(7)) + .with(pv_reg(8)) + .with(pv_reg(9)) + .with(pv_reg(10)) + .with(pv_reg(11)) + .with(pv_reg(12)) + .with(pv_reg(13)) + .with(pv_reg(14)) + .with(pv_reg(15)) + .with(pv_reg(16)) + .with(pv_reg(17)) + .with(pv_reg(18)) + .with(pv_reg(19)) + .with(pv_reg(20)) + .with(pv_reg(21)) + .with(pv_reg(22)) + .with(pv_reg(23)) + .with(pv_reg(24)) + .with(pv_reg(25)) + .with(pv_reg(26)) + .with(pv_reg(27)) + .with(pv_reg(28)) + .with(pv_reg(29)) + .with(pv_reg(30)) + .with(pv_reg(31)) +} + +const CLOBBERS: PRegSet = clobbers(); + impl Riscv64MachineDeps { fn gen_probestack_unroll(insts: &mut SmallInstVec, guard_size: u32, probe_count: u32) { insts.reserve(probe_count as usize); diff --git a/cranelift/codegen/src/isa/riscv64/inst/emit.rs b/cranelift/codegen/src/isa/riscv64/inst/emit.rs index c4f28f409ca7..df7567918bf4 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/emit.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/emit.rs @@ -927,6 +927,8 @@ impl MachInstEmit for Inst { } &Inst::Mov { rd, rm, ty } => { + debug_assert_ne!(rd.to_reg().class(), RegClass::Vector); + debug_assert_ne!(rm.class(), RegClass::Vector); if rd.to_reg() != rm { let rm = allocs.next(rm); let rd = allocs.next_writable(rd); diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs index a556949b242a..0e662d8054bc 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs @@ -7,7 +7,7 @@ use super::lower::isle::generated_code::{VecAMode, VecElementWidth}; use crate::binemit::{Addend, CodeOffset, Reloc}; pub use crate::ir::condcodes::IntCC; -use crate::ir::types::{self, F32, F64, I128, I16, I32, I64, I8, R32, R64}; +use crate::ir::types::{self, F32, F64, I128, I16, I32, I64, I8, I8X16, R32, R64}; pub use crate::ir::{ExternalName, MemFlags, Opcode, SourceLoc, Type, ValueLabel}; use crate::isa::{CallConv, FunctionAlignment}; @@ -667,7 +667,7 @@ impl MachInst for Inst { match rc { regalloc2::RegClass::Int => I64, regalloc2::RegClass::Float => F64, - regalloc2::RegClass::Vector => unreachable!(), + regalloc2::RegClass::Vector => I8X16, } } @@ -770,7 +770,7 @@ impl MachInst for Inst { let idx = (ty.bytes().ilog2() - 1) as usize; let ty = &SIMD_TYPES[idx][..]; - Ok((&[RegClass::Float], ty)) + Ok((&[RegClass::Vector], ty)) } _ => Err(CodegenError::Unsupported(format!( "Unexpected SSA-value type: {}", @@ -830,24 +830,13 @@ pub fn reg_name(reg: Reg) -> String { 28..=31 => format!("ft{}", real.hw_enc() - 20), _ => unreachable!(), }, - RegClass::Vector => unreachable!(), + RegClass::Vector => format!("v{}", real.hw_enc()), }, None => { format!("{:?}", reg) } } } -pub fn vec_reg_name(reg: Reg) -> String { - match reg.to_real_reg() { - Some(real) => { - assert_eq!(real.class(), RegClass::Float); - format!("v{}", real.hw_enc()) - } - None => { - format!("{:?}", reg) - } - } -} impl Inst { fn print_with_state( @@ -859,10 +848,6 @@ impl Inst { let reg = allocs.next(reg); reg_name(reg) }; - let format_vec_reg = |reg: Reg, allocs: &mut AllocationConsumer<'_>| -> String { - let reg = allocs.next(reg); - vec_reg_name(reg) - }; let format_vec_amode = |amode: &VecAMode, allocs: &mut AllocationConsumer<'_>| -> String { match amode { @@ -1568,9 +1553,9 @@ impl Inst { vs2, ref vstate, } => { - let vs1_s = format_vec_reg(vs1, allocs); - let vs2_s = format_vec_reg(vs2, allocs); - let vd_s = format_vec_reg(vd.to_reg(), allocs); + let vs1_s = format_reg(vs1, allocs); + let vs2_s = format_reg(vs2, allocs); + let vd_s = format_reg(vd.to_reg(), allocs); // Note: vs2 and vs1 here are opposite to the standard scalar ordering. // This is noted in Section 10.1 of the RISC-V Vector spec. @@ -1583,8 +1568,8 @@ impl Inst { vs2, ref vstate, } => { - let vs2_s = format_vec_reg(vs2, allocs); - let vd_s = format_vec_reg(vd.to_reg(), allocs); + let vs2_s = format_reg(vs2, allocs); + let vd_s = format_reg(vd.to_reg(), allocs); format!("{} {},{},{} {}", op, vd_s, vs2_s, imm, vstate) } @@ -1601,7 +1586,7 @@ impl Inst { .. } => { let base = format_vec_amode(from, allocs); - let vd = format_vec_reg(to.to_reg(), allocs); + let vd = format_reg(to.to_reg(), allocs); format!("vl{}.v {},{} {}", eew, vd, base, vstate) } Inst::VecStore { @@ -1612,7 +1597,7 @@ impl Inst { .. } => { let dst = format_vec_amode(to, allocs); - let vs3 = format_vec_reg(*from, allocs); + let vs3 = format_reg(*from, allocs); format!("vs{}.v {},{} {}", eew, vs3, dst, vstate) } } diff --git a/cranelift/codegen/src/isa/riscv64/inst/regs.rs b/cranelift/codegen/src/isa/riscv64/inst/regs.rs index 2bb623dfbff8..429625e22ede 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/regs.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/regs.rs @@ -137,45 +137,37 @@ pub fn writable_spilltmp_reg2() -> Writable { pub fn crate_reg_eviroment(_flags: &settings::Flags) -> MachineEnv { let preferred_regs_by_class: [Vec; 3] = { - let mut x_register: Vec = vec![]; - x_register.push(PReg::new(5, RegClass::Int)); - for i in 6..=7 { - x_register.push(PReg::new(i, RegClass::Int)); - } - for i in 10..=17 { - x_register.push(PReg::new(i, RegClass::Int)); - } - for i in 28..=29 { - x_register.push(PReg::new(i, RegClass::Int)); - } - - let mut f_register: Vec = vec![]; - for i in 0..=7 { - f_register.push(PReg::new(i, RegClass::Float)); - } - for i in 10..=17 { - f_register.push(PReg::new(i, RegClass::Float)); - } - for i in 28..=31 { - f_register.push(PReg::new(i, RegClass::Float)); - } - [x_register, f_register, vec![]] + let x_registers: Vec = (5..=7) + .chain(10..=17) + .chain(28..=29) + .map(|i| PReg::new(i, RegClass::Int)) + .collect(); + + let f_registers: Vec = (0..=7) + .chain(10..=17) + .chain(28..=31) + .map(|i| PReg::new(i, RegClass::Float)) + .collect(); + + let v_registers: Vec = (0..=31).map(|i| PReg::new(i, RegClass::Vector)).collect(); + + [x_registers, f_registers, v_registers] }; let non_preferred_regs_by_class: [Vec; 3] = { - let mut x_register: Vec = vec![]; - x_register.push(PReg::new(9, RegClass::Int)); - for i in 18..=27 { - x_register.push(PReg::new(i, RegClass::Int)); - } - let mut f_register: Vec = vec![]; - for i in 8..=9 { - f_register.push(PReg::new(i, RegClass::Float)); - } - for i in 18..=27 { - f_register.push(PReg::new(i, RegClass::Float)); - } - [x_register, f_register, vec![]] + let x_registers: Vec = (9..=9) + .chain(18..=27) + .map(|i| PReg::new(i, RegClass::Int)) + .collect(); + + let f_registers: Vec = (8..=9) + .chain(18..=27) + .map(|i| PReg::new(i, RegClass::Float)) + .collect(); + + let v_registers = vec![]; + + [x_registers, f_registers, v_registers] }; MachineEnv { @@ -192,7 +184,7 @@ pub fn x_reg(enc: usize) -> Reg { let v_reg = VReg::new(p_reg.index(), p_reg.class()); Reg::from(v_reg) } -pub fn px_reg(enc: usize) -> PReg { +pub const fn px_reg(enc: usize) -> PReg { PReg::new(enc, RegClass::Int) } @@ -222,10 +214,10 @@ pub(crate) fn x_reg_range(start: usize, end: usize) -> Vec> { #[inline] pub fn v_reg(enc: usize) -> Reg { - let p_reg = PReg::new(enc, RegClass::Float); + let p_reg = PReg::new(enc, RegClass::Vector); let v_reg = VReg::new(p_reg.index(), p_reg.class()); Reg::from(v_reg) } -pub fn vx_reg(enc: usize) -> PReg { - PReg::new(enc, RegClass::Float) +pub const fn pv_reg(enc: usize) -> PReg { + PReg::new(enc, RegClass::Vector) } diff --git a/cranelift/codegen/src/isa/riscv64/inst/unwind/systemv.rs b/cranelift/codegen/src/isa/riscv64/inst/unwind/systemv.rs index 4dc84dae80ff..601b194becb2 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/unwind/systemv.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/unwind/systemv.rs @@ -30,17 +30,14 @@ pub fn create_cie() -> CommonInformationEntry { /// Map Cranelift registers to their corresponding Gimli registers. pub fn map_reg(reg: Reg) -> Result { - match reg.class() { - RegClass::Int => { - let reg = reg.to_real_reg().unwrap().hw_enc() as u16; - Ok(Register(reg)) - } - RegClass::Float => { - let reg = reg.to_real_reg().unwrap().hw_enc() as u16; - Ok(Register(32 + reg)) - } - RegClass::Vector => unreachable!(), - } + let reg_offset = match reg.class() { + RegClass::Int => 0, + RegClass::Float => 32, + RegClass::Vector => 64, + }; + + let reg = reg.to_real_reg().unwrap().hw_enc() as u16; + Ok(Register(reg_offset + reg)) } pub(crate) struct RegisterMapper; diff --git a/cranelift/codegen/src/isa/riscv64/lower/isle.rs b/cranelift/codegen/src/isa/riscv64/lower/isle.rs index b18a5049266c..1094dfce2113 100644 --- a/cranelift/codegen/src/isa/riscv64/lower/isle.rs +++ b/cranelift/codegen/src/isa/riscv64/lower/isle.rs @@ -52,39 +52,10 @@ impl<'a, 'b> RV64IsleContext<'a, 'b, MInst, Riscv64Backend> { Self { lower_ctx, backend, - min_vec_reg_size: Self::compute_min_vec_reg_size(backend), + min_vec_reg_size: backend.isa_flags.min_vec_reg_size(), } } - fn compute_min_vec_reg_size(backend: &Riscv64Backend) -> u64 { - let flags = &backend.isa_flags; - let entries = [ - (flags.has_zvl65536b(), 65536), - (flags.has_zvl32768b(), 32768), - (flags.has_zvl16384b(), 16384), - (flags.has_zvl8192b(), 8192), - (flags.has_zvl4096b(), 4096), - (flags.has_zvl2048b(), 2048), - (flags.has_zvl1024b(), 1024), - (flags.has_zvl512b(), 512), - (flags.has_zvl256b(), 256), - // In order to claim the Application Profile V extension, a minimum - // register size of 128 is required. i.e. V implies Zvl128b. - (flags.has_v(), 128), - (flags.has_zvl128b(), 128), - (flags.has_zvl64b(), 64), - (flags.has_zvl32b(), 32), - ]; - - for (has_flag, size) in entries.into_iter() { - if has_flag { - return size; - } - } - - return 0; - } - #[inline] fn emit_list(&mut self, list: &SmallInstVec) { for i in list { diff --git a/cranelift/codegen/src/isa/s390x/abi.rs b/cranelift/codegen/src/isa/s390x/abi.rs index c3858ef5bc25..856e6137ba3c 100644 --- a/cranelift/codegen/src/isa/s390x/abi.rs +++ b/cranelift/codegen/src/isa/s390x/abi.rs @@ -761,7 +761,11 @@ impl ABIMachineSpec for S390xMachineDeps { unimplemented!("StructArgs not implemented for S390X yet"); } - fn get_number_of_spillslots_for_value(rc: RegClass, _vector_scale: u32) -> u32 { + fn get_number_of_spillslots_for_value( + rc: RegClass, + _vector_scale: u32, + _isa_flags: &Self::F, + ) -> u32 { // We allocate in terms of 8-byte slots. match rc { RegClass::Int => 1, diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs index 9999320cada8..81c323ec5541 100644 --- a/cranelift/codegen/src/isa/x64/abi.rs +++ b/cranelift/codegen/src/isa/x64/abi.rs @@ -672,7 +672,11 @@ impl ABIMachineSpec for X64ABIMachineSpec { insts } - fn get_number_of_spillslots_for_value(rc: RegClass, vector_scale: u32) -> u32 { + fn get_number_of_spillslots_for_value( + rc: RegClass, + vector_scale: u32, + _isa_flags: &Self::F, + ) -> u32 { // We allocate in terms of 8-byte slots. match rc { RegClass::Int => 1, diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs index 530bfbb4a57f..e0939112929b 100644 --- a/cranelift/codegen/src/machinst/abi.rs +++ b/cranelift/codegen/src/machinst/abi.rs @@ -578,7 +578,11 @@ pub trait ABIMachineSpec { ) -> SmallVec<[Self::I; 8]>; /// Get the number of spillslots required for the given register-class. - fn get_number_of_spillslots_for_value(rc: RegClass, target_vector_bytes: u32) -> u32; + fn get_number_of_spillslots_for_value( + rc: RegClass, + target_vector_bytes: u32, + isa_flags: &Self::F, + ) -> u32; /// Get the current virtual-SP offset from an instruction-emission state. fn get_virtual_sp_offset_from_state(s: &::State) -> i64; @@ -1937,7 +1941,7 @@ impl Callee { .map(|(_k, v)| v) .unwrap() }; - M::get_number_of_spillslots_for_value(rc, max) + M::get_number_of_spillslots_for_value(rc, max, &self.isa_flags) } /// Generate a spill. diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-abi-large-spill.clif b/cranelift/filetests/filetests/isa/riscv64/simd-abi-large-spill.clif new file mode 100644 index 000000000000..ec63b72898fe --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-abi-large-spill.clif @@ -0,0 +1,71 @@ +test compile precise-output +set unwind_info=false +target riscv64gc has_v has_zvl4096b + +;; This test causes a spill for v3, and exposed an issue with regalloc2 +;; where it is not correctly computing the spill size due to the +;; large spill slot size. + +function u1:0() -> i16x8 system_v { + fn0 = colocated u2:0() -> i32 system_v + +block0: + v3 = vconst.i16x8 0x000000000000000000000000020a0402 + v10 = call fn0() + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; sd s7,-8(sp) +; add sp,-1040 +; block0: +; mv s7,a0 +; vle16.v v3,[const(0)] #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v3,0(nominal_sp) #avl=16, #vtype=(e8, m1, ta, ma) +; call userextname0 +; mv a0,s7 +; vle8.v v3,0(nominal_sp) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v3,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; add sp,+1040 +; ld s7,-8(sp) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; sd s7, -8(sp) +; addi sp, sp, -0x410 +; block1: ; offset 0x18 +; ori s7, a0, 0 +; .byte 0x57, 0x70, 0x84, 0xcc +; auipc t6, 0 +; addi t6, t6, 0x40 +; .byte 0x87, 0xd1, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x01, 0x01, 0x02 +; auipc ra, 0 ; reloc_external RiscvCall u2:0 0 +; jalr ra +; ori a0, s7, 0 +; .byte 0x87, 0x01, 0x01, 0x02 +; .byte 0xa7, 0x01, 0x05, 0x02 +; addi sp, sp, 0x410 +; ld s7, -8(sp) +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret +; .byte 0x02, 0x04, 0x0a, 0x02 +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 +; .byte 0x00, 0x00, 0x00, 0x00 + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-abi.clif b/cranelift/filetests/filetests/isa/riscv64/simd-abi.clif index daf0eda889ff..4f2f0a8dcfe1 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-abi.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-abi.clif @@ -37,93 +37,81 @@ block0( ; sd ra,8(sp) ; sd fp,0(sp) ; mv fp,sp -; fsd fs0,-8(sp) -; fsd fs2,-16(sp) -; fsd fs3,-24(sp) -; fsd fs4,-32(sp) -; fsd fs5,-40(sp) -; fsd fs6,-48(sp) -; fsd fs7,-56(sp) -; fsd fs8,-64(sp) -; fsd fs9,-72(sp) -; fsd fs10,-80(sp) -; fsd fs11,-88(sp) -; add sp,-112 +; add sp,-256 ; block0: -; fsd fa0,0(nominal_sp) -; fsd fa1,8(nominal_sp) -; vle8.v v28,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v29,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v30,48(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v31,64(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v0,80(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v1,96(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v2,112(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v3,128(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v5,144(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v7,160(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v4,176(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v6,192(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v3,0(nominal_sp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v5,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v5,128(nominal_sp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v5,48(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v7,64(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v9,80(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v11,96(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v13,112(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v15,128(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v17,144(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v19,160(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v21,176(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v23,192(fp) #avl=16, #vtype=(e8, m1, ta, ma) ; vle8.v v25,208(fp) #avl=16, #vtype=(e8, m1, ta, ma) ; vle8.v v27,224(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v9,240(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v19,256(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v21,272(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v23,288(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v26,304(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v29,240(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v31,256(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v2,272(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v4,288(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v6,304(fp) #avl=16, #vtype=(e8, m1, ta, ma) ; vle8.v v8,320(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v18,336(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v20,352(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v22,368(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v24,384(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v11,400(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vle8.v v10,416(fp) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v24,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v22,16(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v20,32(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v18,48(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v8,64(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v10,336(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v12,352(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v14,368(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v16,384(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v18,400(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v20,416(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v22,432(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v24,448(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v26,464(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v28,480(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v30,496(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v0,512(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v1,528(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,544(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v3,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v1,16(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v0,32(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v30,48(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v28,64(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; vse8.v v26,80(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v23,96(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v21,112(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v19,128(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v9,144(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v27,160(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v25,176(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v6,192(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v4,208(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v7,224(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v5,240(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v3,256(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v24,96(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v22,112(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v20,128(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v18,144(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v16,160(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v14,176(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v12,192(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v10,208(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v8,224(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v6,240(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v4,256(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; vse8.v v2,272(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v1,288(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v0,304(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v31,320(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v30,336(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v29,352(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v28,368(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v17,384(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v16,400(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v31,288(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v29,304(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v27,320(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v25,336(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v23,352(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v21,368(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v19,384(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v17,400(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; vse8.v v15,416(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v14,432(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v13,448(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; vse8.v v12,464(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; fld fa4,8(nominal_sp) -; vse8.v v14,480(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; fld fa7,0(nominal_sp) -; vse8.v v17,496(a0) #avl=16, #vtype=(e8, m1, ta, ma) -; add sp,+112 -; fld fs0,-8(sp) -; fld fs2,-16(sp) -; fld fs3,-24(sp) -; fld fs4,-32(sp) -; fld fs5,-40(sp) -; fld fs6,-48(sp) -; fld fs7,-56(sp) -; fld fs8,-64(sp) -; fld fs9,-72(sp) -; fld fs10,-80(sp) -; fld fs11,-88(sp) +; vse8.v v13,432(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v11,448(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v9,464(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v7,480(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v5,496(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v5,128(nominal_sp) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v5,512(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,0(nominal_sp) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v3,528(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; add sp,+256 ; ld ra,8(sp) ; ld fp,0(sp) ; add sp,+16 @@ -135,151 +123,151 @@ block0( ; sd ra, 8(sp) ; sd s0, 0(sp) ; ori s0, sp, 0 -; fsd fs0, -8(sp) -; fsd fs2, -0x10(sp) -; fsd fs3, -0x18(sp) -; fsd fs4, -0x20(sp) -; fsd fs5, -0x28(sp) -; fsd fs6, -0x30(sp) -; fsd fs7, -0x38(sp) -; fsd fs8, -0x40(sp) -; fsd fs9, -0x48(sp) -; fsd fs10, -0x50(sp) -; fsd fs11, -0x58(sp) -; addi sp, sp, -0x70 -; block1: ; offset 0x40 -; fsd fa0, 0(sp) -; fsd fa1, 8(sp) +; addi sp, sp, -0x100 +; block1: ; offset 0x14 ; .byte 0x57, 0x70, 0x08, 0xcc ; addi t6, s0, 0x10 -; .byte 0x07, 0x8e, 0x0f, 0x02 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0xa7, 0x01, 0x01, 0x02 ; addi t6, s0, 0x20 -; .byte 0x87, 0x8e, 0x0f, 0x02 +; .byte 0x87, 0x82, 0x0f, 0x02 +; addi t6, sp, 0x80 +; .byte 0xa7, 0x82, 0x0f, 0x02 ; addi t6, s0, 0x30 -; .byte 0x07, 0x8f, 0x0f, 0x02 +; .byte 0x87, 0x82, 0x0f, 0x02 ; addi t6, s0, 0x40 -; .byte 0x87, 0x8f, 0x0f, 0x02 +; .byte 0x87, 0x83, 0x0f, 0x02 ; addi t6, s0, 0x50 -; .byte 0x07, 0x80, 0x0f, 0x02 +; .byte 0x87, 0x84, 0x0f, 0x02 ; addi t6, s0, 0x60 -; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x87, 0x85, 0x0f, 0x02 ; addi t6, s0, 0x70 -; .byte 0x07, 0x81, 0x0f, 0x02 +; .byte 0x87, 0x86, 0x0f, 0x02 ; addi t6, s0, 0x80 -; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x87, 0x87, 0x0f, 0x02 ; addi t6, s0, 0x90 -; .byte 0x87, 0x82, 0x0f, 0x02 +; .byte 0x87, 0x88, 0x0f, 0x02 ; addi t6, s0, 0xa0 -; .byte 0x87, 0x83, 0x0f, 0x02 +; .byte 0x87, 0x89, 0x0f, 0x02 ; addi t6, s0, 0xb0 -; .byte 0x07, 0x82, 0x0f, 0x02 +; .byte 0x87, 0x8a, 0x0f, 0x02 ; addi t6, s0, 0xc0 -; .byte 0x07, 0x83, 0x0f, 0x02 +; .byte 0x87, 0x8b, 0x0f, 0x02 ; addi t6, s0, 0xd0 ; .byte 0x87, 0x8c, 0x0f, 0x02 ; addi t6, s0, 0xe0 ; .byte 0x87, 0x8d, 0x0f, 0x02 ; addi t6, s0, 0xf0 -; .byte 0x87, 0x84, 0x0f, 0x02 +; .byte 0x87, 0x8e, 0x0f, 0x02 ; addi t6, s0, 0x100 -; .byte 0x87, 0x89, 0x0f, 0x02 +; .byte 0x87, 0x8f, 0x0f, 0x02 ; addi t6, s0, 0x110 -; .byte 0x87, 0x8a, 0x0f, 0x02 +; .byte 0x07, 0x81, 0x0f, 0x02 ; addi t6, s0, 0x120 -; .byte 0x87, 0x8b, 0x0f, 0x02 +; .byte 0x07, 0x82, 0x0f, 0x02 ; addi t6, s0, 0x130 -; .byte 0x07, 0x8d, 0x0f, 0x02 +; .byte 0x07, 0x83, 0x0f, 0x02 ; addi t6, s0, 0x140 ; .byte 0x07, 0x84, 0x0f, 0x02 ; addi t6, s0, 0x150 -; .byte 0x07, 0x89, 0x0f, 0x02 +; .byte 0x07, 0x85, 0x0f, 0x02 ; addi t6, s0, 0x160 -; .byte 0x07, 0x8a, 0x0f, 0x02 +; .byte 0x07, 0x86, 0x0f, 0x02 ; addi t6, s0, 0x170 -; .byte 0x07, 0x8b, 0x0f, 0x02 +; .byte 0x07, 0x87, 0x0f, 0x02 ; addi t6, s0, 0x180 -; .byte 0x07, 0x8c, 0x0f, 0x02 +; .byte 0x07, 0x88, 0x0f, 0x02 ; addi t6, s0, 0x190 -; .byte 0x87, 0x85, 0x0f, 0x02 +; .byte 0x07, 0x89, 0x0f, 0x02 ; addi t6, s0, 0x1a0 -; .byte 0x07, 0x85, 0x0f, 0x02 -; .byte 0x27, 0x0c, 0x05, 0x02 +; .byte 0x07, 0x8a, 0x0f, 0x02 +; addi t6, s0, 0x1b0 +; .byte 0x07, 0x8b, 0x0f, 0x02 +; addi t6, s0, 0x1c0 +; .byte 0x07, 0x8c, 0x0f, 0x02 +; addi t6, s0, 0x1d0 +; .byte 0x07, 0x8d, 0x0f, 0x02 +; addi t6, s0, 0x1e0 +; .byte 0x07, 0x8e, 0x0f, 0x02 +; addi t6, s0, 0x1f0 +; .byte 0x07, 0x8f, 0x0f, 0x02 +; addi t6, s0, 0x200 +; .byte 0x07, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x210 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x220 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0xa7, 0x01, 0x05, 0x02 ; addi t6, a0, 0x10 -; .byte 0x27, 0x8b, 0x0f, 0x02 +; .byte 0xa7, 0x80, 0x0f, 0x02 ; addi t6, a0, 0x20 -; .byte 0x27, 0x8a, 0x0f, 0x02 +; .byte 0x27, 0x80, 0x0f, 0x02 ; addi t6, a0, 0x30 -; .byte 0x27, 0x89, 0x0f, 0x02 +; .byte 0x27, 0x8f, 0x0f, 0x02 ; addi t6, a0, 0x40 -; .byte 0x27, 0x84, 0x0f, 0x02 +; .byte 0x27, 0x8e, 0x0f, 0x02 ; addi t6, a0, 0x50 ; .byte 0x27, 0x8d, 0x0f, 0x02 ; addi t6, a0, 0x60 -; .byte 0xa7, 0x8b, 0x0f, 0x02 +; .byte 0x27, 0x8c, 0x0f, 0x02 ; addi t6, a0, 0x70 -; .byte 0xa7, 0x8a, 0x0f, 0x02 +; .byte 0x27, 0x8b, 0x0f, 0x02 ; addi t6, a0, 0x80 -; .byte 0xa7, 0x89, 0x0f, 0x02 +; .byte 0x27, 0x8a, 0x0f, 0x02 ; addi t6, a0, 0x90 -; .byte 0xa7, 0x84, 0x0f, 0x02 +; .byte 0x27, 0x89, 0x0f, 0x02 ; addi t6, a0, 0xa0 -; .byte 0xa7, 0x8d, 0x0f, 0x02 +; .byte 0x27, 0x88, 0x0f, 0x02 ; addi t6, a0, 0xb0 -; .byte 0xa7, 0x8c, 0x0f, 0x02 +; .byte 0x27, 0x87, 0x0f, 0x02 ; addi t6, a0, 0xc0 -; .byte 0x27, 0x83, 0x0f, 0x02 +; .byte 0x27, 0x86, 0x0f, 0x02 ; addi t6, a0, 0xd0 -; .byte 0x27, 0x82, 0x0f, 0x02 +; .byte 0x27, 0x85, 0x0f, 0x02 ; addi t6, a0, 0xe0 -; .byte 0xa7, 0x83, 0x0f, 0x02 +; .byte 0x27, 0x84, 0x0f, 0x02 ; addi t6, a0, 0xf0 -; .byte 0xa7, 0x82, 0x0f, 0x02 +; .byte 0x27, 0x83, 0x0f, 0x02 ; addi t6, a0, 0x100 -; .byte 0xa7, 0x81, 0x0f, 0x02 +; .byte 0x27, 0x82, 0x0f, 0x02 ; addi t6, a0, 0x110 ; .byte 0x27, 0x81, 0x0f, 0x02 ; addi t6, a0, 0x120 -; .byte 0xa7, 0x80, 0x0f, 0x02 +; .byte 0xa7, 0x8f, 0x0f, 0x02 ; addi t6, a0, 0x130 -; .byte 0x27, 0x80, 0x0f, 0x02 +; .byte 0xa7, 0x8e, 0x0f, 0x02 ; addi t6, a0, 0x140 -; .byte 0xa7, 0x8f, 0x0f, 0x02 +; .byte 0xa7, 0x8d, 0x0f, 0x02 ; addi t6, a0, 0x150 -; .byte 0x27, 0x8f, 0x0f, 0x02 +; .byte 0xa7, 0x8c, 0x0f, 0x02 ; addi t6, a0, 0x160 -; .byte 0xa7, 0x8e, 0x0f, 0x02 +; .byte 0xa7, 0x8b, 0x0f, 0x02 ; addi t6, a0, 0x170 -; .byte 0x27, 0x8e, 0x0f, 0x02 +; .byte 0xa7, 0x8a, 0x0f, 0x02 ; addi t6, a0, 0x180 -; .byte 0xa7, 0x88, 0x0f, 0x02 +; .byte 0xa7, 0x89, 0x0f, 0x02 ; addi t6, a0, 0x190 -; .byte 0x27, 0x88, 0x0f, 0x02 +; .byte 0xa7, 0x88, 0x0f, 0x02 ; addi t6, a0, 0x1a0 ; .byte 0xa7, 0x87, 0x0f, 0x02 ; addi t6, a0, 0x1b0 -; .byte 0x27, 0x87, 0x0f, 0x02 -; addi t6, a0, 0x1c0 ; .byte 0xa7, 0x86, 0x0f, 0x02 +; addi t6, a0, 0x1c0 +; .byte 0xa7, 0x85, 0x0f, 0x02 ; addi t6, a0, 0x1d0 -; .byte 0x27, 0x86, 0x0f, 0x02 -; fld fa4, 8(sp) +; .byte 0xa7, 0x84, 0x0f, 0x02 ; addi t6, a0, 0x1e0 -; .byte 0x27, 0x87, 0x0f, 0x02 -; fld fa7, 0(sp) +; .byte 0xa7, 0x83, 0x0f, 0x02 ; addi t6, a0, 0x1f0 -; .byte 0xa7, 0x88, 0x0f, 0x02 -; addi sp, sp, 0x70 -; fld fs0, -8(sp) -; fld fs2, -0x10(sp) -; fld fs3, -0x18(sp) -; fld fs4, -0x20(sp) -; fld fs5, -0x28(sp) -; fld fs6, -0x30(sp) -; fld fs7, -0x38(sp) -; fld fs8, -0x40(sp) -; fld fs9, -0x48(sp) -; fld fs10, -0x50(sp) -; fld fs11, -0x58(sp) +; .byte 0xa7, 0x82, 0x0f, 0x02 +; addi t6, sp, 0x80 +; .byte 0x87, 0x82, 0x0f, 0x02 +; addi t6, a0, 0x200 +; .byte 0xa7, 0x82, 0x0f, 0x02 +; .byte 0x87, 0x01, 0x01, 0x02 +; addi t6, a0, 0x210 +; .byte 0xa7, 0x81, 0x0f, 0x02 +; addi sp, sp, 0x100 ; ld ra, 8(sp) ; ld s0, 0(sp) ; addi sp, sp, 0x10 diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-band.clif b/cranelift/filetests/filetests/isa/riscv64/simd-band.clif index 6726b53d831f..c4c6a3530482 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-band.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-band.clif @@ -10,14 +10,37 @@ block0(v0: i8x16, v1: i8x16): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vand.vv v10,v10,v11 #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x57, 0x85, 0xa5, 0x26 +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x83, 0x11, 0x26 +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %band_i16x8(i16x8, i16x8) -> i16x8 { @@ -27,14 +50,39 @@ block0(v0: i16x8, v1: i16x8): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vand.vv v10,v10,v11 #avl=8, #vtype=(e16, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x84, 0xcc -; .byte 0x57, 0x85, 0xa5, 0x26 +; .byte 0x57, 0x83, 0x11, 0x26 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %band_i32x4(i32x4, i32x4) -> i32x4 { @@ -44,14 +92,39 @@ block0(v0: i32x4, v1: i32x4): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vand.vv v10,v10,v11 #avl=4, #vtype=(e32, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x57, 0x85, 0xa5, 0x26 +; .byte 0x57, 0x83, 0x11, 0x26 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %band_i64x2(i64x2, i64x2) -> i64x2 { @@ -61,13 +134,38 @@ block0(v0: i64x2, v1: i64x2): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vand.vv v10,v10,v11 #avl=2, #vtype=(e64, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x85, 0xa5, 0x26 +; .byte 0x57, 0x83, 0x11, 0x26 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif index 6f6a191b62ce..7f8beb629f50 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif @@ -10,14 +10,37 @@ block0(v0: i8x16, v1: i8x16): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vor.vv v10,v10,v11 #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vor.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x57, 0x85, 0xa5, 0x2a +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x83, 0x11, 0x2a +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %bor_i16x8(i16x8, i16x8) -> i16x8 { @@ -27,14 +50,39 @@ block0(v0: i16x8, v1: i16x8): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vor.vv v10,v10,v11 #avl=8, #vtype=(e16, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vor.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x84, 0xcc -; .byte 0x57, 0x85, 0xa5, 0x2a +; .byte 0x57, 0x83, 0x11, 0x2a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %bor_i32x4(i32x4, i32x4) -> i32x4 { @@ -44,14 +92,39 @@ block0(v0: i32x4, v1: i32x4): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vor.vv v10,v10,v11 #avl=4, #vtype=(e32, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vor.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x57, 0x85, 0xa5, 0x2a +; .byte 0x57, 0x83, 0x11, 0x2a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %bor_i64x2(i64x2, i64x2) -> i64x2 { @@ -61,13 +134,38 @@ block0(v0: i64x2, v1: i64x2): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vor.vv v10,v10,v11 #avl=2, #vtype=(e64, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vor.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x85, 0xa5, 0x2a +; .byte 0x57, 0x83, 0x11, 0x2a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif index de505094fd8f..0c8cc8f1ad4c 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif @@ -10,14 +10,37 @@ block0(v0: i8x16, v1: i8x16): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vxor.vv v10,v10,v11 #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vxor.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x57, 0x85, 0xa5, 0x2e +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x83, 0x11, 0x2e +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %bxor_i16x8(i16x8, i16x8) -> i16x8 { @@ -27,14 +50,39 @@ block0(v0: i16x8, v1: i16x8): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vxor.vv v10,v10,v11 #avl=8, #vtype=(e16, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vxor.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x84, 0xcc -; .byte 0x57, 0x85, 0xa5, 0x2e +; .byte 0x57, 0x83, 0x11, 0x2e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %bxor_i32x4(i32x4, i32x4) -> i32x4 { @@ -44,14 +92,39 @@ block0(v0: i32x4, v1: i32x4): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vxor.vv v10,v10,v11 #avl=4, #vtype=(e32, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vxor.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x57, 0x85, 0xa5, 0x2e +; .byte 0x57, 0x83, 0x11, 0x2e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %bxor_i64x2(i64x2, i64x2) -> i64x2 { @@ -61,13 +134,38 @@ block0(v0: i64x2, v1: i64x2): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vxor.vv v10,v10,v11 #avl=2, #vtype=(e64, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vxor.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x85, 0xa5, 0x2e +; .byte 0x57, 0x83, 0x11, 0x2e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-iadd-big.clif b/cranelift/filetests/filetests/isa/riscv64/simd-iadd-big.clif index b026de76984a..3a4f4c3abbfd 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-iadd-big.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-iadd-big.clif @@ -11,14 +11,39 @@ block0(v0:i64x4, v1:i64x4): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vv v10,v10,v11 #avl=4, #vtype=(e64, m1, ta, ma) +; vle16.v v1,16(fp) #avl=16, #vtype=(e16, m1, ta, ma) +; vle16.v v3,48(fp) #avl=16, #vtype=(e16, m1, ta, ma) +; vadd.vv v6,v1,v3 #avl=4, #vtype=(e64, m1, ta, ma) +; vse16.v v6,0(a0) #avl=16, #vtype=(e16, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x88, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0xd0, 0x0f, 0x02 +; addi t6, s0, 0x30 +; .byte 0x87, 0xd1, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x82, 0xcd -; .byte 0x57, 0x85, 0xa5, 0x02 +; .byte 0x57, 0x83, 0x11, 0x02 +; .byte 0x57, 0x70, 0x88, 0xcc +; .byte 0x27, 0x53, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %iadd_i64x8(i64x8, i64x8) -> i64x8 { @@ -28,13 +53,38 @@ block0(v0:i64x8, v1:i64x8): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vv v10,v10,v11 #avl=8, #vtype=(e64, m1, ta, ma) +; vle32.v v1,16(fp) #avl=16, #vtype=(e32, m1, ta, ma) +; vle32.v v3,80(fp) #avl=16, #vtype=(e32, m1, ta, ma) +; vadd.vv v6,v1,v3 #avl=8, #vtype=(e64, m1, ta, ma) +; vse32.v v6,0(a0) #avl=16, #vtype=(e32, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcd +; addi t6, s0, 0x10 +; .byte 0x87, 0xe0, 0x0f, 0x02 +; addi t6, s0, 0x50 +; .byte 0x87, 0xe1, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x84, 0xcd -; .byte 0x57, 0x85, 0xa5, 0x02 +; .byte 0x57, 0x83, 0x11, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcd +; .byte 0x27, 0x63, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-iadd-small.clif b/cranelift/filetests/filetests/isa/riscv64/simd-iadd-small.clif index edab35d60a1c..00077f3e4a69 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-iadd-small.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-iadd-small.clif @@ -10,14 +10,37 @@ block0(v0:i8x8, v1:i8x8): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vv v10,v10,v11 #avl=8, #vtype=(e8, m1, ta, ma) +; vle8.v v1,16(fp) #avl=8, #vtype=(e8, m1, ta, ma) +; vle8.v v3,24(fp) #avl=8, #vtype=(e8, m1, ta, ma) +; vadd.vv v6,v1,v3 #avl=8, #vtype=(e8, m1, ta, ma) +; vse8.v v6,0(a0) #avl=8, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 ; .byte 0x57, 0x70, 0x04, 0xcc -; .byte 0x57, 0x85, 0xa5, 0x02 +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x18 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x83, 0x11, 0x02 +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %iadd_i16x4(i16x4, i16x4) -> i16x4 { @@ -27,14 +50,39 @@ block0(v0:i16x4, v1:i16x4): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vv v10,v10,v11 #avl=4, #vtype=(e16, m1, ta, ma) +; vle8.v v1,16(fp) #avl=8, #vtype=(e8, m1, ta, ma) +; vle8.v v3,24(fp) #avl=8, #vtype=(e8, m1, ta, ma) +; vadd.vv v6,v1,v3 #avl=4, #vtype=(e16, m1, ta, ma) +; vse8.v v6,0(a0) #avl=8, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x04, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x18 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x82, 0xcc -; .byte 0x57, 0x85, 0xa5, 0x02 +; .byte 0x57, 0x83, 0x11, 0x02 +; .byte 0x57, 0x70, 0x04, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %iadd_i32x2(i32x2, i32x2) -> i32x2 { @@ -44,13 +92,38 @@ block0(v0:i32x2, v1:i32x2): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vv v10,v10,v11 #avl=2, #vtype=(e32, m1, ta, ma) +; vle8.v v1,16(fp) #avl=8, #vtype=(e8, m1, ta, ma) +; vle8.v v3,24(fp) #avl=8, #vtype=(e8, m1, ta, ma) +; vadd.vv v6,v1,v3 #avl=2, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=8, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x04, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x18 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x01, 0xcd -; .byte 0x57, 0x85, 0xa5, 0x02 +; .byte 0x57, 0x83, 0x11, 0x02 +; .byte 0x57, 0x70, 0x04, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif b/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif index 2ebc6933f6c8..f37d39bc19bf 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif @@ -10,14 +10,37 @@ block0(v0: i8x16, v1: i8x16): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vv v10,v10,v11 #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x57, 0x85, 0xa5, 0x02 +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x83, 0x11, 0x02 +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %iadd_i16x8(i16x8, i16x8) -> i16x8 { @@ -27,14 +50,39 @@ block0(v0: i16x8, v1: i16x8): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vv v10,v10,v11 #avl=8, #vtype=(e16, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x84, 0xcc -; .byte 0x57, 0x85, 0xa5, 0x02 +; .byte 0x57, 0x83, 0x11, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %iadd_i32x4(i32x4, i32x4) -> i32x4 { @@ -44,14 +92,39 @@ block0(v0: i32x4, v1: i32x4): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vv v10,v10,v11 #avl=4, #vtype=(e32, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x57, 0x85, 0xa5, 0x02 +; .byte 0x57, 0x83, 0x11, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %iadd_i64x2(i64x2, i64x2) -> i64x2 { @@ -61,14 +134,39 @@ block0(v0: i64x2, v1: i64x2): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vv v10,v10,v11 #avl=2, #vtype=(e64, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x85, 0xa5, 0x02 +; .byte 0x57, 0x83, 0x11, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %iadd_const_i8x16(i8x16) -> i8x16 { @@ -80,14 +178,34 @@ block0(v0: i8x16): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vi v10,v10,5 #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vi v4,v1,5 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x57, 0xb5, 0xa2, 0x02 +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0xb2, 0x12, 0x02 +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %iadd_const_i16x8(i16x8) -> i16x8 { @@ -99,14 +217,36 @@ block0(v0: i16x8): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vi v10,v10,-16 #avl=8, #vtype=(e16, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vi v4,v1,-16 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x84, 0xcc -; .byte 0x57, 0x35, 0xa8, 0x02 +; .byte 0x57, 0x32, 0x18, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %iadd_const_i32x4(i32x4) -> i32x4 { @@ -118,14 +258,36 @@ block0(v0: i32x4): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vi v10,v10,15 #avl=4, #vtype=(e32, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vi v4,v1,15 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x57, 0xb5, 0xa7, 0x02 +; .byte 0x57, 0xb2, 0x17, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %iadd_const_i64x2(i64x2) -> i64x2 { @@ -137,13 +299,35 @@ block0(v0: i64x2): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vi v10,v10,-5 #avl=2, #vtype=(e64, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vi v4,v1,-5 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0xb5, 0xad, 0x02 +; .byte 0x57, 0xb2, 0x1d, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-imul.clif b/cranelift/filetests/filetests/isa/riscv64/simd-imul.clif index 04b85462cf38..4ffd982866b6 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-imul.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-imul.clif @@ -10,14 +10,37 @@ block0(v0: i8x16, v1: i8x16): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vmul.vv v10,v10,v11 #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmul.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x57, 0xa5, 0xa5, 0x96 +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0xa3, 0x11, 0x96 +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %imul_i16x8(i16x8, i16x8) -> i16x8 { @@ -27,14 +50,39 @@ block0(v0: i16x8, v1: i16x8): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vmul.vv v10,v10,v11 #avl=8, #vtype=(e16, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmul.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x84, 0xcc -; .byte 0x57, 0xa5, 0xa5, 0x96 +; .byte 0x57, 0xa3, 0x11, 0x96 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %imul_i32x4(i32x4, i32x4) -> i32x4 { @@ -44,14 +92,39 @@ block0(v0: i32x4, v1: i32x4): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vmul.vv v10,v10,v11 #avl=4, #vtype=(e32, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmul.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x57, 0xa5, 0xa5, 0x96 +; .byte 0x57, 0xa3, 0x11, 0x96 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %imul_i64x2(i64x2, i64x2) -> i64x2 { @@ -61,13 +134,38 @@ block0(v0: i64x2, v1: i64x2): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vmul.vv v10,v10,v11 #avl=2, #vtype=(e64, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmul.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0xa5, 0xa5, 0x96 +; .byte 0x57, 0xa3, 0x11, 0x96 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif b/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif index 536045967709..550cddb7dd09 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif @@ -10,14 +10,37 @@ block0(v0: i8x16, v1: i8x16): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vsub.vv v10,v10,v11 #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsub.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x57, 0x85, 0xa5, 0x0a +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x83, 0x11, 0x0a +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %isub_i16x8(i16x8, i16x8) -> i16x8 { @@ -27,14 +50,39 @@ block0(v0: i16x8, v1: i16x8): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vsub.vv v10,v10,v11 #avl=8, #vtype=(e16, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsub.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x84, 0xcc -; .byte 0x57, 0x85, 0xa5, 0x0a +; .byte 0x57, 0x83, 0x11, 0x0a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %isub_i32x4(i32x4, i32x4) -> i32x4 { @@ -44,14 +92,39 @@ block0(v0: i32x4, v1: i32x4): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vsub.vv v10,v10,v11 #avl=4, #vtype=(e32, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsub.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x57, 0x85, 0xa5, 0x0a +; .byte 0x57, 0x83, 0x11, 0x0a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %isub_i64x2(i64x2, i64x2) -> i64x2 { @@ -61,13 +134,38 @@ block0(v0: i64x2, v1: i64x2): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vsub.vv v10,v10,v11 #avl=2, #vtype=(e64, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsub.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0x85, 0xa5, 0x0a +; .byte 0x57, 0x83, 0x11, 0x0a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-loads.clif b/cranelift/filetests/filetests/isa/riscv64/simd-loads.clif index 54988d46260c..04bf79d36f29 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-loads.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-loads.clif @@ -11,13 +11,15 @@ block0(v0: i64): ; VCode: ; block0: -; vle8.v v10,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v3,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: ; block0: ; offset 0x0 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x07, 0x05, 0x05, 0x02 +; .byte 0x87, 0x01, 0x05, 0x02 +; .byte 0xa7, 0x81, 0x05, 0x02 ; ret function %load_i16x8(i64) -> i16x8 { @@ -28,13 +30,16 @@ block0(v0: i64): ; VCode: ; block0: -; vle16.v v10,0(a0) #avl=8, #vtype=(e16, m1, ta, ma) +; vle16.v v3,0(a0) #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v3,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: ; block0: ; offset 0x0 ; .byte 0x57, 0x70, 0x84, 0xcc -; .byte 0x07, 0x55, 0x05, 0x02 +; .byte 0x87, 0x51, 0x05, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x81, 0x05, 0x02 ; ret function %load_i32x4(i64) -> i32x4 { @@ -45,13 +50,16 @@ block0(v0: i64): ; VCode: ; block0: -; vle32.v v10,0(a0) #avl=4, #vtype=(e32, m1, ta, ma) +; vle32.v v3,0(a0) #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v3,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: ; block0: ; offset 0x0 ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x07, 0x65, 0x05, 0x02 +; .byte 0x87, 0x61, 0x05, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x81, 0x05, 0x02 ; ret function %load_i64x2(i64) -> i64x2 { @@ -62,12 +70,15 @@ block0(v0: i64): ; VCode: ; block0: -; vle64.v v10,0(a0) #avl=2, #vtype=(e64, m1, ta, ma) +; vle64.v v3,0(a0) #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v3,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: ; block0: ; offset 0x0 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x07, 0x75, 0x05, 0x02 +; .byte 0x87, 0x71, 0x05, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x81, 0x05, 0x02 ; ret diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-smulhi.clif b/cranelift/filetests/filetests/isa/riscv64/simd-smulhi.clif index 87e9b3716cbc..2ce6f351b7c6 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-smulhi.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-smulhi.clif @@ -10,14 +10,37 @@ block0(v0: i8x16, v1: i8x16): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vmulh.vv v10,v10,v11 #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulh.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x57, 0xa5, 0xa5, 0x9e +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0xa3, 0x11, 0x9e +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %smulhi_i16x8(i16x8, i16x8) -> i16x8 { @@ -27,14 +50,39 @@ block0(v0: i16x8, v1: i16x8): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vmulh.vv v10,v10,v11 #avl=8, #vtype=(e16, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulh.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x84, 0xcc -; .byte 0x57, 0xa5, 0xa5, 0x9e +; .byte 0x57, 0xa3, 0x11, 0x9e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %smulhi_i32x4(i32x4, i32x4) -> i32x4 { @@ -44,14 +92,39 @@ block0(v0: i32x4, v1: i32x4): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vmulh.vv v10,v10,v11 #avl=4, #vtype=(e32, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulh.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x57, 0xa5, 0xa5, 0x9e +; .byte 0x57, 0xa3, 0x11, 0x9e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %smulhi_i64x2(i64x2, i64x2) -> i64x2 { @@ -61,13 +134,38 @@ block0(v0: i64x2, v1: i64x2): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vmulh.vv v10,v10,v11 #avl=2, #vtype=(e64, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulh.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0xa5, 0xa5, 0x9e +; .byte 0x57, 0xa3, 0x11, 0x9e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-stores.clif b/cranelift/filetests/filetests/isa/riscv64/simd-stores.clif index 4c93773d290e..630498c0ff82 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-stores.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-stores.clif @@ -10,14 +10,32 @@ block0(v0: i64, v1: i8x16): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vse8.v v10,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v2,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x27, 0x05, 0x05, 0x02 +; addi t6, s0, 0x10 +; .byte 0x07, 0x81, 0x0f, 0x02 +; .byte 0x27, 0x01, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %store_i16x8(i64, i16x8) { @@ -27,14 +45,33 @@ block0(v0: i64, v1: i16x8): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vse16.v v10,0(a0) #avl=8, #vtype=(e16, m1, ta, ma) +; vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vse16.v v2,0(a0) #avl=8, #vtype=(e16, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x84, 0xcc -; .byte 0x27, 0x55, 0x05, 0x02 +; .byte 0x27, 0x51, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %store_i32x4(i64, i32x4) { @@ -44,14 +81,33 @@ block0(v0: i64, v1: i32x4): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vse32.v v10,0(a0) #avl=4, #vtype=(e32, m1, ta, ma) +; vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vse32.v v2,0(a0) #avl=4, #vtype=(e32, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x27, 0x65, 0x05, 0x02 +; .byte 0x27, 0x61, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %store_i64x2(i64, i64x2) { @@ -61,13 +117,32 @@ block0(v0: i64, v1: i64x2): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vse64.v v10,0(a0) #avl=2, #vtype=(e64, m1, ta, ma) +; vle8.v v2,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vse64.v v2,0(a0) #avl=2, #vtype=(e64, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x07, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x27, 0x75, 0x05, 0x02 +; .byte 0x27, 0x71, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-umulhi.clif b/cranelift/filetests/filetests/isa/riscv64/simd-umulhi.clif index b013f71c7921..be224b3f2eb3 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-umulhi.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-umulhi.clif @@ -10,14 +10,37 @@ block0(v0: i8x16, v1: i8x16): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vmulhu.vv v10,v10,v11 #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulhu.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x57, 0xa5, 0xa5, 0x92 +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0xa3, 0x11, 0x92 +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %umulhi_i16x8(i16x8, i16x8) -> i16x8 { @@ -27,14 +50,39 @@ block0(v0: i16x8, v1: i16x8): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vmulhu.vv v10,v10,v11 #avl=8, #vtype=(e16, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulhu.vv v6,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x84, 0xcc -; .byte 0x57, 0xa5, 0xa5, 0x92 +; .byte 0x57, 0xa3, 0x11, 0x92 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %umulhi_i32x4(i32x4, i32x4) -> i32x4 { @@ -44,14 +92,39 @@ block0(v0: i32x4, v1: i32x4): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vmulhu.vv v10,v10,v11 #avl=4, #vtype=(e32, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulhu.vv v6,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x02, 0xcd -; .byte 0x57, 0xa5, 0xa5, 0x92 +; .byte 0x57, 0xa3, 0x11, 0x92 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret function %umulhi_i64x2(i64x2, i64x2) -> i64x2 { @@ -61,13 +134,38 @@ block0(v0: i64x2, v1: i64x2): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vmulhu.vv v10,v10,v11 #avl=2, #vtype=(e64, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vmulhu.vv v6,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 ; .byte 0x57, 0x70, 0x81, 0xcd -; .byte 0x57, 0xa5, 0xa5, 0x92 +; .byte 0x57, 0xa3, 0x11, 0x92 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x03, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-vconst-64bit.clif b/cranelift/filetests/filetests/isa/riscv64/simd-vconst-64bit.clif index 7bf8b5b70c6b..3c02749e7d26 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-vconst-64bit.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-vconst-64bit.clif @@ -10,7 +10,8 @@ block0: ; VCode: ; block0: -; vle8.v v10,[const(0)] #avl=8, #vtype=(e8, m1, ta, ma) +; vle8.v v2,[const(0)] #avl=8, #vtype=(e8, m1, ta, ma) +; vse8.v v2,0(a0) #avl=8, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: @@ -18,11 +19,11 @@ block0: ; .byte 0x57, 0x70, 0x04, 0xcc ; auipc t6, 0 ; addi t6, t6, 0x14 -; .byte 0x07, 0x85, 0x0f, 0x02 +; .byte 0x07, 0x81, 0x0f, 0x02 +; .byte 0x27, 0x01, 0x05, 0x02 ; ret ; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x00, 0x00, 0x00 -; .byte 0x00, 0x00, 0x00, 0x00 function %vconst_ones() -> i8x8 { block0: @@ -32,7 +33,8 @@ block0: ; VCode: ; block0: -; vle8.v v10,[const(0)] #avl=8, #vtype=(e8, m1, ta, ma) +; vle8.v v2,[const(0)] #avl=8, #vtype=(e8, m1, ta, ma) +; vse8.v v2,0(a0) #avl=8, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: @@ -40,9 +42,9 @@ block0: ; .byte 0x57, 0x70, 0x04, 0xcc ; auipc t6, 0 ; addi t6, t6, 0x14 -; .byte 0x07, 0x85, 0x0f, 0x02 +; .byte 0x07, 0x81, 0x0f, 0x02 +; .byte 0x27, 0x01, 0x05, 0x02 ; ret -; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0xff, 0xff, 0xff, 0xff ; .byte 0xff, 0xff, 0xff, 0xff @@ -54,7 +56,8 @@ block0: ; VCode: ; block0: -; vle8.v v10,[const(0)] #avl=8, #vtype=(e8, m1, ta, ma) +; vle8.v v2,[const(0)] #avl=8, #vtype=(e8, m1, ta, ma) +; vse8.v v2,0(a0) #avl=8, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: @@ -62,9 +65,9 @@ block0: ; .byte 0x57, 0x70, 0x04, 0xcc ; auipc t6, 0 ; addi t6, t6, 0x14 -; .byte 0x07, 0x85, 0x0f, 0x02 +; .byte 0x07, 0x81, 0x0f, 0x02 +; .byte 0x27, 0x01, 0x05, 0x02 ; ret -; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x1f, 0x3f, 0x5f ; .byte 0x7f, 0x9f, 0xbf, 0xff @@ -76,15 +79,18 @@ block0: ; VCode: ; block0: -; vle16.v v10,[const(0)] #avl=4, #vtype=(e16, m1, ta, ma) +; vle16.v v2,[const(0)] #avl=4, #vtype=(e16, m1, ta, ma) +; vse8.v v2,0(a0) #avl=8, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: ; block0: ; offset 0x0 ; .byte 0x57, 0x70, 0x82, 0xcc ; auipc t6, 0 -; addi t6, t6, 0x14 -; .byte 0x07, 0xd5, 0x0f, 0x02 +; addi t6, t6, 0x1c +; .byte 0x07, 0xd1, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x04, 0xcc +; .byte 0x27, 0x01, 0x05, 0x02 ; ret ; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x00, 0xff, 0x00 @@ -98,15 +104,18 @@ block0: ; VCode: ; block0: -; vle32.v v10,[const(0)] #avl=2, #vtype=(e32, m1, ta, ma) +; vle32.v v2,[const(0)] #avl=2, #vtype=(e32, m1, ta, ma) +; vse8.v v2,0(a0) #avl=8, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: ; block0: ; offset 0x0 ; .byte 0x57, 0x70, 0x01, 0xcd ; auipc t6, 0 -; addi t6, t6, 0x14 -; .byte 0x07, 0xe5, 0x0f, 0x02 +; addi t6, t6, 0x1c +; .byte 0x07, 0xe1, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x04, 0xcc +; .byte 0x27, 0x01, 0x05, 0x02 ; ret ; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x00, 0x00, 0x00 diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-vconst.clif b/cranelift/filetests/filetests/isa/riscv64/simd-vconst.clif index 0919aefed737..33d2441e561f 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-vconst.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-vconst.clif @@ -10,7 +10,8 @@ block0: ; VCode: ; block0: -; vle8.v v10,[const(0)] #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v2,[const(0)] #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v2,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: @@ -18,7 +19,8 @@ block0: ; .byte 0x57, 0x70, 0x08, 0xcc ; auipc t6, 0 ; addi t6, t6, 0x1c -; .byte 0x07, 0x85, 0x0f, 0x02 +; .byte 0x07, 0x81, 0x0f, 0x02 +; .byte 0x27, 0x01, 0x05, 0x02 ; ret ; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x00, 0x00, 0x00 @@ -26,7 +28,6 @@ block0: ; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x00, 0x00, 0x00 -; .byte 0x00, 0x00, 0x00, 0x00 function %vconst_ones_i8x16() -> i8x16 { block0: @@ -36,7 +37,8 @@ block0: ; VCode: ; block0: -; vle8.v v10,[const(0)] #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v2,[const(0)] #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v2,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: @@ -44,11 +46,11 @@ block0: ; .byte 0x57, 0x70, 0x08, 0xcc ; auipc t6, 0 ; addi t6, t6, 0x1c -; .byte 0x07, 0x85, 0x0f, 0x02 +; .byte 0x07, 0x81, 0x0f, 0x02 +; .byte 0x27, 0x01, 0x05, 0x02 ; ret ; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x00, 0x00, 0x00 -; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0xff, 0xff, 0xff, 0xff ; .byte 0xff, 0xff, 0xff, 0xff ; .byte 0xff, 0xff, 0xff, 0xff @@ -62,7 +64,8 @@ block0: ; VCode: ; block0: -; vle8.v v10,[const(0)] #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v2,[const(0)] #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v2,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: @@ -70,11 +73,11 @@ block0: ; .byte 0x57, 0x70, 0x08, 0xcc ; auipc t6, 0 ; addi t6, t6, 0x1c -; .byte 0x07, 0x85, 0x0f, 0x02 +; .byte 0x07, 0x81, 0x0f, 0x02 +; .byte 0x27, 0x01, 0x05, 0x02 ; ret ; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x00, 0x00, 0x00 -; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x1f, 0x3f, 0x5f ; .byte 0x7f, 0x9f, 0xbf, 0xff ; .byte 0x01, 0x02, 0x03, 0x04 @@ -88,7 +91,8 @@ block0: ; VCode: ; block0: -; vle16.v v10,[const(0)] #avl=8, #vtype=(e16, m1, ta, ma) +; vle16.v v2,[const(0)] #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v2,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: @@ -96,11 +100,11 @@ block0: ; .byte 0x57, 0x70, 0x84, 0xcc ; auipc t6, 0 ; addi t6, t6, 0x1c -; .byte 0x07, 0xd5, 0x0f, 0x02 +; .byte 0x07, 0xd1, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x01, 0x05, 0x02 ; ret ; .byte 0x00, 0x00, 0x00, 0x00 -; .byte 0x00, 0x00, 0x00, 0x00 -; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x00, 0xff, 0x00 ; .byte 0xff, 0x7f, 0xff, 0xff ; .byte 0x01, 0x00, 0x02, 0x00 @@ -114,7 +118,8 @@ block0: ; VCode: ; block0: -; vle32.v v10,[const(0)] #avl=4, #vtype=(e32, m1, ta, ma) +; vle32.v v2,[const(0)] #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v2,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: @@ -122,12 +127,12 @@ block0: ; .byte 0x57, 0x70, 0x02, 0xcd ; auipc t6, 0 ; addi t6, t6, 0x1c -; .byte 0x07, 0xe5, 0x0f, 0x02 +; .byte 0x07, 0xe1, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x01, 0x05, 0x02 ; ret ; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x00, 0x00, 0x00 -; .byte 0x00, 0x00, 0x00, 0x00 -; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0xff, 0xff, 0xff, 0xff ; .byte 0x01, 0x00, 0x00, 0x00 ; .byte 0x02, 0x00, 0x00, 0x00 @@ -140,7 +145,8 @@ block0: ; VCode: ; block0: -; vle64.v v10,[const(0)] #avl=2, #vtype=(e64, m1, ta, ma) +; vle64.v v2,[const(0)] #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v2,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) ; ret ; ; Disassembled: @@ -148,13 +154,13 @@ block0: ; .byte 0x57, 0x70, 0x81, 0xcd ; auipc t6, 0 ; addi t6, t6, 0x1c -; .byte 0x07, 0xf5, 0x0f, 0x02 +; .byte 0x07, 0xf1, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x01, 0x05, 0x02 ; ret ; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0x00, 0x00, 0x00, 0x00 -; .byte 0x00, 0x00, 0x00, 0x00 -; .byte 0x00, 0x00, 0x00, 0x00 ; .byte 0xff, 0xff, 0xff, 0xff ; .byte 0xff, 0xff, 0xff, 0xff diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-vstate.clif b/cranelift/filetests/filetests/isa/riscv64/simd-vstate.clif index 0eff3a179724..f6412fb3e148 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-vstate.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-vstate.clif @@ -12,19 +12,46 @@ block0(v0: i8x16, v1: i16x8): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vv v10,v10,v10 #avl=16, #vtype=(e8, m1, ta, ma) -; vadd.vv v5,v11,v11 #avl=8, #vtype=(e16, m1, ta, ma) -; vadd.vv v11,v5,v5 #avl=8, #vtype=(e16, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vv v8,v1,v1 #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vv v9,v3,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vadd.vv v9,v9,v9 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v8,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v9,16(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x57, 0x05, 0xa5, 0x02 +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x84, 0x10, 0x02 ; .byte 0x57, 0x70, 0x84, 0xcc -; .byte 0xd7, 0x82, 0xb5, 0x02 -; .byte 0xd7, 0x85, 0x52, 0x02 +; .byte 0xd7, 0x84, 0x31, 0x02 +; .byte 0xd7, 0x84, 0x94, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x04, 0x05, 0x02 +; addi t6, a0, 0x10 +; .byte 0xa7, 0x84, 0x0f, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret ;; When the block changes, we need to reemit the vector state instruction @@ -44,25 +71,48 @@ block2(v6: i8x16, v7: i8x16): } ; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp ; block0: -; vadd.vv v5,v10,v11 #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vv v8,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma) ; j label1 ; block1: -; vadd.vv v6,v11,v5 #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vv v9,v3,v8 #avl=16, #vtype=(e8, m1, ta, ma) ; j label2 ; block2: -; vadd.vv v10,v5,v6 #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vv v10,v8,v9 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v10,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 ; ret ; ; Disassembled: ; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0xd7, 0x82, 0xa5, 0x02 -; block1: ; offset 0x8 +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; .byte 0x57, 0x84, 0x11, 0x02 +; block2: ; offset 0x28 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x57, 0x83, 0xb2, 0x02 -; block2: ; offset 0x10 +; .byte 0xd7, 0x04, 0x34, 0x02 +; block3: ; offset 0x30 ; .byte 0x57, 0x70, 0x08, 0xcc -; .byte 0x57, 0x05, 0x53, 0x02 +; .byte 0x57, 0x85, 0x84, 0x02 +; .byte 0x27, 0x05, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 ; ret diff --git a/cranelift/filetests/filetests/runtests/simd-vconst-large.clif b/cranelift/filetests/filetests/runtests/simd-vconst-large.clif new file mode 100644 index 000000000000..dbcc64763de8 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-vconst-large.clif @@ -0,0 +1,143 @@ +test run +target s390x +target aarch64 +set enable_simd +target x86_64 has_sse3 has_ssse3 has_sse41 +target x86_64 has_sse3 has_ssse3 has_sse41 has_avx +target riscv64 has_v + + +;; This tests that vconst correctly loads large offsets into the constant pool +function %vconst_iadd_large() -> i32x4 { +block0: + v0 = vconst.i32x4 [0 0 0 0] + v1 = vconst.i32x4 [1 1 1 1] + v2 = vconst.i32x4 [2 2 2 2] + v3 = vconst.i32x4 [3 3 3 3] + v4 = vconst.i32x4 [4 4 4 4] + v5 = vconst.i32x4 [5 5 5 5] + v6 = vconst.i32x4 [6 6 6 6] + v7 = vconst.i32x4 [7 7 7 7] + v8 = vconst.i32x4 [8 8 8 8] + v9 = vconst.i32x4 [9 9 9 9] + v10 = vconst.i32x4 [10 10 10 10] + v11 = vconst.i32x4 [11 11 11 11] + v12 = vconst.i32x4 [12 12 12 12] + v13 = vconst.i32x4 [13 13 13 13] + v14 = vconst.i32x4 [14 14 14 14] + v15 = vconst.i32x4 [15 15 15 15] + v16 = vconst.i32x4 [16 16 16 16] + v17 = vconst.i32x4 [17 17 17 17] + v18 = vconst.i32x4 [18 18 18 18] + v19 = vconst.i32x4 [19 19 19 19] + v20 = vconst.i32x4 [20 20 20 20] + v21 = vconst.i32x4 [21 21 21 21] + v22 = vconst.i32x4 [22 22 22 22] + v23 = vconst.i32x4 [23 23 23 23] + v24 = vconst.i32x4 [24 24 24 24] + v25 = vconst.i32x4 [25 25 25 25] + v26 = vconst.i32x4 [26 26 26 26] + v27 = vconst.i32x4 [27 27 27 27] + v28 = vconst.i32x4 [28 28 28 28] + v29 = vconst.i32x4 [29 29 29 29] + v30 = vconst.i32x4 [30 30 30 30] + v31 = vconst.i32x4 [31 31 31 31] + v32 = vconst.i32x4 [32 32 32 32] + v33 = vconst.i32x4 [33 33 33 33] + v34 = vconst.i32x4 [34 34 34 34] + v35 = vconst.i32x4 [35 35 35 35] + v36 = vconst.i32x4 [36 36 36 36] + v37 = vconst.i32x4 [37 37 37 37] + v38 = vconst.i32x4 [38 38 38 38] + v39 = vconst.i32x4 [39 39 39 39] + v40 = vconst.i32x4 [40 40 40 40] + v41 = vconst.i32x4 [41 41 41 41] + v42 = vconst.i32x4 [42 42 42 42] + v43 = vconst.i32x4 [43 43 43 43] + v44 = vconst.i32x4 [44 44 44 44] + v45 = vconst.i32x4 [45 45 45 45] + v46 = vconst.i32x4 [46 46 46 46] + v47 = vconst.i32x4 [47 47 47 47] + v48 = vconst.i32x4 [48 48 48 48] + v49 = vconst.i32x4 [49 49 49 49] + v50 = vconst.i32x4 [50 50 50 50] + v51 = vconst.i32x4 [51 51 51 51] + v52 = vconst.i32x4 [52 52 52 52] + v53 = vconst.i32x4 [53 53 53 53] + v54 = vconst.i32x4 [54 54 54 54] + v55 = vconst.i32x4 [55 55 55 55] + v56 = vconst.i32x4 [56 56 56 56] + v57 = vconst.i32x4 [57 57 57 57] + v58 = vconst.i32x4 [58 58 58 58] + v59 = vconst.i32x4 [59 59 59 59] + v60 = vconst.i32x4 [60 60 60 60] + v61 = vconst.i32x4 [61 61 61 61] + v62 = vconst.i32x4 [62 62 62 62] + v63 = vconst.i32x4 [63 63 63 63] + + v64 = iadd v0, v1 + v65 = iadd v64, v2 + v66 = iadd v65, v3 + v67 = iadd v66, v4 + v68 = iadd v67, v5 + v69 = iadd v68, v6 + v70 = iadd v69, v7 + v71 = iadd v70, v8 + v72 = iadd v71, v9 + v73 = iadd v72, v10 + v74 = iadd v73, v11 + v75 = iadd v74, v12 + v76 = iadd v75, v13 + v77 = iadd v76, v14 + v78 = iadd v77, v15 + v79 = iadd v78, v16 + v80 = iadd v79, v17 + v81 = iadd v80, v18 + v82 = iadd v81, v19 + v83 = iadd v82, v20 + v84 = iadd v83, v21 + v85 = iadd v84, v22 + v86 = iadd v85, v23 + v87 = iadd v86, v24 + v88 = iadd v87, v25 + v89 = iadd v88, v26 + v90 = iadd v89, v27 + v91 = iadd v90, v28 + v92 = iadd v91, v29 + v93 = iadd v92, v30 + v94 = iadd v93, v31 + v95 = iadd v94, v32 + v96 = iadd v95, v33 + v97 = iadd v96, v34 + v98 = iadd v97, v35 + v99 = iadd v98, v36 + v100 = iadd v99, v37 + v101 = iadd v100, v38 + v102 = iadd v101, v39 + v103 = iadd v102, v40 + v104 = iadd v103, v41 + v105 = iadd v104, v42 + v106 = iadd v105, v43 + v107 = iadd v106, v44 + v108 = iadd v107, v45 + v109 = iadd v108, v46 + v110 = iadd v109, v47 + v111 = iadd v110, v48 + v112 = iadd v111, v49 + v113 = iadd v112, v50 + v114 = iadd v113, v51 + v115 = iadd v114, v52 + v116 = iadd v115, v53 + v117 = iadd v116, v54 + v118 = iadd v117, v55 + v119 = iadd v118, v56 + v120 = iadd v119, v57 + v121 = iadd v120, v58 + v122 = iadd v121, v59 + v123 = iadd v122, v60 + v124 = iadd v123, v61 + v125 = iadd v124, v62 + v126 = iadd v125, v63 + return v126 +} +; run: %vconst_iadd_large() == [0x7e0 0x7e0 0x7e0 0x7e0] \ No newline at end of file