diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs index 0e662d8054bc..d654e77765a6 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs @@ -17,7 +17,7 @@ use crate::{settings, CodegenError, CodegenResult}; pub use crate::ir::condcodes::FloatCC; use alloc::vec::Vec; -use regalloc2::{PRegSet, VReg}; +use regalloc2::{PRegSet, RegClass, VReg}; use smallvec::{smallvec, SmallVec}; use std::boxed::Box; use std::string::{String, ToString}; @@ -53,11 +53,11 @@ pub(crate) type VecWritableReg = Vec>; //============================================================================= // Instructions (top level): definition -use crate::isa::riscv64::lower::isle::generated_code::MInst; pub use crate::isa::riscv64::lower::isle::generated_code::{ AluOPRRI, AluOPRRR, AtomicOP, FClassResult, FFlagsException, FloatRoundOP, FloatSelectOP, FpuOPRR, FpuOPRRR, FpuOPRRRR, IntSelectOP, LoadOP, MInst as Inst, StoreOP, FRM, }; +use crate::isa::riscv64::lower::isle::generated_code::{MInst, VecAluOpRRR}; type BoxCallInfo = Box; type BoxCallIndInfo = Box; @@ -624,12 +624,21 @@ fn riscv64_get_operands VReg>(inst: &Inst, collector: &mut Operan // gen_prologue is called at emit stage. // no need let reg alloc know. } - &Inst::VecAluRRR { vd, vs1, vs2, .. } => { + &Inst::VecAluRRR { + op, vd, vs1, vs2, .. + } => { + debug_assert_eq!(vd.to_reg().class(), RegClass::Vector); + debug_assert_eq!(vs2.class(), RegClass::Vector); + debug_assert_eq!(vs1.class(), op.vs1_regclass()); + collector.reg_use(vs1); collector.reg_use(vs2); collector.reg_def(vd); } &Inst::VecAluRRImm5 { vd, vs2, .. } => { + debug_assert_eq!(vd.to_reg().class(), RegClass::Vector); + debug_assert_eq!(vs2.class(), RegClass::Vector); + collector.reg_use(vs2); collector.reg_def(vd); } @@ -1559,7 +1568,12 @@ impl Inst { // Note: vs2 and vs1 here are opposite to the standard scalar ordering. // This is noted in Section 10.1 of the RISC-V Vector spec. - format!("{} {},{},{} {}", op, vd_s, vs2_s, vs1_s, vstate) + match (op, vs1) { + (VecAluOpRRR::VrsubVX, vs1) if vs1 == zero_reg() => { + format!("vneg.v {},{} {}", vd_s, vs2_s, vstate) + } + _ => format!("{} {},{},{} {}", op, vd_s, vs2_s, vs1_s, vstate), + } } &Inst::VecAluRRImm5 { op, diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs index c6029fb2fdeb..603da8690ea4 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -4,6 +4,7 @@ use crate::isa::riscv64::lower::isle::generated_code::{ VecAMode, VecAluOpRRImm5, VecAluOpRRR, VecAvl, VecElementWidth, VecLmul, VecMaskMode, VecOpCategory, VecOpMasking, VecTailMode, }; +use crate::machinst::RegClass; use crate::Reg; use core::fmt; @@ -245,27 +246,46 @@ impl VecAluOpRRR { 0x57 } pub fn funct3(&self) -> u32 { - match self { - VecAluOpRRR::Vadd - | VecAluOpRRR::Vsub - | VecAluOpRRR::Vand - | VecAluOpRRR::Vor - | VecAluOpRRR::Vxor => VecOpCategory::OPIVV, - VecAluOpRRR::Vmul | VecAluOpRRR::Vmulh | VecAluOpRRR::Vmulhu => VecOpCategory::OPMVV, - } - .encode() + self.category().encode() } pub fn funct6(&self) -> u32 { // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc match self { - VecAluOpRRR::Vadd => 0b000000, - VecAluOpRRR::Vsub => 0b000010, - VecAluOpRRR::Vmul => 0b100101, - VecAluOpRRR::Vmulh => 0b100111, - VecAluOpRRR::Vmulhu => 0b100100, - VecAluOpRRR::Vand => 0b001001, - VecAluOpRRR::Vor => 0b001010, - VecAluOpRRR::Vxor => 0b001011, + VecAluOpRRR::VaddVV | VecAluOpRRR::VaddVX => 0b000000, + VecAluOpRRR::VsubVV | VecAluOpRRR::VsubVX => 0b000010, + VecAluOpRRR::VrsubVX => 0b000011, + VecAluOpRRR::VmulVV => 0b100101, + VecAluOpRRR::VmulhVV => 0b100111, + VecAluOpRRR::VmulhuVV => 0b100100, + VecAluOpRRR::VandVV => 0b001001, + VecAluOpRRR::VorVV => 0b001010, + VecAluOpRRR::VxorVV => 0b001011, + } + } + + pub fn category(&self) -> VecOpCategory { + match self { + VecAluOpRRR::VaddVV + | VecAluOpRRR::VsubVV + | VecAluOpRRR::VandVV + | VecAluOpRRR::VorVV + | VecAluOpRRR::VxorVV => VecOpCategory::OPIVV, + VecAluOpRRR::VmulVV | VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhuVV => { + VecOpCategory::OPMVV + } + VecAluOpRRR::VaddVX | VecAluOpRRR::VsubVX | VecAluOpRRR::VrsubVX => { + VecOpCategory::OPIVX + } + } + } + + // vs1 is the only variable source, vs2 is fixed. + pub fn vs1_regclass(&self) -> RegClass { + match self.category() { + VecOpCategory::OPIVV | VecOpCategory::OPFVV | VecOpCategory::OPMVV => RegClass::Vector, + VecOpCategory::OPIVX | VecOpCategory::OPMVX => RegClass::Int, + VecOpCategory::OPFVF => RegClass::Float, + _ => unreachable!(), } } } @@ -274,8 +294,8 @@ impl fmt::Display for VecAluOpRRR { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut s = format!("{self:?}"); s.make_ascii_lowercase(); - s.push_str(".vv"); - f.write_str(&s) + let (opcode, category) = s.split_at(s.len() - 2); + f.write_str(&format!("{}.{}", opcode, category)) } } @@ -290,7 +310,8 @@ impl VecAluOpRRImm5 { pub fn funct6(&self) -> u32 { // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc match self { - VecAluOpRRImm5::Vadd => 0b000000, + VecAluOpRRImm5::VaddVI => 0b000000, + VecAluOpRRImm5::VrsubVI => 0b000011, } } } @@ -299,8 +320,8 @@ impl fmt::Display for VecAluOpRRImm5 { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut s = format!("{self:?}"); s.make_ascii_lowercase(); - s.push_str(".vi"); - f.write_str(&s) + let (opcode, category) = s.split_at(s.len() - 2); + f.write_str(&format!("{}.{}", opcode, category)) } } diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index 01a60f80abfa..ee02f7b7c503 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -83,19 +83,26 @@ ;; Register to Register ALU Ops (type VecAluOpRRR (enum - (Vadd) - (Vsub) - (Vmul) - (Vmulh) - (Vmulhu) - (Vand) - (Vor) - (Vxor) + ;; Vector-Vector Opcodes + (VaddVV) + (VsubVV) + (VmulVV) + (VmulhVV) + (VmulhuVV) + (VandVV) + (VorVV) + (VxorVV) + + ;; Vector-Scalar Opcodes + (VaddVX) + (VsubVX) + (VrsubVX) )) ;; Register-Imm ALU Ops (type VecAluOpRRImm5 (enum - (Vadd) + (VaddVI) + (VrsubVI) )) @@ -181,44 +188,69 @@ ;; Helper for emitting the `vadd.vv` instruction. (decl rv_vadd_vv (Reg Reg VState) Reg) (rule (rv_vadd_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.Vadd) vs2 vs1 vstate)) + (vec_alu_rrr (VecAluOpRRR.VaddVV) vs2 vs1 vstate)) + +;; Helper for emitting the `vadd.vx` instruction. +(decl rv_vadd_vx (Reg Reg VState) Reg) +(rule (rv_vadd_vx vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VaddVX) vs2 vs1 vstate)) ;; Helper for emitting the `vadd.vi` instruction. (decl rv_vadd_vi (Reg Imm5 VState) Reg) (rule (rv_vadd_vi vs2 imm vstate) - (vec_alu_rr_imm5 (VecAluOpRRImm5.Vadd) vs2 imm vstate)) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VaddVI) vs2 imm vstate)) ;; Helper for emitting the `vsub.vv` instruction. (decl rv_vsub_vv (Reg Reg VState) Reg) (rule (rv_vsub_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.Vsub) vs2 vs1 vstate)) + (vec_alu_rrr (VecAluOpRRR.VsubVV) vs2 vs1 vstate)) + +;; Helper for emitting the `vsub.vx` instruction. +(decl rv_vsub_vx (Reg Reg VState) Reg) +(rule (rv_vsub_vx vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VsubVX) vs2 vs1 vstate)) + +;; Helper for emitting the `vrsub.vx` instruction. +(decl rv_vrsub_vx (Reg Reg VState) Reg) +(rule (rv_vrsub_vx vs2 vs1 vstate) + (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 vs1 vstate)) + +;; Helper for emitting the `vneg.v` pseudo-instruction. +(decl rv_vneg_v (Reg VState) Reg) +(rule (rv_vneg_v vs2 vstate) + (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 (zero_reg) vstate)) + +;; Helper for emitting the `vrsub.vi` instruction. +(decl rv_vrsub_vi (Reg Imm5 VState) Reg) +(rule (rv_vrsub_vi vs2 imm vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VrsubVI) vs2 imm vstate)) ;; Helper for emitting the `vmul.vv` instruction. (decl rv_vmul_vv (Reg Reg VState) Reg) (rule (rv_vmul_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.Vmul) vs2 vs1 vstate)) + (vec_alu_rrr (VecAluOpRRR.VmulVV) vs2 vs1 vstate)) ;; Helper for emitting the `vmulh.vv` instruction. (decl rv_vmulh_vv (Reg Reg VState) Reg) (rule (rv_vmulh_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.Vmulh) vs2 vs1 vstate)) + (vec_alu_rrr (VecAluOpRRR.VmulhVV) vs2 vs1 vstate)) ;; Helper for emitting the `vmulhu.vv` instruction. (decl rv_vmulhu_vv (Reg Reg VState) Reg) (rule (rv_vmulhu_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.Vmulhu) vs2 vs1 vstate)) + (vec_alu_rrr (VecAluOpRRR.VmulhuVV) vs2 vs1 vstate)) ;; Helper for emitting the `vand.vv` instruction. (decl rv_vand_vv (Reg Reg VState) Reg) (rule (rv_vand_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.Vand) vs2 vs1 vstate)) + (vec_alu_rrr (VecAluOpRRR.VandVV) vs2 vs1 vstate)) ;; Helper for emitting the `vor.vv` instruction. (decl rv_vor_vv (Reg Reg VState) Reg) (rule (rv_vor_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.Vor) vs2 vs1 vstate)) + (vec_alu_rrr (VecAluOpRRR.VorVV) vs2 vs1 vstate)) ;; Helper for emitting the `vxor.vv` instruction. (decl rv_vxor_vv (Reg Reg VState) Reg) (rule (rv_vxor_vv vs2 vs1 vstate) - (vec_alu_rrr (VecAluOpRRR.Vxor) vs2 vs1 vstate)) + (vec_alu_rrr (VecAluOpRRR.VxorVV) vs2 vs1 vstate)) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index 207ea631d6b4..cf30e6e89b0f 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -110,10 +110,16 @@ (rule 8 (lower (has_type (ty_vec_fits_in_register ty) (iadd x y))) (rv_vadd_vv x y ty)) -(rule 9 (lower (has_type (ty_vec_fits_in_register ty) (iadd x (replicated_imm5 y)))) +(rule 9 (lower (has_type (ty_vec_fits_in_register ty) (iadd x (splat y)))) + (rv_vadd_vx x y ty)) + +(rule 10 (lower (has_type (ty_vec_fits_in_register ty) (iadd (splat x) y))) + (rv_vadd_vx y x ty)) + +(rule 11 (lower (has_type (ty_vec_fits_in_register ty) (iadd x (replicated_imm5 y)))) (rv_vadd_vi x y ty)) -(rule 10 (lower (has_type (ty_vec_fits_in_register ty) (iadd (replicated_imm5 x) y))) +(rule 12 (lower (has_type (ty_vec_fits_in_register ty) (iadd (replicated_imm5 x) y))) (rv_vadd_vi y x ty)) ;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;; @@ -140,12 +146,25 @@ (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (isub x y))) (rv_vsub_vv x y ty)) +(rule 4 (lower (has_type (ty_vec_fits_in_register ty) (isub x (splat y)))) + (rv_vsub_vx x y ty)) + +(rule 5 (lower (has_type (ty_vec_fits_in_register ty) (isub (splat x) y))) + (rv_vrsub_vx y x ty)) + +(rule 6 (lower (has_type (ty_vec_fits_in_register ty) (isub (replicated_imm5 x) y))) + (rv_vrsub_vi y x ty)) + + ;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; `i64` and smaller. -(rule (lower (has_type ty (ineg val))) +(rule (lower (has_type (ty_int ty) (ineg val))) (neg ty val)) +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (ineg x))) + (rv_vneg_v x ty)) + + ;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (imul x y))) diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif b/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif index f37d39bc19bf..dc5790303ced 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif @@ -331,3 +331,161 @@ block0(v0: i64x2): ; addi sp, sp, 0x10 ; ret +function %iadd_splat_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = iadd v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0xd7, 0x42, 0x15, 0x02 +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %iadd_splat_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = iadd v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0x42, 0x15, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %iadd_splat_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = iadd v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x42, 0x15, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %iadd_splat_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = iadd v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vadd.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x42, 0x15, 0x02 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-ineg.clif b/cranelift/filetests/filetests/isa/riscv64/simd-ineg.clif new file mode 100644 index 000000000000..36aba8eb32a8 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-ineg.clif @@ -0,0 +1,159 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %ineg_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = ineg v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vneg.v v4,v1 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x42, 0x10, 0x0e +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %ineg_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = ineg v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vneg.v v4,v1 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x42, 0x10, 0x0e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %ineg_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = ineg v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vneg.v v4,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x42, 0x10, 0x0e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %ineg_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = ineg v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vneg.v v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x42, 0x10, 0x0e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif b/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif index 550cddb7dd09..9c7f08e2fa7e 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif @@ -169,3 +169,83 @@ block0(v0: i64x2, v1: i64x2): ; addi sp, sp, 0x10 ; ret +function %isub_splat_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = isub v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vsub.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x42, 0x15, 0x0a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %isub_splat_reverse_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = isub v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vrsub.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x42, 0x15, 0x0e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-iadd-splat.clif b/cranelift/filetests/filetests/runtests/simd-iadd-splat.clif index 2fa55bc142aa..bebad9eb6383 100644 --- a/cranelift/filetests/filetests/runtests/simd-iadd-splat.clif +++ b/cranelift/filetests/filetests/runtests/simd-iadd-splat.clif @@ -8,38 +8,71 @@ target x86_64 target x86_64 skylake target riscv64 has_v -function %iadd_splat_i8x16(i8x16) -> i8x16 { +function %iadd_splat_const_i8x16(i8x16) -> i8x16 { block0(v0: i8x16): v1 = iconst.i8 5 v2 = splat.i8x16 v1 v3 = iadd v0, v2 return v3 } -; run: %iadd_splat_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21] +; run: %iadd_splat_const_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21] -function %iadd_splat_i16x8(i16x8) -> i16x8 { +function %iadd_splat_const_i16x8(i16x8) -> i16x8 { block0(v0: i16x8): v1 = iconst.i16 -16 v2 = splat.i16x8 v1 v3 = iadd v0, v2 return v3 } -; run: %iadd_splat_i16x8([1 2 3 4 5 6 7 8]) == [-15 -14 -13 -12 -11 -10 -9 -8] +; run: %iadd_splat_const_i16x8([1 2 3 4 5 6 7 8]) == [-15 -14 -13 -12 -11 -10 -9 -8] -function %iadd_splat_i32x4(i32x4) -> i32x4 { +function %iadd_splat_const_i32x4(i32x4) -> i32x4 { block0(v0: i32x4): v1 = iconst.i32 15 v2 = splat.i32x4 v1 v3 = iadd v0, v2 return v3 } -; run: %iadd_splat_i32x4([1 2 3 4]) == [16 17 18 19] +; run: %iadd_splat_const_i32x4([1 2 3 4]) == [16 17 18 19] -function %iadd_splat_i64x2(i64x2) -> i64x2 { +function %iadd_splat_const_i64x2(i64x2) -> i64x2 { block0(v0: i64x2): v1 = iconst.i64 -5 v2 = splat.i64x2 v1 v3 = iadd v2, v0 return v3 } -; run: %iadd_splat_i64x2([1 2]) == [-4 -3] +; run: %iadd_splat_const_i64x2([1 2]) == [-4 -3] + + +function %iadd_splat_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = iadd v0, v2 + return v3 +} +; run: %iadd_splat_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], -15) == [-14 -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1] + +function %iadd_splat_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = iadd v0, v2 + return v3 +} +; run: %iadd_splat_i16x8([1 2 3 4 5 6 7 8], -10) == [-9 -8 -7 -6 -5 -4 -3 -2] + +function %iadd_splat_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = iadd v0, v2 + return v3 +} +; run: %iadd_splat_i32x4([1 2 3 4], 22) == [23 24 25 26] + +function %iadd_splat_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = iadd v2, v0 + return v3 +} +; run: %iadd_splat_i64x2([1 2], 10) == [11 12] diff --git a/cranelift/filetests/filetests/runtests/simd-ineg.clif b/cranelift/filetests/filetests/runtests/simd-ineg.clif index 4cc78bdf795b..ff26ea5c6521 100644 --- a/cranelift/filetests/filetests/runtests/simd-ineg.clif +++ b/cranelift/filetests/filetests/runtests/simd-ineg.clif @@ -4,6 +4,21 @@ target s390x set enable_simd target x86_64 target x86_64 skylake +target riscv64 has_v + +function %ineg_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = ineg v0 + return v1 +} +; run: %ineg_i8x16([-1 10 2 4 5 6 7 8 9 10 -11 -12 -13 -14 -15 -16]) == [1 -10 -2 -4 -5 -6 -7 -8 -9 -10 11 12 13 14 15 16] + +function %ineg_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = ineg v0 + return v1 +} +; run: %ineg_i16x8([1 2 -3 -4 5 6 -7 -8]) == [-1 -2 3 4 -5 -6 7 8] function %ineg_i32x4(i32x4) -> i32x4 { block0(v0: i32x4): @@ -11,3 +26,11 @@ block0(v0: i32x4): return v1 } ; run: %ineg_i32x4([1 1 1 1]) == [-1 -1 -1 -1] +; run: %ineg_i32x4([1 -9 1 -10]) == [-1 9 -1 10] + +function %ineg_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = ineg v0 + return v1 +} +; run: %ineg_i64x2([99 -10]) == [-99 10] diff --git a/cranelift/filetests/filetests/runtests/simd-isub-splat.clif b/cranelift/filetests/filetests/runtests/simd-isub-splat.clif new file mode 100644 index 000000000000..975c2fadb006 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-isub-splat.clif @@ -0,0 +1,153 @@ +test interpret +test run +target aarch64 +target s390x +target x86_64 has_sse41=false +set enable_simd +target x86_64 +target x86_64 skylake +target riscv64 has_v + + +function %isub_splat_reverse_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = isub v2, v0 + return v3 +} +; run: %isub_splat_reverse_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], 22) == [21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6] + +function %isub_splat_reverse_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = isub v2, v0 + return v3 +} +; run: %isub_splat_reverse_i16x8([1 2 3 4 5 6 7 8], 22) == [21 20 19 18 17 16 15 14] + +function %isub_splat_reverse_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = isub v2, v0 + return v3 +} +; run: %isub_splat_reverse_i32x4([1 2 3 4], 22) == [21 20 19 18] + +function %isub_splat_reverse_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = isub v2, v0 + return v3 +} +; run: %isub_splat_reverse_i64x2([1 2], 22) == [21 20] + + + +function %isub_splat_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = isub v0, v2 + return v3 +} +; run: %isub_splat_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], 22) == [-21 -20 -19 -18 -17 -16 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6] + +function %isub_splat_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = isub v0, v2 + return v3 +} +; run: %isub_splat_i16x8([1 2 3 4 5 6 7 8], 22) == [-21 -20 -19 -18 -17 -16 -15 -14] + +function %isub_splat_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = isub v0, v2 + return v3 +} +; run: %isub_splat_i32x4([1 2 3 4], 22) == [-21 -20 -19 -18] + +function %isub_splat_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = isub v0, v2 + return v3 +} +; run: %isub_splat_i64x2([1 2], 22) == [-21 -20] + + + +function %isub_splat_const_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 5 + v2 = splat.i8x16 v1 + v3 = isub v0, v2 + return v3 +} +; run: %isub_splat_const_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [-4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11] + +function %isub_splat_const_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 -16 + v2 = splat.i16x8 v1 + v3 = isub v0, v2 + return v3 +} +; run: %isub_splat_const_i16x8([1 2 3 4 5 6 7 8]) == [17 18 19 20 21 22 23 24] + +function %isub_splat_const_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 15 + v2 = splat.i32x4 v1 + v3 = isub v0, v2 + return v3 +} +; run: %isub_splat_const_i32x4([1 2 3 4]) == [-14 -13 -12 -11] + +function %isub_splat_const_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 -5 + v2 = splat.i64x2 v1 + v3 = isub v0, v2 + return v3 +} +; run: %isub_splat_const_i64x2([1 2]) == [6 7] + + + +function %isub_splat_const_reverse_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 5 + v2 = splat.i8x16 v1 + v3 = isub v2, v0 + return v3 +} +; run: %isub_splat_const_reverse_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [4 3 2 1 0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11] + +function %isub_splat_const_reverse_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 -16 + v2 = splat.i16x8 v1 + v3 = isub v2, v0 + return v3 +} +; run: %isub_splat_const_reverse_i16x8([1 2 3 4 5 6 7 8]) == [-17 -18 -19 -20 -21 -22 -23 -24] + +function %isub_splat_const_reverse_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 15 + v2 = splat.i32x4 v1 + v3 = isub v2, v0 + return v3 +} +; run: %isub_splat_const_reverse_i32x4([1 2 3 4]) == [14 13 12 11] + +function %isub_splat_const_reverse_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 -5 + v2 = splat.i64x2 v1 + v3 = isub v2, v0 + return v3 +} +; run: %isub_splat_const_reverse_i64x2([1 2]) == [-6 -7] +