From dc01a9f7308ce50ab98ca9b08bc849bdb8c0ff92 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Sat, 20 May 2023 12:42:14 +0100 Subject: [PATCH 1/8] riscv64: Implement SIMD `bitselect` --- cranelift/codegen/src/isa/riscv64/inst.isle | 17 +- cranelift/codegen/src/isa/riscv64/inst/mod.rs | 9 +- .../codegen/src/isa/riscv64/inst/vector.rs | 14 +- .../codegen/src/isa/riscv64/inst_vector.isle | 14 ++ cranelift/codegen/src/isa/riscv64/lower.isle | 20 +- .../codegen/src/isa/riscv64/lower/isle.rs | 4 + .../filetests/isa/riscv64/simd-bitselect.clif | 206 ++++++++++++++++++ .../filetests/runtests/simd-bitselect.clif | 21 ++ 8 files changed, 284 insertions(+), 21 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-bitselect.clif diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle index 6b639ad311b9..7b12a36a049e 100644 --- a/cranelift/codegen/src/isa/riscv64/inst.isle +++ b/cranelift/codegen/src/isa/riscv64/inst.isle @@ -1365,6 +1365,10 @@ (decl imm5_from_u64 (Imm5) u64) (extern extractor imm5_from_u64 imm5_from_u64) +;; Construct a Imm5 from an i8 +(decl pure partial imm5_from_i8 (i8) Imm5) +(extern constructor imm5_from_i8 imm5_from_i8) + ;; Extractor that matches a `Value` equivalent to a replicated Imm5 on all lanes. ;; TODO: Try matching vconst here as well (decl replicated_imm5 (Imm5) Value) @@ -2215,19 +2219,6 @@ (decl alloc_vec_writable (Type) VecWritableReg) (extern constructor alloc_vec_writable alloc_vec_writable) -(decl gen_bitselect (Type Reg Reg Reg) Reg) -(rule - (gen_bitselect ty c x y) - (let - ((tmp_x Reg (rv_and c x)) - ;;;inverse condition - (c_inverse Reg (rv_not c)) - ;;;get all y part. - (tmp_y Reg (rv_and c_inverse y)) - ;;;get reuslt. - (result Reg (rv_or tmp_x tmp_y))) - result)) - (decl gen_int_select (Type IntSelectOP ValueRegs ValueRegs) ValueRegs) (rule (gen_int_select ty op x y) diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs index 659dcb0fa0cf..21b259abd562 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs @@ -57,7 +57,7 @@ pub use crate::isa::riscv64::lower::isle::generated_code::{ AluOPRRI, AluOPRRR, AtomicOP, FClassResult, FFlagsException, FloatRoundOP, FloatSelectOP, FpuOPRR, FpuOPRRR, FpuOPRRRR, IntSelectOP, LoadOP, MInst as Inst, StoreOP, FRM, }; -use crate::isa::riscv64::lower::isle::generated_code::{MInst, VecAluOpRRR}; +use crate::isa::riscv64::lower::isle::generated_code::{MInst, VecAluOpRRImm5, VecAluOpRRR}; type BoxCallInfo = Box; type BoxCallIndInfo = Box; @@ -1663,7 +1663,12 @@ impl Inst { format!("{}", imm) }; - format!("{op} {vd_s},{vs2_s},{imm_s}{mask} {vstate}") + match (op, imm) { + (VecAluOpRRImm5::VxorVI, imm) if imm == Imm5::maybe_from_i8(-1).unwrap() => { + format!("vnot.v {vd_s},{vs2_s}{mask} {vstate}") + } + _ => format!("{op} {vd_s},{vs2_s},{imm_s}{mask} {vstate}"), + } } &Inst::VecAluRR { op, diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs index 48d5192efaaf..874b6a015323 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -30,6 +30,9 @@ impl VecAvl { } } +// TODO: Can we tell ISLE to derive this? +impl Copy for VecAvl {} + // TODO: Can we tell ISLE to derive this? impl PartialEq for VecAvl { fn eq(&self, other: &Self) -> bool { @@ -154,7 +157,7 @@ impl fmt::Display for VecMaskMode { /// Vector Type (VType) /// /// vtype provides the default type used to interpret the contents of the vector register file. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq)] pub struct VType { pub sew: VecElementWidth, pub lmul: VecLmul, @@ -189,7 +192,7 @@ impl fmt::Display for VType { /// VState represents the state of the vector unit that each instruction expects before execution. /// Unlike VType or any of the other types here, VState is not a part of the RISC-V ISA. It is /// used by our instruction emission code to ensure that the vector unit is in the correct state. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Copy, Debug, PartialEq)] pub struct VState { pub avl: VecAvl, pub vtype: VType, @@ -354,6 +357,7 @@ impl VecAluOpRRImm5 { match self { VecAluOpRRImm5::VaddVI => 0b000000, VecAluOpRRImm5::VrsubVI => 0b000011, + VecAluOpRRImm5::VxorVI => 0b001011, VecAluOpRRImm5::VslidedownVI => 0b001111, VecAluOpRRImm5::VmergeVIM => 0b010111, } @@ -363,6 +367,7 @@ impl VecAluOpRRImm5 { match self { VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI + | VecAluOpRRImm5::VxorVI | VecAluOpRRImm5::VslidedownVI | VecAluOpRRImm5::VmergeVIM => VecOpCategory::OPIVI, } @@ -371,7 +376,10 @@ impl VecAluOpRRImm5 { pub fn imm_is_unsigned(&self) -> bool { match self { VecAluOpRRImm5::VslidedownVI => true, - VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI | VecAluOpRRImm5::VmergeVIM => false, + VecAluOpRRImm5::VaddVI + | VecAluOpRRImm5::VrsubVI + | VecAluOpRRImm5::VxorVI + | VecAluOpRRImm5::VmergeVIM => false, } } } diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index 64294d2276fa..41b09e469970 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -125,6 +125,7 @@ ;; Regular VI Opcodes (VaddVI) (VrsubVI) + (VxorVI) (VslidedownVI) (VmergeVIM) )) @@ -329,6 +330,19 @@ (rule (rv_vxor_vv vs2 vs1 mask vstate) (vec_alu_rrr (VecAluOpRRR.VxorVV) vs2 vs1 mask vstate)) +;; Helper for emitting the `vxor.vi` instruction. +;; Unlike other `vi` instructions the immediate is zero extended. +(decl rv_vxor_vi (Reg Imm5 VecOpMasking VState) Reg) +(rule (rv_vxor_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VxorVI) vs2 imm mask vstate)) + +;; Helper for emitting the `vnot.v` instruction. +;; This is just a mnemonic for `vxor.vi vd, vs, -1` +(decl rv_vnot_v (Reg VecOpMasking VState) Reg) +(rule (rv_vnot_v vs2 mask vstate) + (if-let neg1 (imm5_from_i8 -1)) + (rv_vxor_vi vs2 neg1 mask vstate)) + ;; Helper for emitting the `vfadd.vv` instruction. (decl rv_vfadd_vv (Reg Reg VecOpMasking VState) Reg) (rule (rv_vfadd_vv vs2 vs1 mask vstate) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index ef72b3568dd1..b6be299c2ad9 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -817,9 +817,23 @@ ;;;;; Rules for `bitselect`;;;;;;;;; -(rule - (lower (has_type ty (bitselect c x y))) - (gen_bitselect ty c x y)) +;; Do a (c & x) | (~c & y) operation. +(rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (bitselect c x y))) + (let ((tmp_x Reg (rv_and c x)) + (c_inverse Reg (rv_not c)) + (tmp_y Reg (rv_and c_inverse y))) + (rv_or tmp_x tmp_y))) + +;; For vectors, we also do the same operation. +;; We can technically use any type in the bitwise operations, but prefer +;; using the type of the inputs so that we avoid emitting unnecessary +;; `vsetvl` instructions. It's likeley that the vector unit is already +;; configured for that type. +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (bitselect c x y))) + (let ((tmp_x Reg (rv_vand_vv c x (unmasked) ty)) + (c_inverse Reg (rv_vnot_v c (unmasked) ty)) + (tmp_y Reg (rv_vand_vv c_inverse y (unmasked) ty))) + (rv_vor_vv tmp_x tmp_y (unmasked) ty))) ;;;;; Rules for `isplit`;;;;;;;;; (rule diff --git a/cranelift/codegen/src/isa/riscv64/lower/isle.rs b/cranelift/codegen/src/isa/riscv64/lower/isle.rs index 86e5daaff756..a17244112ba8 100644 --- a/cranelift/codegen/src/isa/riscv64/lower/isle.rs +++ b/cranelift/codegen/src/isa/riscv64/lower/isle.rs @@ -206,6 +206,10 @@ impl generated_code::Context for RV64IsleContext<'_, '_, MInst, Riscv64Backend> Imm5::maybe_from_i8(i8::try_from(arg0 as i64).ok()?) } #[inline] + fn imm5_from_i8(&mut self, arg0: i8) -> Option { + Imm5::maybe_from_i8(arg0) + } + #[inline] fn uimm5_bitcast_to_imm5(&mut self, arg0: UImm5) -> Imm5 { Imm5::from_bits(arg0.bits() as u8) } diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bitselect.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bitselect.clif new file mode 100644 index 000000000000..8331763f3383 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-bitselect.clif @@ -0,0 +1,206 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + +function %bitselect_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bitselect v0, v1, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v5,48(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vv v8,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma) +; vnot.v v10,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; vand.vv v12,v10,v5 #avl=2, #vtype=(e64, m1, ta, ma) +; vor.vv v14,v8,v12 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v14,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; addi t6, s0, 0x30 +; .byte 0x87, 0x82, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0x84, 0x11, 0x26 +; .byte 0x57, 0xb5, 0x1f, 0x2e +; .byte 0x57, 0x86, 0xa2, 0x26 +; .byte 0x57, 0x07, 0x86, 0x2a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x07, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bitselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 { +block0(v0: i32x4, v1: i32x4, v2: i32x4): + v3 = bitselect v0, v1, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v5,48(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vv v8,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma) +; vnot.v v10,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; vand.vv v12,v10,v5 #avl=4, #vtype=(e32, m1, ta, ma) +; vor.vv v14,v8,v12 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v14,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; addi t6, s0, 0x30 +; .byte 0x87, 0x82, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0x84, 0x11, 0x26 +; .byte 0x57, 0xb5, 0x1f, 0x2e +; .byte 0x57, 0x86, 0xa2, 0x26 +; .byte 0x57, 0x07, 0x86, 0x2a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x07, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bitselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8, v2: i16x8): + v3 = bitselect v0, v1, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v5,48(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vv v8,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma) +; vnot.v v10,v1 #avl=8, #vtype=(e16, m1, ta, ma) +; vand.vv v12,v10,v5 #avl=8, #vtype=(e16, m1, ta, ma) +; vor.vv v14,v8,v12 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v14,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; addi t6, s0, 0x30 +; .byte 0x87, 0x82, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x84, 0x11, 0x26 +; .byte 0x57, 0xb5, 0x1f, 0x2e +; .byte 0x57, 0x86, 0xa2, 0x26 +; .byte 0x57, 0x07, 0x86, 0x2a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x07, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16, v2: i8x16): + v3 = bitselect v0, v1, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vle8.v v5,48(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vv v8,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma) +; vnot.v v10,v1 #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vv v12,v10,v5 #avl=16, #vtype=(e8, m1, ta, ma) +; vor.vv v14,v8,v12 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v14,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; addi t6, s0, 0x20 +; .byte 0x87, 0x81, 0x0f, 0x02 +; addi t6, s0, 0x30 +; .byte 0x87, 0x82, 0x0f, 0x02 +; .byte 0x57, 0x84, 0x11, 0x26 +; .byte 0x57, 0xb5, 0x1f, 0x2e +; .byte 0x57, 0x86, 0xa2, 0x26 +; .byte 0x57, 0x07, 0x86, 0x2a +; .byte 0x27, 0x07, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-bitselect.clif b/cranelift/filetests/filetests/runtests/simd-bitselect.clif index 51e075e7c0ad..f1204bfd68d9 100644 --- a/cranelift/filetests/filetests/runtests/simd-bitselect.clif +++ b/cranelift/filetests/filetests/runtests/simd-bitselect.clif @@ -4,6 +4,17 @@ target aarch64 target s390x target x86_64 has_sse3 has_ssse3 has_sse41 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx +target riscv64 has_v + +function %bitselect_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bitselect v0, v1, v2 + return v3 +} +; run: %bitselect_i64x2(0x00000000000000000000000000000000, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x00000000000000000000000000000000 +; run: %bitselect_i64x2(0x11111111111111111111111111111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x11111111111111111111111111111111 +; run: %bitselect_i64x2(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111 +; run: %bitselect_i64x2(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000 function %bitselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 { block0(v0: i32x4, v1: i32x4, v2: i32x4): @@ -15,6 +26,16 @@ block0(v0: i32x4, v1: i32x4, v2: i32x4): ; run: %bitselect_i32x4(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111 ; run: %bitselect_i32x4(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000 +function %bitselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 { +block0(v0: i16x8, v1: i16x8, v2: i16x8): + v3 = bitselect v0, v1, v2 + return v3 +} +; run: %bitselect_i16x8(0x00000000000000000000000000000000, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x00000000000000000000000000000000 +; run: %bitselect_i16x8(0x11111111111111111111111111111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x11111111111111111111111111111111 +; run: %bitselect_i16x8(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111 +; run: %bitselect_i16x8(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000 + function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16, v2: i8x16): v3 = bitselect v0, v1, v2 From adcc26c3e53c94be74a76bf8e75e5ad9f551cc85 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Sat, 20 May 2023 13:02:47 +0100 Subject: [PATCH 2/8] riscv64: Add SIMD ` bnot` --- build.rs | 1 - cranelift/codegen/src/isa/riscv64/lower.isle | 5 +- cranelift/codegen/src/isle_prelude.rs | 9 + cranelift/codegen/src/prelude.isle | 4 + .../filetests/isa/riscv64/simd-bnot.clif | 159 ++++++++++++++++++ .../filetests/runtests/simd-bnot.clif | 38 +++++ 6 files changed, 214 insertions(+), 2 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-bnot.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-bnot.clif diff --git a/build.rs b/build.rs index 5f9b18ebf803..a4409b07e4bf 100644 --- a/build.rs +++ b/build.rs @@ -212,7 +212,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { "load_splat_out_of_bounds", "simd_align", "simd_bit_shift", - "simd_bitwise", "simd_boolean", "simd_conversions", "simd_f32x4", diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index b6be299c2ad9..56b046baa34e 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -399,9 +399,12 @@ (rv_vxor_vv x y (unmasked) ty)) ;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule (lower (has_type ty (bnot x))) +(rule 0 (lower (has_type (ty_scalar ty) (bnot x))) (gen_bnot ty x)) +(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (bnot x))) + (rv_vnot_v x (unmasked) ty)) + ;;;; Rules for `bit_reverse` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type (fits_in_64 (ty_int ty)) (bitrev x))) (lower_bit_reverse x ty)) diff --git a/cranelift/codegen/src/isle_prelude.rs b/cranelift/codegen/src/isle_prelude.rs index b5e3aaee28db..ff458ba69641 100644 --- a/cranelift/codegen/src/isle_prelude.rs +++ b/cranelift/codegen/src/isle_prelude.rs @@ -373,6 +373,15 @@ macro_rules! isle_common_prelude_methods { ty.is_int().then(|| ty) } + #[inline] + fn ty_scalar(&mut self, ty: Type) -> Option { + if ty.lane_count() == 1 { + Some(ty) + } else { + None + } + } + #[inline] fn ty_scalar_float(&mut self, ty: Type) -> Option { match ty { diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle index 6b2baf1a5aba..0d336cb21399 100644 --- a/cranelift/codegen/src/prelude.isle +++ b/cranelift/codegen/src/prelude.isle @@ -383,6 +383,10 @@ (decl ty_int (Type) Type) (extern extractor ty_int ty_int) +;; An extractor that only matches scalar types, float or int or ref's. +(decl ty_scalar (Type) Type) +(extern extractor ty_scalar ty_scalar) + ;; An extractor that only matches scalar floating-point types--F32 or F64. (decl ty_scalar_float (Type) Type) (extern extractor ty_scalar_float ty_scalar_float) diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bnot.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bnot.clif new file mode 100644 index 000000000000..58d3364d2775 --- /dev/null +++ b/cranelift/filetests/filetests/isa/riscv64/simd-bnot.clif @@ -0,0 +1,159 @@ +test compile precise-output +set unwind_info=false +target riscv64 has_v + + +function %bnot_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = bnot v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vnot.v v4,v1 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0xb2, 0x1f, 0x2e +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bnot_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = bnot v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vnot.v v4,v1 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0xb2, 0x1f, 0x2e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bnot_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = bnot v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vnot.v v4,v1 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0xb2, 0x1f, 0x2e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bnot_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = bnot v0 + return v1 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vnot.v v4,v1 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xb2, 0x1f, 0x2e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-bnot.clif b/cranelift/filetests/filetests/runtests/simd-bnot.clif new file mode 100644 index 000000000000..866f1eaa9370 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-bnot.clif @@ -0,0 +1,38 @@ +test interpret +test run +target aarch64 +target s390x +target x86_64 has_sse41=false +set enable_simd +target x86_64 +target x86_64 skylake +target riscv64 has_v + + +function %bnot_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = bnot v0 + return v1 +} +; run: %bnot_i8x16(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000 +; run: %bnot_i8x16(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111 +; run: %bnot_i8x16(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111 +; run: %bnot_i8x16(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000 + +function %bnot_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = bnot v0 + return v1 +} + +function %bnot_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = bnot v0 + return v1 +} + +function %bnot_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = bnot v0 + return v1 +} From a1cb4ae7441a1bdaade7fa1d866b63516a3c746b Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Sat, 20 May 2023 19:30:51 +0100 Subject: [PATCH 3/8] riscv64: Add `bxor` splat rules --- .../codegen/src/isa/riscv64/inst/vector.rs | 3 +- .../codegen/src/isa/riscv64/inst_vector.isle | 6 + cranelift/codegen/src/isa/riscv64/lower.isle | 30 +- .../filetests/isa/riscv64/simd-bxor.clif | 328 +++++++++++++++++- .../filetests/runtests/simd-bnot.clif | 12 + .../filetests/runtests/simd-bxor-splat.clif | 102 ++++++ 6 files changed, 466 insertions(+), 15 deletions(-) create mode 100644 cranelift/filetests/filetests/runtests/simd-bxor-splat.clif diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs index 874b6a015323..c721653372d3 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -277,7 +277,7 @@ impl VecAluOpRRR { VecAluOpRRR::VmulhuVV | VecAluOpRRR::VfmulVV | VecAluOpRRR::VfmulVF => 0b100100, VecAluOpRRR::VandVV => 0b001001, VecAluOpRRR::VorVV => 0b001010, - VecAluOpRRR::VxorVV => 0b001011, + VecAluOpRRR::VxorVV | VecAluOpRRR::VxorVX => 0b001011, VecAluOpRRR::VslidedownVX => 0b001111, VecAluOpRRR::VfrsubVF => 0b100111, VecAluOpRRR::VmergeVVM | VecAluOpRRR::VmergeVXM | VecAluOpRRR::VfmergeVFM => 0b010111, @@ -301,6 +301,7 @@ impl VecAluOpRRR { VecAluOpRRR::VaddVX | VecAluOpRRR::VsubVX | VecAluOpRRR::VrsubVX + | VecAluOpRRR::VxorVX | VecAluOpRRR::VslidedownVX | VecAluOpRRR::VmergeVXM => VecOpCategory::OPIVX, VecAluOpRRR::VfaddVV diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index 41b09e469970..aa968cf0f73e 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -109,6 +109,7 @@ (VaddVX) (VsubVX) (VrsubVX) + (VxorVX) (VslidedownVX) (VfaddVF) (VfsubVF) @@ -330,6 +331,11 @@ (rule (rv_vxor_vv vs2 vs1 mask vstate) (vec_alu_rrr (VecAluOpRRR.VxorVV) vs2 vs1 mask vstate)) +;; Helper for emitting the `vxor.vx` instruction. +(decl rv_vxor_vx (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vxor_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VxorVX) vs2 vs1 mask vstate)) + ;; Helper for emitting the `vxor.vi` instruction. ;; Unlike other `vi` instructions the immediate is zero extended. (decl rv_vxor_vi (Reg Imm5 VecOpMasking VState) Reg) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index 56b046baa34e..c432b26cfb78 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -376,28 +376,38 @@ (rv_vor_vv x y (unmasked) ty)) ;;;; Rules for `xor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule -1 (lower (has_type (fits_in_64 (ty_int ty)) (bxor x y))) +(rule 0 (lower (has_type (fits_in_64 (ty_int ty)) (bxor x y))) (rv_xor x y)) ;; Special cases for when one operand is an immediate that fits in 12 bits. -(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (bxor x (imm12_from_value y)))) +(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (bxor x (imm12_from_value y)))) (rv_xori x y)) -(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (bxor (imm12_from_value x) y))) +(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (bxor (imm12_from_value x) y))) (rv_xori y x)) -(rule (lower (has_type $I128 (bxor x y))) +(rule 3 (lower (has_type $I128 (bxor x y))) (lower_b128_binary (AluOPRRR.Xor) x y)) -(rule (lower (has_type $F32 (bxor x y))) - (lower_float_binary (AluOPRRR.Xor) x y $F32)) +(rule 4 (lower (has_type (ty_scalar_float ty) (bxor x y))) + (lower_float_binary (AluOPRRR.Xor) x y ty)) -(rule (lower (has_type $F64 (bxor x y))) - (lower_float_binary (AluOPRRR.Xor) x y $F64)) - -(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (bxor x y))) +(rule 5 (lower (has_type (ty_vec_fits_in_register ty) (bxor x y))) (rv_vxor_vv x y (unmasked) ty)) +(rule 6 (lower (has_type (ty_vec_fits_in_register ty) (bxor x (splat y)))) + (rv_vxor_vx x y (unmasked) ty)) + +(rule 7 (lower (has_type (ty_vec_fits_in_register ty) (bxor (splat x) y))) + (rv_vxor_vx y x (unmasked) ty)) + +(rule 8 (lower (has_type (ty_vec_fits_in_register ty) (bxor x (replicated_imm5 y)))) + (rv_vxor_vi x y (unmasked) ty)) + +(rule 9 (lower (has_type (ty_vec_fits_in_register ty) (bxor (replicated_imm5 x) y))) + (rv_vxor_vi y x (unmasked) ty)) + + ;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 0 (lower (has_type (ty_scalar ty) (bnot x))) (gen_bnot ty x)) diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif index 0c8cc8f1ad4c..d4acbade9772 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif @@ -23,7 +23,7 @@ block0(v0: i8x16, v1: i8x16): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -63,7 +63,7 @@ block0(v0: i16x8, v1: i16x8): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -105,7 +105,7 @@ block0(v0: i32x4, v1: i32x4): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -147,7 +147,7 @@ block0(v0: i64x2, v1: i64x2): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -169,3 +169,323 @@ block0(v0: i64x2, v1: i64x2): ; addi sp, sp, 0x10 ; ret +function %bxor_const_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 5 + v2 = splat.i8x16 v1 + v3 = bxor v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vxor.vi v4,v1,5 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0xb2, 0x12, 0x2e +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bxor_const_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 -16 + v2 = splat.i16x8 v1 + v3 = bxor v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vxor.vi v4,v1,-16 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x32, 0x18, 0x2e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bxor_const_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 15 + v2 = splat.i32x4 v1 + v3 = bxor v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vxor.vi v4,v1,15 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0xb2, 0x17, 0x2e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bxor_const_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 -5 + v2 = splat.i64x2 v1 + v3 = bxor v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vxor.vi v4,v1,-5 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xb2, 0x1d, 0x2e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bxor_splat_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = bxor v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vxor.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0xd7, 0x42, 0x15, 0x2e +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bxor_splat_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = bxor v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vxor.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0x42, 0x15, 0x2e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bxor_splat_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = bxor v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vxor.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x42, 0x15, 0x2e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bxor_splat_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = bxor v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vxor.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x42, 0x15, 0x2e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-bnot.clif b/cranelift/filetests/filetests/runtests/simd-bnot.clif index 866f1eaa9370..b2c0446f6fbe 100644 --- a/cranelift/filetests/filetests/runtests/simd-bnot.clif +++ b/cranelift/filetests/filetests/runtests/simd-bnot.clif @@ -24,15 +24,27 @@ block0(v0: i16x8): v1 = bnot v0 return v1 } +; run: %bnot_i16x8(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000 +; run: %bnot_i16x8(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111 +; run: %bnot_i16x8(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111 +; run: %bnot_i16x8(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000 function %bnot_i32x4(i32x4) -> i32x4 { block0(v0: i32x4): v1 = bnot v0 return v1 } +; run: %bnot_i32x4(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000 +; run: %bnot_i32x4(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111 +; run: %bnot_i32x4(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111 +; run: %bnot_i32x4(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000 function %bnot_i64x2(i64x2) -> i64x2 { block0(v0: i64x2): v1 = bnot v0 return v1 } +; run: %bnot_i64x2(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000 +; run: %bnot_i64x2(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111 +; run: %bnot_i64x2(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111 +; run: %bnot_i64x2(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000 diff --git a/cranelift/filetests/filetests/runtests/simd-bxor-splat.clif b/cranelift/filetests/filetests/runtests/simd-bxor-splat.clif new file mode 100644 index 000000000000..6e232b9d81ad --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-bxor-splat.clif @@ -0,0 +1,102 @@ +test interpret +test run +target aarch64 +target s390x +target x86_64 has_sse41=false +set enable_simd +target x86_64 +target x86_64 skylake +target riscv64 has_v + +function %bxor_splat_const_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 5 + v2 = splat.i8x16 v1 + v3 = bxor v0, v2 + return v3 +} +; run: %bxor_splat_const_i8x16(0x00000000000000000000000000000000) == 0x05050505050505050505050505050505 +; run: %bxor_splat_const_i8x16(0x11111111111111111111111111111111) == 0x14141414141414141414141414141414 +; run: %bxor_splat_const_i8x16(0x01010011000011110000000011111111) == 0x04040514050514140505050514141414 +; run: %bxor_splat_const_i8x16(0x00000000000000001111111111111111) == 0x05050505050505051414141414141414 + +function %bxor_splat_const_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 -16 + v2 = splat.i16x8 v1 + v3 = bxor v0, v2 + return v3 +} +; run: %bxor_splat_const_i16x8(0x00000000000000000000000000000000) == 0xfff0fff0fff0fff0fff0fff0fff0fff0 +; run: %bxor_splat_const_i16x8(0x11111111111111111111111111111111) == 0xeee1eee1eee1eee1eee1eee1eee1eee1 +; run: %bxor_splat_const_i16x8(0x01010011000011110000000011111111) == 0xfef1ffe1fff0eee1fff0fff0eee1eee1 +; run: %bxor_splat_const_i16x8(0x00000000000000001111111111111111) == 0xfff0fff0fff0fff0eee1eee1eee1eee1 + +function %bxor_splat_const_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 15 + v2 = splat.i32x4 v1 + v3 = bxor v0, v2 + return v3 +} +; run: %bxor_splat_const_i32x4(0x00000000000000000000000000000000) == 0x0000000f0000000f0000000f0000000f +; run: %bxor_splat_const_i32x4(0x11111111111111111111111111111111) == 0x1111111e1111111e1111111e1111111e +; run: %bxor_splat_const_i32x4(0x01010011000011110000000011111111) == 0x0101001e0000111e0000000f1111111e +; run: %bxor_splat_const_i32x4(0x00000000000000001111111111111111) == 0x0000000f0000000f1111111e1111111e + +function %bxor_splat_const_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 -5 + v2 = splat.i64x2 v1 + v3 = bxor v2, v0 + return v3 +} +; run: %bxor_splat_const_i64x2(0x00000000000000000000000000000000) == 0xfffffffffffffffbfffffffffffffffb +; run: %bxor_splat_const_i64x2(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeaeeeeeeeeeeeeeeea +; run: %bxor_splat_const_i64x2(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeaffffffffeeeeeeea +; run: %bxor_splat_const_i64x2(0x00000000000000001111111111111111) == 0xfffffffffffffffbeeeeeeeeeeeeeeea + + +function %bxor_splat_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = bxor v0, v2 + return v3 +} +; run: %bxor_splat_i8x16(0x00000000000000000000000000000000, 0x01) == 0x01010101010101010101010101010101 +; run: %bxor_splat_i8x16(0x11111111111111111111111111111111, 0xff) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee +; run: %bxor_splat_i8x16(0x01010011000011110000000011111111, 0x8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f +; run: %bxor_splat_i8x16(0x00000000000000001111111111111111, 0xbe) == 0xbebebebebebebebeafafafafafafafaf + +function %bxor_splat_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = bxor v0, v2 + return v3 +} +; run: %bxor_splat_i16x8(0x00000000000000000000000000000000, 0x0001) == 0x00010001000100010001000100010001 +; run: %bxor_splat_i16x8(0x11111111111111111111111111111111, 0xffff) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee +; run: %bxor_splat_i16x8(0x01010011000011110000000011111111, 0x8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f +; run: %bxor_splat_i16x8(0x00000000000000001111111111111111, 0xc0fe) == 0xc0fec0fec0fec0fed1efd1efd1efd1ef + +function %bxor_splat_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = bxor v0, v2 + return v3 +} +; run: %bxor_splat_i32x4(0x00000000000000000000000000000000, 0x00000001) == 0x00000001000000010000000100000001 +; run: %bxor_splat_i32x4(0x11111111111111111111111111111111, 0xffffffff) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee +; run: %bxor_splat_i32x4(0x01010011000011110000000011111111, 0x8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f +; run: %bxor_splat_i32x4(0x00000000000000001111111111111111, 0xc0ffeeee) == 0xc0ffeeeec0ffeeeed1eeffffd1eeffff + +function %bxor_splat_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = bxor v2, v0 + return v3 +} +; run: %bxor_splat_i64x2(0x00000000000000000000000000000000, 0x0000000000000001) == 0x00000000000000010000000000000001 +; run: %bxor_splat_i64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee +; run: %bxor_splat_i64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f +; run: %bxor_splat_i64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0xc0ffeeeec0ffeeeed1eeffffd1eeffff From 5c4994433f4c90e462806b5daa3f3b6c2c5f4e34 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Sat, 20 May 2023 19:44:33 +0100 Subject: [PATCH 4/8] riscv64: Add SIMD `bor` optimizations --- .../codegen/src/isa/riscv64/inst/vector.rs | 6 +- .../codegen/src/isa/riscv64/inst_vector.isle | 13 +- cranelift/codegen/src/isa/riscv64/lower.isle | 46 +-- .../filetests/isa/riscv64/simd-bor.clif | 320 ++++++++++++++++++ .../filetests/runtests/simd-bor-splat.clif | 102 ++++++ 5 files changed, 466 insertions(+), 21 deletions(-) create mode 100644 cranelift/filetests/filetests/runtests/simd-bor-splat.clif diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs index c721653372d3..7b1329e71e1e 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -276,7 +276,7 @@ impl VecAluOpRRR { VecAluOpRRR::VmulhVV => 0b100111, VecAluOpRRR::VmulhuVV | VecAluOpRRR::VfmulVV | VecAluOpRRR::VfmulVF => 0b100100, VecAluOpRRR::VandVV => 0b001001, - VecAluOpRRR::VorVV => 0b001010, + VecAluOpRRR::VorVV | VecAluOpRRR::VorVX => 0b001010, VecAluOpRRR::VxorVV | VecAluOpRRR::VxorVX => 0b001011, VecAluOpRRR::VslidedownVX => 0b001111, VecAluOpRRR::VfrsubVF => 0b100111, @@ -301,6 +301,7 @@ impl VecAluOpRRR { VecAluOpRRR::VaddVX | VecAluOpRRR::VsubVX | VecAluOpRRR::VrsubVX + | VecAluOpRRR::VorVX | VecAluOpRRR::VxorVX | VecAluOpRRR::VslidedownVX | VecAluOpRRR::VmergeVXM => VecOpCategory::OPIVX, @@ -358,6 +359,7 @@ impl VecAluOpRRImm5 { match self { VecAluOpRRImm5::VaddVI => 0b000000, VecAluOpRRImm5::VrsubVI => 0b000011, + VecAluOpRRImm5::VorVI => 0b001010, VecAluOpRRImm5::VxorVI => 0b001011, VecAluOpRRImm5::VslidedownVI => 0b001111, VecAluOpRRImm5::VmergeVIM => 0b010111, @@ -368,6 +370,7 @@ impl VecAluOpRRImm5 { match self { VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI + | VecAluOpRRImm5::VorVI | VecAluOpRRImm5::VxorVI | VecAluOpRRImm5::VslidedownVI | VecAluOpRRImm5::VmergeVIM => VecOpCategory::OPIVI, @@ -379,6 +382,7 @@ impl VecAluOpRRImm5 { VecAluOpRRImm5::VslidedownVI => true, VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI + | VecAluOpRRImm5::VorVI | VecAluOpRRImm5::VxorVI | VecAluOpRRImm5::VmergeVIM => false, } diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index aa968cf0f73e..976add60a4e6 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -109,6 +109,7 @@ (VaddVX) (VsubVX) (VrsubVX) + (VorVX) (VxorVX) (VslidedownVX) (VfaddVF) @@ -126,6 +127,7 @@ ;; Regular VI Opcodes (VaddVI) (VrsubVI) + (VorVI) (VxorVI) (VslidedownVI) (VmergeVIM) @@ -326,6 +328,16 @@ (rule (rv_vor_vv vs2 vs1 mask vstate) (vec_alu_rrr (VecAluOpRRR.VorVV) vs2 vs1 mask vstate)) +;; Helper for emitting the `vor.vx` instruction. +(decl rv_vor_vx (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vor_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VorVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vor.vi` instruction. +(decl rv_vor_vi (Reg Imm5 VecOpMasking VState) Reg) +(rule (rv_vor_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VorVI) vs2 imm mask vstate)) + ;; Helper for emitting the `vxor.vv` instruction. (decl rv_vxor_vv (Reg Reg VecOpMasking VState) Reg) (rule (rv_vxor_vv vs2 vs1 mask vstate) @@ -337,7 +349,6 @@ (vec_alu_rrr (VecAluOpRRR.VxorVX) vs2 vs1 mask vstate)) ;; Helper for emitting the `vxor.vi` instruction. -;; Unlike other `vi` instructions the immediate is zero extended. (decl rv_vxor_vi (Reg Imm5 VecOpMasking VState) Reg) (rule (rv_vxor_vi vs2 imm mask vstate) (vec_alu_rr_imm5 (VecAluOpRRImm5.VxorVI) vs2 imm mask vstate)) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index c432b26cfb78..23e664bd2f61 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -330,51 +330,59 @@ ;;;; Rules for `or` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule -1 (lower (has_type (ty_int ty) (bor x y))) +(rule 0 (lower (has_type (ty_int ty) (bor x y))) (gen_or ty x y)) ;; Special cases for when one operand is an immediate that fits in 12 bits. -(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (bor x (imm12_from_value y)))) +(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (bor x (imm12_from_value y)))) (rv_ori x y)) -(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (bor (imm12_from_value x) y))) +(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (bor (imm12_from_value x) y))) (rv_ori y x)) -(rule (lower (has_type $F32 (bor x y))) - (lower_float_binary (AluOPRRR.Or) x y $F32)) - -(rule (lower (has_type $F64 (bor x y))) - (lower_float_binary (AluOPRRR.Or) x y $F64)) +(rule 3 (lower (has_type (ty_scalar_float ty) (bor x y))) + (lower_float_binary (AluOPRRR.Or) x y ty)) ;; Specialized lowerings for `(bor x (bnot y))` which is additionally produced ;; by Cranelift's `bor_not` instruction that is legalized into the simpler ;; forms early on. -(rule 3 (lower (has_type (fits_in_64 (ty_int ty)) (bor x (bnot y)))) +(rule 4 (lower (has_type (fits_in_64 (ty_int ty)) (bor x (bnot y)))) (if-let $true (has_zbb)) (rv_orn x y)) -(rule 4 (lower (has_type (fits_in_64 (ty_int ty)) (bor (bnot y) x))) +(rule 5 (lower (has_type (fits_in_64 (ty_int ty)) (bor (bnot y) x))) (if-let $true (has_zbb)) (rv_orn x y)) -(rule 5 (lower (has_type $I128 (bor x (bnot y)))) +(rule 6 (lower (has_type $I128 (bor x (bnot y)))) (if-let $true (has_zbb)) - (let - ((low Reg (rv_orn (value_regs_get x 0) (value_regs_get y 0))) - (high Reg (rv_orn (value_regs_get x 1) (value_regs_get y 1)))) + (let ((low Reg (rv_orn (value_regs_get x 0) (value_regs_get y 0))) + (high Reg (rv_orn (value_regs_get x 1) (value_regs_get y 1)))) (value_regs low high))) -(rule 6 (lower (has_type $I128 (bor (bnot y) x))) +(rule 7 (lower (has_type $I128 (bor (bnot y) x))) (if-let $true (has_zbb)) - (let - ((low Reg (rv_orn (value_regs_get x 0) (value_regs_get y 0))) - (high Reg (rv_orn (value_regs_get x 1) (value_regs_get y 1)))) + (let ((low Reg (rv_orn (value_regs_get x 0) (value_regs_get y 0))) + (high Reg (rv_orn (value_regs_get x 1) (value_regs_get y 1)))) (value_regs low high))) -(rule 7 (lower (has_type (ty_vec_fits_in_register ty) (bor x y))) +(rule 8 (lower (has_type (ty_vec_fits_in_register ty) (bor x y))) (rv_vor_vv x y (unmasked) ty)) +(rule 9 (lower (has_type (ty_vec_fits_in_register ty) (bor x (splat y)))) + (rv_vor_vx x y (unmasked) ty)) + +(rule 10 (lower (has_type (ty_vec_fits_in_register ty) (bor (splat x) y))) + (rv_vor_vx y x (unmasked) ty)) + +(rule 11 (lower (has_type (ty_vec_fits_in_register ty) (bor x (replicated_imm5 y)))) + (rv_vor_vi x y (unmasked) ty)) + +(rule 12 (lower (has_type (ty_vec_fits_in_register ty) (bor (replicated_imm5 x) y))) + (rv_vor_vi y x (unmasked) ty)) + + ;;;; Rules for `xor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 0 (lower (has_type (fits_in_64 (ty_int ty)) (bxor x y))) (rv_xor x y)) diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif index 7f8beb629f50..e556b8a554dc 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif @@ -169,3 +169,323 @@ block0(v0: i64x2, v1: i64x2): ; addi sp, sp, 0x10 ; ret +function %bor_const_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 5 + v2 = splat.i8x16 v1 + v3 = bor v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vor.vi v4,v1,5 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0xb2, 0x12, 0x2a +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bor_const_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 -16 + v2 = splat.i16x8 v1 + v3 = bor v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vor.vi v4,v1,-16 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x32, 0x18, 0x2a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bor_const_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 15 + v2 = splat.i32x4 v1 + v3 = bor v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vor.vi v4,v1,15 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0xb2, 0x17, 0x2a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bor_const_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 -5 + v2 = splat.i64x2 v1 + v3 = bor v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vor.vi v4,v1,-5 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xb2, 0x1d, 0x2a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bor_splat_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = bor v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vor.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0xd7, 0x42, 0x15, 0x2a +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bor_splat_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = bor v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vor.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0x42, 0x15, 0x2a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bor_splat_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = bor v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vor.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x42, 0x15, 0x2a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bor_splat_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = bor v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vor.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x42, 0x15, 0x2a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-bor-splat.clif b/cranelift/filetests/filetests/runtests/simd-bor-splat.clif new file mode 100644 index 000000000000..29b1cfd33140 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-bor-splat.clif @@ -0,0 +1,102 @@ +test interpret +test run +target aarch64 +target s390x +target x86_64 has_sse41=false +set enable_simd +target x86_64 +target x86_64 skylake +target riscv64 has_v + +function %bor_splat_const_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 5 + v2 = splat.i8x16 v1 + v3 = bor v0, v2 + return v3 +} +; run: %bor_splat_const_i8x16(0x00000000000000000000000000000000) == 0x05050505050505050505050505050505 +; run: %bor_splat_const_i8x16(0x11111111111111111111111111111111) == 0x15151515151515151515151515151515 +; run: %bor_splat_const_i8x16(0x01010011000011110000000011111111) == 0x05050515050515150505050515151515 +; run: %bor_splat_const_i8x16(0x00000000000000001111111111111111) == 0x05050505050505051515151515151515 + +function %bor_splat_const_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 -16 + v2 = splat.i16x8 v1 + v3 = bor v0, v2 + return v3 +} +; run: %bor_splat_const_i16x8(0x00000000000000000000000000000000) == 0xfff0fff0fff0fff0fff0fff0fff0fff0 +; run: %bor_splat_const_i16x8(0x11111111111111111111111111111111) == 0xfff1fff1fff1fff1fff1fff1fff1fff1 +; run: %bor_splat_const_i16x8(0x01010011000011110000000011111111) == 0xfff1fff1fff0fff1fff0fff0fff1fff1 +; run: %bor_splat_const_i16x8(0x00000000000000001111111111111111) == 0xfff0fff0fff0fff0fff1fff1fff1fff1 + +function %bor_splat_const_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 15 + v2 = splat.i32x4 v1 + v3 = bor v0, v2 + return v3 +} +; run: %bor_splat_const_i32x4(0x00000000000000000000000000000000) == 0x0000000f0000000f0000000f0000000f +; run: %bor_splat_const_i32x4(0x11111111111111111111111111111111) == 0x1111111f1111111f1111111f1111111f +; run: %bor_splat_const_i32x4(0x01010011000011110000000011111111) == 0x0101001f0000111f0000000f1111111f +; run: %bor_splat_const_i32x4(0x00000000000000001111111111111111) == 0x0000000f0000000f1111111f1111111f + +function %bor_splat_const_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 -5 + v2 = splat.i64x2 v1 + v3 = bor v2, v0 + return v3 +} +; run: %bor_splat_const_i64x2(0x00000000000000000000000000000000) == 0xfffffffffffffffbfffffffffffffffb +; run: %bor_splat_const_i64x2(0x11111111111111111111111111111111) == 0xfffffffffffffffbfffffffffffffffb +; run: %bor_splat_const_i64x2(0x01010011000011110000000011111111) == 0xfffffffffffffffbfffffffffffffffb +; run: %bor_splat_const_i64x2(0x00000000000000001111111111111111) == 0xfffffffffffffffbfffffffffffffffb + + +function %bor_splat_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = bor v0, v2 + return v3 +} +; run: %bor_splat_i8x16(0x00000000000000000000000000000000, 0x01) == 0x01010101010101010101010101010101 +; run: %bor_splat_i8x16(0x11111111111111111111111111111111, 0xff) == 0xffffffffffffffffffffffffffffffff +; run: %bor_splat_i8x16(0x01010011000011110000000011111111, 0x8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f +; run: %bor_splat_i8x16(0x00000000000000001111111111111111, 0xbe) == 0xbebebebebebebebebfbfbfbfbfbfbfbf + +function %bor_splat_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = bor v0, v2 + return v3 +} +; run: %bor_splat_i16x8(0x00000000000000000000000000000000, 0x0001) == 0x00010001000100010001000100010001 +; run: %bor_splat_i16x8(0x11111111111111111111111111111111, 0xffff) == 0xffffffffffffffffffffffffffffffff +; run: %bor_splat_i16x8(0x01010011000011110000000011111111, 0x8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f +; run: %bor_splat_i16x8(0x00000000000000001111111111111111, 0xc0fe) == 0xc0fec0fec0fec0fed1ffd1ffd1ffd1ff + +function %bor_splat_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = bor v0, v2 + return v3 +} +; run: %bor_splat_i32x4(0x00000000000000000000000000000000, 0x00000001) == 0x00000001000000010000000100000001 +; run: %bor_splat_i32x4(0x11111111111111111111111111111111, 0xffffffff) == 0xffffffffffffffffffffffffffffffff +; run: %bor_splat_i32x4(0x01010011000011110000000011111111, 0x8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f +; run: %bor_splat_i32x4(0x00000000000000001111111111111111, 0xc0ffeeee) == 0xc0ffeeeec0ffeeeed1ffffffd1ffffff + +function %bor_splat_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = bor v2, v0 + return v3 +} +; run: %bor_splat_i64x2(0x00000000000000000000000000000000, 0x0000000000000001) == 0x00000000000000010000000000000001 +; run: %bor_splat_i64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0xffffffffffffffffffffffffffffffff +; run: %bor_splat_i64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f +; run: %bor_splat_i64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0xc0ffeeeec0ffeeeed1ffffffd1ffffff From 9ca616c8796a4051a9ea33bc72244899097f82c3 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Sat, 20 May 2023 19:59:25 +0100 Subject: [PATCH 5/8] riscv64: Add SIMD `band` splat rules --- .../codegen/src/isa/riscv64/inst/vector.rs | 6 +- .../codegen/src/isa/riscv64/inst_vector.isle | 12 + cranelift/codegen/src/isa/riscv64/lower.isle | 46 +-- .../filetests/isa/riscv64/simd-band.clif | 328 +++++++++++++++++- .../filetests/runtests/simd-band-splat.clif | 102 ++++++ 5 files changed, 469 insertions(+), 25 deletions(-) create mode 100644 cranelift/filetests/filetests/runtests/simd-band-splat.clif diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs index 7b1329e71e1e..fae3d66a4698 100644 --- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs +++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs @@ -275,7 +275,7 @@ impl VecAluOpRRR { VecAluOpRRR::VmulVV => 0b100101, VecAluOpRRR::VmulhVV => 0b100111, VecAluOpRRR::VmulhuVV | VecAluOpRRR::VfmulVV | VecAluOpRRR::VfmulVF => 0b100100, - VecAluOpRRR::VandVV => 0b001001, + VecAluOpRRR::VandVV | VecAluOpRRR::VandVX => 0b001001, VecAluOpRRR::VorVV | VecAluOpRRR::VorVX => 0b001010, VecAluOpRRR::VxorVV | VecAluOpRRR::VxorVX => 0b001011, VecAluOpRRR::VslidedownVX => 0b001111, @@ -301,6 +301,7 @@ impl VecAluOpRRR { VecAluOpRRR::VaddVX | VecAluOpRRR::VsubVX | VecAluOpRRR::VrsubVX + | VecAluOpRRR::VandVX | VecAluOpRRR::VorVX | VecAluOpRRR::VxorVX | VecAluOpRRR::VslidedownVX @@ -359,6 +360,7 @@ impl VecAluOpRRImm5 { match self { VecAluOpRRImm5::VaddVI => 0b000000, VecAluOpRRImm5::VrsubVI => 0b000011, + VecAluOpRRImm5::VandVI => 0b001001, VecAluOpRRImm5::VorVI => 0b001010, VecAluOpRRImm5::VxorVI => 0b001011, VecAluOpRRImm5::VslidedownVI => 0b001111, @@ -370,6 +372,7 @@ impl VecAluOpRRImm5 { match self { VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI + | VecAluOpRRImm5::VandVI | VecAluOpRRImm5::VorVI | VecAluOpRRImm5::VxorVI | VecAluOpRRImm5::VslidedownVI @@ -382,6 +385,7 @@ impl VecAluOpRRImm5 { VecAluOpRRImm5::VslidedownVI => true, VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI + | VecAluOpRRImm5::VandVI | VecAluOpRRImm5::VorVI | VecAluOpRRImm5::VxorVI | VecAluOpRRImm5::VmergeVIM => false, diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle index 976add60a4e6..0a6d8dc599b6 100644 --- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle +++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle @@ -109,6 +109,7 @@ (VaddVX) (VsubVX) (VrsubVX) + (VandVX) (VorVX) (VxorVX) (VslidedownVX) @@ -127,6 +128,7 @@ ;; Regular VI Opcodes (VaddVI) (VrsubVI) + (VandVI) (VorVI) (VxorVI) (VslidedownVI) @@ -323,6 +325,16 @@ (rule (rv_vand_vv vs2 vs1 mask vstate) (vec_alu_rrr (VecAluOpRRR.VandVV) vs2 vs1 mask vstate)) +;; Helper for emitting the `vand.vx` instruction. +(decl rv_vand_vx (Reg Reg VecOpMasking VState) Reg) +(rule (rv_vand_vx vs2 vs1 mask vstate) + (vec_alu_rrr (VecAluOpRRR.VandVX) vs2 vs1 mask vstate)) + +;; Helper for emitting the `vand.vi` instruction. +(decl rv_vand_vi (Reg Imm5 VecOpMasking VState) Reg) +(rule (rv_vand_vi vs2 imm mask vstate) + (vec_alu_rr_imm5 (VecAluOpRRImm5.VandVI) vs2 imm mask vstate)) + ;; Helper for emitting the `vor.vv` instruction. (decl rv_vor_vv (Reg Reg VecOpMasking VState) Reg) (rule (rv_vor_vv vs2 vs1 mask vstate) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index 23e664bd2f61..d53bee81668c 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -282,52 +282,58 @@ (rv_remu x y))) ;;;; Rules for `and` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule -1 (lower (has_type (ty_int ty) (band x y))) +(rule 0 (lower (has_type (ty_int ty) (band x y))) (gen_and ty x y)) ;; Special cases for when one operand is an immediate that fits in 12 bits. -(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (band x (imm12_from_value y)))) +(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (band x (imm12_from_value y)))) (rv_andi x y)) -(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (band (imm12_from_value x) y))) +(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (band (imm12_from_value x) y))) (rv_andi y x)) -(rule (lower (has_type $F32 (band x y))) - (lower_float_binary (AluOPRRR.And) x y $F32)) - -(rule (lower (has_type $F64 (band x y))) - (lower_float_binary (AluOPRRR.And) x y $F64)) +(rule 3 (lower (has_type (ty_scalar_float ty) (band x y))) + (lower_float_binary (AluOPRRR.And) x y ty)) ;; Specialized lowerings for `(band x (bnot y))` which is additionally produced ;; by Cranelift's `band_not` instruction that is legalized into the simpler ;; forms early on. -(rule 3 (lower (has_type (fits_in_64 (ty_int ty)) (band x (bnot y)))) +(rule 4 (lower (has_type (fits_in_64 (ty_int ty)) (band x (bnot y)))) (if-let $true (has_zbb)) (rv_andn x y)) -(rule 4 (lower (has_type (fits_in_64 (ty_int ty)) (band (bnot y) x))) +(rule 5 (lower (has_type (fits_in_64 (ty_int ty)) (band (bnot y) x))) (if-let $true (has_zbb)) (rv_andn x y)) -(rule 5 (lower (has_type $I128 (band x (bnot y)))) +(rule 6 (lower (has_type $I128 (band x (bnot y)))) (if-let $true (has_zbb)) - (let - ((low Reg (rv_andn (value_regs_get x 0) (value_regs_get y 0))) - (high Reg (rv_andn (value_regs_get x 1) (value_regs_get y 1)))) + (let ((low Reg (rv_andn (value_regs_get x 0) (value_regs_get y 0))) + (high Reg (rv_andn (value_regs_get x 1) (value_regs_get y 1)))) (value_regs low high))) -(rule 6 (lower (has_type $I128 (band (bnot y) x))) +(rule 7 (lower (has_type $I128 (band (bnot y) x))) (if-let $true (has_zbb)) - (let - ((low Reg (rv_andn (value_regs_get x 0) (value_regs_get y 0))) - (high Reg (rv_andn (value_regs_get x 1) (value_regs_get y 1)))) + (let ((low Reg (rv_andn (value_regs_get x 0) (value_regs_get y 0))) + (high Reg (rv_andn (value_regs_get x 1) (value_regs_get y 1)))) (value_regs low high))) - -(rule 7 (lower (has_type (ty_vec_fits_in_register ty) (band x y))) +(rule 8 (lower (has_type (ty_vec_fits_in_register ty) (band x y))) (rv_vand_vv x y (unmasked) ty)) +(rule 9 (lower (has_type (ty_vec_fits_in_register ty) (band x (splat y)))) + (rv_vand_vx x y (unmasked) ty)) + +(rule 10 (lower (has_type (ty_vec_fits_in_register ty) (band (splat x) y))) + (rv_vand_vx y x (unmasked) ty)) + +(rule 11 (lower (has_type (ty_vec_fits_in_register ty) (band x (replicated_imm5 y)))) + (rv_vand_vi x y (unmasked) ty)) + +(rule 12 (lower (has_type (ty_vec_fits_in_register ty) (band (replicated_imm5 x) y))) + (rv_vand_vi y x (unmasked) ty)) + ;;;; Rules for `or` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule 0 (lower (has_type (ty_int ty) (bor x y))) diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-band.clif b/cranelift/filetests/filetests/isa/riscv64/simd-band.clif index c4c6a3530482..a0b99c569b77 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-band.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-band.clif @@ -23,7 +23,7 @@ block0(v0: i8x16, v1: i8x16): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -63,7 +63,7 @@ block0(v0: i16x8, v1: i16x8): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -105,7 +105,7 @@ block0(v0: i32x4, v1: i32x4): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -147,7 +147,7 @@ block0(v0: i64x2, v1: i64x2): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -169,3 +169,323 @@ block0(v0: i64x2, v1: i64x2): ; addi sp, sp, 0x10 ; ret +function %band_const_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 5 + v2 = splat.i8x16 v1 + v3 = band v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vi v4,v1,5 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0xb2, 0x12, 0x26 +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %band_const_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 -16 + v2 = splat.i16x8 v1 + v3 = band v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vi v4,v1,-16 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0x57, 0x32, 0x18, 0x26 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %band_const_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 15 + v2 = splat.i32x4 v1 + v3 = band v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vi v4,v1,15 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0x57, 0xb2, 0x17, 0x26 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %band_const_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 -5 + v2 = splat.i64x2 v1 + v3 = band v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vi v4,v1,-5 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0x57, 0xb2, 0x1d, 0x26 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0x27, 0x02, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %band_splat_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = band v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0xd7, 0x42, 0x15, 0x26 +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %band_splat_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = band v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x84, 0xcc +; .byte 0xd7, 0x42, 0x15, 0x26 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %band_splat_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = band v0, v2 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0x42, 0x15, 0x26 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %band_splat_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = band v2, v0 + return v3 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; vand.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0x42, 0x15, 0x26 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x82, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-band-splat.clif b/cranelift/filetests/filetests/runtests/simd-band-splat.clif new file mode 100644 index 000000000000..ede40bb89431 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-band-splat.clif @@ -0,0 +1,102 @@ +test interpret +test run +target aarch64 +target s390x +target x86_64 has_sse41=false +set enable_simd +target x86_64 +target x86_64 skylake +target riscv64 has_v + +function %band_splat_const_i8x16(i8x16) -> i8x16 { +block0(v0: i8x16): + v1 = iconst.i8 5 + v2 = splat.i8x16 v1 + v3 = band v0, v2 + return v3 +} +; run: %band_splat_const_i8x16(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000 +; run: %band_splat_const_i8x16(0x11111111111111111111111111111111) == 0x01010101010101010101010101010101 +; run: %band_splat_const_i8x16(0x01010011000011110000000011111111) == 0x01010001000001010000000001010101 +; run: %band_splat_const_i8x16(0x00000000000000001111111111111111) == 0x00000000000000000101010101010101 + +function %band_splat_const_i16x8(i16x8) -> i16x8 { +block0(v0: i16x8): + v1 = iconst.i16 -16 + v2 = splat.i16x8 v1 + v3 = band v0, v2 + return v3 +} +; run: %band_splat_const_i16x8(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000 +; run: %band_splat_const_i16x8(0x11111111111111111111111111111111) == 0x11101110111011101110111011101110 +; run: %band_splat_const_i16x8(0x01010011000011110000000011111111) == 0x01000010000011100000000011101110 +; run: %band_splat_const_i16x8(0x00000000000000001111111111111111) == 0x00000000000000001110111011101110 + +function %band_splat_const_i32x4(i32x4) -> i32x4 { +block0(v0: i32x4): + v1 = iconst.i32 15 + v2 = splat.i32x4 v1 + v3 = band v0, v2 + return v3 +} +; run: %band_splat_const_i32x4(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000 +; run: %band_splat_const_i32x4(0x11111111111111111111111111111111) == 0x00000001000000010000000100000001 +; run: %band_splat_const_i32x4(0x01010011000011110000000011111111) == 0x00000001000000010000000000000001 +; run: %band_splat_const_i32x4(0x00000000000000001111111111111111) == 0x00000000000000000000000100000001 + +function %band_splat_const_i64x2(i64x2) -> i64x2 { +block0(v0: i64x2): + v1 = iconst.i64 -5 + v2 = splat.i64x2 v1 + v3 = band v2, v0 + return v3 +} +; run: %band_splat_const_i64x2(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000 +; run: %band_splat_const_i64x2(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111 +; run: %band_splat_const_i64x2(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111 +; run: %band_splat_const_i64x2(0x00000000000000001111111111111111) == 0x00000000000000001111111111111111 + + +function %band_splat_i8x16(i8x16, i8) -> i8x16 { +block0(v0: i8x16, v1: i8): + v2 = splat.i8x16 v1 + v3 = band v0, v2 + return v3 +} +; run: %band_splat_i8x16(0x00000000000000000000000000000000, 0x01) == 0x00000000000000000000000000000000 +; run: %band_splat_i8x16(0x11111111111111111111111111111111, 0xff) == 0x11111111111111111111111111111111 +; run: %band_splat_i8x16(0x01010011000011110000000011111111, 0x8e) == 0x00000000000000000000000000000000 +; run: %band_splat_i8x16(0x00000000000000001111111111111111, 0xbe) == 0x00000000000000001010101010101010 + +function %band_splat_i16x8(i16x8, i16) -> i16x8 { +block0(v0: i16x8, v1: i16): + v2 = splat.i16x8 v1 + v3 = band v0, v2 + return v3 +} +; run: %band_splat_i16x8(0x00000000000000000000000000000000, 0x0001) == 0x00000000000000000000000000000000 +; run: %band_splat_i16x8(0x11111111111111111111111111111111, 0xffff) == 0x11111111111111111111111111111111 +; run: %band_splat_i16x8(0x01010011000011110000000011111111, 0x8e8e) == 0x00000000000000000000000000000000 +; run: %band_splat_i16x8(0x00000000000000001111111111111111, 0xc0fe) == 0x00000000000000000010001000100010 + +function %band_splat_i32x4(i32x4, i32) -> i32x4 { +block0(v0: i32x4, v1: i32): + v2 = splat.i32x4 v1 + v3 = band v0, v2 + return v3 +} +; run: %band_splat_i32x4(0x00000000000000000000000000000000, 0x00000001) == 0x00000000000000000000000000000000 +; run: %band_splat_i32x4(0x11111111111111111111111111111111, 0xffffffff) == 0x11111111111111111111111111111111 +; run: %band_splat_i32x4(0x01010011000011110000000011111111, 0x8e8e8e8e) == 0x00000000000000000000000000000000 +; run: %band_splat_i32x4(0x00000000000000001111111111111111, 0xc0ffeeee) == 0x00000000000000000011000000110000 + +function %band_splat_i64x2(i64x2, i64) -> i64x2 { +block0(v0: i64x2, v1: i64): + v2 = splat.i64x2 v1 + v3 = band v2, v0 + return v3 +} +; run: %band_splat_i64x2(0x00000000000000000000000000000000, 0x0000000000000001) == 0x00000000000000000000000000000000 +; run: %band_splat_i64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0x11111111111111111111111111111111 +; run: %band_splat_i64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x00000000000000000000000000000000 +; run: %band_splat_i64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0x00000000000000000011000000110000 From 37c3011c0a318230115c54783e9f0e6d0a79cbcf Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Sat, 20 May 2023 20:55:38 +0100 Subject: [PATCH 6/8] riscv64: Fix tests --- .../filetests/isa/riscv64/simd-band.clif | 24 +++++++------- .../filetests/isa/riscv64/simd-bxor.clif | 24 +++++++------- .../filetests/runtests/simd-bnot.clif | 32 +++++++++---------- 3 files changed, 40 insertions(+), 40 deletions(-) diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-band.clif b/cranelift/filetests/filetests/isa/riscv64/simd-band.clif index a0b99c569b77..3d3630e9087f 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-band.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-band.clif @@ -23,7 +23,7 @@ block0(v0: i8x16, v1: i8x16): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -63,7 +63,7 @@ block0(v0: i16x8, v1: i16x8): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -105,7 +105,7 @@ block0(v0: i32x4, v1: i32x4): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -147,7 +147,7 @@ block0(v0: i64x2, v1: i64x2): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -190,7 +190,7 @@ block0(v0: i8x16): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -229,7 +229,7 @@ block0(v0: i16x8): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -270,7 +270,7 @@ block0(v0: i32x4): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -311,7 +311,7 @@ block0(v0: i64x2): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -351,7 +351,7 @@ block0(v0: i8x16, v1: i8): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -389,7 +389,7 @@ block0(v0: i16x8, v1: i16): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -429,7 +429,7 @@ block0(v0: i32x4, v1: i32): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -469,7 +469,7 @@ block0(v0: i64x2, v1: i64): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif index d4acbade9772..d0b7290ebbe1 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif @@ -23,7 +23,7 @@ block0(v0: i8x16, v1: i8x16): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -63,7 +63,7 @@ block0(v0: i16x8, v1: i16x8): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -105,7 +105,7 @@ block0(v0: i32x4, v1: i32x4): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -147,7 +147,7 @@ block0(v0: i64x2, v1: i64x2): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -190,7 +190,7 @@ block0(v0: i8x16): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -229,7 +229,7 @@ block0(v0: i16x8): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -270,7 +270,7 @@ block0(v0: i32x4): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -311,7 +311,7 @@ block0(v0: i64x2): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -351,7 +351,7 @@ block0(v0: i8x16, v1: i8): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -389,7 +389,7 @@ block0(v0: i16x8, v1: i16): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -429,7 +429,7 @@ block0(v0: i32x4, v1: i32): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 @@ -469,7 +469,7 @@ block0(v0: i64x2, v1: i64): ; ld fp,0(sp) ; add sp,+16 ; ret -; +; ; Disassembled: ; block0: ; offset 0x0 ; addi sp, sp, -0x10 diff --git a/cranelift/filetests/filetests/runtests/simd-bnot.clif b/cranelift/filetests/filetests/runtests/simd-bnot.clif index b2c0446f6fbe..92d13d17770c 100644 --- a/cranelift/filetests/filetests/runtests/simd-bnot.clif +++ b/cranelift/filetests/filetests/runtests/simd-bnot.clif @@ -14,37 +14,37 @@ block0(v0: i8x16): v1 = bnot v0 return v1 } -; run: %bnot_i8x16(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000 -; run: %bnot_i8x16(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111 -; run: %bnot_i8x16(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111 -; run: %bnot_i8x16(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000 +; run: %bnot_i8x16(0x00000000000000000000000000000000) == 0xffffffffffffffffffffffffffffffff +; run: %bnot_i8x16(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee +; run: %bnot_i8x16(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeeffffffffeeeeeeee +; run: %bnot_i8x16(0x00000000000000001111111111111111) == 0xffffffffffffffffeeeeeeeeeeeeeeee function %bnot_i16x8(i16x8) -> i16x8 { block0(v0: i16x8): v1 = bnot v0 return v1 } -; run: %bnot_i16x8(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000 -; run: %bnot_i16x8(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111 -; run: %bnot_i16x8(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111 -; run: %bnot_i16x8(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000 +; run: %bnot_i16x8(0x00000000000000000000000000000000) == 0xffffffffffffffffffffffffffffffff +; run: %bnot_i16x8(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee +; run: %bnot_i16x8(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeeffffffffeeeeeeee +; run: %bnot_i16x8(0x00000000000000001111111111111111) == 0xffffffffffffffffeeeeeeeeeeeeeeee function %bnot_i32x4(i32x4) -> i32x4 { block0(v0: i32x4): v1 = bnot v0 return v1 } -; run: %bnot_i32x4(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000 -; run: %bnot_i32x4(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111 -; run: %bnot_i32x4(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111 -; run: %bnot_i32x4(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000 +; run: %bnot_i32x4(0x00000000000000000000000000000000) == 0xffffffffffffffffffffffffffffffff +; run: %bnot_i32x4(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee +; run: %bnot_i32x4(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeeffffffffeeeeeeee +; run: %bnot_i32x4(0x00000000000000001111111111111111) == 0xffffffffffffffffeeeeeeeeeeeeeeee function %bnot_i64x2(i64x2) -> i64x2 { block0(v0: i64x2): v1 = bnot v0 return v1 } -; run: %bnot_i64x2(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000 -; run: %bnot_i64x2(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111 -; run: %bnot_i64x2(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111 -; run: %bnot_i64x2(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000 +; run: %bnot_i64x2(0x00000000000000000000000000000000) == 0xffffffffffffffffffffffffffffffff +; run: %bnot_i64x2(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee +; run: %bnot_i64x2(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeeffffffffeeeeeeee +; run: %bnot_i64x2(0x00000000000000001111111111111111) == 0xffffffffffffffffeeeeeeeeeeeeeeee From 75d3e72349a5efa0e413e6020c8b454ceba54558 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Sat, 20 May 2023 21:48:43 +0100 Subject: [PATCH 7/8] riscv64: Restrict `.vx` rules to integer arguments --- cranelift/codegen/src/isa/riscv64/lower.isle | 6 ++ .../filetests/isa/riscv64/simd-band.clif | 90 +++++++++++++++++++ .../filetests/isa/riscv64/simd-bor.clif | 90 +++++++++++++++++++ .../filetests/isa/riscv64/simd-bxor.clif | 90 +++++++++++++++++++ .../filetests/runtests/simd-band-splat.clif | 24 +++++ .../filetests/runtests/simd-band.clif | 19 ++++ .../filetests/runtests/simd-bnot.clif | 20 +++++ .../filetests/runtests/simd-bor-splat.clif | 25 ++++++ .../filetests/runtests/simd-bor.clif | 18 ++++ .../filetests/runtests/simd-bxor-splat.clif | 24 +++++ .../filetests/runtests/simd-bxor.clif | 18 ++++ 11 files changed, 424 insertions(+) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index d53bee81668c..732858c511e7 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -323,9 +323,11 @@ (rv_vand_vv x y (unmasked) ty)) (rule 9 (lower (has_type (ty_vec_fits_in_register ty) (band x (splat y)))) + (if (ty_vector_not_float ty)) (rv_vand_vx x y (unmasked) ty)) (rule 10 (lower (has_type (ty_vec_fits_in_register ty) (band (splat x) y))) + (if (ty_vector_not_float ty)) (rv_vand_vx y x (unmasked) ty)) (rule 11 (lower (has_type (ty_vec_fits_in_register ty) (band x (replicated_imm5 y)))) @@ -377,9 +379,11 @@ (rv_vor_vv x y (unmasked) ty)) (rule 9 (lower (has_type (ty_vec_fits_in_register ty) (bor x (splat y)))) + (if (ty_vector_not_float ty)) (rv_vor_vx x y (unmasked) ty)) (rule 10 (lower (has_type (ty_vec_fits_in_register ty) (bor (splat x) y))) + (if (ty_vector_not_float ty)) (rv_vor_vx y x (unmasked) ty)) (rule 11 (lower (has_type (ty_vec_fits_in_register ty) (bor x (replicated_imm5 y)))) @@ -410,9 +414,11 @@ (rv_vxor_vv x y (unmasked) ty)) (rule 6 (lower (has_type (ty_vec_fits_in_register ty) (bxor x (splat y)))) + (if (ty_vector_not_float ty)) (rv_vxor_vx x y (unmasked) ty)) (rule 7 (lower (has_type (ty_vec_fits_in_register ty) (bxor (splat x) y))) + (if (ty_vector_not_float ty)) (rv_vxor_vx y x (unmasked) ty)) (rule 8 (lower (has_type (ty_vec_fits_in_register ty) (bxor x (replicated_imm5 y)))) diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-band.clif b/cranelift/filetests/filetests/isa/riscv64/simd-band.clif index 3d3630e9087f..20fd9c3cfa83 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-band.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-band.clif @@ -489,3 +489,93 @@ block0(v0: i64x2, v1: i64): ; addi sp, sp, 0x10 ; ret +function %band_splat_f32x4(f32x4, i32) -> f32x4 { +block0(v0: f32x4, v1: i32): + v2 = bitcast.f32 v1 + v3 = splat.f32x4 v2 + v4 = band v0, v3 + return v4 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; fmv.w.x ft7,a0 +; vfmv.v.f v7,ft7 #avl=4, #vtype=(e32, m1, ta, ma) +; vand.vv v7,v1,v7 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; fmv.w.x ft7, a0 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0xd3, 0x03, 0x5e +; .byte 0xd7, 0x83, 0x13, 0x26 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x83, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %band_splat_f64x2(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = bitcast.f64 v1 + v3 = splat.f64x2 v2 + v4 = band v0, v3 + return v4 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; fmv.d.x ft7,a0 +; vfmv.v.f v7,ft7 #avl=2, #vtype=(e64, m1, ta, ma) +; vand.vv v7,v1,v7 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; fmv.d.x ft7, a0 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0xd3, 0x03, 0x5e +; .byte 0xd7, 0x83, 0x13, 0x26 +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x83, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif index e556b8a554dc..2204581bb542 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif @@ -489,3 +489,93 @@ block0(v0: i64x2, v1: i64): ; addi sp, sp, 0x10 ; ret +function %bor_splat_f32x4(f32x4, i32) -> f32x4 { +block0(v0: f32x4, v1: i32): + v2 = bitcast.f32 v1 + v3 = splat.f32x4 v2 + v4 = bor v0, v3 + return v4 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; fmv.w.x ft7,a0 +; vfmv.v.f v7,ft7 #avl=4, #vtype=(e32, m1, ta, ma) +; vor.vv v7,v1,v7 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; fmv.w.x ft7, a0 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0xd3, 0x03, 0x5e +; .byte 0xd7, 0x83, 0x13, 0x2a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x83, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bor_splat_f64x2(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = bitcast.f64 v1 + v3 = splat.f64x2 v2 + v4 = bor v0, v3 + return v4 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; fmv.d.x ft7,a0 +; vfmv.v.f v7,ft7 #avl=2, #vtype=(e64, m1, ta, ma) +; vor.vv v7,v1,v7 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; fmv.d.x ft7, a0 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0xd3, 0x03, 0x5e +; .byte 0xd7, 0x83, 0x13, 0x2a +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x83, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif index d0b7290ebbe1..0f3eb3f0a1a8 100644 --- a/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif +++ b/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif @@ -489,3 +489,93 @@ block0(v0: i64x2, v1: i64): ; addi sp, sp, 0x10 ; ret +function %bxor_splat_f32x4(f32x4, i32) -> f32x4 { +block0(v0: f32x4, v1: i32): + v2 = bitcast.f32 v1 + v3 = splat.f32x4 v2 + v4 = bxor v0, v3 + return v4 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; fmv.w.x ft7,a0 +; vfmv.v.f v7,ft7 #avl=4, #vtype=(e32, m1, ta, ma) +; vxor.vv v7,v1,v7 #avl=4, #vtype=(e32, m1, ta, ma) +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; fmv.w.x ft7, a0 +; .byte 0x57, 0x70, 0x02, 0xcd +; .byte 0xd7, 0xd3, 0x03, 0x5e +; .byte 0xd7, 0x83, 0x13, 0x2e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x83, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + +function %bxor_splat_f64x2(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = bitcast.f64 v1 + v3 = splat.f64x2 v2 + v4 = bxor v0, v3 + return v4 +} + +; VCode: +; add sp,-16 +; sd ra,8(sp) +; sd fp,0(sp) +; mv fp,sp +; block0: +; vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma) +; fmv.d.x ft7,a0 +; vfmv.v.f v7,ft7 #avl=2, #vtype=(e64, m1, ta, ma) +; vxor.vv v7,v1,v7 #avl=2, #vtype=(e64, m1, ta, ma) +; vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma) +; ld ra,8(sp) +; ld fp,0(sp) +; add sp,+16 +; ret +; +; Disassembled: +; block0: ; offset 0x0 +; addi sp, sp, -0x10 +; sd ra, 8(sp) +; sd s0, 0(sp) +; ori s0, sp, 0 +; block1: ; offset 0x10 +; .byte 0x57, 0x70, 0x08, 0xcc +; addi t6, s0, 0x10 +; .byte 0x87, 0x80, 0x0f, 0x02 +; fmv.d.x ft7, a0 +; .byte 0x57, 0x70, 0x81, 0xcd +; .byte 0xd7, 0xd3, 0x03, 0x5e +; .byte 0xd7, 0x83, 0x13, 0x2e +; .byte 0x57, 0x70, 0x08, 0xcc +; .byte 0xa7, 0x83, 0x05, 0x02 +; ld ra, 8(sp) +; ld s0, 0(sp) +; addi sp, sp, 0x10 +; ret + diff --git a/cranelift/filetests/filetests/runtests/simd-band-splat.clif b/cranelift/filetests/filetests/runtests/simd-band-splat.clif index ede40bb89431..cc912d21b791 100644 --- a/cranelift/filetests/filetests/runtests/simd-band-splat.clif +++ b/cranelift/filetests/filetests/runtests/simd-band-splat.clif @@ -100,3 +100,27 @@ block0(v0: i64x2, v1: i64): ; run: %band_splat_i64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0x11111111111111111111111111111111 ; run: %band_splat_i64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x00000000000000000000000000000000 ; run: %band_splat_i64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0x00000000000000000011000000110000 + +function %band_splat_f32x4(f32x4, i32) -> f32x4 { +block0(v0: f32x4, v1: i32): + v2 = bitcast.f32 v1 + v3 = splat.f32x4 v2 + v4 = band v0, v3 + return v4 +} +; run: %band_splat_f32x4(0x00000000000000000000000000000000, 0x00000001) == 0x00000000000000000000000000000000 +; run: %band_splat_f32x4(0x11111111111111111111111111111111, 0xffffffff) == 0x11111111111111111111111111111111 +; run: %band_splat_f32x4(0x01010011000011110000000011111111, 0x8e8e8e8e) == 0x00000000000000000000000000000000 +; run: %band_splat_f32x4(0x00000000000000001111111111111111, 0xc0ffeeee) == 0x00000000000000000011000000110000 + +function %band_splat_f64x2(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = bitcast.f64 v1 + v3 = splat.f64x2 v2 + v4 = band v0, v3 + return v4 +} +; run: %band_splat_f64x2(0x00000000000000000000000000000000, 0x0000000000000001) == 0x00000000000000000000000000000000 +; run: %band_splat_f64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0x11111111111111111111111111111111 +; run: %band_splat_f64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x00000000000000000000000000000000 +; run: %band_splat_f64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0x00000000000000000011000000110000 diff --git a/cranelift/filetests/filetests/runtests/simd-band.clif b/cranelift/filetests/filetests/runtests/simd-band.clif index 2bfe927f3e22..55defb7f945f 100644 --- a/cranelift/filetests/filetests/runtests/simd-band.clif +++ b/cranelift/filetests/filetests/runtests/simd-band.clif @@ -45,3 +45,22 @@ block0(v0:i64x2, v1:i64x2): ; run: %band_i64x2([0xFEDCBA9876543210 0x0123456789ABCDEF], [0x0123456789ABCDEF 0xFEDCBA9876543210]) == [0 0] ; run: %band_i64x2([0xFEEEFFFFFEEEFFFF 0xF1FFFEFEF1FFFEFE], [0xDFDBFFFFDFDBFFFF 0xCEFFEFEFCEFFEFEF]) == [0xDECAFFFFDECAFFFF 0xC0FFEEEEC0FFEEEE] + +function %band_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0:f32x4, v1:f32x4): + v2 = band v0, v1 + return v2 +} +; run: %band_f32x4(0xFEDCBA98_76543210_01234567_89ABCDEF, 0x01234567_89ABCDEF_FEDCBA98_76543210) == 0x00000000_00000000_00000000_00000000 +; run: %band_f32x4(0xFEEEFFFF_FEEEFFFF_F1FFFEFE_F1FFFEFE, 0xDFDBFFFF_DFDBFFFF_CEFFEFEF_CEFFEFEF) == 0xDECAFFFF_DECAFFFF_C0FFEEEE_C0FFEEEE + + + +function %band_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0:f64x2, v1:f64x2): + v2 = band v0, v1 + return v2 +} + +; run: %band_f64x2(0xFEDCBA98_76543210_01234567_89ABCDEF, 0x01234567_89ABCDEF_FEDCBA98_76543210) == 0x00000000_00000000_00000000_00000000 +; run: %band_f64x2(0xFEEEFFFF_FEEEFFFF_F1FFFEFE_F1FFFEFE, 0xDFDBFFFF_DFDBFFFF_CEFFEFEF_CEFFEFEF) == 0xDECAFFFF_DECAFFFF_C0FFEEEE_C0FFEEEE diff --git a/cranelift/filetests/filetests/runtests/simd-bnot.clif b/cranelift/filetests/filetests/runtests/simd-bnot.clif index 92d13d17770c..2682d2e54674 100644 --- a/cranelift/filetests/filetests/runtests/simd-bnot.clif +++ b/cranelift/filetests/filetests/runtests/simd-bnot.clif @@ -48,3 +48,23 @@ block0(v0: i64x2): ; run: %bnot_i64x2(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee ; run: %bnot_i64x2(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeeffffffffeeeeeeee ; run: %bnot_i64x2(0x00000000000000001111111111111111) == 0xffffffffffffffffeeeeeeeeeeeeeeee + +function %bnot_f32x4(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = bnot v0 + return v1 +} +; run: %bnot_f32x4(0x00000000000000000000000000000000) == 0xffffffffffffffffffffffffffffffff +; run: %bnot_f32x4(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee +; run: %bnot_f32x4(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeeffffffffeeeeeeee +; run: %bnot_f32x4(0x00000000000000001111111111111111) == 0xffffffffffffffffeeeeeeeeeeeeeeee + +function %bnot_f64x2(f32x4) -> f32x4 { +block0(v0: f32x4): + v1 = bnot v0 + return v1 +} +; run: %bnot_f64x2(0x00000000000000000000000000000000) == 0xffffffffffffffffffffffffffffffff +; run: %bnot_f64x2(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee +; run: %bnot_f64x2(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeeffffffffeeeeeeee +; run: %bnot_f64x2(0x00000000000000001111111111111111) == 0xffffffffffffffffeeeeeeeeeeeeeeee diff --git a/cranelift/filetests/filetests/runtests/simd-bor-splat.clif b/cranelift/filetests/filetests/runtests/simd-bor-splat.clif index 29b1cfd33140..7fe6b77d9f9b 100644 --- a/cranelift/filetests/filetests/runtests/simd-bor-splat.clif +++ b/cranelift/filetests/filetests/runtests/simd-bor-splat.clif @@ -100,3 +100,28 @@ block0(v0: i64x2, v1: i64): ; run: %bor_splat_i64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0xffffffffffffffffffffffffffffffff ; run: %bor_splat_i64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f ; run: %bor_splat_i64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0xc0ffeeeec0ffeeeed1ffffffd1ffffff + + +function %bor_splat_f32x4(f32x4, i32) -> f32x4 { +block0(v0: f32x4, v1: i32): + v2 = bitcast.f32 v1 + v3 = splat.f32x4 v2 + v4 = bor v0, v3 + return v4 +} +; run: %bor_splat_f32x4(0x00000000000000000000000000000000, 0x00000001) == 0x00000001000000010000000100000001 +; run: %bor_splat_f32x4(0x11111111111111111111111111111111, 0xffffffff) == 0xffffffffffffffffffffffffffffffff +; run: %bor_splat_f32x4(0x01010011000011110000000011111111, 0x8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f +; run: %bor_splat_f32x4(0x00000000000000001111111111111111, 0xc0ffeeee) == 0xc0ffeeeec0ffeeeed1ffffffd1ffffff + +function %bor_splat_f64x2(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = bitcast.f64 v1 + v3 = splat.f64x2 v2 + v4 = bor v0, v3 + return v4 +} +; run: %bor_splat_f64x2(0x00000000000000000000000000000000, 0x0000000000000001) == 0x00000000000000010000000000000001 +; run: %bor_splat_f64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0xffffffffffffffffffffffffffffffff +; run: %bor_splat_f64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f +; run: %bor_splat_f64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0xc0ffeeeec0ffeeeed1ffffffd1ffffff diff --git a/cranelift/filetests/filetests/runtests/simd-bor.clif b/cranelift/filetests/filetests/runtests/simd-bor.clif index 15fe37de9204..eac53ea65152 100644 --- a/cranelift/filetests/filetests/runtests/simd-bor.clif +++ b/cranelift/filetests/filetests/runtests/simd-bor.clif @@ -43,3 +43,21 @@ block0(v0:i64x2, v1:i64x2): } ; run: %bor_i64x2([0xFEDCBA9876543210 0x0123456789ABCDEF], [0x0123456789ABCDEF 0xFEDCBA9876543210]) == [0xFFFFFFFFFFFFFFFF 0xFFFFFFFFFFFFFFFF] ; run: %bor_i64x2([0x8A8AAAAA8A8AAAAA 0x8A8AAAAA8A8AAAAA], [0x5440555554405555 0x5440555554405555]) == [0xDECAFFFFDECAFFFF 0xDECAFFFFDECAFFFF] + + +function %bor_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0:f32x4, v1:f32x4): + v2 = bor v0, v1 + return v2 +} +; run: %bor_f32x4(0xFEDCBA98_76543210_01234567_89ABCDEF, 0x01234567_89ABCDEF_FEDCBA98_76543210) == 0xFFFFFFFF_FFFFFFFF_FFFFFFFF_FFFFFFFF +; run: %bor_f32x4(0x8A8AAAAA_8A8AAAAA_8A8AAAAA_8A8AAAAA, 0x54405555_54405555_54405555_54405555) == 0xDECAFFFF_DECAFFFF_DECAFFFF_DECAFFFF + + +function %bor_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0:f64x2, v1:f64x2): + v2 = bor v0, v1 + return v2 +} +; run: %bor_f64x2(0xFEDCBA9876543210_0123456789ABCDEF, 0x0123456789ABCDEF_FEDCBA9876543210) == 0xFFFFFFFFFFFFFFFF_FFFFFFFFFFFFFFFF +; run: %bor_f64x2(0x8A8AAAAA8A8AAAAA_8A8AAAAA8A8AAAAA, 0x5440555554405555_5440555554405555) == 0xDECAFFFFDECAFFFF_DECAFFFFDECAFFFF diff --git a/cranelift/filetests/filetests/runtests/simd-bxor-splat.clif b/cranelift/filetests/filetests/runtests/simd-bxor-splat.clif index 6e232b9d81ad..6ce7be76db54 100644 --- a/cranelift/filetests/filetests/runtests/simd-bxor-splat.clif +++ b/cranelift/filetests/filetests/runtests/simd-bxor-splat.clif @@ -100,3 +100,27 @@ block0(v0: i64x2, v1: i64): ; run: %bxor_splat_i64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee ; run: %bxor_splat_i64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f ; run: %bxor_splat_i64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0xc0ffeeeec0ffeeeed1eeffffd1eeffff + +function %bxor_splat_f32x4(f32x4, i32) -> f32x4 { +block0(v0: f32x4, v1: i32): + v2 = bitcast.f32 v1 + v3 = splat.f32x4 v2 + v4 = bxor v0, v3 + return v4 +} +; run: %bxor_splat_f32x4(0x00000000000000000000000000000000, 0x00000001) == 0x00000001000000010000000100000001 +; run: %bxor_splat_f32x4(0x11111111111111111111111111111111, 0xffffffff) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee +; run: %bxor_splat_f32x4(0x01010011000011110000000011111111, 0x8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f +; run: %bxor_splat_f32x4(0x00000000000000001111111111111111, 0xc0ffeeee) == 0xc0ffeeeec0ffeeeed1eeffffd1eeffff + +function %bxor_splat_f64x2(f64x2, i64) -> f64x2 { +block0(v0: f64x2, v1: i64): + v2 = bitcast.f64 v1 + v3 = splat.f64x2 v2 + v4 = bxor v0, v3 + return v4 +} +; run: %bxor_splat_f64x2(0x00000000000000000000000000000000, 0x0000000000000001) == 0x00000000000000010000000000000001 +; run: %bxor_splat_f64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee +; run: %bxor_splat_f64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f +; run: %bxor_splat_f64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0xc0ffeeeec0ffeeeed1eeffffd1eeffff diff --git a/cranelift/filetests/filetests/runtests/simd-bxor.clif b/cranelift/filetests/filetests/runtests/simd-bxor.clif index a2cce79362b6..20e4b545202c 100644 --- a/cranelift/filetests/filetests/runtests/simd-bxor.clif +++ b/cranelift/filetests/filetests/runtests/simd-bxor.clif @@ -43,3 +43,21 @@ block0(v0:i64x2, v1:i64x2): } ; run: %bxor_i64x2([0xFEDCBA9876543210 0x0123456789ABCDEF], [0x0123456789ABCDEF 0xFEDCBA9876543210]) == [0xFFFFFFFFFFFFFFFF 0xFFFFFFFFFFFFFFFF] ; run: %bxor_i64x2([0x9440A07D9440A07D 0x9440A07D9440A07D], [0x4A8A5F824A8A5F82 0x4A8A5F824A8A5F82]) == [0xDECAFFFFDECAFFFF 0xDECAFFFFDECAFFFF] + + +function %bxor_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0:f32x4, v1:f32x4): + v2 = bxor v0, v1 + return v2 +} +; run: %bxor_f32x4(0xFEDCBA98_76543210_01234567_89ABCDEF, 0x01234567_89ABCDEF_FEDCBA98_76543210) == 0xFFFFFFFF_FFFFFFFF_FFFFFFFF_FFFFFFFF +; run: %bxor_f32x4(0x9440A07D_9440A07D_9440A07D_9440A07D, 0x4A8A5F82_4A8A5F82_4A8A5F82_4A8A5F82) == 0xDECAFFFF_DECAFFFF_DECAFFFF_DECAFFFF + + +function %bxor_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0:f64x2, v1:f64x2): + v2 = bxor v0, v1 + return v2 +} +; run: %bxor_f64x2(0xFEDCBA9876543210_0123456789ABCDEF, 0x0123456789ABCDEF_FEDCBA9876543210) == 0xFFFFFFFFFFFFFFFF_FFFFFFFFFFFFFFFF +; run: %bxor_f64x2(0x9440A07D9440A07D_9440A07D9440A07D, 0x4A8A5F824A8A5F82_4A8A5F824A8A5F82) == 0xDECAFFFFDECAFFFF_DECAFFFFDECAFFFF From 1443e2968c2517aa713e9abd0c4253adc8f34ec2 Mon Sep 17 00:00:00 2001 From: Afonso Bordado Date: Sat, 20 May 2023 21:52:44 +0100 Subject: [PATCH 8/8] riscv64: Add `splat` note --- cranelift/codegen/src/isa/riscv64/lower.isle | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle index 732858c511e7..fa5e35803190 100644 --- a/cranelift/codegen/src/isa/riscv64/lower.isle +++ b/cranelift/codegen/src/isa/riscv64/lower.isle @@ -1176,3 +1176,5 @@ ;; TODO: We can splat out more patterns by using for example a vmv.v.i i8x16 for ;; a i64x2 const with a compatible bit pattern. The AArch64 Backend does something ;; similar in its splat rules. +;; TODO: Look through bitcasts when splatting out registers. We can use +;; `vmv.v.x` in a `(splat.f32x4 (bitcast.f32 val))`. And vice versa for integers.