From dc01a9f7308ce50ab98ca9b08bc849bdb8c0ff92 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 20 May 2023 12:42:14 +0100
Subject: [PATCH 1/8] riscv64: Implement SIMD `bitselect`

---
 cranelift/codegen/src/isa/riscv64/inst.isle   |  17 +-
 cranelift/codegen/src/isa/riscv64/inst/mod.rs |   9 +-
 .../codegen/src/isa/riscv64/inst/vector.rs    |  14 +-
 .../codegen/src/isa/riscv64/inst_vector.isle  |  14 ++
 cranelift/codegen/src/isa/riscv64/lower.isle  |  20 +-
 .../codegen/src/isa/riscv64/lower/isle.rs     |   4 +
 .../filetests/isa/riscv64/simd-bitselect.clif | 206 ++++++++++++++++++
 .../filetests/runtests/simd-bitselect.clif    |  21 ++
 8 files changed, 284 insertions(+), 21 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-bitselect.clif
diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle
index 6b639ad311b9..7b12a36a049e 100644
--- a/cranelift/codegen/src/isa/riscv64/inst.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -1365,6 +1365,10 @@
 (decl imm5_from_u64 (Imm5) u64)
 (extern extractor imm5_from_u64 imm5_from_u64)
 
+;; Construct a Imm5 from an i8
+(decl pure partial imm5_from_i8 (i8) Imm5)
+(extern constructor imm5_from_i8 imm5_from_i8)
+
 ;; Extractor that matches a `Value` equivalent to a replicated Imm5 on all lanes.
 ;; TODO: Try matching vconst here as well
 (decl replicated_imm5 (Imm5) Value)
@@ -2215,19 +2219,6 @@
 (decl alloc_vec_writable (Type) VecWritableReg)
 (extern constructor alloc_vec_writable alloc_vec_writable)
 
-(decl gen_bitselect (Type Reg Reg Reg) Reg)
-(rule
-  (gen_bitselect ty c x y)
-  (let
-    ((tmp_x Reg (rv_and c x))
-      ;;;inverse condition
-      (c_inverse Reg (rv_not c))
-      ;;;get all y part.
-      (tmp_y Reg (rv_and c_inverse y))
-      ;;;get reuslt.
-      (result Reg (rv_or tmp_x tmp_y)))
-    result))
-
 (decl gen_int_select (Type IntSelectOP ValueRegs ValueRegs) ValueRegs)
 (rule
   (gen_int_select ty op x y)
diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
index 659dcb0fa0cf..21b259abd562 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
@@ -57,7 +57,7 @@ pub use crate::isa::riscv64::lower::isle::generated_code::{
     AluOPRRI, AluOPRRR, AtomicOP, FClassResult, FFlagsException, FloatRoundOP, FloatSelectOP,
     FpuOPRR, FpuOPRRR, FpuOPRRRR, IntSelectOP, LoadOP, MInst as Inst, StoreOP, FRM,
 };
-use crate::isa::riscv64::lower::isle::generated_code::{MInst, VecAluOpRRR};
+use crate::isa::riscv64::lower::isle::generated_code::{MInst, VecAluOpRRImm5, VecAluOpRRR};
 
 type BoxCallInfo = Box<CallInfo>;
 type BoxCallIndInfo = Box<CallIndInfo>;
@@ -1663,7 +1663,12 @@ impl Inst {
                     format!("{}", imm)
                 };
 
-                format!("{op} {vd_s},{vs2_s},{imm_s}{mask} {vstate}")
+                match (op, imm) {
+                    (VecAluOpRRImm5::VxorVI, imm) if imm == Imm5::maybe_from_i8(-1).unwrap() => {
+                        format!("vnot.v {vd_s},{vs2_s}{mask} {vstate}")
+                    }
+                    _ => format!("{op} {vd_s},{vs2_s},{imm_s}{mask} {vstate}"),
+                }
             }
             &Inst::VecAluRR {
                 op,
diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index 48d5192efaaf..874b6a015323 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -30,6 +30,9 @@ impl VecAvl {
     }
 }
 
+// TODO: Can we tell ISLE to derive this?
+impl Copy for VecAvl {}
+
 // TODO: Can we tell ISLE to derive this?
 impl PartialEq for VecAvl {
     fn eq(&self, other: &Self) -> bool {
@@ -154,7 +157,7 @@ impl fmt::Display for VecMaskMode {
 /// Vector Type (VType)
 ///
 /// vtype provides the default type used to interpret the contents of the vector register file.
-#[derive(Clone, Debug, PartialEq)]
+#[derive(Clone, Copy, Debug, PartialEq)]
 pub struct VType {
     pub sew: VecElementWidth,
     pub lmul: VecLmul,
@@ -189,7 +192,7 @@ impl fmt::Display for VType {
 /// VState represents the state of the vector unit that each instruction expects before execution.
 /// Unlike VType or any of the other types here, VState is not a part of the RISC-V ISA. It is
 /// used by our instruction emission code to ensure that the vector unit is in the correct state.
-#[derive(Clone, Debug, PartialEq)]
+#[derive(Clone, Copy, Debug, PartialEq)]
 pub struct VState {
     pub avl: VecAvl,
     pub vtype: VType,
@@ -354,6 +357,7 @@ impl VecAluOpRRImm5 {
         match self {
             VecAluOpRRImm5::VaddVI => 0b000000,
             VecAluOpRRImm5::VrsubVI => 0b000011,
+            VecAluOpRRImm5::VxorVI => 0b001011,
             VecAluOpRRImm5::VslidedownVI => 0b001111,
             VecAluOpRRImm5::VmergeVIM => 0b010111,
         }
@@ -363,6 +367,7 @@ impl VecAluOpRRImm5 {
         match self {
             VecAluOpRRImm5::VaddVI
             | VecAluOpRRImm5::VrsubVI
+            | VecAluOpRRImm5::VxorVI
             | VecAluOpRRImm5::VslidedownVI
             | VecAluOpRRImm5::VmergeVIM => VecOpCategory::OPIVI,
         }
@@ -371,7 +376,10 @@ impl VecAluOpRRImm5 {
     pub fn imm_is_unsigned(&self) -> bool {
         match self {
             VecAluOpRRImm5::VslidedownVI => true,
-            VecAluOpRRImm5::VaddVI | VecAluOpRRImm5::VrsubVI | VecAluOpRRImm5::VmergeVIM => false,
+            VecAluOpRRImm5::VaddVI
+            | VecAluOpRRImm5::VrsubVI
+            | VecAluOpRRImm5::VxorVI
+            | VecAluOpRRImm5::VmergeVIM => false,
         }
     }
 }
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index 64294d2276fa..41b09e469970 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -125,6 +125,7 @@
   ;; Regular VI Opcodes
   (VaddVI)
   (VrsubVI)
+  (VxorVI)
   (VslidedownVI)
   (VmergeVIM)
 ))
@@ -329,6 +330,19 @@
 (rule (rv_vxor_vv vs2 vs1 mask vstate)
   (vec_alu_rrr (VecAluOpRRR.VxorVV) vs2 vs1 mask vstate))
 
+;; Helper for emitting the `vxor.vi` instruction.
+;; Unlike other `vi` instructions the immediate is zero extended.
+(decl rv_vxor_vi (Reg Imm5 VecOpMasking VState) Reg)
+(rule (rv_vxor_vi vs2 imm mask vstate)
+  (vec_alu_rr_imm5 (VecAluOpRRImm5.VxorVI) vs2 imm mask vstate))
+
+;; Helper for emitting the `vnot.v` instruction.
+;; This is just a mnemonic for `vxor.vi vd, vs, -1`
+(decl rv_vnot_v (Reg VecOpMasking VState) Reg)
+(rule (rv_vnot_v vs2 mask vstate)
+  (if-let neg1 (imm5_from_i8 -1))
+  (rv_vxor_vi vs2 neg1 mask vstate))
+
 ;; Helper for emitting the `vfadd.vv` instruction.
 (decl rv_vfadd_vv (Reg Reg VecOpMasking VState) Reg)
 (rule (rv_vfadd_vv vs2 vs1 mask vstate)
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index ef72b3568dd1..b6be299c2ad9 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -817,9 +817,23 @@
 
 ;;;;;  Rules for `bitselect`;;;;;;;;;
 
-(rule
-  (lower (has_type ty (bitselect c x y)))
-  (gen_bitselect ty c x y))
+;; Do a (c & x) | (~c & y) operation.
+(rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (bitselect c x y)))
+  (let ((tmp_x Reg (rv_and c x))
+        (c_inverse Reg (rv_not c))
+        (tmp_y Reg (rv_and c_inverse y)))
+    (rv_or tmp_x tmp_y)))
+
+;; For vectors, we also do the same operation.
+;; We can technically use any type in the bitwise operations, but prefer
+;; using the type of the inputs so that we avoid emitting unnecessary
+;; `vsetvl` instructions. It's likeley that the vector unit is already
+;; configured for that type.
+(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (bitselect c x y)))
+  (let ((tmp_x Reg (rv_vand_vv c x (unmasked) ty))
+        (c_inverse Reg (rv_vnot_v c (unmasked) ty))
+        (tmp_y Reg (rv_vand_vv c_inverse y (unmasked) ty)))
+    (rv_vor_vv tmp_x tmp_y (unmasked) ty)))
 
 ;;;;;  Rules for `isplit`;;;;;;;;;
 (rule
diff --git a/cranelift/codegen/src/isa/riscv64/lower/isle.rs b/cranelift/codegen/src/isa/riscv64/lower/isle.rs
index 86e5daaff756..a17244112ba8 100644
--- a/cranelift/codegen/src/isa/riscv64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/riscv64/lower/isle.rs
@@ -206,6 +206,10 @@ impl generated_code::Context for RV64IsleContext<'_, '_, MInst, Riscv64Backend>
         Imm5::maybe_from_i8(i8::try_from(arg0 as i64).ok()?)
     }
     #[inline]
+    fn imm5_from_i8(&mut self, arg0: i8) -> Option<Imm5> {
+        Imm5::maybe_from_i8(arg0)
+    }
+    #[inline]
     fn uimm5_bitcast_to_imm5(&mut self, arg0: UImm5) -> Imm5 {
         Imm5::from_bits(arg0.bits() as u8)
     }
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bitselect.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bitselect.clif
new file mode 100644
index 000000000000..8331763f3383
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-bitselect.clif
@@ -0,0 +1,206 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %bitselect_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+    v3 = bitselect v0, v1, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v5,48(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vv v8,v1,v3 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vnot.v v10,v1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vand.vv v12,v10,v5 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vor.vv v14,v8,v12 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v14,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   addi t6, s0, 0x30
+;   .byte 0x87, 0x82, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x84, 0x11, 0x26
+;   .byte 0x57, 0xb5, 0x1f, 0x2e
+;   .byte 0x57, 0x86, 0xa2, 0x26
+;   .byte 0x57, 0x07, 0x86, 0x2a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x07, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bitselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4, v2: i32x4):
+    v3 = bitselect v0, v1, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v5,48(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vv v8,v1,v3 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vnot.v v10,v1 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vand.vv v12,v10,v5 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vor.vv v14,v8,v12 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v14,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   addi t6, s0, 0x30
+;   .byte 0x87, 0x82, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x84, 0x11, 0x26
+;   .byte 0x57, 0xb5, 0x1f, 0x2e
+;   .byte 0x57, 0x86, 0xa2, 0x26
+;   .byte 0x57, 0x07, 0x86, 0x2a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x07, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bitselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8, v2: i16x8):
+    v3 = bitselect v0, v1, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v5,48(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vv v8,v1,v3 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vnot.v v10,v1 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vand.vv v12,v10,v5 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vor.vv v14,v8,v12 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v14,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   addi t6, s0, 0x30
+;   .byte 0x87, 0x82, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x84, 0x11, 0x26
+;   .byte 0x57, 0xb5, 0x1f, 0x2e
+;   .byte 0x57, 0x86, 0xa2, 0x26
+;   .byte 0x57, 0x07, 0x86, 0x2a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x07, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16, v2: i8x16):
+    v3 = bitselect v0, v1, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v5,48(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vv v8,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vnot.v v10,v1 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vv v12,v10,v5 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vor.vv v14,v8,v12 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v14,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   addi t6, s0, 0x30
+;   .byte 0x87, 0x82, 0x0f, 0x02
+;   .byte 0x57, 0x84, 0x11, 0x26
+;   .byte 0x57, 0xb5, 0x1f, 0x2e
+;   .byte 0x57, 0x86, 0xa2, 0x26
+;   .byte 0x57, 0x07, 0x86, 0x2a
+;   .byte 0x27, 0x07, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-bitselect.clif b/cranelift/filetests/filetests/runtests/simd-bitselect.clif
index 51e075e7c0ad..f1204bfd68d9 100644
--- a/cranelift/filetests/filetests/runtests/simd-bitselect.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bitselect.clif
@@ -4,6 +4,17 @@ target aarch64
 target s390x
 target x86_64 has_sse3 has_ssse3 has_sse41
 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
+target riscv64 has_v
+
+function %bitselect_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+    v3 = bitselect v0, v1, v2
+    return v3
+}
+; run: %bitselect_i64x2(0x00000000000000000000000000000000, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
+; run: %bitselect_i64x2(0x11111111111111111111111111111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x11111111111111111111111111111111
+; run: %bitselect_i64x2(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111
+; run: %bitselect_i64x2(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000
 
 function %bitselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4, v2: i32x4):
@@ -15,6 +26,16 @@ block0(v0: i32x4, v1: i32x4, v2: i32x4):
 ; run: %bitselect_i32x4(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111
 ; run: %bitselect_i32x4(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000
 
+function %bitselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8, v2: i16x8):
+    v3 = bitselect v0, v1, v2
+    return v3
+}
+; run: %bitselect_i16x8(0x00000000000000000000000000000000, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
+; run: %bitselect_i16x8(0x11111111111111111111111111111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x11111111111111111111111111111111
+; run: %bitselect_i16x8(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111
+; run: %bitselect_i16x8(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000
+
 function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16, v2: i8x16):
     v3 = bitselect v0, v1, v2

From adcc26c3e53c94be74a76bf8e75e5ad9f551cc85 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 20 May 2023 13:02:47 +0100
Subject: [PATCH 2/8] riscv64: Add SIMD ` bnot`

---
 build.rs                                      |   1 -
 cranelift/codegen/src/isa/riscv64/lower.isle  |   5 +-
 cranelift/codegen/src/isle_prelude.rs         |   9 +
 cranelift/codegen/src/prelude.isle            |   4 +
 .../filetests/isa/riscv64/simd-bnot.clif      | 159 ++++++++++++++++++
 .../filetests/runtests/simd-bnot.clif         |  38 +++++
 6 files changed, 214 insertions(+), 2 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-bnot.clif
 create mode 100644 cranelift/filetests/filetests/runtests/simd-bnot.clif

diff --git a/build.rs b/build.rs
index 5f9b18ebf803..a4409b07e4bf 100644
--- a/build.rs
+++ b/build.rs
@@ -212,7 +212,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
                 "load_splat_out_of_bounds",
                 "simd_align",
                 "simd_bit_shift",
-                "simd_bitwise",
                 "simd_boolean",
                 "simd_conversions",
                 "simd_f32x4",
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index b6be299c2ad9..56b046baa34e 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -399,9 +399,12 @@
   (rv_vxor_vv x y (unmasked) ty))
 
 ;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-(rule (lower (has_type ty (bnot x)))
+(rule 0 (lower (has_type (ty_scalar ty) (bnot x)))
   (gen_bnot ty x))
 
+(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (bnot x)))
+  (rv_vnot_v x (unmasked) ty))
+
 ;;;; Rules for `bit_reverse` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type (fits_in_64 (ty_int ty)) (bitrev x)))
   (lower_bit_reverse x ty))
diff --git a/cranelift/codegen/src/isle_prelude.rs b/cranelift/codegen/src/isle_prelude.rs
index b5e3aaee28db..ff458ba69641 100644
--- a/cranelift/codegen/src/isle_prelude.rs
+++ b/cranelift/codegen/src/isle_prelude.rs
@@ -373,6 +373,15 @@ macro_rules! isle_common_prelude_methods {
             ty.is_int().then(|| ty)
         }
 
+        #[inline]
+        fn ty_scalar(&mut self, ty: Type) -> Option<Type> {
+            if ty.lane_count() == 1 {
+                Some(ty)
+            } else {
+                None
+            }
+        }
+
         #[inline]
         fn ty_scalar_float(&mut self, ty: Type) -> Option<Type> {
             match ty {
diff --git a/cranelift/codegen/src/prelude.isle b/cranelift/codegen/src/prelude.isle
index 6b2baf1a5aba..0d336cb21399 100644
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -383,6 +383,10 @@
 (decl ty_int (Type) Type)
 (extern extractor ty_int ty_int)
 
+;; An extractor that only matches scalar types, float or int or ref's.
+(decl ty_scalar (Type) Type)
+(extern extractor ty_scalar ty_scalar)
+
 ;; An extractor that only matches scalar floating-point types--F32 or F64.
 (decl ty_scalar_float (Type) Type)
 (extern extractor ty_scalar_float ty_scalar_float)
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bnot.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bnot.clif
new file mode 100644
index 000000000000..58d3364d2775
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-bnot.clif
@@ -0,0 +1,159 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+
+function %bnot_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = bnot v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vnot.v v4,v1 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0xb2, 0x1f, 0x2e
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bnot_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = bnot v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vnot.v v4,v1 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0xb2, 0x1f, 0x2e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bnot_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = bnot v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vnot.v v4,v1 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0xb2, 0x1f, 0x2e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bnot_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = bnot v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vnot.v v4,v1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xb2, 0x1f, 0x2e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-bnot.clif b/cranelift/filetests/filetests/runtests/simd-bnot.clif
new file mode 100644
index 000000000000..866f1eaa9370
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-bnot.clif
@@ -0,0 +1,38 @@
+test interpret
+test run
+target aarch64
+target s390x
+target x86_64 has_sse41=false
+set enable_simd
+target x86_64
+target x86_64 skylake
+target riscv64 has_v
+
+
+function %bnot_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = bnot v0
+    return v1
+}
+; run: %bnot_i8x16(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
+; run: %bnot_i8x16(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111
+; run: %bnot_i8x16(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111
+; run: %bnot_i8x16(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000
+
+function %bnot_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = bnot v0
+    return v1
+}
+
+function %bnot_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = bnot v0
+    return v1
+}
+
+function %bnot_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = bnot v0
+    return v1
+}

From a1cb4ae7441a1bdaade7fa1d866b63516a3c746b Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 20 May 2023 19:30:51 +0100
Subject: [PATCH 3/8] riscv64: Add `bxor` splat rules

---
 .../codegen/src/isa/riscv64/inst/vector.rs    |   3 +-
 .../codegen/src/isa/riscv64/inst_vector.isle  |   6 +
 cranelift/codegen/src/isa/riscv64/lower.isle  |  30 +-
 .../filetests/isa/riscv64/simd-bxor.clif      | 328 +++++++++++++++++-
 .../filetests/runtests/simd-bnot.clif         |  12 +
 .../filetests/runtests/simd-bxor-splat.clif   | 102 ++++++
 6 files changed, 466 insertions(+), 15 deletions(-)
 create mode 100644 cranelift/filetests/filetests/runtests/simd-bxor-splat.clif

diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index 874b6a015323..c721653372d3 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -277,7 +277,7 @@ impl VecAluOpRRR {
             VecAluOpRRR::VmulhuVV | VecAluOpRRR::VfmulVV | VecAluOpRRR::VfmulVF => 0b100100,
             VecAluOpRRR::VandVV => 0b001001,
             VecAluOpRRR::VorVV => 0b001010,
-            VecAluOpRRR::VxorVV => 0b001011,
+            VecAluOpRRR::VxorVV | VecAluOpRRR::VxorVX => 0b001011,
             VecAluOpRRR::VslidedownVX => 0b001111,
             VecAluOpRRR::VfrsubVF => 0b100111,
             VecAluOpRRR::VmergeVVM | VecAluOpRRR::VmergeVXM | VecAluOpRRR::VfmergeVFM => 0b010111,
@@ -301,6 +301,7 @@ impl VecAluOpRRR {
             VecAluOpRRR::VaddVX
             | VecAluOpRRR::VsubVX
             | VecAluOpRRR::VrsubVX
+            | VecAluOpRRR::VxorVX
             | VecAluOpRRR::VslidedownVX
             | VecAluOpRRR::VmergeVXM => VecOpCategory::OPIVX,
             VecAluOpRRR::VfaddVV
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index 41b09e469970..aa968cf0f73e 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -109,6 +109,7 @@
   (VaddVX)
   (VsubVX)
   (VrsubVX)
+  (VxorVX)
   (VslidedownVX)
   (VfaddVF)
   (VfsubVF)
@@ -330,6 +331,11 @@
 (rule (rv_vxor_vv vs2 vs1 mask vstate)
   (vec_alu_rrr (VecAluOpRRR.VxorVV) vs2 vs1 mask vstate))
 
+;; Helper for emitting the `vxor.vx` instruction.
+(decl rv_vxor_vx (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vxor_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VxorVX) vs2 vs1 mask vstate))
+
 ;; Helper for emitting the `vxor.vi` instruction.
 ;; Unlike other `vi` instructions the immediate is zero extended.
 (decl rv_vxor_vi (Reg Imm5 VecOpMasking VState) Reg)
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index 56b046baa34e..c432b26cfb78 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -376,28 +376,38 @@
   (rv_vor_vv x y (unmasked) ty))
 
 ;;;; Rules for `xor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-(rule -1 (lower (has_type (fits_in_64 (ty_int ty)) (bxor x y)))
+(rule 0 (lower (has_type (fits_in_64 (ty_int ty)) (bxor x y)))
   (rv_xor x y))
 
 ;; Special cases for when one operand is an immediate that fits in 12 bits.
-(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (bxor x (imm12_from_value y))))
+(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (bxor x (imm12_from_value y))))
   (rv_xori x y))
 
-(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (bxor (imm12_from_value x) y)))
+(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (bxor (imm12_from_value x) y)))
   (rv_xori y x))
 
-(rule (lower (has_type $I128 (bxor x y)))
+(rule 3 (lower (has_type $I128 (bxor x y)))
   (lower_b128_binary (AluOPRRR.Xor) x y))
 
-(rule (lower (has_type $F32 (bxor x y)))
-  (lower_float_binary (AluOPRRR.Xor) x y $F32))
+(rule 4 (lower (has_type (ty_scalar_float ty) (bxor x y)))
+  (lower_float_binary (AluOPRRR.Xor) x y ty))
 
-(rule (lower (has_type $F64 (bxor x y)))
-  (lower_float_binary (AluOPRRR.Xor) x y $F64))
-
-(rule 3 (lower (has_type (ty_vec_fits_in_register ty) (bxor x y)))
+(rule 5 (lower (has_type (ty_vec_fits_in_register ty) (bxor x y)))
   (rv_vxor_vv x y (unmasked) ty))
 
+(rule 6 (lower (has_type (ty_vec_fits_in_register ty) (bxor x (splat y))))
+  (rv_vxor_vx x y (unmasked) ty))
+
+(rule 7 (lower (has_type (ty_vec_fits_in_register ty) (bxor (splat x) y)))
+  (rv_vxor_vx y x (unmasked) ty))
+
+(rule 8 (lower (has_type (ty_vec_fits_in_register ty) (bxor x (replicated_imm5 y))))
+  (rv_vxor_vi x y (unmasked) ty))
+
+(rule 9 (lower (has_type (ty_vec_fits_in_register ty) (bxor (replicated_imm5 x) y)))
+  (rv_vxor_vi y x (unmasked) ty))
+
+
 ;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule 0 (lower (has_type (ty_scalar ty) (bnot x)))
   (gen_bnot ty x))
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif
index 0c8cc8f1ad4c..d4acbade9772 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif
@@ -23,7 +23,7 @@ block0(v0: i8x16, v1: i8x16):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-; 
+;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -63,7 +63,7 @@ block0(v0: i16x8, v1: i16x8):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-; 
+;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -105,7 +105,7 @@ block0(v0: i32x4, v1: i32x4):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-; 
+;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -147,7 +147,7 @@ block0(v0: i64x2, v1: i64x2):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-; 
+;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -169,3 +169,323 @@ block0(v0: i64x2, v1: i64x2):
 ;   addi sp, sp, 0x10
 ;   ret
 
+function %bxor_const_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 5
+    v2 = splat.i8x16 v1
+    v3 = bxor v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vxor.vi v4,v1,5 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0xb2, 0x12, 0x2e
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bxor_const_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 -16
+    v2 = splat.i16x8 v1
+    v3 = bxor v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vxor.vi v4,v1,-16 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x32, 0x18, 0x2e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bxor_const_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 15
+    v2 = splat.i32x4 v1
+    v3 = bxor v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vxor.vi v4,v1,15 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0xb2, 0x17, 0x2e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bxor_const_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 -5
+    v2 = splat.i64x2 v1
+    v3 = bxor v2, v0
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vxor.vi v4,v1,-5 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xb2, 0x1d, 0x2e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bxor_splat_i8x16(i8x16,  i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = bxor v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vxor.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0xd7, 0x42, 0x15, 0x2e
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bxor_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = bxor v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vxor.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0x42, 0x15, 0x2e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bxor_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = bxor v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vxor.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x42, 0x15, 0x2e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bxor_splat_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = bxor v2, v0
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vxor.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0x42, 0x15, 0x2e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-bnot.clif b/cranelift/filetests/filetests/runtests/simd-bnot.clif
index 866f1eaa9370..b2c0446f6fbe 100644
--- a/cranelift/filetests/filetests/runtests/simd-bnot.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bnot.clif
@@ -24,15 +24,27 @@ block0(v0: i16x8):
     v1 = bnot v0
     return v1
 }
+; run: %bnot_i16x8(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
+; run: %bnot_i16x8(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111
+; run: %bnot_i16x8(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111
+; run: %bnot_i16x8(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000
 
 function %bnot_i32x4(i32x4) -> i32x4 {
 block0(v0: i32x4):
     v1 = bnot v0
     return v1
 }
+; run: %bnot_i32x4(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
+; run: %bnot_i32x4(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111
+; run: %bnot_i32x4(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111
+; run: %bnot_i32x4(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000
 
 function %bnot_i64x2(i64x2) -> i64x2 {
 block0(v0: i64x2):
     v1 = bnot v0
     return v1
 }
+; run: %bnot_i64x2(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
+; run: %bnot_i64x2(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111
+; run: %bnot_i64x2(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111
+; run: %bnot_i64x2(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000
diff --git a/cranelift/filetests/filetests/runtests/simd-bxor-splat.clif b/cranelift/filetests/filetests/runtests/simd-bxor-splat.clif
new file mode 100644
index 000000000000..6e232b9d81ad
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-bxor-splat.clif
@@ -0,0 +1,102 @@
+test interpret
+test run
+target aarch64
+target s390x
+target x86_64 has_sse41=false
+set enable_simd
+target x86_64
+target x86_64 skylake
+target riscv64 has_v
+
+function %bxor_splat_const_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 5
+    v2 = splat.i8x16 v1
+    v3 = bxor v0, v2
+    return v3
+}
+; run: %bxor_splat_const_i8x16(0x00000000000000000000000000000000) == 0x05050505050505050505050505050505
+; run: %bxor_splat_const_i8x16(0x11111111111111111111111111111111) == 0x14141414141414141414141414141414
+; run: %bxor_splat_const_i8x16(0x01010011000011110000000011111111) == 0x04040514050514140505050514141414
+; run: %bxor_splat_const_i8x16(0x00000000000000001111111111111111) == 0x05050505050505051414141414141414
+
+function %bxor_splat_const_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 -16
+    v2 = splat.i16x8 v1
+    v3 = bxor v0, v2
+    return v3
+}
+; run: %bxor_splat_const_i16x8(0x00000000000000000000000000000000) == 0xfff0fff0fff0fff0fff0fff0fff0fff0
+; run: %bxor_splat_const_i16x8(0x11111111111111111111111111111111) == 0xeee1eee1eee1eee1eee1eee1eee1eee1
+; run: %bxor_splat_const_i16x8(0x01010011000011110000000011111111) == 0xfef1ffe1fff0eee1fff0fff0eee1eee1
+; run: %bxor_splat_const_i16x8(0x00000000000000001111111111111111) == 0xfff0fff0fff0fff0eee1eee1eee1eee1
+
+function %bxor_splat_const_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 15
+    v2 = splat.i32x4 v1
+    v3 = bxor v0, v2
+    return v3
+}
+; run: %bxor_splat_const_i32x4(0x00000000000000000000000000000000) == 0x0000000f0000000f0000000f0000000f
+; run: %bxor_splat_const_i32x4(0x11111111111111111111111111111111) == 0x1111111e1111111e1111111e1111111e
+; run: %bxor_splat_const_i32x4(0x01010011000011110000000011111111) == 0x0101001e0000111e0000000f1111111e
+; run: %bxor_splat_const_i32x4(0x00000000000000001111111111111111) == 0x0000000f0000000f1111111e1111111e
+
+function %bxor_splat_const_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 -5
+    v2 = splat.i64x2 v1
+    v3 = bxor v2, v0
+    return v3
+}
+; run: %bxor_splat_const_i64x2(0x00000000000000000000000000000000) == 0xfffffffffffffffbfffffffffffffffb
+; run: %bxor_splat_const_i64x2(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeaeeeeeeeeeeeeeeea
+; run: %bxor_splat_const_i64x2(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeaffffffffeeeeeeea
+; run: %bxor_splat_const_i64x2(0x00000000000000001111111111111111) == 0xfffffffffffffffbeeeeeeeeeeeeeeea
+
+
+function %bxor_splat_i8x16(i8x16, i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = bxor v0, v2
+    return v3
+}
+; run: %bxor_splat_i8x16(0x00000000000000000000000000000000, 0x01) == 0x01010101010101010101010101010101
+; run: %bxor_splat_i8x16(0x11111111111111111111111111111111, 0xff) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
+; run: %bxor_splat_i8x16(0x01010011000011110000000011111111, 0x8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f
+; run: %bxor_splat_i8x16(0x00000000000000001111111111111111, 0xbe) == 0xbebebebebebebebeafafafafafafafaf
+
+function %bxor_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = bxor v0, v2
+    return v3
+}
+; run: %bxor_splat_i16x8(0x00000000000000000000000000000000, 0x0001) == 0x00010001000100010001000100010001
+; run: %bxor_splat_i16x8(0x11111111111111111111111111111111, 0xffff) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
+; run: %bxor_splat_i16x8(0x01010011000011110000000011111111, 0x8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f
+; run: %bxor_splat_i16x8(0x00000000000000001111111111111111, 0xc0fe) == 0xc0fec0fec0fec0fed1efd1efd1efd1ef
+
+function %bxor_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = bxor v0, v2
+    return v3
+}
+; run: %bxor_splat_i32x4(0x00000000000000000000000000000000, 0x00000001) == 0x00000001000000010000000100000001
+; run: %bxor_splat_i32x4(0x11111111111111111111111111111111, 0xffffffff) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
+; run: %bxor_splat_i32x4(0x01010011000011110000000011111111, 0x8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f
+; run: %bxor_splat_i32x4(0x00000000000000001111111111111111, 0xc0ffeeee) == 0xc0ffeeeec0ffeeeed1eeffffd1eeffff
+
+function %bxor_splat_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = bxor v2, v0
+    return v3
+}
+; run: %bxor_splat_i64x2(0x00000000000000000000000000000000, 0x0000000000000001) == 0x00000000000000010000000000000001
+; run: %bxor_splat_i64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
+; run: %bxor_splat_i64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f
+; run: %bxor_splat_i64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0xc0ffeeeec0ffeeeed1eeffffd1eeffff

From 5c4994433f4c90e462806b5daa3f3b6c2c5f4e34 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 20 May 2023 19:44:33 +0100
Subject: [PATCH 4/8] riscv64: Add SIMD `bor` optimizations

---
 .../codegen/src/isa/riscv64/inst/vector.rs    |   6 +-
 .../codegen/src/isa/riscv64/inst_vector.isle  |  13 +-
 cranelift/codegen/src/isa/riscv64/lower.isle  |  46 +--
 .../filetests/isa/riscv64/simd-bor.clif       | 320 ++++++++++++++++++
 .../filetests/runtests/simd-bor-splat.clif    | 102 ++++++
 5 files changed, 466 insertions(+), 21 deletions(-)
 create mode 100644 cranelift/filetests/filetests/runtests/simd-bor-splat.clif

diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index c721653372d3..7b1329e71e1e 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -276,7 +276,7 @@ impl VecAluOpRRR {
             VecAluOpRRR::VmulhVV => 0b100111,
             VecAluOpRRR::VmulhuVV | VecAluOpRRR::VfmulVV | VecAluOpRRR::VfmulVF => 0b100100,
             VecAluOpRRR::VandVV => 0b001001,
-            VecAluOpRRR::VorVV => 0b001010,
+            VecAluOpRRR::VorVV | VecAluOpRRR::VorVX => 0b001010,
             VecAluOpRRR::VxorVV | VecAluOpRRR::VxorVX => 0b001011,
             VecAluOpRRR::VslidedownVX => 0b001111,
             VecAluOpRRR::VfrsubVF => 0b100111,
@@ -301,6 +301,7 @@ impl VecAluOpRRR {
             VecAluOpRRR::VaddVX
             | VecAluOpRRR::VsubVX
             | VecAluOpRRR::VrsubVX
+            | VecAluOpRRR::VorVX
             | VecAluOpRRR::VxorVX
             | VecAluOpRRR::VslidedownVX
             | VecAluOpRRR::VmergeVXM => VecOpCategory::OPIVX,
@@ -358,6 +359,7 @@ impl VecAluOpRRImm5 {
         match self {
             VecAluOpRRImm5::VaddVI => 0b000000,
             VecAluOpRRImm5::VrsubVI => 0b000011,
+            VecAluOpRRImm5::VorVI => 0b001010,
             VecAluOpRRImm5::VxorVI => 0b001011,
             VecAluOpRRImm5::VslidedownVI => 0b001111,
             VecAluOpRRImm5::VmergeVIM => 0b010111,
@@ -368,6 +370,7 @@ impl VecAluOpRRImm5 {
         match self {
             VecAluOpRRImm5::VaddVI
             | VecAluOpRRImm5::VrsubVI
+            | VecAluOpRRImm5::VorVI
             | VecAluOpRRImm5::VxorVI
             | VecAluOpRRImm5::VslidedownVI
             | VecAluOpRRImm5::VmergeVIM => VecOpCategory::OPIVI,
@@ -379,6 +382,7 @@ impl VecAluOpRRImm5 {
             VecAluOpRRImm5::VslidedownVI => true,
             VecAluOpRRImm5::VaddVI
             | VecAluOpRRImm5::VrsubVI
+            | VecAluOpRRImm5::VorVI
             | VecAluOpRRImm5::VxorVI
             | VecAluOpRRImm5::VmergeVIM => false,
         }
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index aa968cf0f73e..976add60a4e6 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -109,6 +109,7 @@
   (VaddVX)
   (VsubVX)
   (VrsubVX)
+  (VorVX)
   (VxorVX)
   (VslidedownVX)
   (VfaddVF)
@@ -126,6 +127,7 @@
   ;; Regular VI Opcodes
   (VaddVI)
   (VrsubVI)
+  (VorVI)
   (VxorVI)
   (VslidedownVI)
   (VmergeVIM)
@@ -326,6 +328,16 @@
 (rule (rv_vor_vv vs2 vs1 mask vstate)
   (vec_alu_rrr (VecAluOpRRR.VorVV) vs2 vs1 mask vstate))
 
+;; Helper for emitting the `vor.vx` instruction.
+(decl rv_vor_vx (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vor_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VorVX) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `vor.vi` instruction.
+(decl rv_vor_vi (Reg Imm5 VecOpMasking VState) Reg)
+(rule (rv_vor_vi vs2 imm mask vstate)
+  (vec_alu_rr_imm5 (VecAluOpRRImm5.VorVI) vs2 imm mask vstate))
+
 ;; Helper for emitting the `vxor.vv` instruction.
 (decl rv_vxor_vv (Reg Reg VecOpMasking VState) Reg)
 (rule (rv_vxor_vv vs2 vs1 mask vstate)
@@ -337,7 +349,6 @@
   (vec_alu_rrr (VecAluOpRRR.VxorVX) vs2 vs1 mask vstate))
 
 ;; Helper for emitting the `vxor.vi` instruction.
-;; Unlike other `vi` instructions the immediate is zero extended.
 (decl rv_vxor_vi (Reg Imm5 VecOpMasking VState) Reg)
 (rule (rv_vxor_vi vs2 imm mask vstate)
   (vec_alu_rr_imm5 (VecAluOpRRImm5.VxorVI) vs2 imm mask vstate))
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index c432b26cfb78..23e664bd2f61 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -330,51 +330,59 @@
 
 
 ;;;; Rules for `or` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-(rule -1 (lower (has_type (ty_int ty) (bor x y)))
+(rule 0 (lower (has_type (ty_int ty) (bor x y)))
   (gen_or ty x y))
 
 ;; Special cases for when one operand is an immediate that fits in 12 bits.
-(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (bor x (imm12_from_value y))))
+(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (bor x (imm12_from_value y))))
   (rv_ori x y))
 
-(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (bor (imm12_from_value x) y)))
+(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (bor (imm12_from_value x) y)))
   (rv_ori y x))
 
-(rule (lower (has_type $F32 (bor x y)))
-  (lower_float_binary (AluOPRRR.Or) x y $F32))
-
-(rule (lower (has_type $F64 (bor x y)))
-  (lower_float_binary (AluOPRRR.Or) x y $F64))
+(rule 3 (lower (has_type (ty_scalar_float ty) (bor x y)))
+  (lower_float_binary (AluOPRRR.Or) x y ty))
 
 ;; Specialized lowerings for `(bor x (bnot y))` which is additionally produced
 ;; by Cranelift's `bor_not` instruction that is legalized into the simpler
 ;; forms early on.
 
-(rule 3 (lower (has_type (fits_in_64 (ty_int ty)) (bor x (bnot y))))
+(rule 4 (lower (has_type (fits_in_64 (ty_int ty)) (bor x (bnot y))))
   (if-let $true (has_zbb))
   (rv_orn x y))
 
-(rule 4 (lower (has_type (fits_in_64 (ty_int ty)) (bor (bnot y) x)))
+(rule 5 (lower (has_type (fits_in_64 (ty_int ty)) (bor (bnot y) x)))
   (if-let $true (has_zbb))
   (rv_orn x y))
 
-(rule 5 (lower (has_type $I128 (bor x (bnot y))))
+(rule 6 (lower (has_type $I128 (bor x (bnot y))))
   (if-let $true (has_zbb))
-  (let
-    ((low Reg (rv_orn (value_regs_get x 0) (value_regs_get y 0)))
-      (high Reg (rv_orn (value_regs_get x 1) (value_regs_get y 1))))
+  (let ((low Reg (rv_orn (value_regs_get x 0) (value_regs_get y 0)))
+        (high Reg (rv_orn (value_regs_get x 1) (value_regs_get y 1))))
     (value_regs low high)))
 
-(rule 6 (lower (has_type $I128 (bor (bnot y) x)))
+(rule 7 (lower (has_type $I128 (bor (bnot y) x)))
   (if-let $true (has_zbb))
-  (let
-    ((low Reg (rv_orn (value_regs_get x 0) (value_regs_get y 0)))
-      (high Reg (rv_orn (value_regs_get x 1) (value_regs_get y 1))))
+  (let ((low Reg (rv_orn (value_regs_get x 0) (value_regs_get y 0)))
+        (high Reg (rv_orn (value_regs_get x 1) (value_regs_get y 1))))
     (value_regs low high)))
 
-(rule 7 (lower (has_type (ty_vec_fits_in_register ty) (bor x y)))
+(rule 8 (lower (has_type (ty_vec_fits_in_register ty) (bor x y)))
   (rv_vor_vv x y (unmasked) ty))
 
+(rule 9 (lower (has_type (ty_vec_fits_in_register ty) (bor x (splat y))))
+  (rv_vor_vx x y (unmasked) ty))
+
+(rule 10 (lower (has_type (ty_vec_fits_in_register ty) (bor (splat x) y)))
+  (rv_vor_vx y x (unmasked) ty))
+
+(rule 11 (lower (has_type (ty_vec_fits_in_register ty) (bor x (replicated_imm5 y))))
+  (rv_vor_vi x y (unmasked) ty))
+
+(rule 12 (lower (has_type (ty_vec_fits_in_register ty) (bor (replicated_imm5 x) y)))
+  (rv_vor_vi y x (unmasked) ty))
+
+
 ;;;; Rules for `xor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule 0 (lower (has_type (fits_in_64 (ty_int ty)) (bxor x y)))
   (rv_xor x y))
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif
index 7f8beb629f50..e556b8a554dc 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif
@@ -169,3 +169,323 @@ block0(v0: i64x2, v1: i64x2):
 ;   addi sp, sp, 0x10
 ;   ret
 
+function %bor_const_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 5
+    v2 = splat.i8x16 v1
+    v3 = bor v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vor.vi v4,v1,5 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0xb2, 0x12, 0x2a
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bor_const_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 -16
+    v2 = splat.i16x8 v1
+    v3 = bor v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vor.vi v4,v1,-16 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x32, 0x18, 0x2a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bor_const_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 15
+    v2 = splat.i32x4 v1
+    v3 = bor v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vor.vi v4,v1,15 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0xb2, 0x17, 0x2a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bor_const_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 -5
+    v2 = splat.i64x2 v1
+    v3 = bor v2, v0
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vor.vi v4,v1,-5 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xb2, 0x1d, 0x2a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bor_splat_i8x16(i8x16,  i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = bor v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vor.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0xd7, 0x42, 0x15, 0x2a
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bor_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = bor v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vor.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0x42, 0x15, 0x2a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bor_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = bor v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vor.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x42, 0x15, 0x2a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bor_splat_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = bor v2, v0
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vor.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0x42, 0x15, 0x2a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-bor-splat.clif b/cranelift/filetests/filetests/runtests/simd-bor-splat.clif
new file mode 100644
index 000000000000..29b1cfd33140
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-bor-splat.clif
@@ -0,0 +1,102 @@
+test interpret
+test run
+target aarch64
+target s390x
+target x86_64 has_sse41=false
+set enable_simd
+target x86_64
+target x86_64 skylake
+target riscv64 has_v
+
+function %bor_splat_const_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 5
+    v2 = splat.i8x16 v1
+    v3 = bor v0, v2
+    return v3
+}
+; run: %bor_splat_const_i8x16(0x00000000000000000000000000000000) == 0x05050505050505050505050505050505
+; run: %bor_splat_const_i8x16(0x11111111111111111111111111111111) == 0x15151515151515151515151515151515
+; run: %bor_splat_const_i8x16(0x01010011000011110000000011111111) == 0x05050515050515150505050515151515
+; run: %bor_splat_const_i8x16(0x00000000000000001111111111111111) == 0x05050505050505051515151515151515
+
+function %bor_splat_const_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 -16
+    v2 = splat.i16x8 v1
+    v3 = bor v0, v2
+    return v3
+}
+; run: %bor_splat_const_i16x8(0x00000000000000000000000000000000) == 0xfff0fff0fff0fff0fff0fff0fff0fff0
+; run: %bor_splat_const_i16x8(0x11111111111111111111111111111111) == 0xfff1fff1fff1fff1fff1fff1fff1fff1
+; run: %bor_splat_const_i16x8(0x01010011000011110000000011111111) == 0xfff1fff1fff0fff1fff0fff0fff1fff1
+; run: %bor_splat_const_i16x8(0x00000000000000001111111111111111) == 0xfff0fff0fff0fff0fff1fff1fff1fff1
+
+function %bor_splat_const_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 15
+    v2 = splat.i32x4 v1
+    v3 = bor v0, v2
+    return v3
+}
+; run: %bor_splat_const_i32x4(0x00000000000000000000000000000000) == 0x0000000f0000000f0000000f0000000f
+; run: %bor_splat_const_i32x4(0x11111111111111111111111111111111) == 0x1111111f1111111f1111111f1111111f
+; run: %bor_splat_const_i32x4(0x01010011000011110000000011111111) == 0x0101001f0000111f0000000f1111111f
+; run: %bor_splat_const_i32x4(0x00000000000000001111111111111111) == 0x0000000f0000000f1111111f1111111f
+
+function %bor_splat_const_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 -5
+    v2 = splat.i64x2 v1
+    v3 = bor v2, v0
+    return v3
+}
+; run: %bor_splat_const_i64x2(0x00000000000000000000000000000000) == 0xfffffffffffffffbfffffffffffffffb
+; run: %bor_splat_const_i64x2(0x11111111111111111111111111111111) == 0xfffffffffffffffbfffffffffffffffb
+; run: %bor_splat_const_i64x2(0x01010011000011110000000011111111) == 0xfffffffffffffffbfffffffffffffffb
+; run: %bor_splat_const_i64x2(0x00000000000000001111111111111111) == 0xfffffffffffffffbfffffffffffffffb
+
+
+function %bor_splat_i8x16(i8x16, i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = bor v0, v2
+    return v3
+}
+; run: %bor_splat_i8x16(0x00000000000000000000000000000000, 0x01) == 0x01010101010101010101010101010101
+; run: %bor_splat_i8x16(0x11111111111111111111111111111111, 0xff) == 0xffffffffffffffffffffffffffffffff
+; run: %bor_splat_i8x16(0x01010011000011110000000011111111, 0x8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f
+; run: %bor_splat_i8x16(0x00000000000000001111111111111111, 0xbe) == 0xbebebebebebebebebfbfbfbfbfbfbfbf
+
+function %bor_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = bor v0, v2
+    return v3
+}
+; run: %bor_splat_i16x8(0x00000000000000000000000000000000, 0x0001) == 0x00010001000100010001000100010001
+; run: %bor_splat_i16x8(0x11111111111111111111111111111111, 0xffff) == 0xffffffffffffffffffffffffffffffff
+; run: %bor_splat_i16x8(0x01010011000011110000000011111111, 0x8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f
+; run: %bor_splat_i16x8(0x00000000000000001111111111111111, 0xc0fe) == 0xc0fec0fec0fec0fed1ffd1ffd1ffd1ff
+
+function %bor_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = bor v0, v2
+    return v3
+}
+; run: %bor_splat_i32x4(0x00000000000000000000000000000000, 0x00000001) == 0x00000001000000010000000100000001
+; run: %bor_splat_i32x4(0x11111111111111111111111111111111, 0xffffffff) == 0xffffffffffffffffffffffffffffffff
+; run: %bor_splat_i32x4(0x01010011000011110000000011111111, 0x8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f
+; run: %bor_splat_i32x4(0x00000000000000001111111111111111, 0xc0ffeeee) == 0xc0ffeeeec0ffeeeed1ffffffd1ffffff
+
+function %bor_splat_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = bor v2, v0
+    return v3
+}
+; run: %bor_splat_i64x2(0x00000000000000000000000000000000, 0x0000000000000001) == 0x00000000000000010000000000000001
+; run: %bor_splat_i64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0xffffffffffffffffffffffffffffffff
+; run: %bor_splat_i64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f
+; run: %bor_splat_i64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0xc0ffeeeec0ffeeeed1ffffffd1ffffff

From 9ca616c8796a4051a9ea33bc72244899097f82c3 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 20 May 2023 19:59:25 +0100
Subject: [PATCH 5/8] riscv64: Add SIMD `band` splat rules

---
 .../codegen/src/isa/riscv64/inst/vector.rs    |   6 +-
 .../codegen/src/isa/riscv64/inst_vector.isle  |  12 +
 cranelift/codegen/src/isa/riscv64/lower.isle  |  46 +--
 .../filetests/isa/riscv64/simd-band.clif      | 328 +++++++++++++++++-
 .../filetests/runtests/simd-band-splat.clif   | 102 ++++++
 5 files changed, 469 insertions(+), 25 deletions(-)
 create mode 100644 cranelift/filetests/filetests/runtests/simd-band-splat.clif

diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index 7b1329e71e1e..fae3d66a4698 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -275,7 +275,7 @@ impl VecAluOpRRR {
             VecAluOpRRR::VmulVV => 0b100101,
             VecAluOpRRR::VmulhVV => 0b100111,
             VecAluOpRRR::VmulhuVV | VecAluOpRRR::VfmulVV | VecAluOpRRR::VfmulVF => 0b100100,
-            VecAluOpRRR::VandVV => 0b001001,
+            VecAluOpRRR::VandVV | VecAluOpRRR::VandVX => 0b001001,
             VecAluOpRRR::VorVV | VecAluOpRRR::VorVX => 0b001010,
             VecAluOpRRR::VxorVV | VecAluOpRRR::VxorVX => 0b001011,
             VecAluOpRRR::VslidedownVX => 0b001111,
@@ -301,6 +301,7 @@ impl VecAluOpRRR {
             VecAluOpRRR::VaddVX
             | VecAluOpRRR::VsubVX
             | VecAluOpRRR::VrsubVX
+            | VecAluOpRRR::VandVX
             | VecAluOpRRR::VorVX
             | VecAluOpRRR::VxorVX
             | VecAluOpRRR::VslidedownVX
@@ -359,6 +360,7 @@ impl VecAluOpRRImm5 {
         match self {
             VecAluOpRRImm5::VaddVI => 0b000000,
             VecAluOpRRImm5::VrsubVI => 0b000011,
+            VecAluOpRRImm5::VandVI => 0b001001,
             VecAluOpRRImm5::VorVI => 0b001010,
             VecAluOpRRImm5::VxorVI => 0b001011,
             VecAluOpRRImm5::VslidedownVI => 0b001111,
@@ -370,6 +372,7 @@ impl VecAluOpRRImm5 {
         match self {
             VecAluOpRRImm5::VaddVI
             | VecAluOpRRImm5::VrsubVI
+            | VecAluOpRRImm5::VandVI
             | VecAluOpRRImm5::VorVI
             | VecAluOpRRImm5::VxorVI
             | VecAluOpRRImm5::VslidedownVI
@@ -382,6 +385,7 @@ impl VecAluOpRRImm5 {
             VecAluOpRRImm5::VslidedownVI => true,
             VecAluOpRRImm5::VaddVI
             | VecAluOpRRImm5::VrsubVI
+            | VecAluOpRRImm5::VandVI
             | VecAluOpRRImm5::VorVI
             | VecAluOpRRImm5::VxorVI
             | VecAluOpRRImm5::VmergeVIM => false,
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index 976add60a4e6..0a6d8dc599b6 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -109,6 +109,7 @@
   (VaddVX)
   (VsubVX)
   (VrsubVX)
+  (VandVX)
   (VorVX)
   (VxorVX)
   (VslidedownVX)
@@ -127,6 +128,7 @@
   ;; Regular VI Opcodes
   (VaddVI)
   (VrsubVI)
+  (VandVI)
   (VorVI)
   (VxorVI)
   (VslidedownVI)
@@ -323,6 +325,16 @@
 (rule (rv_vand_vv vs2 vs1 mask vstate)
   (vec_alu_rrr (VecAluOpRRR.VandVV) vs2 vs1 mask vstate))
 
+;; Helper for emitting the `vand.vx` instruction.
+(decl rv_vand_vx (Reg Reg VecOpMasking VState) Reg)
+(rule (rv_vand_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VandVX) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `vand.vi` instruction.
+(decl rv_vand_vi (Reg Imm5 VecOpMasking VState) Reg)
+(rule (rv_vand_vi vs2 imm mask vstate)
+  (vec_alu_rr_imm5 (VecAluOpRRImm5.VandVI) vs2 imm mask vstate))
+
 ;; Helper for emitting the `vor.vv` instruction.
 (decl rv_vor_vv (Reg Reg VecOpMasking VState) Reg)
 (rule (rv_vor_vv vs2 vs1 mask vstate)
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index 23e664bd2f61..d53bee81668c 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -282,52 +282,58 @@
     (rv_remu x y)))
 
 ;;;; Rules for `and` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-(rule -1 (lower (has_type (ty_int ty) (band x y)))
+(rule 0 (lower (has_type (ty_int ty) (band x y)))
   (gen_and ty x y))
 
 ;; Special cases for when one operand is an immediate that fits in 12 bits.
-(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (band x (imm12_from_value y))))
+(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (band x (imm12_from_value y))))
   (rv_andi x y))
 
-(rule 1 (lower (has_type (fits_in_64 (ty_int ty)) (band (imm12_from_value x) y)))
+(rule 2 (lower (has_type (fits_in_64 (ty_int ty)) (band (imm12_from_value x) y)))
   (rv_andi y x))
 
-(rule (lower (has_type $F32 (band x y)))
-  (lower_float_binary (AluOPRRR.And) x y $F32))
-
-(rule (lower (has_type $F64 (band x y)))
-  (lower_float_binary (AluOPRRR.And) x y $F64))
+(rule 3 (lower (has_type (ty_scalar_float ty) (band x y)))
+  (lower_float_binary (AluOPRRR.And) x y ty))
 
 ;; Specialized lowerings for `(band x (bnot y))` which is additionally produced
 ;; by Cranelift's `band_not` instruction that is legalized into the simpler
 ;; forms early on.
 
-(rule 3 (lower (has_type (fits_in_64 (ty_int ty)) (band x (bnot y))))
+(rule 4 (lower (has_type (fits_in_64 (ty_int ty)) (band x (bnot y))))
   (if-let $true (has_zbb))
   (rv_andn x y))
 
-(rule 4 (lower (has_type (fits_in_64 (ty_int ty)) (band (bnot y) x)))
+(rule 5 (lower (has_type (fits_in_64 (ty_int ty)) (band (bnot y) x)))
   (if-let $true (has_zbb))
   (rv_andn x y))
 
-(rule 5 (lower (has_type $I128 (band x (bnot y))))
+(rule 6 (lower (has_type $I128 (band x (bnot y))))
   (if-let $true (has_zbb))
-  (let
-    ((low Reg (rv_andn (value_regs_get x 0) (value_regs_get y 0)))
-      (high Reg (rv_andn (value_regs_get x 1) (value_regs_get y 1))))
+  (let ((low Reg (rv_andn (value_regs_get x 0) (value_regs_get y 0)))
+        (high Reg (rv_andn (value_regs_get x 1) (value_regs_get y 1))))
     (value_regs low high)))
 
-(rule 6 (lower (has_type $I128 (band (bnot y) x)))
+(rule 7 (lower (has_type $I128 (band (bnot y) x)))
   (if-let $true (has_zbb))
-  (let
-    ((low Reg (rv_andn (value_regs_get x 0) (value_regs_get y 0)))
-      (high Reg (rv_andn (value_regs_get x 1) (value_regs_get y 1))))
+  (let ((low Reg (rv_andn (value_regs_get x 0) (value_regs_get y 0)))
+        (high Reg (rv_andn (value_regs_get x 1) (value_regs_get y 1))))
     (value_regs low high)))
 
-
-(rule 7 (lower (has_type (ty_vec_fits_in_register ty) (band x y)))
+(rule 8 (lower (has_type (ty_vec_fits_in_register ty) (band x y)))
   (rv_vand_vv x y (unmasked) ty))
 
+(rule 9 (lower (has_type (ty_vec_fits_in_register ty) (band x (splat y))))
+  (rv_vand_vx x y (unmasked) ty))
+
+(rule 10 (lower (has_type (ty_vec_fits_in_register ty) (band (splat x) y)))
+  (rv_vand_vx y x (unmasked) ty))
+
+(rule 11 (lower (has_type (ty_vec_fits_in_register ty) (band x (replicated_imm5 y))))
+  (rv_vand_vi x y (unmasked) ty))
+
+(rule 12 (lower (has_type (ty_vec_fits_in_register ty) (band (replicated_imm5 x) y)))
+  (rv_vand_vi y x (unmasked) ty))
+
 
 ;;;; Rules for `or` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule 0 (lower (has_type (ty_int ty) (bor x y)))
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-band.clif b/cranelift/filetests/filetests/isa/riscv64/simd-band.clif
index c4c6a3530482..a0b99c569b77 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-band.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-band.clif
@@ -23,7 +23,7 @@ block0(v0: i8x16, v1: i8x16):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-; 
+;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -63,7 +63,7 @@ block0(v0: i16x8, v1: i16x8):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-; 
+;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -105,7 +105,7 @@ block0(v0: i32x4, v1: i32x4):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-; 
+;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -147,7 +147,7 @@ block0(v0: i64x2, v1: i64x2):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-; 
+;
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -169,3 +169,323 @@ block0(v0: i64x2, v1: i64x2):
 ;   addi sp, sp, 0x10
 ;   ret
 
+function %band_const_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 5
+    v2 = splat.i8x16 v1
+    v3 = band v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vi v4,v1,5 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0xb2, 0x12, 0x26
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %band_const_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 -16
+    v2 = splat.i16x8 v1
+    v3 = band v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vi v4,v1,-16 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x32, 0x18, 0x26
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %band_const_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 15
+    v2 = splat.i32x4 v1
+    v3 = band v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vi v4,v1,15 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0xb2, 0x17, 0x26
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %band_const_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 -5
+    v2 = splat.i64x2 v1
+    v3 = band v2, v0
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vi v4,v1,-5 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0xb2, 0x1d, 0x26
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %band_splat_i8x16(i8x16,  i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = band v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0xd7, 0x42, 0x15, 0x26
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %band_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = band v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0x42, 0x15, 0x26
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %band_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = band v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x42, 0x15, 0x26
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %band_splat_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = band v2, v0
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vand.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0x42, 0x15, 0x26
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-band-splat.clif b/cranelift/filetests/filetests/runtests/simd-band-splat.clif
new file mode 100644
index 000000000000..ede40bb89431
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-band-splat.clif
@@ -0,0 +1,102 @@
+test interpret
+test run
+target aarch64
+target s390x
+target x86_64 has_sse41=false
+set enable_simd
+target x86_64
+target x86_64 skylake
+target riscv64 has_v
+
+function %band_splat_const_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 5
+    v2 = splat.i8x16 v1
+    v3 = band v0, v2
+    return v3
+}
+; run: %band_splat_const_i8x16(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
+; run: %band_splat_const_i8x16(0x11111111111111111111111111111111) == 0x01010101010101010101010101010101
+; run: %band_splat_const_i8x16(0x01010011000011110000000011111111) == 0x01010001000001010000000001010101
+; run: %band_splat_const_i8x16(0x00000000000000001111111111111111) == 0x00000000000000000101010101010101
+
+function %band_splat_const_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 -16
+    v2 = splat.i16x8 v1
+    v3 = band v0, v2
+    return v3
+}
+; run: %band_splat_const_i16x8(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
+; run: %band_splat_const_i16x8(0x11111111111111111111111111111111) == 0x11101110111011101110111011101110
+; run: %band_splat_const_i16x8(0x01010011000011110000000011111111) == 0x01000010000011100000000011101110
+; run: %band_splat_const_i16x8(0x00000000000000001111111111111111) == 0x00000000000000001110111011101110
+
+function %band_splat_const_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 15
+    v2 = splat.i32x4 v1
+    v3 = band v0, v2
+    return v3
+}
+; run: %band_splat_const_i32x4(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
+; run: %band_splat_const_i32x4(0x11111111111111111111111111111111) == 0x00000001000000010000000100000001
+; run: %band_splat_const_i32x4(0x01010011000011110000000011111111) == 0x00000001000000010000000000000001
+; run: %band_splat_const_i32x4(0x00000000000000001111111111111111) == 0x00000000000000000000000100000001
+
+function %band_splat_const_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 -5
+    v2 = splat.i64x2 v1
+    v3 = band v2, v0
+    return v3
+}
+; run: %band_splat_const_i64x2(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
+; run: %band_splat_const_i64x2(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111
+; run: %band_splat_const_i64x2(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111
+; run: %band_splat_const_i64x2(0x00000000000000001111111111111111) == 0x00000000000000001111111111111111
+
+
+function %band_splat_i8x16(i8x16, i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = band v0, v2
+    return v3
+}
+; run: %band_splat_i8x16(0x00000000000000000000000000000000, 0x01) == 0x00000000000000000000000000000000
+; run: %band_splat_i8x16(0x11111111111111111111111111111111, 0xff) == 0x11111111111111111111111111111111
+; run: %band_splat_i8x16(0x01010011000011110000000011111111, 0x8e) == 0x00000000000000000000000000000000
+; run: %band_splat_i8x16(0x00000000000000001111111111111111, 0xbe) == 0x00000000000000001010101010101010
+
+function %band_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = band v0, v2
+    return v3
+}
+; run: %band_splat_i16x8(0x00000000000000000000000000000000, 0x0001) == 0x00000000000000000000000000000000
+; run: %band_splat_i16x8(0x11111111111111111111111111111111, 0xffff) == 0x11111111111111111111111111111111
+; run: %band_splat_i16x8(0x01010011000011110000000011111111, 0x8e8e) == 0x00000000000000000000000000000000
+; run: %band_splat_i16x8(0x00000000000000001111111111111111, 0xc0fe) == 0x00000000000000000010001000100010
+
+function %band_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = band v0, v2
+    return v3
+}
+; run: %band_splat_i32x4(0x00000000000000000000000000000000, 0x00000001) == 0x00000000000000000000000000000000
+; run: %band_splat_i32x4(0x11111111111111111111111111111111, 0xffffffff) == 0x11111111111111111111111111111111
+; run: %band_splat_i32x4(0x01010011000011110000000011111111, 0x8e8e8e8e) == 0x00000000000000000000000000000000
+; run: %band_splat_i32x4(0x00000000000000001111111111111111, 0xc0ffeeee) == 0x00000000000000000011000000110000
+
+function %band_splat_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = band v2, v0
+    return v3
+}
+; run: %band_splat_i64x2(0x00000000000000000000000000000000, 0x0000000000000001) == 0x00000000000000000000000000000000
+; run: %band_splat_i64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0x11111111111111111111111111111111
+; run: %band_splat_i64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x00000000000000000000000000000000
+; run: %band_splat_i64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0x00000000000000000011000000110000

From 37c3011c0a318230115c54783e9f0e6d0a79cbcf Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 20 May 2023 20:55:38 +0100
Subject: [PATCH 6/8] riscv64: Fix tests

---
 .../filetests/isa/riscv64/simd-band.clif      | 24 +++++++-------
 .../filetests/isa/riscv64/simd-bxor.clif      | 24 +++++++-------
 .../filetests/runtests/simd-bnot.clif         | 32 +++++++++----------
 3 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-band.clif b/cranelift/filetests/filetests/isa/riscv64/simd-band.clif
index a0b99c569b77..3d3630e9087f 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-band.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-band.clif
@@ -23,7 +23,7 @@ block0(v0: i8x16, v1: i8x16):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -63,7 +63,7 @@ block0(v0: i16x8, v1: i16x8):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -105,7 +105,7 @@ block0(v0: i32x4, v1: i32x4):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -147,7 +147,7 @@ block0(v0: i64x2, v1: i64x2):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -190,7 +190,7 @@ block0(v0: i8x16):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -229,7 +229,7 @@ block0(v0: i16x8):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -270,7 +270,7 @@ block0(v0: i32x4):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -311,7 +311,7 @@ block0(v0: i64x2):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -351,7 +351,7 @@ block0(v0: i8x16, v1: i8):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -389,7 +389,7 @@ block0(v0: i16x8, v1: i16):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -429,7 +429,7 @@ block0(v0: i32x4, v1: i32):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -469,7 +469,7 @@ block0(v0: i64x2, v1: i64):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif
index d4acbade9772..d0b7290ebbe1 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif
@@ -23,7 +23,7 @@ block0(v0: i8x16, v1: i8x16):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -63,7 +63,7 @@ block0(v0: i16x8, v1: i16x8):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -105,7 +105,7 @@ block0(v0: i32x4, v1: i32x4):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -147,7 +147,7 @@ block0(v0: i64x2, v1: i64x2):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -190,7 +190,7 @@ block0(v0: i8x16):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -229,7 +229,7 @@ block0(v0: i16x8):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -270,7 +270,7 @@ block0(v0: i32x4):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -311,7 +311,7 @@ block0(v0: i64x2):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -351,7 +351,7 @@ block0(v0: i8x16, v1: i8):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -389,7 +389,7 @@ block0(v0: i16x8, v1: i16):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -429,7 +429,7 @@ block0(v0: i32x4, v1: i32):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
@@ -469,7 +469,7 @@ block0(v0: i64x2, v1: i64):
 ;   ld fp,0(sp)
 ;   add sp,+16
 ;   ret
-;
+; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   addi sp, sp, -0x10
diff --git a/cranelift/filetests/filetests/runtests/simd-bnot.clif b/cranelift/filetests/filetests/runtests/simd-bnot.clif
index b2c0446f6fbe..92d13d17770c 100644
--- a/cranelift/filetests/filetests/runtests/simd-bnot.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bnot.clif
@@ -14,37 +14,37 @@ block0(v0: i8x16):
     v1 = bnot v0
     return v1
 }
-; run: %bnot_i8x16(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
-; run: %bnot_i8x16(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111
-; run: %bnot_i8x16(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111
-; run: %bnot_i8x16(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000
+; run: %bnot_i8x16(0x00000000000000000000000000000000) == 0xffffffffffffffffffffffffffffffff
+; run: %bnot_i8x16(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
+; run: %bnot_i8x16(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeeffffffffeeeeeeee
+; run: %bnot_i8x16(0x00000000000000001111111111111111) == 0xffffffffffffffffeeeeeeeeeeeeeeee
 
 function %bnot_i16x8(i16x8) -> i16x8 {
 block0(v0: i16x8):
     v1 = bnot v0
     return v1
 }
-; run: %bnot_i16x8(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
-; run: %bnot_i16x8(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111
-; run: %bnot_i16x8(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111
-; run: %bnot_i16x8(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000
+; run: %bnot_i16x8(0x00000000000000000000000000000000) == 0xffffffffffffffffffffffffffffffff
+; run: %bnot_i16x8(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
+; run: %bnot_i16x8(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeeffffffffeeeeeeee
+; run: %bnot_i16x8(0x00000000000000001111111111111111) == 0xffffffffffffffffeeeeeeeeeeeeeeee
 
 function %bnot_i32x4(i32x4) -> i32x4 {
 block0(v0: i32x4):
     v1 = bnot v0
     return v1
 }
-; run: %bnot_i32x4(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
-; run: %bnot_i32x4(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111
-; run: %bnot_i32x4(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111
-; run: %bnot_i32x4(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000
+; run: %bnot_i32x4(0x00000000000000000000000000000000) == 0xffffffffffffffffffffffffffffffff
+; run: %bnot_i32x4(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
+; run: %bnot_i32x4(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeeffffffffeeeeeeee
+; run: %bnot_i32x4(0x00000000000000001111111111111111) == 0xffffffffffffffffeeeeeeeeeeeeeeee
 
 function %bnot_i64x2(i64x2) -> i64x2 {
 block0(v0: i64x2):
     v1 = bnot v0
     return v1
 }
-; run: %bnot_i64x2(0x00000000000000000000000000000000) == 0x00000000000000000000000000000000
-; run: %bnot_i64x2(0x11111111111111111111111111111111) == 0x11111111111111111111111111111111
-; run: %bnot_i64x2(0x01010011000011110000000011111111) == 0x01010011000011110000000011111111
-; run: %bnot_i64x2(0x00000000000000001111111111111111) == 0x11111111111111110000000000000000
+; run: %bnot_i64x2(0x00000000000000000000000000000000) == 0xffffffffffffffffffffffffffffffff
+; run: %bnot_i64x2(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
+; run: %bnot_i64x2(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeeffffffffeeeeeeee
+; run: %bnot_i64x2(0x00000000000000001111111111111111) == 0xffffffffffffffffeeeeeeeeeeeeeeee

From 75d3e72349a5efa0e413e6020c8b454ceba54558 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 20 May 2023 21:48:43 +0100
Subject: [PATCH 7/8] riscv64: Restrict `.vx` rules to integer arguments

---
 cranelift/codegen/src/isa/riscv64/lower.isle  |  6 ++
 .../filetests/isa/riscv64/simd-band.clif      | 90 +++++++++++++++++++
 .../filetests/isa/riscv64/simd-bor.clif       | 90 +++++++++++++++++++
 .../filetests/isa/riscv64/simd-bxor.clif      | 90 +++++++++++++++++++
 .../filetests/runtests/simd-band-splat.clif   | 24 +++++
 .../filetests/runtests/simd-band.clif         | 19 ++++
 .../filetests/runtests/simd-bnot.clif         | 20 +++++
 .../filetests/runtests/simd-bor-splat.clif    | 25 ++++++
 .../filetests/runtests/simd-bor.clif          | 18 ++++
 .../filetests/runtests/simd-bxor-splat.clif   | 24 +++++
 .../filetests/runtests/simd-bxor.clif         | 18 ++++
 11 files changed, 424 insertions(+)

diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index d53bee81668c..732858c511e7 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -323,9 +323,11 @@
   (rv_vand_vv x y (unmasked) ty))
 
 (rule 9 (lower (has_type (ty_vec_fits_in_register ty) (band x (splat y))))
+  (if (ty_vector_not_float ty))
   (rv_vand_vx x y (unmasked) ty))
 
 (rule 10 (lower (has_type (ty_vec_fits_in_register ty) (band (splat x) y)))
+  (if (ty_vector_not_float ty))
   (rv_vand_vx y x (unmasked) ty))
 
 (rule 11 (lower (has_type (ty_vec_fits_in_register ty) (band x (replicated_imm5 y))))
@@ -377,9 +379,11 @@
   (rv_vor_vv x y (unmasked) ty))
 
 (rule 9 (lower (has_type (ty_vec_fits_in_register ty) (bor x (splat y))))
+  (if (ty_vector_not_float ty))
   (rv_vor_vx x y (unmasked) ty))
 
 (rule 10 (lower (has_type (ty_vec_fits_in_register ty) (bor (splat x) y)))
+  (if (ty_vector_not_float ty))
   (rv_vor_vx y x (unmasked) ty))
 
 (rule 11 (lower (has_type (ty_vec_fits_in_register ty) (bor x (replicated_imm5 y))))
@@ -410,9 +414,11 @@
   (rv_vxor_vv x y (unmasked) ty))
 
 (rule 6 (lower (has_type (ty_vec_fits_in_register ty) (bxor x (splat y))))
+  (if (ty_vector_not_float ty))
   (rv_vxor_vx x y (unmasked) ty))
 
 (rule 7 (lower (has_type (ty_vec_fits_in_register ty) (bxor (splat x) y)))
+  (if (ty_vector_not_float ty))
   (rv_vxor_vx y x (unmasked) ty))
 
 (rule 8 (lower (has_type (ty_vec_fits_in_register ty) (bxor x (replicated_imm5 y))))
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-band.clif b/cranelift/filetests/filetests/isa/riscv64/simd-band.clif
index 3d3630e9087f..20fd9c3cfa83 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-band.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-band.clif
@@ -489,3 +489,93 @@ block0(v0: i64x2, v1: i64):
 ;   addi sp, sp, 0x10
 ;   ret
 
+function %band_splat_f32x4(f32x4, i32) -> f32x4 {
+block0(v0: f32x4, v1: i32):
+    v2 = bitcast.f32 v1
+    v3 = splat.f32x4 v2
+    v4 = band v0, v3
+    return v4
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   fmv.w.x ft7,a0
+;   vfmv.v.f v7,ft7 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vand.vv v7,v1,v7 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   fmv.w.x ft7, a0
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0xd3, 0x03, 0x5e
+;   .byte 0xd7, 0x83, 0x13, 0x26
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x83, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %band_splat_f64x2(f64x2, i64) -> f64x2 {
+block0(v0: f64x2, v1: i64):
+    v2 = bitcast.f64 v1
+    v3 = splat.f64x2 v2
+    v4 = band v0, v3
+    return v4
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   fmv.d.x ft7,a0
+;   vfmv.v.f v7,ft7 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vand.vv v7,v1,v7 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   fmv.d.x ft7, a0
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0xd3, 0x03, 0x5e
+;   .byte 0xd7, 0x83, 0x13, 0x26
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x83, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif
index e556b8a554dc..2204581bb542 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-bor.clif
@@ -489,3 +489,93 @@ block0(v0: i64x2, v1: i64):
 ;   addi sp, sp, 0x10
 ;   ret
 
+function %bor_splat_f32x4(f32x4, i32) -> f32x4 {
+block0(v0: f32x4, v1: i32):
+    v2 = bitcast.f32 v1
+    v3 = splat.f32x4 v2
+    v4 = bor v0, v3
+    return v4
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   fmv.w.x ft7,a0
+;   vfmv.v.f v7,ft7 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vor.vv v7,v1,v7 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   fmv.w.x ft7, a0
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0xd3, 0x03, 0x5e
+;   .byte 0xd7, 0x83, 0x13, 0x2a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x83, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bor_splat_f64x2(f64x2, i64) -> f64x2 {
+block0(v0: f64x2, v1: i64):
+    v2 = bitcast.f64 v1
+    v3 = splat.f64x2 v2
+    v4 = bor v0, v3
+    return v4
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   fmv.d.x ft7,a0
+;   vfmv.v.f v7,ft7 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vor.vv v7,v1,v7 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   fmv.d.x ft7, a0
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0xd3, 0x03, 0x5e
+;   .byte 0xd7, 0x83, 0x13, 0x2a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x83, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif b/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif
index d0b7290ebbe1..0f3eb3f0a1a8 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-bxor.clif
@@ -489,3 +489,93 @@ block0(v0: i64x2, v1: i64):
 ;   addi sp, sp, 0x10
 ;   ret
 
+function %bxor_splat_f32x4(f32x4, i32) -> f32x4 {
+block0(v0: f32x4, v1: i32):
+    v2 = bitcast.f32 v1
+    v3 = splat.f32x4 v2
+    v4 = bxor v0, v3
+    return v4
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   fmv.w.x ft7,a0
+;   vfmv.v.f v7,ft7 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vxor.vv v7,v1,v7 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   fmv.w.x ft7, a0
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0xd3, 0x03, 0x5e
+;   .byte 0xd7, 0x83, 0x13, 0x2e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x83, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %bxor_splat_f64x2(f64x2, i64) -> f64x2 {
+block0(v0: f64x2, v1: i64):
+    v2 = bitcast.f64 v1
+    v3 = splat.f64x2 v2
+    v4 = bxor v0, v3
+    return v4
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   fmv.d.x ft7,a0
+;   vfmv.v.f v7,ft7 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vxor.vv v7,v1,v7 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v7,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   fmv.d.x ft7, a0
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0xd3, 0x03, 0x5e
+;   .byte 0xd7, 0x83, 0x13, 0x2e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x83, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-band-splat.clif b/cranelift/filetests/filetests/runtests/simd-band-splat.clif
index ede40bb89431..cc912d21b791 100644
--- a/cranelift/filetests/filetests/runtests/simd-band-splat.clif
+++ b/cranelift/filetests/filetests/runtests/simd-band-splat.clif
@@ -100,3 +100,27 @@ block0(v0: i64x2, v1: i64):
 ; run: %band_splat_i64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0x11111111111111111111111111111111
 ; run: %band_splat_i64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x00000000000000000000000000000000
 ; run: %band_splat_i64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0x00000000000000000011000000110000
+
+function %band_splat_f32x4(f32x4, i32) -> f32x4 {
+block0(v0: f32x4, v1: i32):
+    v2 = bitcast.f32 v1
+    v3 = splat.f32x4 v2
+    v4 = band v0, v3
+    return v4
+}
+; run: %band_splat_f32x4(0x00000000000000000000000000000000, 0x00000001) == 0x00000000000000000000000000000000
+; run: %band_splat_f32x4(0x11111111111111111111111111111111, 0xffffffff) == 0x11111111111111111111111111111111
+; run: %band_splat_f32x4(0x01010011000011110000000011111111, 0x8e8e8e8e) == 0x00000000000000000000000000000000
+; run: %band_splat_f32x4(0x00000000000000001111111111111111, 0xc0ffeeee) == 0x00000000000000000011000000110000
+
+function %band_splat_f64x2(f64x2, i64) -> f64x2 {
+block0(v0: f64x2, v1: i64):
+    v2 = bitcast.f64 v1
+    v3 = splat.f64x2 v2
+    v4 = band v0, v3
+    return v4
+}
+; run: %band_splat_f64x2(0x00000000000000000000000000000000, 0x0000000000000001) == 0x00000000000000000000000000000000
+; run: %band_splat_f64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0x11111111111111111111111111111111
+; run: %band_splat_f64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x00000000000000000000000000000000
+; run: %band_splat_f64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0x00000000000000000011000000110000
diff --git a/cranelift/filetests/filetests/runtests/simd-band.clif b/cranelift/filetests/filetests/runtests/simd-band.clif
index 2bfe927f3e22..55defb7f945f 100644
--- a/cranelift/filetests/filetests/runtests/simd-band.clif
+++ b/cranelift/filetests/filetests/runtests/simd-band.clif
@@ -45,3 +45,22 @@ block0(v0:i64x2, v1:i64x2):
 ; run: %band_i64x2([0xFEDCBA9876543210 0x0123456789ABCDEF], [0x0123456789ABCDEF 0xFEDCBA9876543210]) == [0 0]
 ; run: %band_i64x2([0xFEEEFFFFFEEEFFFF 0xF1FFFEFEF1FFFEFE], [0xDFDBFFFFDFDBFFFF 0xCEFFEFEFCEFFEFEF]) == [0xDECAFFFFDECAFFFF 0xC0FFEEEEC0FFEEEE]
 
+
+function %band_f32x4(f32x4, f32x4) -> f32x4 {
+block0(v0:f32x4, v1:f32x4):
+    v2 = band v0, v1
+    return v2
+}
+; run: %band_f32x4(0xFEDCBA98_76543210_01234567_89ABCDEF, 0x01234567_89ABCDEF_FEDCBA98_76543210) == 0x00000000_00000000_00000000_00000000
+; run: %band_f32x4(0xFEEEFFFF_FEEEFFFF_F1FFFEFE_F1FFFEFE, 0xDFDBFFFF_DFDBFFFF_CEFFEFEF_CEFFEFEF) == 0xDECAFFFF_DECAFFFF_C0FFEEEE_C0FFEEEE
+
+
+
+function %band_f64x2(f64x2, f64x2) -> f64x2 {
+block0(v0:f64x2, v1:f64x2):
+    v2 = band v0, v1
+    return v2
+}
+
+; run: %band_f64x2(0xFEDCBA98_76543210_01234567_89ABCDEF, 0x01234567_89ABCDEF_FEDCBA98_76543210) == 0x00000000_00000000_00000000_00000000
+; run: %band_f64x2(0xFEEEFFFF_FEEEFFFF_F1FFFEFE_F1FFFEFE, 0xDFDBFFFF_DFDBFFFF_CEFFEFEF_CEFFEFEF) == 0xDECAFFFF_DECAFFFF_C0FFEEEE_C0FFEEEE
diff --git a/cranelift/filetests/filetests/runtests/simd-bnot.clif b/cranelift/filetests/filetests/runtests/simd-bnot.clif
index 92d13d17770c..2682d2e54674 100644
--- a/cranelift/filetests/filetests/runtests/simd-bnot.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bnot.clif
@@ -48,3 +48,23 @@ block0(v0: i64x2):
 ; run: %bnot_i64x2(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
 ; run: %bnot_i64x2(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeeffffffffeeeeeeee
 ; run: %bnot_i64x2(0x00000000000000001111111111111111) == 0xffffffffffffffffeeeeeeeeeeeeeeee
+
+function %bnot_f32x4(f32x4) -> f32x4 {
+block0(v0: f32x4):
+    v1 = bnot v0
+    return v1
+}
+; run: %bnot_f32x4(0x00000000000000000000000000000000) == 0xffffffffffffffffffffffffffffffff
+; run: %bnot_f32x4(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
+; run: %bnot_f32x4(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeeffffffffeeeeeeee
+; run: %bnot_f32x4(0x00000000000000001111111111111111) == 0xffffffffffffffffeeeeeeeeeeeeeeee
+
+function %bnot_f64x2(f32x4) -> f32x4 {
+block0(v0: f32x4):
+    v1 = bnot v0
+    return v1
+}
+; run: %bnot_f64x2(0x00000000000000000000000000000000) == 0xffffffffffffffffffffffffffffffff
+; run: %bnot_f64x2(0x11111111111111111111111111111111) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
+; run: %bnot_f64x2(0x01010011000011110000000011111111) == 0xfefeffeeffffeeeeffffffffeeeeeeee
+; run: %bnot_f64x2(0x00000000000000001111111111111111) == 0xffffffffffffffffeeeeeeeeeeeeeeee
diff --git a/cranelift/filetests/filetests/runtests/simd-bor-splat.clif b/cranelift/filetests/filetests/runtests/simd-bor-splat.clif
index 29b1cfd33140..7fe6b77d9f9b 100644
--- a/cranelift/filetests/filetests/runtests/simd-bor-splat.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bor-splat.clif
@@ -100,3 +100,28 @@ block0(v0: i64x2, v1: i64):
 ; run: %bor_splat_i64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0xffffffffffffffffffffffffffffffff
 ; run: %bor_splat_i64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f
 ; run: %bor_splat_i64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0xc0ffeeeec0ffeeeed1ffffffd1ffffff
+
+
+function %bor_splat_f32x4(f32x4, i32) -> f32x4 {
+block0(v0: f32x4, v1: i32):
+    v2 = bitcast.f32 v1
+    v3 = splat.f32x4 v2
+    v4 = bor v0, v3
+    return v4
+}
+; run: %bor_splat_f32x4(0x00000000000000000000000000000000, 0x00000001) == 0x00000001000000010000000100000001
+; run: %bor_splat_f32x4(0x11111111111111111111111111111111, 0xffffffff) == 0xffffffffffffffffffffffffffffffff
+; run: %bor_splat_f32x4(0x01010011000011110000000011111111, 0x8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f
+; run: %bor_splat_f32x4(0x00000000000000001111111111111111, 0xc0ffeeee) == 0xc0ffeeeec0ffeeeed1ffffffd1ffffff
+
+function %bor_splat_f64x2(f64x2, i64) -> f64x2 {
+block0(v0: f64x2, v1: i64):
+    v2 = bitcast.f64 v1
+    v3 = splat.f64x2 v2
+    v4 = bor v0, v3
+    return v4
+}
+; run: %bor_splat_f64x2(0x00000000000000000000000000000000, 0x0000000000000001) == 0x00000000000000010000000000000001
+; run: %bor_splat_f64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0xffffffffffffffffffffffffffffffff
+; run: %bor_splat_f64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f
+; run: %bor_splat_f64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0xc0ffeeeec0ffeeeed1ffffffd1ffffff
diff --git a/cranelift/filetests/filetests/runtests/simd-bor.clif b/cranelift/filetests/filetests/runtests/simd-bor.clif
index 15fe37de9204..eac53ea65152 100644
--- a/cranelift/filetests/filetests/runtests/simd-bor.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bor.clif
@@ -43,3 +43,21 @@ block0(v0:i64x2, v1:i64x2):
 }
 ; run: %bor_i64x2([0xFEDCBA9876543210 0x0123456789ABCDEF], [0x0123456789ABCDEF 0xFEDCBA9876543210]) == [0xFFFFFFFFFFFFFFFF 0xFFFFFFFFFFFFFFFF]
 ; run: %bor_i64x2([0x8A8AAAAA8A8AAAAA 0x8A8AAAAA8A8AAAAA], [0x5440555554405555 0x5440555554405555]) == [0xDECAFFFFDECAFFFF 0xDECAFFFFDECAFFFF]
+
+
+function %bor_f32x4(f32x4, f32x4) -> f32x4 {
+block0(v0:f32x4, v1:f32x4):
+    v2 = bor v0, v1
+    return v2
+}
+; run: %bor_f32x4(0xFEDCBA98_76543210_01234567_89ABCDEF, 0x01234567_89ABCDEF_FEDCBA98_76543210) == 0xFFFFFFFF_FFFFFFFF_FFFFFFFF_FFFFFFFF
+; run: %bor_f32x4(0x8A8AAAAA_8A8AAAAA_8A8AAAAA_8A8AAAAA, 0x54405555_54405555_54405555_54405555) == 0xDECAFFFF_DECAFFFF_DECAFFFF_DECAFFFF
+
+
+function %bor_f64x2(f64x2, f64x2) -> f64x2 {
+block0(v0:f64x2, v1:f64x2):
+    v2 = bor v0, v1
+    return v2
+}
+; run: %bor_f64x2(0xFEDCBA9876543210_0123456789ABCDEF, 0x0123456789ABCDEF_FEDCBA9876543210) == 0xFFFFFFFFFFFFFFFF_FFFFFFFFFFFFFFFF
+; run: %bor_f64x2(0x8A8AAAAA8A8AAAAA_8A8AAAAA8A8AAAAA, 0x5440555554405555_5440555554405555) == 0xDECAFFFFDECAFFFF_DECAFFFFDECAFFFF
diff --git a/cranelift/filetests/filetests/runtests/simd-bxor-splat.clif b/cranelift/filetests/filetests/runtests/simd-bxor-splat.clif
index 6e232b9d81ad..6ce7be76db54 100644
--- a/cranelift/filetests/filetests/runtests/simd-bxor-splat.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bxor-splat.clif
@@ -100,3 +100,27 @@ block0(v0: i64x2, v1: i64):
 ; run: %bxor_splat_i64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
 ; run: %bxor_splat_i64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f
 ; run: %bxor_splat_i64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0xc0ffeeeec0ffeeeed1eeffffd1eeffff
+
+function %bxor_splat_f32x4(f32x4, i32) -> f32x4 {
+block0(v0: f32x4, v1: i32):
+    v2 = bitcast.f32 v1
+    v3 = splat.f32x4 v2
+    v4 = bxor v0, v3
+    return v4
+}
+; run: %bxor_splat_f32x4(0x00000000000000000000000000000000, 0x00000001) == 0x00000001000000010000000100000001
+; run: %bxor_splat_f32x4(0x11111111111111111111111111111111, 0xffffffff) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
+; run: %bxor_splat_f32x4(0x01010011000011110000000011111111, 0x8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f
+; run: %bxor_splat_f32x4(0x00000000000000001111111111111111, 0xc0ffeeee) == 0xc0ffeeeec0ffeeeed1eeffffd1eeffff
+
+function %bxor_splat_f64x2(f64x2, i64) -> f64x2 {
+block0(v0: f64x2, v1: i64):
+    v2 = bitcast.f64 v1
+    v3 = splat.f64x2 v2
+    v4 = bxor v0, v3
+    return v4
+}
+; run: %bxor_splat_f64x2(0x00000000000000000000000000000000, 0x0000000000000001) == 0x00000000000000010000000000000001
+; run: %bxor_splat_f64x2(0x11111111111111111111111111111111, 0xffffffffffffffff) == 0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
+; run: %bxor_splat_f64x2(0x01010011000011110000000011111111, 0x8e8e8e8e8e8e8e8e) == 0x8f8f8e9f8e8e9f9f8e8e8e8e9f9f9f9f
+; run: %bxor_splat_f64x2(0x00000000000000001111111111111111, 0xc0ffeeeec0ffeeee) == 0xc0ffeeeec0ffeeeed1eeffffd1eeffff
diff --git a/cranelift/filetests/filetests/runtests/simd-bxor.clif b/cranelift/filetests/filetests/runtests/simd-bxor.clif
index a2cce79362b6..20e4b545202c 100644
--- a/cranelift/filetests/filetests/runtests/simd-bxor.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bxor.clif
@@ -43,3 +43,21 @@ block0(v0:i64x2, v1:i64x2):
 }
 ; run: %bxor_i64x2([0xFEDCBA9876543210 0x0123456789ABCDEF], [0x0123456789ABCDEF 0xFEDCBA9876543210]) == [0xFFFFFFFFFFFFFFFF 0xFFFFFFFFFFFFFFFF]
 ; run: %bxor_i64x2([0x9440A07D9440A07D 0x9440A07D9440A07D], [0x4A8A5F824A8A5F82 0x4A8A5F824A8A5F82]) == [0xDECAFFFFDECAFFFF 0xDECAFFFFDECAFFFF]
+
+
+function %bxor_f32x4(f32x4, f32x4) -> f32x4 {
+block0(v0:f32x4, v1:f32x4):
+    v2 = bxor v0, v1
+    return v2
+}
+; run: %bxor_f32x4(0xFEDCBA98_76543210_01234567_89ABCDEF, 0x01234567_89ABCDEF_FEDCBA98_76543210) == 0xFFFFFFFF_FFFFFFFF_FFFFFFFF_FFFFFFFF
+; run: %bxor_f32x4(0x9440A07D_9440A07D_9440A07D_9440A07D, 0x4A8A5F82_4A8A5F82_4A8A5F82_4A8A5F82) == 0xDECAFFFF_DECAFFFF_DECAFFFF_DECAFFFF
+
+
+function %bxor_f64x2(f64x2, f64x2) -> f64x2 {
+block0(v0:f64x2, v1:f64x2):
+    v2 = bxor v0, v1
+    return v2
+}
+; run: %bxor_f64x2(0xFEDCBA9876543210_0123456789ABCDEF, 0x0123456789ABCDEF_FEDCBA9876543210) == 0xFFFFFFFFFFFFFFFF_FFFFFFFFFFFFFFFF
+; run: %bxor_f64x2(0x9440A07D9440A07D_9440A07D9440A07D, 0x4A8A5F824A8A5F82_4A8A5F824A8A5F82) == 0xDECAFFFFDECAFFFF_DECAFFFFDECAFFFF

From 1443e2968c2517aa713e9abd0c4253adc8f34ec2 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 20 May 2023 21:52:44 +0100
Subject: [PATCH 8/8] riscv64: Add `splat` note

---
 cranelift/codegen/src/isa/riscv64/lower.isle | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index 732858c511e7..fa5e35803190 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -1176,3 +1176,5 @@
 ;; TODO: We can splat out more patterns by using for example a vmv.v.i i8x16 for
 ;; a i64x2 const with a compatible bit pattern. The AArch64 Backend does something
 ;; similar in its splat rules.
+;; TODO: Look through bitcasts when splatting out registers. We can use
+;; `vmv.v.x` in a `(splat.f32x4 (bitcast.f32 val))`. And vice versa for integers.