From e51be74ae5ed12bb7284000067303bfbd03a10f3 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 6 May 2023 09:02:55 +0100
Subject: [PATCH 1/8] riscv64: Rename VecAluOpRRR Arms

---
 .../codegen/src/isa/riscv64/inst/vector.rs    | 34 ++++++++++---------
 .../codegen/src/isa/riscv64/inst_vector.isle  | 32 ++++++++---------
 2 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index c6029fb2fdeb..e1c9b41a102f 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -246,26 +246,28 @@ impl VecAluOpRRR {
     }
     pub fn funct3(&self) -> u32 {
         match self {
-            VecAluOpRRR::Vadd
-            | VecAluOpRRR::Vsub
-            | VecAluOpRRR::Vand
-            | VecAluOpRRR::Vor
-            | VecAluOpRRR::Vxor => VecOpCategory::OPIVV,
-            VecAluOpRRR::Vmul | VecAluOpRRR::Vmulh | VecAluOpRRR::Vmulhu => VecOpCategory::OPMVV,
+            VecAluOpRRR::VaddVV
+            | VecAluOpRRR::VsubVV
+            | VecAluOpRRR::VandVV
+            | VecAluOpRRR::VorVV
+            | VecAluOpRRR::VxorVV => VecOpCategory::OPIVV,
+            VecAluOpRRR::VmulVV | VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhuVV => {
+                VecOpCategory::OPMVV
+            }
         }
         .encode()
     }
     pub fn funct6(&self) -> u32 {
         // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc
         match self {
-            VecAluOpRRR::Vadd => 0b000000,
-            VecAluOpRRR::Vsub => 0b000010,
-            VecAluOpRRR::Vmul => 0b100101,
-            VecAluOpRRR::Vmulh => 0b100111,
-            VecAluOpRRR::Vmulhu => 0b100100,
-            VecAluOpRRR::Vand => 0b001001,
-            VecAluOpRRR::Vor => 0b001010,
-            VecAluOpRRR::Vxor => 0b001011,
+            VecAluOpRRR::VaddVV => 0b000000,
+            VecAluOpRRR::VsubVV => 0b000010,
+            VecAluOpRRR::VmulVV => 0b100101,
+            VecAluOpRRR::VmulhVV => 0b100111,
+            VecAluOpRRR::VmulhuVV => 0b100100,
+            VecAluOpRRR::VandVV => 0b001001,
+            VecAluOpRRR::VorVV => 0b001010,
+            VecAluOpRRR::VxorVV => 0b001011,
         }
     }
 }
@@ -274,8 +276,8 @@ impl fmt::Display for VecAluOpRRR {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         let mut s = format!("{self:?}");
         s.make_ascii_lowercase();
-        s.push_str(".vv");
-        f.write_str(&s)
+        let (opcode, category) = s.split_at(s.len() - 2);
+        f.write_str(&format!("{}.{}", opcode, category))
     }
 }
 
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index 01a60f80abfa..a91eaecec844 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -83,14 +83,14 @@
 
 ;; Register to Register ALU Ops
 (type VecAluOpRRR (enum
-  (Vadd)
-  (Vsub)
-  (Vmul)
-  (Vmulh)
-  (Vmulhu)
-  (Vand)
-  (Vor)
-  (Vxor)
+  (VaddVV)
+  (VsubVV)
+  (VmulVV)
+  (VmulhVV)
+  (VmulhuVV)
+  (VandVV)
+  (VorVV)
+  (VxorVV)
 ))
 
 ;; Register-Imm ALU Ops
@@ -181,7 +181,7 @@
 ;; Helper for emitting the `vadd.vv` instruction.
 (decl rv_vadd_vv (Reg Reg VState) Reg)
 (rule (rv_vadd_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.Vadd) vs2 vs1 vstate))
+  (vec_alu_rrr (VecAluOpRRR.VaddVV) vs2 vs1 vstate))
 
 ;; Helper for emitting the `vadd.vi` instruction.
 (decl rv_vadd_vi (Reg Imm5 VState) Reg)
@@ -191,34 +191,34 @@
 ;; Helper for emitting the `vsub.vv` instruction.
 (decl rv_vsub_vv (Reg Reg VState) Reg)
 (rule (rv_vsub_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.Vsub) vs2 vs1 vstate))
+  (vec_alu_rrr (VecAluOpRRR.VsubVV) vs2 vs1 vstate))
 
 ;; Helper for emitting the `vmul.vv` instruction.
 (decl rv_vmul_vv (Reg Reg VState) Reg)
 (rule (rv_vmul_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.Vmul) vs2 vs1 vstate))
+  (vec_alu_rrr (VecAluOpRRR.VmulVV) vs2 vs1 vstate))
 
 ;; Helper for emitting the `vmulh.vv` instruction.
 (decl rv_vmulh_vv (Reg Reg VState) Reg)
 (rule (rv_vmulh_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.Vmulh) vs2 vs1 vstate))
+  (vec_alu_rrr (VecAluOpRRR.VmulhVV) vs2 vs1 vstate))
 
 ;; Helper for emitting the `vmulhu.vv` instruction.
 (decl rv_vmulhu_vv (Reg Reg VState) Reg)
 (rule (rv_vmulhu_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.Vmulhu) vs2 vs1 vstate))
+  (vec_alu_rrr (VecAluOpRRR.VmulhuVV) vs2 vs1 vstate))
 
 ;; Helper for emitting the `vand.vv` instruction.
 (decl rv_vand_vv (Reg Reg VState) Reg)
 (rule (rv_vand_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.Vand) vs2 vs1 vstate))
+  (vec_alu_rrr (VecAluOpRRR.VandVV) vs2 vs1 vstate))
 
 ;; Helper for emitting the `vor.vv` instruction.
 (decl rv_vor_vv (Reg Reg VState) Reg)
 (rule (rv_vor_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.Vor) vs2 vs1 vstate))
+  (vec_alu_rrr (VecAluOpRRR.VorVV) vs2 vs1 vstate))
 
 ;; Helper for emitting the `vxor.vv` instruction.
 (decl rv_vxor_vv (Reg Reg VState) Reg)
 (rule (rv_vxor_vv vs2 vs1 vstate)
-  (vec_alu_rrr (VecAluOpRRR.Vxor) vs2 vs1 vstate))
+  (vec_alu_rrr (VecAluOpRRR.VxorVV) vs2 vs1 vstate))

From 9939732aaa08eec7d6b2d088cf3878529a0a59e4 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 6 May 2023 09:05:56 +0100
Subject: [PATCH 2/8] riscv64: Rename VecAluOpRRImm5 Arms

---
 cranelift/codegen/src/isa/riscv64/inst/vector.rs   | 6 +++---
 cranelift/codegen/src/isa/riscv64/inst_vector.isle | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index e1c9b41a102f..34960f614ecf 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -292,7 +292,7 @@ impl VecAluOpRRImm5 {
     pub fn funct6(&self) -> u32 {
         // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc
         match self {
-            VecAluOpRRImm5::Vadd => 0b000000,
+            VecAluOpRRImm5::VaddVI => 0b000000,
         }
     }
 }
@@ -301,8 +301,8 @@ impl fmt::Display for VecAluOpRRImm5 {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         let mut s = format!("{self:?}");
         s.make_ascii_lowercase();
-        s.push_str(".vi");
-        f.write_str(&s)
+        let (opcode, category) = s.split_at(s.len() - 2);
+        f.write_str(&format!("{}.{}", opcode, category))
     }
 }
 
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index a91eaecec844..8b26dacc70dc 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -95,7 +95,7 @@
 
 ;; Register-Imm ALU Ops
 (type VecAluOpRRImm5 (enum
-  (Vadd)
+  (VaddVI)
 ))
 
 
@@ -186,7 +186,7 @@
 ;; Helper for emitting the `vadd.vi` instruction.
 (decl rv_vadd_vi (Reg Imm5 VState) Reg)
 (rule (rv_vadd_vi vs2 imm vstate)
-  (vec_alu_rr_imm5 (VecAluOpRRImm5.Vadd) vs2 imm vstate))
+  (vec_alu_rr_imm5 (VecAluOpRRImm5.VaddVI) vs2 imm vstate))
 
 ;; Helper for emitting the `vsub.vv` instruction.
 (decl rv_vsub_vv (Reg Reg VState) Reg)

From 9e90014c07af76ad9bfd0025d2be3049e21fb557 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 6 May 2023 09:51:42 +0100
Subject: [PATCH 3/8] riscv64: Add `vsub.vx`

---
 cranelift/codegen/src/isa/riscv64/inst/mod.rs | 22 ++++-
 .../codegen/src/isa/riscv64/inst/vector.rs    | 28 ++++---
 .../codegen/src/isa/riscv64/inst_vector.isle  |  9 +++
 cranelift/codegen/src/isa/riscv64/lower.isle  |  3 +
 .../filetests/isa/riscv64/simd-isub.clif      | 40 +++++++++
 .../filetests/runtests/simd-isub-splat.clif   | 81 +++++++++++++++++++
 6 files changed, 168 insertions(+), 15 deletions(-)
 create mode 100644 cranelift/filetests/filetests/runtests/simd-isub-splat.clif

diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
index 0e662d8054bc..8124b3adb938 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
@@ -4,7 +4,7 @@
 #![allow(dead_code)]
 #![allow(non_camel_case_types)]
 
-use super::lower::isle::generated_code::{VecAMode, VecElementWidth};
+use super::lower::isle::generated_code::{VecAMode, VecElementWidth, VecOpCategory};
 use crate::binemit::{Addend, CodeOffset, Reloc};
 pub use crate::ir::condcodes::IntCC;
 use crate::ir::types::{self, F32, F64, I128, I16, I32, I64, I8, I8X16, R32, R64};
@@ -17,7 +17,7 @@ use crate::{settings, CodegenError, CodegenResult};
 pub use crate::ir::condcodes::FloatCC;
 
 use alloc::vec::Vec;
-use regalloc2::{PRegSet, VReg};
+use regalloc2::{PRegSet, RegClass, VReg};
 use smallvec::{smallvec, SmallVec};
 use std::boxed::Box;
 use std::string::{String, ToString};
@@ -624,7 +624,23 @@ fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             // gen_prologue is called at emit stage.
             // no need let reg alloc know.
         }
-        &Inst::VecAluRRR { vd, vs1, vs2, .. } => {
+        &Inst::VecAluRRR {
+            op, vd, vs1, vs2, ..
+        } => {
+            debug_assert_eq!(vs2.class(), RegClass::Vector);
+            match op.category() {
+                VecOpCategory::OPIVV | VecOpCategory::OPFVV | VecOpCategory::OPMVV => {
+                    debug_assert_eq!(vs1.class(), RegClass::Vector);
+                }
+                VecOpCategory::OPIVX | VecOpCategory::OPMVX => {
+                    debug_assert_eq!(vs1.class(), RegClass::Int);
+                }
+                VecOpCategory::OPFVF => {
+                    debug_assert_eq!(vs1.class(), RegClass::Float);
+                }
+                _ => unreachable!(),
+            }
+
             collector.reg_use(vs1);
             collector.reg_use(vs2);
             collector.reg_def(vd);
diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index 34960f614ecf..72e21aa9b206 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -245,23 +245,13 @@ impl VecAluOpRRR {
         0x57
     }
     pub fn funct3(&self) -> u32 {
-        match self {
-            VecAluOpRRR::VaddVV
-            | VecAluOpRRR::VsubVV
-            | VecAluOpRRR::VandVV
-            | VecAluOpRRR::VorVV
-            | VecAluOpRRR::VxorVV => VecOpCategory::OPIVV,
-            VecAluOpRRR::VmulVV | VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhuVV => {
-                VecOpCategory::OPMVV
-            }
-        }
-        .encode()
+        self.category().encode()
     }
     pub fn funct6(&self) -> u32 {
         // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc
         match self {
             VecAluOpRRR::VaddVV => 0b000000,
-            VecAluOpRRR::VsubVV => 0b000010,
+            VecAluOpRRR::VsubVV | VecAluOpRRR::VsubVX => 0b000010,
             VecAluOpRRR::VmulVV => 0b100101,
             VecAluOpRRR::VmulhVV => 0b100111,
             VecAluOpRRR::VmulhuVV => 0b100100,
@@ -270,6 +260,20 @@ impl VecAluOpRRR {
             VecAluOpRRR::VxorVV => 0b001011,
         }
     }
+
+    pub fn category(&self) -> VecOpCategory {
+        match self {
+            VecAluOpRRR::VaddVV
+            | VecAluOpRRR::VsubVV
+            | VecAluOpRRR::VandVV
+            | VecAluOpRRR::VorVV
+            | VecAluOpRRR::VxorVV => VecOpCategory::OPIVV,
+            VecAluOpRRR::VmulVV | VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhuVV => {
+                VecOpCategory::OPMVV
+            }
+            VecAluOpRRR::VsubVX => VecOpCategory::OPIVX,
+        }
+    }
 }
 
 impl fmt::Display for VecAluOpRRR {
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index 8b26dacc70dc..baac1872a363 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -83,6 +83,7 @@
 
 ;; Register to Register ALU Ops
 (type VecAluOpRRR (enum
+  ;; Vector-Vector Opcodes
   (VaddVV)
   (VsubVV)
   (VmulVV)
@@ -91,6 +92,9 @@
   (VandVV)
   (VorVV)
   (VxorVV)
+
+  ;; Vector-Scalar Opcodes
+  (VsubVX)
 ))
 
 ;; Register-Imm ALU Ops
@@ -193,6 +197,11 @@
 (rule (rv_vsub_vv vs2 vs1 vstate)
   (vec_alu_rrr (VecAluOpRRR.VsubVV) vs2 vs1 vstate))
 
+;; Helper for emitting the `vsub.vx` instruction.
+(decl rv_vsub_vx (Reg Reg VState) Reg)
+(rule (rv_vsub_vx vs2 vs1 vstate)
+  (vec_alu_rrr (VecAluOpRRR.VsubVX) vs2 vs1 vstate))
+
 ;; Helper for emitting the `vmul.vv` instruction.
 (decl rv_vmul_vv (Reg Reg VState) Reg)
 (rule (rv_vmul_vv vs2 vs1 vstate)
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index 207ea631d6b4..0f14752acfe0 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -140,6 +140,9 @@
 (rule 3 (lower (has_type (ty_vec_fits_in_register ty) (isub x y)))
   (rv_vsub_vv x y ty))
 
+(rule 4 (lower (has_type (ty_vec_fits_in_register ty) (isub x (splat y))))
+  (rv_vsub_vx x y ty))
+
 ;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; `i64` and smaller.
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif b/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif
index 550cddb7dd09..76720c12e325 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif
@@ -169,3 +169,43 @@ block0(v0: i64x2, v1: i64x2):
 ;   addi sp, sp, 0x10
 ;   ret
 
+function %isub_splat_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = isub v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vsub.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0x42, 0x15, 0x0a
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-isub-splat.clif b/cranelift/filetests/filetests/runtests/simd-isub-splat.clif
new file mode 100644
index 000000000000..df0433af1400
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-isub-splat.clif
@@ -0,0 +1,81 @@
+test interpret
+test run
+target aarch64
+target s390x
+target x86_64 has_sse41=false
+set enable_simd
+target x86_64
+target x86_64 skylake
+target riscv64 has_v
+
+
+function %isub_splat_i8x16(i8x16, i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = isub v0, v2
+    return v3
+}
+; run: %isub_splat_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], 22) == [-21 -20 -19 -18 -17 -16 -15 -14 -13 -12 -11 -10 -9 -8 -7 -6]
+
+function %isub_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = isub v0, v2
+    return v3
+}
+; run: %isub_splat_i16x8([1 2 3 4 5 6 7 8], 22) == [-21 -20 -19 -18 -17 -16 -15 -14]
+
+function %isub_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = isub v0, v2
+    return v3
+}
+; run: %isub_splat_i32x4([1 2 3 4], 22) == [-21 -20 -19 -18]
+
+function %isub_splat_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = isub v0, v2
+    return v3
+}
+; run: %isub_splat_i64x2([1 2], 22) == [-21 -20]
+
+
+
+function %isub_splat_const_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 5
+    v2 = splat.i8x16 v1
+    v3 = isub v0, v2
+    return v3
+}
+; run: %isub_splat_const_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [-4 -3 -2 -1 0 1 2 3 4 5 6 7 8 9 10 11]
+
+function %isub_splat_const_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 -16
+    v2 = splat.i16x8 v1
+    v3 = isub v0, v2
+    return v3
+}
+; run: %isub_splat_const_i16x8([1 2 3 4 5 6 7 8]) == [17 18 19 20 21 22 23 24]
+
+function %isub_splat_const_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 15
+    v2 = splat.i32x4 v1
+    v3 = isub v0, v2
+    return v3
+}
+; run: %isub_splat_const_i32x4([1 2 3 4]) == [-14 -13 -12 -11]
+
+function %isub_splat_const_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 -5
+    v2 = splat.i64x2 v1
+    v3 = isub v0, v2
+    return v3
+}
+; run: %isub_splat_const_i64x2([1 2]) == [6 7]
+

From 1c9d067c7036b6aef5500dc62d028448871cf1f1 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 6 May 2023 09:59:41 +0100
Subject: [PATCH 4/8] riscv64: Add `vrsub.vx`

---
 .../codegen/src/isa/riscv64/inst/vector.rs    |  3 +-
 .../codegen/src/isa/riscv64/inst_vector.isle  |  6 +++
 cranelift/codegen/src/isa/riscv64/lower.isle  |  4 ++
 .../filetests/isa/riscv64/simd-isub.clif      | 40 +++++++++++++++++++
 .../filetests/runtests/simd-isub-splat.clif   | 34 ++++++++++++++++
 5 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index 72e21aa9b206..9656dc1d00f1 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -252,6 +252,7 @@ impl VecAluOpRRR {
         match self {
             VecAluOpRRR::VaddVV => 0b000000,
             VecAluOpRRR::VsubVV | VecAluOpRRR::VsubVX => 0b000010,
+            VecAluOpRRR::VrsubVX => 0b000011,
             VecAluOpRRR::VmulVV => 0b100101,
             VecAluOpRRR::VmulhVV => 0b100111,
             VecAluOpRRR::VmulhuVV => 0b100100,
@@ -271,7 +272,7 @@ impl VecAluOpRRR {
             VecAluOpRRR::VmulVV | VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhuVV => {
                 VecOpCategory::OPMVV
             }
-            VecAluOpRRR::VsubVX => VecOpCategory::OPIVX,
+            VecAluOpRRR::VsubVX | VecAluOpRRR::VrsubVX => VecOpCategory::OPIVX,
         }
     }
 }
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index baac1872a363..75ac8312654b 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -95,6 +95,7 @@
 
   ;; Vector-Scalar Opcodes
   (VsubVX)
+  (VrsubVX)
 ))
 
 ;; Register-Imm ALU Ops
@@ -202,6 +203,11 @@
 (rule (rv_vsub_vx vs2 vs1 vstate)
   (vec_alu_rrr (VecAluOpRRR.VsubVX) vs2 vs1 vstate))
 
+;; Helper for emitting the `vrsub.vx` instruction.
+(decl rv_vrsub_vx (Reg Reg VState) Reg)
+(rule (rv_vrsub_vx vs2 vs1 vstate)
+  (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 vs1 vstate))
+
 ;; Helper for emitting the `vmul.vv` instruction.
 (decl rv_vmul_vv (Reg Reg VState) Reg)
 (rule (rv_vmul_vv vs2 vs1 vstate)
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index 0f14752acfe0..ee4c415a7b43 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -143,6 +143,10 @@
 (rule 4 (lower (has_type (ty_vec_fits_in_register ty) (isub x (splat y))))
   (rv_vsub_vx x y ty))
 
+(rule 5 (lower (has_type (ty_vec_fits_in_register ty) (isub (splat x) y)))
+  (rv_vrsub_vx y x ty))
+
+
 ;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; `i64` and smaller.
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif b/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif
index 76720c12e325..9c7f08e2fa7e 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-isub.clif
@@ -209,3 +209,43 @@ block0(v0: i64x2, v1: i64):
 ;   addi sp, sp, 0x10
 ;   ret
 
+function %isub_splat_reverse_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = isub v2, v0
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vrsub.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0x42, 0x15, 0x0e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-isub-splat.clif b/cranelift/filetests/filetests/runtests/simd-isub-splat.clif
index df0433af1400..a24ee3a777cc 100644
--- a/cranelift/filetests/filetests/runtests/simd-isub-splat.clif
+++ b/cranelift/filetests/filetests/runtests/simd-isub-splat.clif
@@ -9,6 +9,40 @@ target x86_64 skylake
 target riscv64 has_v
 
 
+function %isub_splat_reverse_i8x16(i8x16, i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = isub v2, v0
+    return v3
+}
+; run: %isub_splat_reverse_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], 22) == [21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6]
+
+function %isub_splat_reverse_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = isub v2, v0
+    return v3
+}
+; run: %isub_splat_reverse_i16x8([1 2 3 4 5 6 7 8], 22) == [21 20 19 18 17 16 15 14]
+
+function %isub_splat_reverse_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = isub v2, v0
+    return v3
+}
+; run: %isub_splat_reverse_i32x4([1 2 3 4], 22) == [21 20 19 18]
+
+function %isub_splat_reverse_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = isub v2, v0
+    return v3
+}
+; run: %isub_splat_reverse_i64x2([1 2], 22) == [21 20]
+
+
+
 function %isub_splat_i8x16(i8x16, i8) -> i8x16 {
 block0(v0: i8x16, v1: i8):
     v2 = splat.i8x16 v1

From 5d2b42d41753507b3e2528e76b0aaf853bad804a Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 6 May 2023 10:05:26 +0100
Subject: [PATCH 5/8] riscv64: Add `vrsub.vi`

---
 .../codegen/src/isa/riscv64/inst/vector.rs    |  1 +
 .../codegen/src/isa/riscv64/inst_vector.isle  |  6 +++
 cranelift/codegen/src/isa/riscv64/lower.isle  |  3 ++
 .../filetests/runtests/simd-isub-splat.clif   | 38 +++++++++++++++++++
 4 files changed, 48 insertions(+)

diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index 9656dc1d00f1..f75f7e3ad3f5 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -298,6 +298,7 @@ impl VecAluOpRRImm5 {
         // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc
         match self {
             VecAluOpRRImm5::VaddVI => 0b000000,
+            VecAluOpRRImm5::VrsubVI => 0b000011,
         }
     }
 }
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index 75ac8312654b..0f65edc91853 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -101,6 +101,7 @@
 ;; Register-Imm ALU Ops
 (type VecAluOpRRImm5 (enum
   (VaddVI)
+  (VrsubVI)
 ))
 
 
@@ -208,6 +209,11 @@
 (rule (rv_vrsub_vx vs2 vs1 vstate)
   (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 vs1 vstate))
 
+;; Helper for emitting the `vrsub.vi` instruction.
+(decl rv_vrsub_vi (Reg Imm5 VState) Reg)
+(rule (rv_vrsub_vi vs2 imm vstate)
+  (vec_alu_rr_imm5 (VecAluOpRRImm5.VrsubVI) vs2 imm vstate))
+
 ;; Helper for emitting the `vmul.vv` instruction.
 (decl rv_vmul_vv (Reg Reg VState) Reg)
 (rule (rv_vmul_vv vs2 vs1 vstate)
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index ee4c415a7b43..60c84969a87a 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -146,6 +146,9 @@
 (rule 5 (lower (has_type (ty_vec_fits_in_register ty) (isub (splat x) y)))
   (rv_vrsub_vx y x ty))
 
+(rule 6 (lower (has_type (ty_vec_fits_in_register ty) (isub (replicated_imm5 x) y)))
+  (rv_vrsub_vi y x ty))
+
 
 ;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
diff --git a/cranelift/filetests/filetests/runtests/simd-isub-splat.clif b/cranelift/filetests/filetests/runtests/simd-isub-splat.clif
index a24ee3a777cc..975c2fadb006 100644
--- a/cranelift/filetests/filetests/runtests/simd-isub-splat.clif
+++ b/cranelift/filetests/filetests/runtests/simd-isub-splat.clif
@@ -113,3 +113,41 @@ block0(v0: i64x2):
 }
 ; run: %isub_splat_const_i64x2([1 2]) == [6 7]
 
+
+
+function %isub_splat_const_reverse_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 5
+    v2 = splat.i8x16 v1
+    v3 = isub v2, v0
+    return v3
+}
+; run: %isub_splat_const_reverse_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [4 3 2 1 0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11]
+
+function %isub_splat_const_reverse_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 -16
+    v2 = splat.i16x8 v1
+    v3 = isub v2, v0
+    return v3
+}
+; run: %isub_splat_const_reverse_i16x8([1 2 3 4 5 6 7 8]) == [-17 -18 -19 -20 -21 -22 -23 -24]
+
+function %isub_splat_const_reverse_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 15
+    v2 = splat.i32x4 v1
+    v3 = isub v2, v0
+    return v3
+}
+; run: %isub_splat_const_reverse_i32x4([1 2 3 4]) == [14 13 12 11]
+
+function %isub_splat_const_reverse_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 -5
+    v2 = splat.i64x2 v1
+    v3 = isub v2, v0
+    return v3
+}
+; run: %isub_splat_const_reverse_i64x2([1 2]) == [-6 -7]
+

From 84621b1e0b544bfecc359c136185662a045c4335 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 6 May 2023 10:24:11 +0100
Subject: [PATCH 6/8] riscv64: Add `vneg.v`

---
 cranelift/codegen/src/isa/riscv64/inst/mod.rs |   9 +-
 .../codegen/src/isa/riscv64/inst_vector.isle  |   5 +
 cranelift/codegen/src/isa/riscv64/lower.isle  |   7 +-
 .../filetests/isa/riscv64/simd-ineg.clif      | 159 ++++++++++++++++++
 .../filetests/runtests/simd-ineg.clif         |  23 +++
 5 files changed, 199 insertions(+), 4 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/riscv64/simd-ineg.clif

diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
index 8124b3adb938..2f65d01e4d83 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
@@ -53,11 +53,11 @@ pub(crate) type VecWritableReg = Vec<Writable<Reg>>;
 //=============================================================================
 // Instructions (top level): definition
 
-use crate::isa::riscv64::lower::isle::generated_code::MInst;
 pub use crate::isa::riscv64::lower::isle::generated_code::{
     AluOPRRI, AluOPRRR, AtomicOP, FClassResult, FFlagsException, FloatRoundOP, FloatSelectOP,
     FpuOPRR, FpuOPRRR, FpuOPRRRR, IntSelectOP, LoadOP, MInst as Inst, StoreOP, FRM,
 };
+use crate::isa::riscv64::lower::isle::generated_code::{MInst, VecAluOpRRR};
 
 type BoxCallInfo = Box<CallInfo>;
 type BoxCallIndInfo = Box<CallIndInfo>;
@@ -1575,7 +1575,12 @@ impl Inst {
 
                 // Note: vs2 and vs1 here are opposite to the standard scalar ordering.
                 // This is noted in Section 10.1 of the RISC-V Vector spec.
-                format!("{} {},{},{} {}", op, vd_s, vs2_s, vs1_s, vstate)
+                match (op, vs1) {
+                    (VecAluOpRRR::VrsubVX, vs1) if vs1 == zero_reg() => {
+                        format!("vneg.v {},{} {}", vd_s, vs2_s, vstate)
+                    }
+                    _ => format!("{} {},{},{} {}", op, vd_s, vs2_s, vs1_s, vstate),
+                }
             }
             &Inst::VecAluRRImm5 {
                 op,
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index 0f65edc91853..a84a6c8af0a6 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -209,6 +209,11 @@
 (rule (rv_vrsub_vx vs2 vs1 vstate)
   (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 vs1 vstate))
 
+;; Helper for emitting the `vneg.v` pseudo-instruction.
+(decl rv_vneg_v (Reg VState) Reg)
+(rule (rv_vneg_v vs2 vstate)
+  (vec_alu_rrr (VecAluOpRRR.VrsubVX) vs2 (zero_reg) vstate))
+
 ;; Helper for emitting the `vrsub.vi` instruction.
 (decl rv_vrsub_vi (Reg Imm5 VState) Reg)
 (rule (rv_vrsub_vi vs2 imm vstate)
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index 60c84969a87a..59d36e58ffb3 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -152,10 +152,13 @@
 
 ;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; `i64` and smaller.
-(rule (lower (has_type ty (ineg val)))
+(rule (lower (has_type (ty_int ty) (ineg val)))
   (neg ty val))
 
+(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (ineg x)))
+  (rv_vneg_v x ty))
+
+
 ;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule 0 (lower (has_type (ty_int_ref_scalar_64 ty) (imul x y)))
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-ineg.clif b/cranelift/filetests/filetests/isa/riscv64/simd-ineg.clif
new file mode 100644
index 000000000000..36aba8eb32a8
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-ineg.clif
@@ -0,0 +1,159 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+
+function %ineg_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = ineg v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vneg.v v4,v1 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x42, 0x10, 0x0e
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %ineg_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = ineg v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vneg.v v4,v1 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0x57, 0x42, 0x10, 0x0e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %ineg_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = ineg v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vneg.v v4,v1 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0x57, 0x42, 0x10, 0x0e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %ineg_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = ineg v0
+    return v1
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vneg.v v4,v1 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0x57, 0x42, 0x10, 0x0e
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-ineg.clif b/cranelift/filetests/filetests/runtests/simd-ineg.clif
index 4cc78bdf795b..ff26ea5c6521 100644
--- a/cranelift/filetests/filetests/runtests/simd-ineg.clif
+++ b/cranelift/filetests/filetests/runtests/simd-ineg.clif
@@ -4,6 +4,21 @@ target s390x
 set enable_simd
 target x86_64
 target x86_64 skylake
+target riscv64 has_v
+
+function %ineg_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = ineg v0
+    return v1
+}
+; run: %ineg_i8x16([-1 10 2 4 5 6 7 8 9 10 -11 -12 -13 -14 -15 -16]) == [1 -10 -2 -4 -5 -6 -7 -8 -9 -10 11 12 13 14 15 16]
+
+function %ineg_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = ineg v0
+    return v1
+}
+; run: %ineg_i16x8([1 2 -3 -4 5 6 -7 -8]) == [-1 -2 3 4 -5 -6 7 8]
 
 function %ineg_i32x4(i32x4) -> i32x4 {
 block0(v0: i32x4):
@@ -11,3 +26,11 @@ block0(v0: i32x4):
     return v1
 }
 ; run: %ineg_i32x4([1 1 1 1]) == [-1 -1 -1 -1]
+; run: %ineg_i32x4([1 -9 1 -10]) == [-1 9 -1 10]
+
+function %ineg_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = ineg v0
+    return v1
+}
+; run: %ineg_i64x2([99 -10]) == [-99 10]

From 6a5e40e536093d0931533af9b2bab6f0e8a427cb Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Sat, 6 May 2023 17:42:29 +0100
Subject: [PATCH 7/8] riscv64: Add `vadd.vx`

---
 .../codegen/src/isa/riscv64/inst/vector.rs    |   6 +-
 .../codegen/src/isa/riscv64/inst_vector.isle  |   6 +
 cranelift/codegen/src/isa/riscv64/lower.isle  |  10 +-
 .../filetests/isa/riscv64/simd-iadd.clif      | 158 ++++++++++++++++++
 .../filetests/runtests/simd-iadd-splat.clif   |  49 +++++-
 5 files changed, 217 insertions(+), 12 deletions(-)

diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index f75f7e3ad3f5..e94d7c435f9c 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -250,7 +250,7 @@ impl VecAluOpRRR {
     pub fn funct6(&self) -> u32 {
         // See: https://github.com/riscv/riscv-v-spec/blob/master/inst-table.adoc
         match self {
-            VecAluOpRRR::VaddVV => 0b000000,
+            VecAluOpRRR::VaddVV | VecAluOpRRR::VaddVX => 0b000000,
             VecAluOpRRR::VsubVV | VecAluOpRRR::VsubVX => 0b000010,
             VecAluOpRRR::VrsubVX => 0b000011,
             VecAluOpRRR::VmulVV => 0b100101,
@@ -272,7 +272,9 @@ impl VecAluOpRRR {
             VecAluOpRRR::VmulVV | VecAluOpRRR::VmulhVV | VecAluOpRRR::VmulhuVV => {
                 VecOpCategory::OPMVV
             }
-            VecAluOpRRR::VsubVX | VecAluOpRRR::VrsubVX => VecOpCategory::OPIVX,
+            VecAluOpRRR::VaddVX | VecAluOpRRR::VsubVX | VecAluOpRRR::VrsubVX => {
+                VecOpCategory::OPIVX
+            }
         }
     }
 }
diff --git a/cranelift/codegen/src/isa/riscv64/inst_vector.isle b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
index a84a6c8af0a6..ee02f7b7c503 100644
--- a/cranelift/codegen/src/isa/riscv64/inst_vector.isle
+++ b/cranelift/codegen/src/isa/riscv64/inst_vector.isle
@@ -94,6 +94,7 @@
   (VxorVV)
 
   ;; Vector-Scalar Opcodes
+  (VaddVX)
   (VsubVX)
   (VrsubVX)
 ))
@@ -189,6 +190,11 @@
 (rule (rv_vadd_vv vs2 vs1 vstate)
   (vec_alu_rrr (VecAluOpRRR.VaddVV) vs2 vs1 vstate))
 
+;; Helper for emitting the `vadd.vx` instruction.
+(decl rv_vadd_vx (Reg Reg VState) Reg)
+(rule (rv_vadd_vx vs2 vs1 vstate)
+  (vec_alu_rrr (VecAluOpRRR.VaddVX) vs2 vs1 vstate))
+
 ;; Helper for emitting the `vadd.vi` instruction.
 (decl rv_vadd_vi (Reg Imm5 VState) Reg)
 (rule (rv_vadd_vi vs2 imm vstate)
diff --git a/cranelift/codegen/src/isa/riscv64/lower.isle b/cranelift/codegen/src/isa/riscv64/lower.isle
index 59d36e58ffb3..cf30e6e89b0f 100644
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -110,10 +110,16 @@
 (rule 8 (lower (has_type (ty_vec_fits_in_register ty) (iadd x y)))
   (rv_vadd_vv x y ty))
 
-(rule 9 (lower (has_type (ty_vec_fits_in_register ty) (iadd x (replicated_imm5 y))))
+(rule 9 (lower (has_type (ty_vec_fits_in_register ty) (iadd x (splat y))))
+  (rv_vadd_vx x y ty))
+
+(rule 10 (lower (has_type (ty_vec_fits_in_register ty) (iadd (splat x) y)))
+  (rv_vadd_vx y x ty))
+
+(rule 11 (lower (has_type (ty_vec_fits_in_register ty) (iadd x (replicated_imm5 y))))
   (rv_vadd_vi x y ty))
 
-(rule 10 (lower (has_type (ty_vec_fits_in_register ty) (iadd (replicated_imm5 x) y)))
+(rule 12 (lower (has_type (ty_vec_fits_in_register ty) (iadd (replicated_imm5 x) y)))
   (rv_vadd_vi y x ty))
 
 ;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;;
diff --git a/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif b/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif
index f37d39bc19bf..dc5790303ced 100644
--- a/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif
+++ b/cranelift/filetests/filetests/isa/riscv64/simd-iadd.clif
@@ -331,3 +331,161 @@ block0(v0: i64x2):
 ;   addi sp, sp, 0x10
 ;   ret
 
+function %iadd_splat_i8x16(i8x16,  i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = iadd v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vadd.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0xd7, 0x42, 0x15, 0x02
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %iadd_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = iadd v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vadd.vx v5,v1,a0 #avl=8, #vtype=(e16, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x84, 0xcc
+;   .byte 0xd7, 0x42, 0x15, 0x02
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %iadd_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = iadd v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vadd.vx v5,v1,a0 #avl=4, #vtype=(e32, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x02, 0xcd
+;   .byte 0xd7, 0x42, 0x15, 0x02
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %iadd_splat_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = iadd v2, v0
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vadd.vx v5,v1,a0 #avl=2, #vtype=(e64, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+; 
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x70, 0x81, 0xcd
+;   .byte 0xd7, 0x42, 0x15, 0x02
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
diff --git a/cranelift/filetests/filetests/runtests/simd-iadd-splat.clif b/cranelift/filetests/filetests/runtests/simd-iadd-splat.clif
index 2fa55bc142aa..bebad9eb6383 100644
--- a/cranelift/filetests/filetests/runtests/simd-iadd-splat.clif
+++ b/cranelift/filetests/filetests/runtests/simd-iadd-splat.clif
@@ -8,38 +8,71 @@ target x86_64
 target x86_64 skylake
 target riscv64 has_v
 
-function %iadd_splat_i8x16(i8x16) -> i8x16 {
+function %iadd_splat_const_i8x16(i8x16) -> i8x16 {
 block0(v0: i8x16):
     v1 = iconst.i8 5
     v2 = splat.i8x16 v1
     v3 = iadd v0, v2
     return v3
 }
-; run: %iadd_splat_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21]
+; run: %iadd_splat_const_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21]
 
-function %iadd_splat_i16x8(i16x8) -> i16x8 {
+function %iadd_splat_const_i16x8(i16x8) -> i16x8 {
 block0(v0: i16x8):
     v1 = iconst.i16 -16
     v2 = splat.i16x8 v1
     v3 = iadd v0, v2
     return v3
 }
-; run: %iadd_splat_i16x8([1 2 3 4 5 6 7 8]) == [-15 -14 -13 -12 -11 -10 -9 -8]
+; run: %iadd_splat_const_i16x8([1 2 3 4 5 6 7 8]) == [-15 -14 -13 -12 -11 -10 -9 -8]
 
-function %iadd_splat_i32x4(i32x4) -> i32x4 {
+function %iadd_splat_const_i32x4(i32x4) -> i32x4 {
 block0(v0: i32x4):
     v1 = iconst.i32 15
     v2 = splat.i32x4 v1
     v3 = iadd v0, v2
     return v3
 }
-; run: %iadd_splat_i32x4([1 2 3 4]) == [16 17 18 19]
+; run: %iadd_splat_const_i32x4([1 2 3 4]) == [16 17 18 19]
 
-function %iadd_splat_i64x2(i64x2) -> i64x2 {
+function %iadd_splat_const_i64x2(i64x2) -> i64x2 {
 block0(v0: i64x2):
     v1 = iconst.i64 -5
     v2 = splat.i64x2 v1
     v3 = iadd v2, v0
     return v3
 }
-; run: %iadd_splat_i64x2([1 2]) == [-4 -3]
+; run: %iadd_splat_const_i64x2([1 2]) == [-4 -3]
+
+
+function %iadd_splat_i8x16(i8x16,  i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = iadd v0, v2
+    return v3
+}
+; run: %iadd_splat_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], -15) == [-14 -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1]
+
+function %iadd_splat_i16x8(i16x8, i16) -> i16x8 {
+block0(v0: i16x8, v1: i16):
+    v2 = splat.i16x8 v1
+    v3 = iadd v0, v2
+    return v3
+}
+; run: %iadd_splat_i16x8([1 2 3 4 5 6 7 8], -10) == [-9 -8 -7 -6 -5 -4 -3 -2]
+
+function %iadd_splat_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = splat.i32x4 v1
+    v3 = iadd v0, v2
+    return v3
+}
+; run: %iadd_splat_i32x4([1 2 3 4], 22) == [23 24 25 26]
+
+function %iadd_splat_i64x2(i64x2, i64) -> i64x2 {
+block0(v0: i64x2, v1: i64):
+    v2 = splat.i64x2 v1
+    v3 = iadd v2, v0
+    return v3
+}
+; run: %iadd_splat_i64x2([1 2], 10) == [11 12]

From d905232a5274388beb3d664fe75d478a674cdd83 Mon Sep 17 00:00:00 2001
From: Afonso Bordado <afonsobordado@az8.co>
Date: Wed, 10 May 2023 12:08:06 +0100
Subject: [PATCH 8/8] riscv64: Refactor Inst RegClass asserts

---
 cranelift/codegen/src/isa/riscv64/inst/mod.rs | 19 ++++++-------------
 .../codegen/src/isa/riscv64/inst/vector.rs    | 11 +++++++++++
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/cranelift/codegen/src/isa/riscv64/inst/mod.rs b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
index 2f65d01e4d83..d654e77765a6 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/mod.rs
@@ -4,7 +4,7 @@
 #![allow(dead_code)]
 #![allow(non_camel_case_types)]
 
-use super::lower::isle::generated_code::{VecAMode, VecElementWidth, VecOpCategory};
+use super::lower::isle::generated_code::{VecAMode, VecElementWidth};
 use crate::binemit::{Addend, CodeOffset, Reloc};
 pub use crate::ir::condcodes::IntCC;
 use crate::ir::types::{self, F32, F64, I128, I16, I32, I64, I8, I8X16, R32, R64};
@@ -627,25 +627,18 @@ fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
         &Inst::VecAluRRR {
             op, vd, vs1, vs2, ..
         } => {
+            debug_assert_eq!(vd.to_reg().class(), RegClass::Vector);
             debug_assert_eq!(vs2.class(), RegClass::Vector);
-            match op.category() {
-                VecOpCategory::OPIVV | VecOpCategory::OPFVV | VecOpCategory::OPMVV => {
-                    debug_assert_eq!(vs1.class(), RegClass::Vector);
-                }
-                VecOpCategory::OPIVX | VecOpCategory::OPMVX => {
-                    debug_assert_eq!(vs1.class(), RegClass::Int);
-                }
-                VecOpCategory::OPFVF => {
-                    debug_assert_eq!(vs1.class(), RegClass::Float);
-                }
-                _ => unreachable!(),
-            }
+            debug_assert_eq!(vs1.class(), op.vs1_regclass());
 
             collector.reg_use(vs1);
             collector.reg_use(vs2);
             collector.reg_def(vd);
         }
         &Inst::VecAluRRImm5 { vd, vs2, .. } => {
+            debug_assert_eq!(vd.to_reg().class(), RegClass::Vector);
+            debug_assert_eq!(vs2.class(), RegClass::Vector);
+
             collector.reg_use(vs2);
             collector.reg_def(vd);
         }
diff --git a/cranelift/codegen/src/isa/riscv64/inst/vector.rs b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
index e94d7c435f9c..603da8690ea4 100644
--- a/cranelift/codegen/src/isa/riscv64/inst/vector.rs
+++ b/cranelift/codegen/src/isa/riscv64/inst/vector.rs
@@ -4,6 +4,7 @@ use crate::isa::riscv64::lower::isle::generated_code::{
     VecAMode, VecAluOpRRImm5, VecAluOpRRR, VecAvl, VecElementWidth, VecLmul, VecMaskMode,
     VecOpCategory, VecOpMasking, VecTailMode,
 };
+use crate::machinst::RegClass;
 use crate::Reg;
 use core::fmt;
 
@@ -277,6 +278,16 @@ impl VecAluOpRRR {
             }
         }
     }
+
+    // vs1 is the only variable source, vs2 is fixed.
+    pub fn vs1_regclass(&self) -> RegClass {
+        match self.category() {
+            VecOpCategory::OPIVV | VecOpCategory::OPFVV | VecOpCategory::OPMVV => RegClass::Vector,
+            VecOpCategory::OPIVX | VecOpCategory::OPMVX => RegClass::Int,
+            VecOpCategory::OPFVF => RegClass::Float,
+            _ => unreachable!(),
+        }
+    }
 }
 
 impl fmt::Display for VecAluOpRRR {