riscv64: Add remaining Zfa Instructions (#8582)

* riscv64: Add `fround` instruction * riscv64: Remove unused load_fp functions * riscv64: Add support for `fli` instruction * riscv64: Add negated `fli` rules
bytecodealliance · May 8, 2024 · d89e2b3 · d89e2b3
1 parent d51b5ae
commit d89e2b3
Show file tree

Hide file tree

Showing 14 changed files with 975 additions and 53 deletions.
diff --git a/cranelift/codegen/src/isa/riscv64/inst.isle b/cranelift/codegen/src/isa/riscv64/inst.isle
@@ -19,6 +19,11 @@
       (rd WritableReg)
       (imm Imm20))
 
+    (Fli
+      (ty Type)
+      (imm FliConstant)
+      (rd WritableReg))
+
     ;; An ALU operation with one register sources and a register destination.
     (FpuRR
       (alu_op FpuOPRR)
@@ -411,6 +416,8 @@
   (QNaN)
 ))
 
+(type FliConstant (primitive FliConstant))
+
 (type FpuOPRR (enum
   ;; RV32F Standard Extension
   (FsqrtS)
@@ -447,8 +454,10 @@
   (FcvtWuD)
   (FcvtDW)
   (FcvtDWU)
-  ;; bitmapip
 
+  ;; Zfa Extension
+  (FroundS)
+  (FroundD)
 ))
 
 (type LoadOP (enum
@@ -1549,6 +1558,20 @@
 (rule (rv_fmaxm $F32 rs1 rs2) (fpu_rrr (FpuOPRRR.FmaxmS) $F32 (FRM.RUP) rs1 rs2))
 (rule (rv_fmaxm $F64 rs1 rs2) (fpu_rrr (FpuOPRRR.FmaxmD) $F64 (FRM.RUP) rs1 rs2))
 
+;; Helper for emitting the `fround` instruction.
+(decl rv_fround (Type FRM FReg) FReg)
+(rule (rv_fround $F32 frm rs) (fpu_rr (FpuOPRR.FroundS) $F32 frm rs))
+(rule (rv_fround $F64 frm rs) (fpu_rr (FpuOPRR.FroundD) $F64 frm rs))
+
+;; Helper for emitting the `fli` instruction.
+(decl rv_fli (Type FliConstant) FReg)
+(rule (rv_fli ty imm)
+      (let ((dst WritableFReg (temp_writable_freg))
+            (_ Unit (emit (MInst.Fli ty
+                                     imm
+                                     dst))))
+        dst))
+
 ;; `Zba` Extension Instructions
 
 ;; Helper for emitting the `adduw` ("Add Unsigned Word") instruction.
@@ -1778,6 +1801,14 @@
 
 
 
+;; Helper for generating a FliConstant from a u64 constant
+(decl pure partial fli_constant_from_u64 (Type u64) FliConstant)
+(extern constructor fli_constant_from_u64 fli_constant_from_u64)
+
+;; Helper for generating a FliConstant from a u64 negated constant
+(decl pure partial fli_constant_from_negated_u64 (Type u64) FliConstant)
+(extern constructor fli_constant_from_negated_u64 fli_constant_from_negated_u64)
+
 ;; Helper for generating a i64 from a pair of Imm20 and Imm12 constants
 (decl i64_generate_imm (Imm20 Imm12) i64)
 (extern extractor i64_generate_imm i64_generate_imm)
@@ -1795,14 +1826,30 @@
 ;; TODO: Load floats using `fld` instead of `ld`
 (decl imm (Type u64) Reg)
 
-;; Refs get loaded as integers.
-(rule 5 (imm $R32 c) (imm $I32 c))
-(rule 5 (imm $R64 c) (imm $I64 c))
+;; If Zfa is enabled, we can load certain constants with the `fli` instruction.
+(rule 7 (imm (ty_scalar_float ty) imm)
+  (if-let $true (has_zfa))
+  (if-let const (fli_constant_from_u64 ty imm))
+  (rv_fli ty const))
 
-;; Floats get loaded as integers and then moved into an F register.
+;; It is beneficial to load the negated constant with `fli` and then negate it
+;; in a register.
+;;
+;; For f64's this saves one instruction, and for f32's it avoids
+;; having to allocate an integer register, reducing integer register pressure.
+(rule 6 (imm (ty_scalar_float ty) imm)
+  (if-let $true (has_zfa))
+  (if-let const (fli_constant_from_negated_u64 ty imm))
+  (rv_fneg ty (rv_fli ty const)))
+
+;; Otherwise floats get loaded as integers and then moved into an F register.
 (rule 5 (imm $F32 c) (gen_bitcast (imm $I32 c) $I32 $F32))
 (rule 5 (imm $F64 c) (gen_bitcast (imm $I64 c) $I64 $F64))
 
+;; Refs get loaded as integers.
+(rule 5 (imm $R32 c) (imm $I32 c))
+(rule 5 (imm $R64 c) (imm $I64 c))
+
 ;; Try to match just an imm12
 (rule 4 (imm (ty_int ty) c)
   (if-let (i64_generate_imm (imm20_is_zero) imm12) (i64_sextend_u64 ty c))
@@ -2470,7 +2517,7 @@
 (rule (float_round_fcvt $F64 frm rs) (rv_fcvtdl frm (rv_fcvtld frm rs)))
 
 (decl gen_float_round (FRM FReg Type) FReg)
-(rule (gen_float_round frm rs ty)
+(rule 0 (gen_float_round frm rs ty)
   (let (;; if rs is NaN/+-Infinity/+-Zero or if the exponent is larger than # of bits
         ;; in mantissa, the result is the same as src, check for these cases first.
         (max FReg (imm ty (float_int_max ty)))
@@ -2491,6 +2538,10 @@
     ;; Check if the value cannot be rounded exactly and return the source input if so
     (gen_select_freg (cmp_eqz exact) corrected_nan rounded)))
 
+;; With Zfa we can use the dedicated `fround` instruction.
+(rule 1 (gen_float_round frm rs ty)
+  (if-let $true (has_zfa))
+  (rv_fround ty frm rs))
 
 
 

diff --git a/cranelift/codegen/src/isa/riscv64/inst/args.rs b/cranelift/codegen/src/isa/riscv64/inst/args.rs
@@ -312,6 +312,116 @@ impl IntegerCompare {
     }
 }
 
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct FliConstant(u8);
+
+impl FliConstant {
+    pub(crate) fn new(value: u8) -> Self {
+        debug_assert!(value <= 31, "Invalid FliConstant: {}", value);
+        Self(value)
+    }
+
+    pub(crate) fn maybe_from_u64(ty: Type, imm: u64) -> Option<Self> {
+        // Convert the value into an F64, this allows us to represent
+        // values from both f32 and f64 in the same value.
+        let value = match ty {
+            F32 => f32::from_bits(imm as u32) as f64,
+            F64 => f64::from_bits(imm),
+            _ => unimplemented!(),
+        };
+
+        Some(match (ty, value) {
+            (_, f) if f == -1.0 => Self::new(0),
+
+            // Since f64 can represent all f32 values, f32::min_positive won't be
+            // the same as f64::min_positive, so we need to check for both indepenendtly
+            (F32, f) if f == (f32::MIN_POSITIVE as f64) => Self::new(1),
+            (F64, f) if f == f64::MIN_POSITIVE => Self::new(1),
+
+            (_, f) if f == 2.0f64.powi(-16) => Self::new(2),
+            (_, f) if f == 2.0f64.powi(-15) => Self::new(3),
+            (_, f) if f == 2.0f64.powi(-8) => Self::new(4),
+            (_, f) if f == 2.0f64.powi(-7) => Self::new(5),
+            (_, f) if f == 0.0625 => Self::new(6),
+            (_, f) if f == 0.125 => Self::new(7),
+            (_, f) if f == 0.25 => Self::new(8),
+            (_, f) if f == 0.3125 => Self::new(9),
+            (_, f) if f == 0.375 => Self::new(10),
+            (_, f) if f == 0.4375 => Self::new(11),
+            (_, f) if f == 0.5 => Self::new(12),
+            (_, f) if f == 0.625 => Self::new(13),
+            (_, f) if f == 0.75 => Self::new(14),
+            (_, f) if f == 0.875 => Self::new(15),
+            (_, f) if f == 1.0 => Self::new(16),
+            (_, f) if f == 1.25 => Self::new(17),
+            (_, f) if f == 1.5 => Self::new(18),
+            (_, f) if f == 1.75 => Self::new(19),
+            (_, f) if f == 2.0 => Self::new(20),
+            (_, f) if f == 2.5 => Self::new(21),
+            (_, f) if f == 3.0 => Self::new(22),
+            (_, f) if f == 4.0 => Self::new(23),
+            (_, f) if f == 8.0 => Self::new(24),
+            (_, f) if f == 16.0 => Self::new(25),
+            (_, f) if f == 128.0 => Self::new(26),
+            (_, f) if f == 256.0 => Self::new(27),
+            (_, f) if f == 32768.0 => Self::new(28),
+            (_, f) if f == 65536.0 => Self::new(29),
+            (_, f) if f == f64::INFINITY => Self::new(30),
+
+            // NaN's are not guaranteed to preserve the sign / payload bits, so we need to check
+            // the original bits directly.
+            (F32, f) if f.is_nan() && imm == 0x7fc0_0000 => Self::new(31), // Canonical NaN
+            (F64, f) if f.is_nan() && imm == 0x7ff8_0000_0000_0000 => Self::new(31), // Canonical NaN
+            _ => return None,
+        })
+    }
+
+    pub(crate) fn format(self) -> &'static str {
+        // The preferred assembly syntax for entries 1, 30, and 31 is min, inf, and nan, respectively.
+        // For entries 0 through 29 (including entry 1), the assembler will accept decimal constants
+        // in C-like syntax.
+        match self.0 {
+            0 => "-1.0",
+            1 => "min",
+            2 => "2^-16",
+            3 => "2^-15",
+            4 => "2^-8",
+            5 => "2^-7",
+            6 => "0.0625",
+            7 => "0.125",
+            8 => "0.25",
+            9 => "0.3125",
+            10 => "0.375",
+            11 => "0.4375",
+            12 => "0.5",
+            13 => "0.625",
+            14 => "0.75",
+            15 => "0.875",
+            16 => "1.0",
+            17 => "1.25",
+            18 => "1.5",
+            19 => "1.75",
+            20 => "2.0",
+            21 => "2.5",
+            22 => "3.0",
+            23 => "4.0",
+            24 => "8.0",
+            25 => "16.0",
+            26 => "128.0",
+            27 => "256.0",
+            28 => "32768.0",
+            29 => "65536.0",
+            30 => "inf",
+            31 => "nan",
+            _ => panic!("Invalid FliConstant"),
+        }
+    }
+
+    pub(crate) fn bits(self) -> u8 {
+        self.0
+    }
+}
+
 impl FpuOPRRRR {
     pub(crate) fn op_name(self) -> &'static str {
         match self {
@@ -376,6 +486,8 @@ impl FpuOPRR {
             Self::FcvtWuD => "fcvt.wu.d",
             Self::FcvtDW => "fcvt.d.w",
             Self::FcvtDWU => "fcvt.d.wu",
+            Self::FroundS => "fround.s",
+            Self::FroundD => "fround.d",
         }
     }
 
@@ -392,14 +504,6 @@ impl FpuOPRR {
             _ => false,
         }
     }
-    // move from x register to float register.
-    pub(crate) fn move_x_to_f_op(ty: Type) -> Self {
-        match ty {
-            F32 => Self::FmvWX,
-            F64 => Self::FmvDX,
-            _ => unreachable!("ty:{:?}", ty),
-        }
-    }
 
     pub(crate) fn op_code(self) -> u32 {
         match self {
@@ -428,7 +532,9 @@ impl FpuOPRR {
             | FpuOPRR::FcvtWD
             | FpuOPRR::FcvtWuD
             | FpuOPRR::FcvtDW
-            | FpuOPRR::FcvtDWU => 0b1010011,
+            | FpuOPRR::FcvtDWU
+            | FpuOPRR::FroundS
+            | FpuOPRR::FroundD => 0b1010011,
         }
     }
 
@@ -460,6 +566,8 @@ impl FpuOPRR {
             FpuOPRR::FcvtDW => 0b00000,
             FpuOPRR::FcvtDWU => 0b00001,
             FpuOPRR::FsqrtD => 0b00000,
+            FpuOPRR::FroundS => 0b00100,
+            FpuOPRR::FroundD => 0b00100,
         }
     }
     pub(crate) fn funct7(self) -> u32 {
@@ -482,8 +590,8 @@ impl FpuOPRR {
             FpuOPRR::FcvtDL => 0b1101001,
             FpuOPRR::FcvtDLu => 0b1101001,
             FpuOPRR::FmvDX => 0b1111001,
-            FpuOPRR::FcvtSD => 0b0100000,
-            FpuOPRR::FcvtDS => 0b0100001,
+            FpuOPRR::FcvtSD | FpuOPRR::FroundS => 0b0100000,
+            FpuOPRR::FcvtDS | FpuOPRR::FroundD => 0b0100001,
             FpuOPRR::FclassD => 0b1110001,
             FpuOPRR::FcvtWD => 0b1100001,
             FpuOPRR::FcvtWuD => 0b1100001,

diff --git a/cranelift/codegen/src/isa/riscv64/inst/emit.rs b/cranelift/codegen/src/isa/riscv64/inst/emit.rs
@@ -149,6 +149,7 @@ impl Inst {
             | Inst::Nop4
             | Inst::BrTable { .. }
             | Inst::Auipc { .. }
+            | Inst::Fli { .. }
             | Inst::Lui { .. }
             | Inst::LoadInlineConst { .. }
             | Inst::AluRRR { .. }
@@ -875,6 +876,9 @@ impl Inst {
                 let x: u32 = 0b0110111 | reg_to_gpr_num(rd.to_reg()) << 7 | (imm.bits() << 12);
                 sink.put4(x);
             }
+            &Inst::Fli { rd, ty, imm } => {
+                sink.put4(encode_fli(ty, imm, rd));
+            }
             &Inst::LoadInlineConst { rd, ty, imm } => {
                 let data = &imm.to_le_bytes()[..ty.bytes() as usize];
 

diff --git a/cranelift/codegen/src/isa/riscv64/inst/emit_tests.rs b/cranelift/codegen/src/isa/riscv64/inst/emit_tests.rs
@@ -2084,6 +2084,26 @@ fn test_riscv64_binemit() {
         0x22b59553,
     ));
 
+    insns.push(TestUnit::new(
+        Inst::Fli {
+            ty: F32,
+            rd: writable_fa0(),
+            imm: FliConstant::new(0),
+        },
+        "fli.s fa0,-1.0",
+        0xf0100553,
+    ));
+
+    insns.push(TestUnit::new(
+        Inst::Fli {
+            ty: F64,
+            rd: writable_fa0(),
+            imm: FliConstant::new(13),
+        },
+        "fli.d fa0,0.625",
+        0xf2168553,
+    ));
+
     let (flags, isa_flags) = make_test_flags();
     let emit_info = EmitInfo::new(flags, isa_flags);
 

diff --git a/cranelift/codegen/src/isa/riscv64/inst/encode.rs b/cranelift/codegen/src/isa/riscv64/inst/encode.rs
@@ -652,3 +652,24 @@ pub fn encode_zcbmem_load(op: ZcbMemOp, rd: WritableReg, base: Reg, imm: Uimm2)
 pub fn encode_zcbmem_store(op: ZcbMemOp, src: Reg, base: Reg, imm: Uimm2) -> u16 {
     encode_zcbmem_bits(op, src, base, imm)
 }
+
+pub fn encode_fli(ty: Type, imm: FliConstant, rd: WritableReg) -> u32 {
+    // FLI.{S,D} is encoded as a FMV.{W,D} instruction with rs2 set to the
+    // immediate value to be loaded.
+    let op = match ty {
+        F32 => FpuOPRR::FmvWX,
+        F64 => FpuOPRR::FmvDX,
+        _ => unreachable!(),
+    };
+    let frm = 0; // FRM is hard coded to 0 in both instructions
+    let rs2 = 1; // rs2 set to 1 is what differentiates FLI from FMV
+
+    let mut bits = 0;
+    bits |= unsigned_field_width(op.op_code(), 7);
+    bits |= reg_to_gpr_num(rd.to_reg()) << 7;
+    bits |= unsigned_field_width(frm, 3) << 12;
+    bits |= unsigned_field_width(imm.bits() as u32, 5) << 15;
+    bits |= unsigned_field_width(rs2, 6) << 20;
+    bits |= unsigned_field_width(op.funct7(), 7) << 25;
+    bits
+}