From 9bbf3c21220bdabb13c7e43e8f18c343b36276e0 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Tue, 13 Apr 2021 11:34:23 -0700
Subject: [PATCH] x64: refactor EVEX encodings to use `EvexInstruction`

This change replaces the `encode_evex` function with a builder-style struct, `EvexInstruction`. This approach clarifies the code, adds documentation, and results in slight speedups when benchmarked.
---
 cranelift/codegen/src/isa/x64/inst/emit.rs    |  30 +-
 .../codegen/src/isa/x64/inst/encoding/evex.rs | 337 ++++++++++++++----
 .../codegen/src/isa/x64/inst/encoding/mod.rs  |  20 ++
 .../codegen/src/isa/x64/inst/encoding/rex.rs  |  65 +++-
 4 files changed, 365 insertions(+), 87 deletions(-)

diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 2d14b9fe5d00..a35bbe0a9994 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -6,11 +6,11 @@ use crate::isa::x64::inst::args::*;
 use crate::isa::x64::inst::*;
 use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel};
 use core::convert::TryInto;
-use cranelift_codegen_shared::isa::x86::EncodingBits;
-use encoding::evex::{encode_evex, EvexContext, EvexMasking};
+use encoding::evex::{EvexInstruction, EvexVectorLength};
 use encoding::rex::{
     emit_simm, emit_std_enc_enc, emit_std_enc_mem, emit_std_reg_mem, emit_std_reg_reg, int_reg_enc,
-    low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, RexFlags,
+    low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, OpcodeMap,
+    RexFlags,
 };
 use log::debug;
 use regalloc::{Reg, Writable};
@@ -1407,21 +1407,21 @@ pub(crate) fn emit(
         }
 
         Inst::XmmUnaryRmREvex { op, src, dst } => {
-            let bits = match op {
-                &Avx512Opcode::Vpabsq => EncodingBits::new(&[0x66, 0x0f, 0x38, 0x1f], 0, 1),
+            let opcode = match op {
+                Avx512Opcode::Vpabsq => 0x1f,
             };
             match src {
-                RegMem::Reg { reg: src } => encode_evex(
-                    bits,
-                    dst.to_reg().get_hw_encoding(),
-                    0,
-                    src.get_hw_encoding(),
-                    EvexContext::v128(),
-                    EvexMasking::default(),
-                    sink,
-                ),
+                RegMem::Reg { reg: src } => EvexInstruction::new()
+                    .length(EvexVectorLength::V128)
+                    .prefix(LegacyPrefixes::_66)
+                    .map(OpcodeMap::_0F38)
+                    .w(true)
+                    .opcode(opcode)
+                    .reg(dst.to_reg().get_hw_encoding())
+                    .rm(src.get_hw_encoding())
+                    .encode(sink),
                 _ => todo!(),
-            }
+            };
         }
 
         Inst::XmmRmR {
diff --git a/cranelift/codegen/src/isa/x64/inst/encoding/evex.rs b/cranelift/codegen/src/isa/x64/inst/encoding/evex.rs
index 6dc6dfd89eed..0a113c19dbc3 100644
--- a/cranelift/codegen/src/isa/x64/inst/encoding/evex.rs
+++ b/cranelift/codegen/src/isa/x64/inst/encoding/evex.rs
@@ -1,64 +1,214 @@
-//! Encodes EVEX instructions. These instructions are those added by the AVX-512 extensions.
-use super::rex::encode_modrm;
+//! Encodes EVEX instructions. These instructions are those added by the AVX-512 extensions. The
+//! EVEX encoding requires a 4-byte prefix:
+//!
+//! Byte 0:  0x62
+//!         ┌───┬───┬───┬───┬───┬───┬───┬───┐
+//! Byte 1: │ R │ X │ B │ R'│ 0 │ 0 │ m │ m │
+//!         ├───┼───┼───┼───┼───┼───┼───┼───┤
+//! Byte 2: │ W │ v │ v │ v │ v │ 1 │ p │ p │
+//!         ├───┼───┼───┼───┼───┼───┼───┼───┤
+//! Byte 3: │ z │ L'│ L │ b │ V'│ a │ a │ a │
+//!         └───┴───┴───┴───┴───┴───┴───┴───┘
+//!
+//! The prefix is then followeded by the opcode byte, the ModR/M byte, and other optional suffixes
+//! (e.g. SIB byte, displacements, immediates) based on the instruction (see section 2.6, Intel
+//! Software Development Manual, volume 2A).
+use super::rex::{encode_modrm, LegacyPrefixes, OpcodeMap};
 use super::CodeSink;
-use cranelift_codegen_shared::isa::x86::EncodingBits;
-
-/// Encode an EVEX instruction, including the prefixes, the instruction opcode, and the ModRM byte.
-/// This EVEX encoding function only encodes the `reg` (operand 1), `vvvv` (operand 2), `rm`
-/// (operand 3) form; other forms are possible (see section 2.6.2, Intel Software Development
-/// Manual, volume 2A), requiring refactoring of this function or separate functions for each form
-/// (e.g. as for the REX prefix).
-pub fn encode_evex<CS: CodeSink + ?Sized>(
-    enc: EncodingBits,
-    reg: impl Into<Register>,
-    vvvvv: impl Into<Register>,
-    rm: impl Into<Register>,
-    context: EvexContext,
-    masking: EvexMasking,
-    sink: &mut CS,
-) {
-    let reg = reg.into();
-    let rm = rm.into();
-    let vvvvv = vvvvv.into();
-
-    // EVEX prefix.
-    sink.put1(0x62);
-
-    debug_assert!(enc.mm() < 0b100);
-    let mut p0 = enc.mm() & 0b11;
-    p0 |= evex2(rm, reg) << 4; // bits 3:2 are always unset
-    sink.put1(p0);
-
-    let mut p1 = enc.pp() | 0b100; // bit 2 is always set
-    p1 |= (!(vvvvv.0) & 0b1111) << 3;
-    p1 |= (enc.rex_w() & 0b1) << 7;
-    sink.put1(p1);
-
-    let mut p2 = masking.aaa_bits();
-    p2 |= (!(vvvvv.0 >> 4) & 0b1) << 3;
-    p2 |= context.bits() << 4;
-    p2 |= masking.z_bit() << 7;
-    sink.put1(p2);
-
-    // Opcode.
-    sink.put1(enc.opcode_byte());
-
-    // ModR/M byte.
-    sink.put1(encode_modrm(3, reg.0 & 7, rm.0 & 7))
-}
-
-/// Encode the RXBR' bits of the EVEX P0 byte. For an explanation of these bits, see section 2.6.1
-/// in the Intel Software Development Manual, volume 2A. These bits can be used by different
-/// addressing modes (see section 2.6.2), requiring different `vex*` functions than this one.
-fn evex2(rm: Register, reg: Register) -> u8 {
-    let b = !(rm.0 >> 3) & 1;
-    let x = !(rm.0 >> 4) & 1;
-    let r = !(reg.0 >> 3) & 1;
-    let r_ = !(reg.0 >> 4) & 1;
-    0x00 | r_ | (b << 1) | (x << 2) | (r << 3)
-}
-
-#[derive(Copy, Clone)]
+use core::ops::RangeInclusive;
+
+/// Constructs an EVEX-encoded instruction using a builder pattern. This approach makes it visually
+/// easier to transform something the manual's syntax, `EVEX.256.66.0F38.W1 1F /r` to code:
+/// `EvexInstruction::new().length(...).prefix(...).map(...).w(true).opcode(0x1F).reg(...).rm(...)`.
+pub struct EvexInstruction {
+    bits: u32,
+    opcode: u8,
+    reg: Register,
+    rm: Register,
+}
+
+/// Because some of the bit flags in the EVEX prefix are reversed and users of `EvexInstruction` may
+/// choose to skip setting fields, here we set some sane defaults. Note that:
+/// - the first byte is always `0x62` but you will notice it at the end of the default `bits` value
+///   implemented--remember the little-endian order
+/// - some bits are always set to certain values: bits 10-11 to 0, bit 18 to 1
+/// - the other bits set correspond to reversed bits: R, X, B, R' (byte 1), vvvv (byte 2), V' (byte
+///   3).
+///
+/// See the `default_emission` test for what these defaults are equivalent to (e.g. using RAX,
+/// unsetting the W bit, etc.)
+impl Default for EvexInstruction {
+    fn default() -> Self {
+        Self {
+            bits: 0x08_7C_F0_62,
+            opcode: 0,
+            reg: Register::default(),
+            rm: Register::default(),
+        }
+    }
+}
+
+#[allow(non_upper_case_globals)] // This makes it easier to match the bit range names to the manual's names.
+impl EvexInstruction {
+    /// Construct a default EVEX instruction.
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Set the length of the instruction . Note that there are sets of instructions (i.e. rounding,
+    /// memory broadcast) that modify the same underlying bits--at some point (TODO) we can add a
+    /// way to set those context bits and verify that both are not used (e.g. rounding AND length).
+    /// For now, this method is very convenient.
+    #[inline(always)]
+    pub fn length(mut self, length: EvexVectorLength) -> Self {
+        self.write(Self::LL, EvexContext::Other { length }.bits() as u32);
+        self
+    }
+
+    /// Set the legacy prefix byte of the instruction: None | 66 | F0 | F2 | F3. EVEX instructions
+    /// pack these into the prefix, not as separate bytes.
+    #[inline(always)]
+    pub fn prefix(mut self, prefix: LegacyPrefixes) -> Self {
+        self.write(Self::pp, prefix.bits() as u32);
+        self
+    }
+
+    /// Set the opcode map byte of the instruction: None | 0F | 0F38 | 0F3A. EVEX instructions pack
+    /// these into the prefix, not as separate bytes.
+    #[inline(always)]
+    pub fn map(mut self, map: OpcodeMap) -> Self {
+        self.write(Self::mm, map.bits() as u32);
+        self
+    }
+
+    /// Set the W bit, typically used to indicate an instruction using 64 bits of an operand (e.g.
+    /// 64 bit lanes). EVEX packs this bit in the EVEX prefix; previous encodings used the REX
+    /// prefix.
+    #[inline(always)]
+    pub fn w(mut self, w: bool) -> Self {
+        self.write(Self::W, w as u32);
+        self
+    }
+
+    /// Set the instruction opcode byte.
+    #[inline(always)]
+    pub fn opcode(mut self, opcode: u8) -> Self {
+        self.opcode = opcode;
+        self
+    }
+
+    /// Set the register to use for the `reg` bits; many instructions use this as the write operand.
+    /// Setting this affects both the ModRM byte (`reg` section) and the EVEX prefix (the extension
+    /// bits for register encodings > 8).
+    #[inline(always)]
+    pub fn reg(mut self, reg: impl Into<Register>) -> Self {
+        self.reg = reg.into();
+        let r = !(self.reg.0 >> 3) & 1;
+        let r_ = !(self.reg.0 >> 4) & 1;
+        self.write(Self::R, r as u32);
+        self.write(Self::R_, r_ as u32);
+        self
+    }
+
+    /// Set the mask to use. See section 2.6 in the Intel Software Developer's Manual, volume 2A for
+    /// more details.
+    #[allow(dead_code)]
+    #[inline(always)]
+    pub fn mask(mut self, mask: EvexMasking) -> Self {
+        self.write(Self::aaa, mask.aaa_bits() as u32);
+        self.write(Self::z, mask.z_bit() as u32);
+        self
+    }
+
+    /// Set the `vvvvv` register; some instructions allow using this as a second, non-destructive
+    /// source register in 3-operand instructions (e.g. 2 read, 1 write).
+    #[allow(dead_code)]
+    #[inline(always)]
+    pub fn vvvvv(mut self, reg: impl Into<Register>) -> Self {
+        let reg = reg.into();
+        self.write(Self::vvvv, !(reg.0 as u32) & 0b1111);
+        self.write(Self::V_, !(reg.0 as u32 >> 4) & 0b1);
+        self
+    }
+
+    /// Set the register to use for the `rm` bits; many instructions use this as the "read from
+    /// register/memory" operand. Currently this does not support memory addressing (TODO).Setting
+    /// this affects both the ModRM byte (`rm` section) and the EVEX prefix (the extension bits for
+    /// register encodings > 8).
+    #[inline(always)]
+    pub fn rm(mut self, reg: impl Into<Register>) -> Self {
+        self.rm = reg.into();
+        let b = !(self.rm.0 >> 3) & 1;
+        let x = !(self.rm.0 >> 4) & 1;
+        self.write(Self::X, x as u32);
+        self.write(Self::B, b as u32);
+        self
+    }
+
+    /// Emit the EVEX-encoded instruction to the code sink:
+    /// - first, the 4-byte EVEX prefix;
+    /// - then, the opcode byte;
+    /// - finally, the ModR/M byte.
+    ///
+    /// Eventually this method should support encodings of more than just the reg-reg addressing mode (TODO).
+    pub fn encode<CS: CodeSink + ?Sized>(&self, sink: &mut CS) {
+        sink.put4(self.bits);
+        sink.put1(self.opcode);
+        sink.put1(encode_modrm(3, self.reg.0 & 7, self.rm.0 & 7));
+    }
+
+    // In order to simplify the encoding of the various bit ranges in the prefix, we specify those
+    // ranges according to the table below (extracted from the Intel Software Development Manual,
+    // volume 2A). Remember that, because we pack the 4-byte prefix into a little-endian `u32`, this
+    // chart should be read from right-to-left, top-to-bottom. Note also that we start ranges at bit
+    // 8, leaving bits 0-7 for the mandatory `0x62`.
+    //         ┌───┬───┬───┬───┬───┬───┬───┬───┐
+    // Byte 1: │ R │ X │ B │ R'│ 0 │ 0 │ m │ m │
+    //         ├───┼───┼───┼───┼───┼───┼───┼───┤
+    // Byte 2: │ W │ v │ v │ v │ v │ 1 │ p │ p │
+    //         ├───┼───┼───┼───┼───┼───┼───┼───┤
+    // Byte 3: │ z │ L'│ L │ b │ V'│ a │ a │ a │
+    //         └───┴───┴───┴───┴───┴───┴───┴───┘
+
+    // Byte 1:
+    const mm: RangeInclusive<u8> = 8..=9;
+    const R_: RangeInclusive<u8> = 12..=12;
+    const B: RangeInclusive<u8> = 13..=13;
+    const X: RangeInclusive<u8> = 14..=14;
+    const R: RangeInclusive<u8> = 15..=15;
+
+    // Byte 2:
+    const pp: RangeInclusive<u8> = 16..=17;
+    const vvvv: RangeInclusive<u8> = 19..=22;
+    const W: RangeInclusive<u8> = 23..=23;
+
+    // Byte 3:
+    const aaa: RangeInclusive<u8> = 24..=26;
+    const V_: RangeInclusive<u8> = 27..=27;
+    #[allow(dead_code)] // Will be used once broadcast and rounding controls are exposed.
+    const b: RangeInclusive<u8> = 28..=28;
+    const LL: RangeInclusive<u8> = 29..=30;
+    const z: RangeInclusive<u8> = 31..=31;
+
+    // A convenience method for writing the `value` bits to the given range in `self.bits`.
+    #[inline]
+    fn write(&mut self, range: RangeInclusive<u8>, value: u32) {
+        assert!(ExactSizeIterator::len(&range) > 0);
+        let size = range.end() - range.start() + 1; // Calculate the number of bits in the range.
+        let mask: u32 = (1 << size) - 1; // Generate a bit mask.
+        debug_assert!(
+            value <= mask,
+            "The written value should have fewer than {} bits.",
+            size
+        );
+        let mask_complement = !(mask << *range.start()); // Create the bitwise complement for the clear mask.
+        self.bits &= mask_complement; // Clear the bits in `range`.
+        let value = (value & mask) << *range.start(); // Place the value in the correct location.
+        self.bits |= value; // Modify the bits in `range`.
+    }
+}
+
+#[derive(Copy, Clone, Default)]
 pub struct Register(u8);
 impl From<u8> for Register {
     fn from(reg: u8) -> Self {
@@ -90,14 +240,15 @@ pub enum EvexContext {
     },
 }
 
-impl EvexContext {
-    /// Construct an EVEX context for 128-bit SIMD instructions.
-    pub fn v128() -> Self {
+impl Default for EvexContext {
+    fn default() -> Self {
         Self::Other {
-            length: EvexVectorLength::V128,
+            length: EvexVectorLength::default(),
         }
     }
+}
 
+impl EvexContext {
     /// Encode the `L'`, `L`, and `b` bits (bits 6:4 of EVEX P2 byte) for merging with the P2 byte.
     fn bits(&self) -> u8 {
         match self {
@@ -129,6 +280,12 @@ impl EvexVectorLength {
     }
 }
 
+impl Default for EvexVectorLength {
+    fn default() -> Self {
+        Self::V128
+    }
+}
+
 /// The EVEX format allows defining rounding control in the `L'` and `L` bits; see `EvexContext`.
 #[allow(dead_code)] // Rounding controls are not yet used.
 pub enum EvexRoundingControl {
@@ -185,3 +342,55 @@ impl EvexMasking {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::isa::x64::inst::regs;
+    use std::vec::Vec;
+
+    // As a sanity test, we verify that the output of `xed-asmparse-main 'vpabsq xmm0{k0},
+    // xmm1'` matches this EVEX encoding machinery.
+    #[test]
+    fn vpabsq() {
+        let dst = regs::xmm0();
+        let src = regs::xmm1();
+        let mut sink0 = Vec::new();
+
+        EvexInstruction::new()
+            .prefix(LegacyPrefixes::_66)
+            .map(OpcodeMap::_0F38)
+            .w(true)
+            .opcode(0x1F)
+            .reg(dst.get_hw_encoding())
+            .rm(src.get_hw_encoding())
+            .length(EvexVectorLength::V128)
+            .encode(&mut sink0);
+
+        assert_eq!(sink0, vec![0x62, 0xf2, 0xfd, 0x08, 0x1f, 0xc1]);
+    }
+
+    /// Verify that the defaults are equivalent to an instruction with a `0x00` opcode using the
+    /// "0" register (i.e. `rax`), with sane defaults for the various configurable parameters. This
+    /// test is more interesting than it may appear because some of the parameters have flipped-bit
+    /// representations (e.g. `vvvvv`) so emitting 0s as a default will not work.
+    #[test]
+    fn default_emission() {
+        let mut sink0 = Vec::new();
+        EvexInstruction::new().encode(&mut sink0);
+
+        let mut sink1 = Vec::new();
+        EvexInstruction::new()
+            .length(EvexVectorLength::V128)
+            .prefix(LegacyPrefixes::None)
+            .map(OpcodeMap::None)
+            .w(false)
+            .opcode(0x00)
+            .reg(regs::rax().get_hw_encoding())
+            .rm(regs::rax().get_hw_encoding())
+            .mask(EvexMasking::None)
+            .encode(&mut sink1);
+
+        assert_eq!(sink0, sink1);
+    }
+}
diff --git a/cranelift/codegen/src/isa/x64/inst/encoding/mod.rs b/cranelift/codegen/src/isa/x64/inst/encoding/mod.rs
index f5377a5ea9cb..4d60f255f0df 100644
--- a/cranelift/codegen/src/isa/x64/inst/encoding/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/encoding/mod.rs
@@ -1,4 +1,5 @@
 use crate::{isa::x64, machinst::MachBuffer};
+use std::vec::Vec;
 
 pub mod evex;
 pub mod rex;
@@ -35,3 +36,22 @@ impl CodeSink for MachBuffer<x64::inst::Inst> {
         self.put8(value)
     }
 }
+
+/// Provide a convenient implementation for testing.
+impl CodeSink for Vec<u8> {
+    fn put1(&mut self, v: u8) {
+        self.extend_from_slice(&[v])
+    }
+
+    fn put2(&mut self, v: u16) {
+        self.extend_from_slice(&v.to_le_bytes())
+    }
+
+    fn put4(&mut self, v: u32) {
+        self.extend_from_slice(&v.to_le_bytes())
+    }
+
+    fn put8(&mut self, v: u64) {
+        self.extend_from_slice(&v.to_le_bytes())
+    }
+}
diff --git a/cranelift/codegen/src/isa/x64/inst/encoding/rex.rs b/cranelift/codegen/src/isa/x64/inst/encoding/rex.rs
index 923f2e361562..648941790e50 100644
--- a/cranelift/codegen/src/isa/x64/inst/encoding/rex.rs
+++ b/cranelift/codegen/src/isa/x64/inst/encoding/rex.rs
@@ -153,9 +153,37 @@ impl From<(OperandSize, Reg)> for RexFlags {
     }
 }
 
+/// Allows using the same opcode byte in different "opcode maps" to allow for more instruction
+/// encodings. See appendix A in the Intel Software Developer's Manual, volume 2A, for more details.
+pub enum OpcodeMap {
+    None,
+    _0F,
+    _0F38,
+    _0F3A,
+}
+
+impl OpcodeMap {
+    /// Normally the opcode map is specified as bytes in the instruction, but some x64 encoding
+    /// formats pack this information as bits in a prefix (e.g. EVEX).
+    pub(crate) fn bits(&self) -> u8 {
+        match self {
+            OpcodeMap::None => 0b00,
+            OpcodeMap::_0F => 0b01,
+            OpcodeMap::_0F38 => 0b10,
+            OpcodeMap::_0F3A => 0b11,
+        }
+    }
+}
+
+impl Default for OpcodeMap {
+    fn default() -> Self {
+        Self::None
+    }
+}
+
 /// We may need to include one or more legacy prefix bytes before the REX prefix.  This enum
 /// covers only the small set of possibilities that we actually need.
-pub(crate) enum LegacyPrefixes {
+pub enum LegacyPrefixes {
     /// No prefix bytes.
     None,
     /// Operand Size Override -- here, denoting "16-bit operation".
@@ -173,26 +201,47 @@ pub(crate) enum LegacyPrefixes {
 }
 
 impl LegacyPrefixes {
+    /// Emit the legacy prefix as bytes (e.g. in REX instructions).
     #[inline(always)]
     pub(crate) fn emit(&self, sink: &mut MachBuffer<Inst>) {
         match self {
-            LegacyPrefixes::_66 => sink.put1(0x66),
-            LegacyPrefixes::_F0 => sink.put1(0xF0),
-            LegacyPrefixes::_66F0 => {
+            Self::_66 => sink.put1(0x66),
+            Self::_F0 => sink.put1(0xF0),
+            Self::_66F0 => {
                 // I don't think the order matters, but in any case, this is the same order that
                 // the GNU assembler uses.
                 sink.put1(0x66);
                 sink.put1(0xF0);
             }
-            LegacyPrefixes::_F2 => sink.put1(0xF2),
-            LegacyPrefixes::_F3 => sink.put1(0xF3),
-            LegacyPrefixes::_66F3 => {
+            Self::_F2 => sink.put1(0xF2),
+            Self::_F3 => sink.put1(0xF3),
+            Self::_66F3 => {
                 sink.put1(0x66);
                 sink.put1(0xF3);
             }
-            LegacyPrefixes::None => (),
+            Self::None => (),
         }
     }
+
+    /// Emit the legacy prefix as bits (e.g. for EVEX instructions).
+    #[inline(always)]
+    pub(crate) fn bits(&self) -> u8 {
+        match self {
+            Self::None => 0b00,
+            Self::_66 => 0b01,
+            Self::_F3 => 0b10,
+            Self::_F2 => 0b11,
+            _ => panic!(
+                "VEX and EVEX bits can only be extracted from single prefixes: None, 66, F3, F2"
+            ),
+        }
+    }
+}
+
+impl Default for LegacyPrefixes {
+    fn default() -> Self {
+        Self::None
+    }
 }
 
 /// This is the core 'emit' function for instructions that reference memory.