From 0acc1451eaecfcc73021fc619ea88b0e866bccb2 Mon Sep 17 00:00:00 2001 From: Andrew Brown Date: Thu, 15 Apr 2021 11:53:58 -0700 Subject: [PATCH] x64: lower iabs.i64x2 using a single AVX512 instruction when possible (#2819) * x64: add EVEX encoding mechanism Also, includes an empty stub module for the VEX encoding. * x64: lower abs.i64x2 to VPABSQ when available * x64: refactor EVEX encodings to use `EvexInstruction` This change replaces the `encode_evex` function with a builder-style struct, `EvexInstruction`. This approach clarifies the code, adds documentation, and results in slight speedups when benchmarked. * x64: rename encoding CodeSink to ByteSink --- cranelift/codegen/src/isa/x64/inst/args.rs | 4 - cranelift/codegen/src/isa/x64/inst/emit.rs | 22 +- .../codegen/src/isa/x64/inst/emit_tests.rs | 7 + .../codegen/src/isa/x64/inst/encoding/evex.rs | 396 ++++++++++++++++++ .../codegen/src/isa/x64/inst/encoding/mod.rs | 56 +++ .../codegen/src/isa/x64/inst/encoding/rex.rs | 65 ++- .../codegen/src/isa/x64/inst/encoding/vex.rs | 2 + cranelift/codegen/src/isa/x64/inst/mod.rs | 30 +- cranelift/codegen/src/isa/x64/lower.rs | 40 +- 9 files changed, 590 insertions(+), 32 deletions(-) create mode 100644 cranelift/codegen/src/isa/x64/inst/encoding/evex.rs create mode 100644 cranelift/codegen/src/isa/x64/inst/encoding/vex.rs diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 2eebf4140ef5..b54f1b6126fe 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -460,9 +460,7 @@ pub(crate) enum InstructionSet { BMI1, #[allow(dead_code)] // never constructed (yet). BMI2, - #[allow(dead_code)] AVX512F, - #[allow(dead_code)] AVX512VL, } @@ -995,13 +993,11 @@ impl fmt::Display for SseOpcode { #[derive(Clone)] pub enum Avx512Opcode { - #[allow(dead_code)] Vpabsq, } impl Avx512Opcode { /// Which `InstructionSet`s support the opcode? - #[allow(dead_code)] pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> { match self { Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL], diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 3bd6a58e7c29..a35bbe0a9994 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -6,9 +6,11 @@ use crate::isa::x64::inst::args::*; use crate::isa::x64::inst::*; use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel}; use core::convert::TryInto; +use encoding::evex::{EvexInstruction, EvexVectorLength}; use encoding::rex::{ emit_simm, emit_std_enc_enc, emit_std_enc_mem, emit_std_reg_mem, emit_std_reg_reg, int_reg_enc, - low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, RexFlags, + low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, OpcodeMap, + RexFlags, }; use log::debug; use regalloc::{Reg, Writable}; @@ -1404,6 +1406,24 @@ pub(crate) fn emit( }; } + Inst::XmmUnaryRmREvex { op, src, dst } => { + let opcode = match op { + Avx512Opcode::Vpabsq => 0x1f, + }; + match src { + RegMem::Reg { reg: src } => EvexInstruction::new() + .length(EvexVectorLength::V128) + .prefix(LegacyPrefixes::_66) + .map(OpcodeMap::_0F38) + .w(true) + .opcode(opcode) + .reg(dst.to_reg().get_hw_encoding()) + .rm(src.get_hw_encoding()) + .encode(sink), + _ => todo!(), + }; + } + Inst::XmmRmR { op, src: src_e, diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index f730d25e935d..f03762b97bab 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3865,6 +3865,12 @@ fn test_x64_emit() { "cvtdq2pd %xmm2, %xmm8", )); + insns.push(( + Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vpabsq, RegMem::reg(xmm2), w_xmm8), + "6272FD081FC2", + "vpabsq %xmm2, %xmm8", + )); + // Xmm to int conversions, and conversely. insns.push(( @@ -4276,6 +4282,7 @@ fn test_x64_emit() { let mut isa_flag_builder = x64::settings::builder(); isa_flag_builder.enable("has_ssse3").unwrap(); isa_flag_builder.enable("has_sse41").unwrap(); + isa_flag_builder.enable("has_avx512f").unwrap(); let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder); let rru = regs::create_reg_universe_systemv(&flags); diff --git a/cranelift/codegen/src/isa/x64/inst/encoding/evex.rs b/cranelift/codegen/src/isa/x64/inst/encoding/evex.rs new file mode 100644 index 000000000000..9029b9f43e08 --- /dev/null +++ b/cranelift/codegen/src/isa/x64/inst/encoding/evex.rs @@ -0,0 +1,396 @@ +//! Encodes EVEX instructions. These instructions are those added by the AVX-512 extensions. The +//! EVEX encoding requires a 4-byte prefix: +//! +//! Byte 0: 0x62 +//! ┌───┬───┬───┬───┬───┬───┬───┬───┐ +//! Byte 1: │ R │ X │ B │ R'│ 0 │ 0 │ m │ m │ +//! ├───┼───┼───┼───┼───┼───┼───┼───┤ +//! Byte 2: │ W │ v │ v │ v │ v │ 1 │ p │ p │ +//! ├───┼───┼───┼───┼───┼───┼───┼───┤ +//! Byte 3: │ z │ L'│ L │ b │ V'│ a │ a │ a │ +//! └───┴───┴───┴───┴───┴───┴───┴───┘ +//! +//! The prefix is then followeded by the opcode byte, the ModR/M byte, and other optional suffixes +//! (e.g. SIB byte, displacements, immediates) based on the instruction (see section 2.6, Intel +//! Software Development Manual, volume 2A). +use super::rex::{encode_modrm, LegacyPrefixes, OpcodeMap}; +use super::ByteSink; +use core::ops::RangeInclusive; + +/// Constructs an EVEX-encoded instruction using a builder pattern. This approach makes it visually +/// easier to transform something the manual's syntax, `EVEX.256.66.0F38.W1 1F /r` to code: +/// `EvexInstruction::new().length(...).prefix(...).map(...).w(true).opcode(0x1F).reg(...).rm(...)`. +pub struct EvexInstruction { + bits: u32, + opcode: u8, + reg: Register, + rm: Register, +} + +/// Because some of the bit flags in the EVEX prefix are reversed and users of `EvexInstruction` may +/// choose to skip setting fields, here we set some sane defaults. Note that: +/// - the first byte is always `0x62` but you will notice it at the end of the default `bits` value +/// implemented--remember the little-endian order +/// - some bits are always set to certain values: bits 10-11 to 0, bit 18 to 1 +/// - the other bits set correspond to reversed bits: R, X, B, R' (byte 1), vvvv (byte 2), V' (byte +/// 3). +/// +/// See the `default_emission` test for what these defaults are equivalent to (e.g. using RAX, +/// unsetting the W bit, etc.) +impl Default for EvexInstruction { + fn default() -> Self { + Self { + bits: 0x08_7C_F0_62, + opcode: 0, + reg: Register::default(), + rm: Register::default(), + } + } +} + +#[allow(non_upper_case_globals)] // This makes it easier to match the bit range names to the manual's names. +impl EvexInstruction { + /// Construct a default EVEX instruction. + pub fn new() -> Self { + Self::default() + } + + /// Set the length of the instruction . Note that there are sets of instructions (i.e. rounding, + /// memory broadcast) that modify the same underlying bits--at some point (TODO) we can add a + /// way to set those context bits and verify that both are not used (e.g. rounding AND length). + /// For now, this method is very convenient. + #[inline(always)] + pub fn length(mut self, length: EvexVectorLength) -> Self { + self.write(Self::LL, EvexContext::Other { length }.bits() as u32); + self + } + + /// Set the legacy prefix byte of the instruction: None | 66 | F0 | F2 | F3. EVEX instructions + /// pack these into the prefix, not as separate bytes. + #[inline(always)] + pub fn prefix(mut self, prefix: LegacyPrefixes) -> Self { + self.write(Self::pp, prefix.bits() as u32); + self + } + + /// Set the opcode map byte of the instruction: None | 0F | 0F38 | 0F3A. EVEX instructions pack + /// these into the prefix, not as separate bytes. + #[inline(always)] + pub fn map(mut self, map: OpcodeMap) -> Self { + self.write(Self::mm, map.bits() as u32); + self + } + + /// Set the W bit, typically used to indicate an instruction using 64 bits of an operand (e.g. + /// 64 bit lanes). EVEX packs this bit in the EVEX prefix; previous encodings used the REX + /// prefix. + #[inline(always)] + pub fn w(mut self, w: bool) -> Self { + self.write(Self::W, w as u32); + self + } + + /// Set the instruction opcode byte. + #[inline(always)] + pub fn opcode(mut self, opcode: u8) -> Self { + self.opcode = opcode; + self + } + + /// Set the register to use for the `reg` bits; many instructions use this as the write operand. + /// Setting this affects both the ModRM byte (`reg` section) and the EVEX prefix (the extension + /// bits for register encodings > 8). + #[inline(always)] + pub fn reg(mut self, reg: impl Into) -> Self { + self.reg = reg.into(); + let r = !(self.reg.0 >> 3) & 1; + let r_ = !(self.reg.0 >> 4) & 1; + self.write(Self::R, r as u32); + self.write(Self::R_, r_ as u32); + self + } + + /// Set the mask to use. See section 2.6 in the Intel Software Developer's Manual, volume 2A for + /// more details. + #[allow(dead_code)] + #[inline(always)] + pub fn mask(mut self, mask: EvexMasking) -> Self { + self.write(Self::aaa, mask.aaa_bits() as u32); + self.write(Self::z, mask.z_bit() as u32); + self + } + + /// Set the `vvvvv` register; some instructions allow using this as a second, non-destructive + /// source register in 3-operand instructions (e.g. 2 read, 1 write). + #[allow(dead_code)] + #[inline(always)] + pub fn vvvvv(mut self, reg: impl Into) -> Self { + let reg = reg.into(); + self.write(Self::vvvv, !(reg.0 as u32) & 0b1111); + self.write(Self::V_, !(reg.0 as u32 >> 4) & 0b1); + self + } + + /// Set the register to use for the `rm` bits; many instructions use this as the "read from + /// register/memory" operand. Currently this does not support memory addressing (TODO).Setting + /// this affects both the ModRM byte (`rm` section) and the EVEX prefix (the extension bits for + /// register encodings > 8). + #[inline(always)] + pub fn rm(mut self, reg: impl Into) -> Self { + self.rm = reg.into(); + let b = !(self.rm.0 >> 3) & 1; + let x = !(self.rm.0 >> 4) & 1; + self.write(Self::X, x as u32); + self.write(Self::B, b as u32); + self + } + + /// Emit the EVEX-encoded instruction to the code sink: + /// - first, the 4-byte EVEX prefix; + /// - then, the opcode byte; + /// - finally, the ModR/M byte. + /// + /// Eventually this method should support encodings of more than just the reg-reg addressing mode (TODO). + pub fn encode(&self, sink: &mut CS) { + sink.put4(self.bits); + sink.put1(self.opcode); + sink.put1(encode_modrm(3, self.reg.0 & 7, self.rm.0 & 7)); + } + + // In order to simplify the encoding of the various bit ranges in the prefix, we specify those + // ranges according to the table below (extracted from the Intel Software Development Manual, + // volume 2A). Remember that, because we pack the 4-byte prefix into a little-endian `u32`, this + // chart should be read from right-to-left, top-to-bottom. Note also that we start ranges at bit + // 8, leaving bits 0-7 for the mandatory `0x62`. + // ┌───┬───┬───┬───┬───┬───┬───┬───┐ + // Byte 1: │ R │ X │ B │ R'│ 0 │ 0 │ m │ m │ + // ├───┼───┼───┼───┼───┼───┼───┼───┤ + // Byte 2: │ W │ v │ v │ v │ v │ 1 │ p │ p │ + // ├───┼───┼───┼───┼───┼───┼───┼───┤ + // Byte 3: │ z │ L'│ L │ b │ V'│ a │ a │ a │ + // └───┴───┴───┴───┴───┴───┴───┴───┘ + + // Byte 1: + const mm: RangeInclusive = 8..=9; + const R_: RangeInclusive = 12..=12; + const B: RangeInclusive = 13..=13; + const X: RangeInclusive = 14..=14; + const R: RangeInclusive = 15..=15; + + // Byte 2: + const pp: RangeInclusive = 16..=17; + const vvvv: RangeInclusive = 19..=22; + const W: RangeInclusive = 23..=23; + + // Byte 3: + const aaa: RangeInclusive = 24..=26; + const V_: RangeInclusive = 27..=27; + #[allow(dead_code)] // Will be used once broadcast and rounding controls are exposed. + const b: RangeInclusive = 28..=28; + const LL: RangeInclusive = 29..=30; + const z: RangeInclusive = 31..=31; + + // A convenience method for writing the `value` bits to the given range in `self.bits`. + #[inline] + fn write(&mut self, range: RangeInclusive, value: u32) { + assert!(ExactSizeIterator::len(&range) > 0); + let size = range.end() - range.start() + 1; // Calculate the number of bits in the range. + let mask: u32 = (1 << size) - 1; // Generate a bit mask. + debug_assert!( + value <= mask, + "The written value should have fewer than {} bits.", + size + ); + let mask_complement = !(mask << *range.start()); // Create the bitwise complement for the clear mask. + self.bits &= mask_complement; // Clear the bits in `range`; otherwise the OR below may allow previously-set bits to slip through. + let value = value << *range.start(); // Place the value in the correct location (assumes `value <= mask`). + self.bits |= value; // Modify the bits in `range`. + } +} + +#[derive(Copy, Clone, Default)] +pub struct Register(u8); +impl From for Register { + fn from(reg: u8) -> Self { + debug_assert!(reg < 16); + Self(reg) + } +} + +/// Defines the EVEX context for the `L'`, `L`, and `b` bits (bits 6:4 of EVEX P2 byte). Table 2-36 in +/// section 2.6.10 (Intel Software Development Manual, volume 2A) describes how these bits can be +/// used together for certain classes of instructions; i.e., special care should be taken to ensure +/// that instructions use an applicable correct `EvexContext`. Table 2-39 contains cases where +/// opcodes can result in an #UD. +#[allow(dead_code)] // Rounding and broadcast modes are not yet used. +pub enum EvexContext { + RoundingRegToRegFP { + rc: EvexRoundingControl, + }, + NoRoundingFP { + sae: bool, + length: EvexVectorLength, + }, + MemoryOp { + broadcast: bool, + length: EvexVectorLength, + }, + Other { + length: EvexVectorLength, + }, +} + +impl Default for EvexContext { + fn default() -> Self { + Self::Other { + length: EvexVectorLength::default(), + } + } +} + +impl EvexContext { + /// Encode the `L'`, `L`, and `b` bits (bits 6:4 of EVEX P2 byte) for merging with the P2 byte. + fn bits(&self) -> u8 { + match self { + Self::RoundingRegToRegFP { rc } => 0b001 | rc.bits() << 1, + Self::NoRoundingFP { sae, length } => (*sae as u8) | length.bits() << 1, + Self::MemoryOp { broadcast, length } => (*broadcast as u8) | length.bits() << 1, + Self::Other { length } => length.bits() << 1, + } + } +} + +/// The EVEX format allows choosing a vector length in the `L'` and `L` bits; see `EvexContext`. +#[allow(dead_code)] // Wider-length vectors are not yet used. +pub enum EvexVectorLength { + V128, + V256, + V512, +} + +impl EvexVectorLength { + /// Encode the `L'` and `L` bits for merging with the P2 byte. + fn bits(&self) -> u8 { + match self { + Self::V128 => 0b00, + Self::V256 => 0b01, + Self::V512 => 0b10, + // 0b11 is reserved (#UD). + } + } +} + +impl Default for EvexVectorLength { + fn default() -> Self { + Self::V128 + } +} + +/// The EVEX format allows defining rounding control in the `L'` and `L` bits; see `EvexContext`. +#[allow(dead_code)] // Rounding controls are not yet used. +pub enum EvexRoundingControl { + RNE, + RD, + RU, + RZ, +} + +impl EvexRoundingControl { + /// Encode the `L'` and `L` bits for merging with the P2 byte. + fn bits(&self) -> u8 { + match self { + Self::RNE => 0b00, + Self::RD => 0b01, + Self::RU => 0b10, + Self::RZ => 0b11, + } + } +} + +/// Defines the EVEX masking behavior; masking support is described in section 2.6.4 of the Intel +/// Software Development Manual, volume 2A. +#[allow(dead_code)] // Masking is not yet used. +pub enum EvexMasking { + None, + Merging { k: u8 }, + Zeroing { k: u8 }, +} + +impl Default for EvexMasking { + fn default() -> Self { + EvexMasking::None + } +} + +impl EvexMasking { + /// Encode the `z` bit for merging with the P2 byte. + fn z_bit(&self) -> u8 { + match self { + Self::None | Self::Merging { .. } => 0, + Self::Zeroing { .. } => 1, + } + } + + /// Encode the `aaa` bits for merging with the P2 byte. + fn aaa_bits(&self) -> u8 { + match self { + Self::None => 0b000, + Self::Merging { k } | Self::Zeroing { k } => { + debug_assert!(*k <= 7); + *k + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::isa::x64::inst::regs; + use std::vec::Vec; + + // As a sanity test, we verify that the output of `xed-asmparse-main 'vpabsq xmm0{k0}, + // xmm1'` matches this EVEX encoding machinery. + #[test] + fn vpabsq() { + let dst = regs::xmm0(); + let src = regs::xmm1(); + let mut sink0 = Vec::new(); + + EvexInstruction::new() + .prefix(LegacyPrefixes::_66) + .map(OpcodeMap::_0F38) + .w(true) + .opcode(0x1F) + .reg(dst.get_hw_encoding()) + .rm(src.get_hw_encoding()) + .length(EvexVectorLength::V128) + .encode(&mut sink0); + + assert_eq!(sink0, vec![0x62, 0xf2, 0xfd, 0x08, 0x1f, 0xc1]); + } + + /// Verify that the defaults are equivalent to an instruction with a `0x00` opcode using the + /// "0" register (i.e. `rax`), with sane defaults for the various configurable parameters. This + /// test is more interesting than it may appear because some of the parameters have flipped-bit + /// representations (e.g. `vvvvv`) so emitting 0s as a default will not work. + #[test] + fn default_emission() { + let mut sink0 = Vec::new(); + EvexInstruction::new().encode(&mut sink0); + + let mut sink1 = Vec::new(); + EvexInstruction::new() + .length(EvexVectorLength::V128) + .prefix(LegacyPrefixes::None) + .map(OpcodeMap::None) + .w(false) + .opcode(0x00) + .reg(regs::rax().get_hw_encoding()) + .rm(regs::rax().get_hw_encoding()) + .mask(EvexMasking::None) + .encode(&mut sink1); + + assert_eq!(sink0, sink1); + } +} diff --git a/cranelift/codegen/src/isa/x64/inst/encoding/mod.rs b/cranelift/codegen/src/isa/x64/inst/encoding/mod.rs index 7fd3aeeae7d9..a269e586094f 100644 --- a/cranelift/codegen/src/isa/x64/inst/encoding/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/encoding/mod.rs @@ -1 +1,57 @@ +use crate::{isa::x64, machinst::MachBuffer}; +use std::vec::Vec; + +pub mod evex; pub mod rex; +pub mod vex; + +pub trait ByteSink { + /// Add 1 byte to the code section. + fn put1(&mut self, _: u8); + + /// Add 2 bytes to the code section. + fn put2(&mut self, _: u16); + + /// Add 4 bytes to the code section. + fn put4(&mut self, _: u32); + + /// Add 8 bytes to the code section. + fn put8(&mut self, _: u64); +} + +impl ByteSink for MachBuffer { + fn put1(&mut self, value: u8) { + self.put1(value) + } + + fn put2(&mut self, value: u16) { + self.put2(value) + } + + fn put4(&mut self, value: u32) { + self.put4(value) + } + + fn put8(&mut self, value: u64) { + self.put8(value) + } +} + +/// Provide a convenient implementation for testing. +impl ByteSink for Vec { + fn put1(&mut self, v: u8) { + self.extend_from_slice(&[v]) + } + + fn put2(&mut self, v: u16) { + self.extend_from_slice(&v.to_le_bytes()) + } + + fn put4(&mut self, v: u32) { + self.extend_from_slice(&v.to_le_bytes()) + } + + fn put8(&mut self, v: u64) { + self.extend_from_slice(&v.to_le_bytes()) + } +} diff --git a/cranelift/codegen/src/isa/x64/inst/encoding/rex.rs b/cranelift/codegen/src/isa/x64/inst/encoding/rex.rs index 923f2e361562..648941790e50 100644 --- a/cranelift/codegen/src/isa/x64/inst/encoding/rex.rs +++ b/cranelift/codegen/src/isa/x64/inst/encoding/rex.rs @@ -153,9 +153,37 @@ impl From<(OperandSize, Reg)> for RexFlags { } } +/// Allows using the same opcode byte in different "opcode maps" to allow for more instruction +/// encodings. See appendix A in the Intel Software Developer's Manual, volume 2A, for more details. +pub enum OpcodeMap { + None, + _0F, + _0F38, + _0F3A, +} + +impl OpcodeMap { + /// Normally the opcode map is specified as bytes in the instruction, but some x64 encoding + /// formats pack this information as bits in a prefix (e.g. EVEX). + pub(crate) fn bits(&self) -> u8 { + match self { + OpcodeMap::None => 0b00, + OpcodeMap::_0F => 0b01, + OpcodeMap::_0F38 => 0b10, + OpcodeMap::_0F3A => 0b11, + } + } +} + +impl Default for OpcodeMap { + fn default() -> Self { + Self::None + } +} + /// We may need to include one or more legacy prefix bytes before the REX prefix. This enum /// covers only the small set of possibilities that we actually need. -pub(crate) enum LegacyPrefixes { +pub enum LegacyPrefixes { /// No prefix bytes. None, /// Operand Size Override -- here, denoting "16-bit operation". @@ -173,26 +201,47 @@ pub(crate) enum LegacyPrefixes { } impl LegacyPrefixes { + /// Emit the legacy prefix as bytes (e.g. in REX instructions). #[inline(always)] pub(crate) fn emit(&self, sink: &mut MachBuffer) { match self { - LegacyPrefixes::_66 => sink.put1(0x66), - LegacyPrefixes::_F0 => sink.put1(0xF0), - LegacyPrefixes::_66F0 => { + Self::_66 => sink.put1(0x66), + Self::_F0 => sink.put1(0xF0), + Self::_66F0 => { // I don't think the order matters, but in any case, this is the same order that // the GNU assembler uses. sink.put1(0x66); sink.put1(0xF0); } - LegacyPrefixes::_F2 => sink.put1(0xF2), - LegacyPrefixes::_F3 => sink.put1(0xF3), - LegacyPrefixes::_66F3 => { + Self::_F2 => sink.put1(0xF2), + Self::_F3 => sink.put1(0xF3), + Self::_66F3 => { sink.put1(0x66); sink.put1(0xF3); } - LegacyPrefixes::None => (), + Self::None => (), } } + + /// Emit the legacy prefix as bits (e.g. for EVEX instructions). + #[inline(always)] + pub(crate) fn bits(&self) -> u8 { + match self { + Self::None => 0b00, + Self::_66 => 0b01, + Self::_F3 => 0b10, + Self::_F2 => 0b11, + _ => panic!( + "VEX and EVEX bits can only be extracted from single prefixes: None, 66, F3, F2" + ), + } + } +} + +impl Default for LegacyPrefixes { + fn default() -> Self { + Self::None + } } /// This is the core 'emit' function for instructions that reference memory. diff --git a/cranelift/codegen/src/isa/x64/inst/encoding/vex.rs b/cranelift/codegen/src/isa/x64/inst/encoding/vex.rs new file mode 100644 index 000000000000..f2f3feebbae6 --- /dev/null +++ b/cranelift/codegen/src/isa/x64/inst/encoding/vex.rs @@ -0,0 +1,2 @@ +//! Encodes VEX instructions. These instructions are those added by the Advanced Vector Extensions +//! (AVX). diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 0e8b8d9f1739..cfb0351bf38c 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -225,6 +225,12 @@ pub enum Inst { dst: Writable, }, + XmmUnaryRmREvex { + op: Avx512Opcode, + src: RegMem, + dst: Writable, + }, + /// XMM (scalar or vector) unary op (from xmm to reg/mem): stores, movd, movq XmmMovRM { op: SseOpcode, @@ -571,6 +577,8 @@ impl Inst { | Inst::XmmRmRImm { op, .. } | Inst::XmmToGpr { op, .. } | Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()], + + Inst::XmmUnaryRmREvex { op, .. } => op.available_from(), } } } @@ -705,6 +713,12 @@ impl Inst { Inst::XmmUnaryRmR { op, src, dst } } + pub(crate) fn xmm_unary_rm_r_evex(op: Avx512Opcode, src: RegMem, dst: Writable) -> Inst { + src.assert_regclass_is(RegClass::V128); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmUnaryRmREvex { op, src, dst } + } + pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable) -> Self { src.assert_regclass_is(RegClass::V128); debug_assert!(dst.to_reg().get_class() == RegClass::V128); @@ -1391,6 +1405,13 @@ impl PrettyPrint for Inst { show_ireg_sized(dst.to_reg(), mb_rru, 8), ), + Inst::XmmUnaryRmREvex { op, src, dst, .. } => format!( + "{} {}, {}", + ljustify(op.to_string()), + src.show_rru_sized(mb_rru, 8), + show_ireg_sized(dst.to_reg(), mb_rru, 8), + ), + Inst::XmmMovRM { op, src, dst, .. } => format!( "{} {}, {}", ljustify(op.to_string()), @@ -1863,7 +1884,9 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(Writable::from_reg(regs::rdx())); } }, - Inst::UnaryRmR { src, dst, .. } | Inst::XmmUnaryRmR { src, dst, .. } => { + Inst::UnaryRmR { src, dst, .. } + | Inst::XmmUnaryRmR { src, dst, .. } + | Inst::XmmUnaryRmREvex { src, dst, .. } => { src.get_regs_as_uses(collector); collector.add_def(*dst); } @@ -2210,6 +2233,11 @@ fn x64_map_regs(inst: &mut Inst, mapper: &RUM) { ref mut dst, .. } + | Inst::XmmUnaryRmREvex { + ref mut src, + ref mut dst, + .. + } | Inst::UnaryRmR { ref mut src, ref mut dst, diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 26c0b89740ce..6f675b923246 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -1855,25 +1855,29 @@ fn lower_insn_to_regs>( let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let ty = ty.unwrap(); if ty == types::I64X2 { - // This lowering could be a single instruction with AVX512F/VL's VPABSQ instruction. - // Instead, we use a separate register, `tmp`, to contain the results of `0 - src` - // and then blend in those results with `BLENDVPD` if the MSB of `tmp` was set to 1 - // (i.e. if `tmp` was negative or, conversely, if `src` was originally positive). + if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() { + ctx.emit(Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vpabsq, src, dst)); + } else { + // If `VPABSQ` from AVX512 is unavailable, we use a separate register, `tmp`, to + // contain the results of `0 - src` and then blend in those results with + // `BLENDVPD` if the MSB of `tmp` was set to 1 (i.e. if `tmp` was negative or, + // conversely, if `src` was originally positive). - // Emit all 0s into the `tmp` register. - let tmp = ctx.alloc_tmp(ty).only_reg().unwrap(); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); - // Subtract the lanes from 0 and set up `dst`. - ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubq, src.clone(), tmp)); - ctx.emit(Inst::gen_move(dst, tmp.to_reg(), ty)); - // Choose the subtracted lanes when `tmp` has an MSB of 1. BLENDVPD's semantics - // require the "choice" mask to be in XMM0. - ctx.emit(Inst::gen_move( - Writable::from_reg(regs::xmm0()), - tmp.to_reg(), - ty, - )); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Blendvpd, src, dst)); + // Emit all 0s into the `tmp` register. + let tmp = ctx.alloc_tmp(ty).only_reg().unwrap(); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); + // Subtract the lanes from 0 and set up `dst`. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubq, src.clone(), tmp)); + ctx.emit(Inst::gen_move(dst, tmp.to_reg(), ty)); + // Choose the subtracted lanes when `tmp` has an MSB of 1. BLENDVPD's semantics + // require the "choice" mask to be in XMM0. + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::xmm0()), + tmp.to_reg(), + ty, + )); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Blendvpd, src, dst)); + } } else if ty.is_vector() { let opcode = match ty { types::I8X16 => SseOpcode::Pabsb,