diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 2eebf4140ef5..b54f1b6126fe 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -460,9 +460,7 @@ pub(crate) enum InstructionSet { BMI1, #[allow(dead_code)] // never constructed (yet). BMI2, - #[allow(dead_code)] AVX512F, - #[allow(dead_code)] AVX512VL, } @@ -995,13 +993,11 @@ impl fmt::Display for SseOpcode { #[derive(Clone)] pub enum Avx512Opcode { - #[allow(dead_code)] Vpabsq, } impl Avx512Opcode { /// Which `InstructionSet`s support the opcode? - #[allow(dead_code)] pub(crate) fn available_from(&self) -> SmallVec<[InstructionSet; 2]> { match self { Avx512Opcode::Vpabsq => smallvec![InstructionSet::AVX512F, InstructionSet::AVX512VL], diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index f16f7d66c1eb..fef47ebb8644 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -6,6 +6,8 @@ use crate::isa::x64::inst::args::*; use crate::isa::x64::inst::*; use crate::machinst::{inst_common, MachBuffer, MachInstEmit, MachLabel}; use core::convert::TryInto; +use cranelift_codegen_shared::isa::x86::EncodingBits; +use encoding::evex::{encode_evex, EvexContext, EvexMasking}; use encoding::rex::{ emit_simm, emit_std_enc_enc, emit_std_enc_mem, emit_std_reg_mem, emit_std_reg_reg, int_reg_enc, low8_will_sign_extend_to_32, low8_will_sign_extend_to_64, reg_enc, LegacyPrefixes, RexFlags, @@ -1409,6 +1411,24 @@ pub(crate) fn emit( }; } + Inst::XmmUnaryRmREvex { op, src, dst } => { + let bits = match op { + &Avx512Opcode::Vpabsq => EncodingBits::new(&[0x66, 0x0f, 0x38, 0x1f], 0, 1), + }; + match src { + RegMem::Reg { reg: src } => encode_evex( + bits, + dst.to_reg().get_hw_encoding(), + 0, + src.get_hw_encoding(), + EvexContext::v128(), + EvexMasking::default(), + sink, + ), + _ => todo!(), + } + } + Inst::XmmRmR { op, src: src_e, diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs index f730d25e935d..f03762b97bab 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs @@ -3865,6 +3865,12 @@ fn test_x64_emit() { "cvtdq2pd %xmm2, %xmm8", )); + insns.push(( + Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vpabsq, RegMem::reg(xmm2), w_xmm8), + "6272FD081FC2", + "vpabsq %xmm2, %xmm8", + )); + // Xmm to int conversions, and conversely. insns.push(( @@ -4276,6 +4282,7 @@ fn test_x64_emit() { let mut isa_flag_builder = x64::settings::builder(); isa_flag_builder.enable("has_ssse3").unwrap(); isa_flag_builder.enable("has_sse41").unwrap(); + isa_flag_builder.enable("has_avx512f").unwrap(); let isa_flags = x64::settings::Flags::new(&flags, isa_flag_builder); let rru = regs::create_reg_universe_systemv(&flags); diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 099c97553152..71b97bfda14b 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -225,6 +225,12 @@ pub enum Inst { dst: Writable, }, + XmmUnaryRmREvex { + op: Avx512Opcode, + src: RegMem, + dst: Writable, + }, + /// XMM (scalar or vector) unary op (from xmm to reg/mem): stores, movd, movq XmmMovRM { op: SseOpcode, @@ -571,6 +577,8 @@ impl Inst { | Inst::XmmRmRImm { op, .. } | Inst::XmmToGpr { op, .. } | Inst::XmmUnaryRmR { op, .. } => smallvec![op.available_from()], + + Inst::XmmUnaryRmREvex { op, .. } => op.available_from(), } } } @@ -705,6 +713,12 @@ impl Inst { Inst::XmmUnaryRmR { op, src, dst } } + pub(crate) fn xmm_unary_rm_r_evex(op: Avx512Opcode, src: RegMem, dst: Writable) -> Inst { + src.assert_regclass_is(RegClass::V128); + debug_assert!(dst.to_reg().get_class() == RegClass::V128); + Inst::XmmUnaryRmREvex { op, src, dst } + } + pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable) -> Self { src.assert_regclass_is(RegClass::V128); debug_assert!(dst.to_reg().get_class() == RegClass::V128); @@ -1400,6 +1414,13 @@ impl PrettyPrint for Inst { show_ireg_sized(dst.to_reg(), mb_rru, 8), ), + Inst::XmmUnaryRmREvex { op, src, dst, .. } => format!( + "{} {}, {}", + ljustify(op.to_string()), + src.show_rru_sized(mb_rru, 8), + show_ireg_sized(dst.to_reg(), mb_rru, 8), + ), + Inst::XmmMovRM { op, src, dst, .. } => format!( "{} {}, {}", ljustify(op.to_string()), @@ -1872,7 +1893,9 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { collector.add_def(Writable::from_reg(regs::rdx())); } }, - Inst::UnaryRmR { src, dst, .. } | Inst::XmmUnaryRmR { src, dst, .. } => { + Inst::UnaryRmR { src, dst, .. } + | Inst::XmmUnaryRmR { src, dst, .. } + | Inst::XmmUnaryRmREvex { src, dst, .. } => { src.get_regs_as_uses(collector); collector.add_def(*dst); } @@ -2219,6 +2242,11 @@ fn x64_map_regs(inst: &mut Inst, mapper: &RUM) { ref mut dst, .. } + | Inst::XmmUnaryRmREvex { + ref mut src, + ref mut dst, + .. + } | Inst::UnaryRmR { ref mut src, ref mut dst, diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index dc812a9e8f4f..7589f21672f6 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -1855,25 +1855,29 @@ fn lower_insn_to_regs>( let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); let ty = ty.unwrap(); if ty == types::I64X2 { - // This lowering could be a single instruction with AVX512F/VL's VPABSQ instruction. - // Instead, we use a separate register, `tmp`, to contain the results of `0 - src` - // and then blend in those results with `BLENDVPD` if the MSB of `tmp` was set to 1 - // (i.e. if `tmp` was negative or, conversely, if `src` was originally positive). + if isa_flags.use_avx512f_simd() || isa_flags.use_avx512vl_simd() { + ctx.emit(Inst::xmm_unary_rm_r_evex(Avx512Opcode::Vpabsq, src, dst)); + } else { + // If `VPABSQ` from AVX512 is unavailable, we use a separate register, `tmp`, to + // contain the results of `0 - src` and then blend in those results with + // `BLENDVPD` if the MSB of `tmp` was set to 1 (i.e. if `tmp` was negative or, + // conversely, if `src` was originally positive). - // Emit all 0s into the `tmp` register. - let tmp = ctx.alloc_tmp(ty).only_reg().unwrap(); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); - // Subtract the lanes from 0 and set up `dst`. - ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubq, src.clone(), tmp)); - ctx.emit(Inst::gen_move(dst, tmp.to_reg(), ty)); - // Choose the subtracted lanes when `tmp` has an MSB of 1. BLENDVPD's semantics - // require the "choice" mask to be in XMM0. - ctx.emit(Inst::gen_move( - Writable::from_reg(regs::xmm0()), - tmp.to_reg(), - ty, - )); - ctx.emit(Inst::xmm_rm_r(SseOpcode::Blendvpd, src, dst)); + // Emit all 0s into the `tmp` register. + let tmp = ctx.alloc_tmp(ty).only_reg().unwrap(); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp)); + // Subtract the lanes from 0 and set up `dst`. + ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubq, src.clone(), tmp)); + ctx.emit(Inst::gen_move(dst, tmp.to_reg(), ty)); + // Choose the subtracted lanes when `tmp` has an MSB of 1. BLENDVPD's semantics + // require the "choice" mask to be in XMM0. + ctx.emit(Inst::gen_move( + Writable::from_reg(regs::xmm0()), + tmp.to_reg(), + ty, + )); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Blendvpd, src, dst)); + } } else if ty.is_vector() { let opcode = match ty { types::I8X16 => SseOpcode::Pabsb,