From d1e92af5caf4ff203e540b4618309f9d1674d884 Mon Sep 17 00:00:00 2001 From: Ralf Jung Date: Wed, 24 Jul 2024 20:55:43 +0200 Subject: [PATCH] nontemporal_store: make sure that the intrinsic is truly just a hint --- .../src/intrinsics/mod.rs | 3 +- compiler/rustc_codegen_gcc/src/builder.rs | 2 ++ compiler/rustc_codegen_llvm/src/builder.rs | 30 ++++++++++++++----- library/core/src/intrinsics.rs | 10 +++---- tests/codegen/intrinsics/nontemporal.rs | 19 ++++++++++-- 5 files changed, 49 insertions(+), 15 deletions(-) diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs index b21c559e6686c..29deac6073035 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs @@ -725,7 +725,8 @@ fn codegen_regular_intrinsic_call<'tcx>( // Cranelift treats stores as volatile by default // FIXME correctly handle unaligned_volatile_store - // FIXME actually do nontemporal stores if requested + // FIXME actually do nontemporal stores if requested (but do not just emit MOVNT on x86; + // see the LLVM backend for details) let dest = CPlace::for_ptr(Pointer::new(ptr), val.layout()); dest.write_cvalue(fx, val); } diff --git a/compiler/rustc_codegen_gcc/src/builder.rs b/compiler/rustc_codegen_gcc/src/builder.rs index b9e4bd79fe1e6..45fee0eff9b8b 100644 --- a/compiler/rustc_codegen_gcc/src/builder.rs +++ b/compiler/rustc_codegen_gcc/src/builder.rs @@ -1128,6 +1128,8 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> { self.llbb().add_assignment(self.location, aligned_destination, val); // TODO(antoyo): handle align and flags. // NOTE: dummy value here since it's never used. FIXME(antoyo): API should not return a value here? + // When adding support for NONTEMPORAL, make sure to not just emit MOVNT on x86; see the + // LLVM backend for details. self.cx.context.new_rvalue_zero(self.type_i32()) } diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs index 72ff9ea118e28..836da66032732 100644 --- a/compiler/rustc_codegen_llvm/src/builder.rs +++ b/compiler/rustc_codegen_llvm/src/builder.rs @@ -725,13 +725,29 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> { llvm::LLVMSetVolatile(store, llvm::True); } if flags.contains(MemFlags::NONTEMPORAL) { - // According to LLVM [1] building a nontemporal store must - // *always* point to a metadata value of the integer 1. - // - // [1]: https://llvm.org/docs/LangRef.html#store-instruction - let one = self.cx.const_i32(1); - let node = llvm::LLVMMDNodeInContext(self.cx.llcx, &one, 1); - llvm::LLVMSetMetadata(store, llvm::MD_nontemporal as c_uint, node); + // Make sure that the current target architectures supports "sane" non-temporal + // stores, i.e., non-temporal stores that are equivalent to regular stores except + // for performance. LLVM doesn't seem to care about this, and will happily treat + // `!nontemporal` stores as-if they were normal stores (for reordering optimizations + // etc) even on x86, despite later lowering them to MOVNT which do *not* behave like + // regular stores but require special fences. + // So we keep a list of architectures where `!nontemporal` is known to be truly just + // a hint, and use regular stores everywhere else. + // (In the future, we could alternatively ensure that an sfence gets emitted after a sequence of movnt + // before any kind of synchronizing operation. But it's not clear how to do that with LLVM.) + const WELL_BEHAVED_NONTEMPORAL_ARCHS: &[&str] = &["aarch64", "arm"]; + + let use_nontemporal = + WELL_BEHAVED_NONTEMPORAL_ARCHS.contains(&&*self.cx.tcx.sess.target.arch); + if use_nontemporal { + // According to LLVM [1] building a nontemporal store must + // *always* point to a metadata value of the integer 1. + // + // [1]: https://llvm.org/docs/LangRef.html#store-instruction + let one = self.cx.const_i32(1); + let node = llvm::LLVMMDNodeInContext(self.cx.llcx, &one, 1); + llvm::LLVMSetMetadata(store, llvm::MD_nontemporal as c_uint, node); + } } store } diff --git a/library/core/src/intrinsics.rs b/library/core/src/intrinsics.rs index c4c6388338949..7bf6b030564a9 100644 --- a/library/core/src/intrinsics.rs +++ b/library/core/src/intrinsics.rs @@ -2405,12 +2405,12 @@ extern "rust-intrinsic" { #[rustc_nounwind] pub fn catch_unwind(try_fn: fn(*mut u8), data: *mut u8, catch_fn: fn(*mut u8, *mut u8)) -> i32; - /// Emits a `!nontemporal` store according to LLVM (see their docs). - /// Probably will never become stable. + /// Emits a `nontemporal` store, which gives a hint to the CPU that the data should not be held + /// in cache. Except for performance, this is fully equivalent to `ptr.write(val)`. /// - /// Do NOT use this intrinsic; "nontemporal" operations do not exist in our memory model! - /// It exists to support current stdarch, but the plan is to change stdarch and remove this intrinsic. - /// See for some more discussion. + /// Not all architectures provide such an operation. For instance, x86 does not: while `MOVNT` + /// exists, that operation is *not* equivalent to `ptr.write(val)` (`MOVNT` writes can be reordered + /// in ways that are not allowed for regular writes). #[rustc_nounwind] pub fn nontemporal_store(ptr: *mut T, val: T); diff --git a/tests/codegen/intrinsics/nontemporal.rs b/tests/codegen/intrinsics/nontemporal.rs index 076d6d6d9da49..828cb7e828761 100644 --- a/tests/codegen/intrinsics/nontemporal.rs +++ b/tests/codegen/intrinsics/nontemporal.rs @@ -1,13 +1,28 @@ //@ compile-flags: -O +//@ compile-flags: --target aarch64-unknown-linux-gnu +//@ needs-llvm-components: aarch64 -#![feature(core_intrinsics)] +#![feature(no_core, lang_items, intrinsics)] +#![no_core] #![crate_type = "lib"] +#[lang = "sized"] +pub trait Sized {} +#[lang = "copy"] +pub trait Copy {} + +impl Copy for u32 {} +impl Copy for *mut T {} + +extern "rust-intrinsic" { + pub fn nontemporal_store(ptr: *mut T, val: T); +} + #[no_mangle] pub fn a(a: &mut u32, b: u32) { // CHECK-LABEL: define{{.*}}void @a // CHECK: store i32 %b, ptr %a, align 4, !nontemporal unsafe { - std::intrinsics::nontemporal_store(a, b); + nontemporal_store(a, b); } }