From f55358e20c625cfe697e887d3b9a0ee59c47d892 Mon Sep 17 00:00:00 2001
From: Taiki Endo <te316e89@gmail.com>
Date: Sun, 4 Jun 2023 22:59:50 +0900
Subject: [PATCH] Optimize atomic float on NVPTX

---
 README.md        |   2 +-
 build.rs         |  33 +++-
 src/imp/float.rs |   5 +-
 src/imp/mod.rs   |  21 +++
 src/imp/nvptx.rs | 391 +++++++++++++++++++++++++++++++++++++++++++++++
 src/lib.rs       |   5 +-
 tools/build.sh   |   8 +
 7 files changed, 458 insertions(+), 7 deletions(-)
 create mode 100644 src/imp/nvptx.rs
diff --git a/README.md b/README.md
index dbc347e1..fa714718 100644
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@ See the [`atomic128` module's readme](https://github.com/taiki-e/portable-atomic
 - <a name="optional-features-float"></a>**`float`**<br>
   Provide `AtomicF{32,64}`.
 
-  Note that most of `fetch_*` operations of atomic floats are implemented using CAS loops, which can be slower than equivalent operations of atomic integers. ([GPU targets have atomic instructions for float, so we plan to use these instructions for GPU targets in the future.](https://github.com/taiki-e/portable-atomic/issues/34))
+  Note that most of `fetch_*` operations of atomic floats are implemented using CAS loops, which can be slower than equivalent operations of atomic integers. (GPU targets have atomic instructions for float, so we use these instructions for GPU targets on nightly.)
 
 - **`std`**<br>
   Use `std`.
diff --git a/build.rs b/build.rs
index 93935122..2f44606e 100644
--- a/build.rs
+++ b/build.rs
@@ -162,7 +162,8 @@ fn main() {
         // https://github.com/rust-lang/rust/pull/111331 merged in Rust 1.71 (nightly-2023-05-09).
         if !no_asm
             && (target_arch == "powerpc64" && version.probe(60, 2022, 2, 12)
-                || target_arch == "s390x" && version.probe(71, 2023, 5, 8))
+                || target_arch == "s390x" && version.probe(71, 2023, 5, 8)
+                || target_arch == "nvptx64")
             && is_allowed_feature("asm_experimental_arch")
         {
             println!("cargo:rustc-cfg=portable_atomic_unstable_asm_experimental_arch");
@@ -328,6 +329,36 @@ fn main() {
                 false,
             );
         }
+        "nvptx64" => {
+            let mut has_sm_70 = false;
+            if let Some(rustflags) = env::var_os("CARGO_ENCODED_RUSTFLAGS") {
+                for mut flag in rustflags.to_string_lossy().split('\x1f') {
+                    flag = strip_prefix(flag, "-C").unwrap_or(flag);
+                    if let Some(flag) = strip_prefix(flag, "target-feature=") {
+                        for s in flag.split(',') {
+                            // TODO: Handles cases where a specific target feature
+                            // implicitly enables another target feature.
+                            match (s.as_bytes().first(), s.get(1..)) {
+                                (Some(b'+'), Some(f)) => {
+                                    if let Some(sm) = strip_prefix(f, "sm_") {
+                                        if let Ok(sm) = sm.parse::<u32>() {
+                                            if sm >= 70 {
+                                                has_sm_70 = true;
+                                            }
+                                        }
+                                    }
+                                }
+                                (Some(b'-'), Some(_f)) => {
+                                    // TODO
+                                }
+                                _ => {}
+                            }
+                        }
+                    }
+                }
+            }
+            target_feature_if("sm_70", has_sm_70, &version, None, false);
+        }
         _ => {}
     }
 }
diff --git a/src/imp/float.rs b/src/imp/float.rs
index 6d6ac4b0..6786459e 100644
--- a/src/imp/float.rs
+++ b/src/imp/float.rs
@@ -5,9 +5,8 @@
 // Note that most of `fetch_*` operations of atomic floats are implemented using
 // CAS loops, which can be slower than equivalent operations of atomic integers.
 //
-// GPU targets have atomic instructions for float, so GPU targets will use
-// architecture-specific implementations instead of this implementation in the
-// future: https://github.com/taiki-e/portable-atomic/issues/34
+// GPU targets have atomic instructions for float, so we use these instructions
+// for GPU targets on nightly (see nvptx.rs).
 
 #![cfg_attr(
     all(target_pointer_width = "16", not(feature = "fallback")),
diff --git a/src/imp/mod.rs b/src/imp/mod.rs
index 2f52f9f2..1eb362bb 100644
--- a/src/imp/mod.rs
+++ b/src/imp/mod.rs
@@ -135,6 +135,14 @@ mod arm_linux;
 #[cfg(target_arch = "msp430")]
 pub(crate) mod msp430;
 
+#[cfg(feature = "float")]
+#[cfg(all(
+    target_arch = "nvptx64",
+    any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"),
+    portable_atomic_unstable_asm_experimental_arch,
+))]
+pub(crate) mod nvptx;
+
 // atomic load/store for RISC-V without A-extension
 #[cfg(any(test, not(feature = "critical-section")))]
 #[cfg_attr(portable_atomic_no_cfg_target_has_atomic, cfg(any(test, portable_atomic_no_atomic_cas)))]
@@ -219,8 +227,21 @@ mod interrupt;
 // Atomic float implementations
 
 #[cfg(feature = "float")]
+#[cfg(not(all(
+    target_arch = "nvptx64",
+    any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"),
+    portable_atomic_unstable_asm_experimental_arch,
+)))]
 pub(crate) mod float;
 
+#[cfg(feature = "float")]
+#[cfg(all(
+    target_arch = "nvptx64",
+    any(target_feature = "sm_70", portable_atomic_target_feature = "sm_70"),
+    portable_atomic_unstable_asm_experimental_arch,
+))]
+pub(crate) use nvptx as float;
+
 // -----------------------------------------------------------------------------
 
 #[cfg(not(any(
diff --git a/src/imp/nvptx.rs b/src/imp/nvptx.rs
new file mode 100644
index 00000000..02b8dfcd
--- /dev/null
+++ b/src/imp/nvptx.rs
@@ -0,0 +1,391 @@
+// Atomic float implementation on NVPTX.
+//
+// Refs:
+// - https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld
+// - https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
+// - https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar
+// - User Guide for NVPTX Back-end (LLVM documentation) https://llvm.org/docs/NVPTXUsage.html
+// - https://github.com/NVIDIA/libcudacxx/blob/1.9.0-rc1/include/cuda/std/detail/libcxx/include/support/atomic/atomic_cuda_generated.h
+//
+// This module is currently enabled on sm_70+.
+// TODO: Support pre-sm_70
+
+use core::{arch::asm, sync::atomic::Ordering};
+
+// NVPTX's seqcst atomic op is preceding seqcst fence + acquire op.
+macro_rules! fence_sc {
+    () => {
+        "fence.sc.gl;"
+    };
+}
+
+macro_rules! atomic_rmw {
+    ($op:ident, $order:ident) => {
+        match $order {
+            Ordering::Relaxed => $op!("relaxed", ""),
+            Ordering::Acquire => $op!("acquire", ""),
+            Ordering::Release => $op!("release", ""),
+            Ordering::AcqRel => $op!("acqrel", ""),
+            Ordering::SeqCst => $op!("acquire", fence_sc!()),
+            _ => unreachable!("{:?}", $order),
+        }
+    };
+}
+
+macro_rules! atomic_float {
+    (
+        $atomic_type:ident, $float_type:ident, $atomic_int_type:ident, $int_type:ident,
+        $val_reg:ident, $align:expr
+    ) => {
+        #[repr(C, align($align))]
+        pub(crate) struct $atomic_type {
+            v: core::cell::UnsafeCell<$float_type>,
+        }
+
+        // Send is implicitly implemented.
+        // SAFETY: any data races are prevented by atomic operations.
+        unsafe impl Sync for $atomic_type {}
+
+        impl $atomic_type {
+            #[inline]
+            pub(crate) const fn new(v: $float_type) -> Self {
+                Self { v: core::cell::UnsafeCell::new(v) }
+            }
+
+            #[inline]
+            pub(crate) fn is_lock_free() -> bool {
+                true
+            }
+            #[inline]
+            pub(crate) const fn is_always_lock_free() -> bool {
+                true
+            }
+
+            #[inline]
+            pub(crate) fn get_mut(&mut self) -> &mut $float_type {
+                // SAFETY: the mutable reference guarantees unique ownership.
+                // (UnsafeCell::get_mut requires Rust 1.50)
+                unsafe { &mut *self.v.get() }
+            }
+
+            #[inline]
+            pub(crate) fn into_inner(self) -> $float_type {
+                self.v.into_inner()
+            }
+
+            #[inline]
+            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
+            pub(crate) fn load(&self, order: Ordering) -> $float_type {
+                let src = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! atomic_load {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("ld.", $sem, ".gpu.", stringify!($float_type), " {out}, [{src}];"),
+                                src = in(reg64) src,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    match order {
+                        Ordering::Relaxed => atomic_load!("relaxed", ""),
+                        Ordering::Acquire => atomic_load!("acquire", ""),
+                        Ordering::SeqCst => atomic_load!("acquire", fence_sc!()),
+                        _ => unreachable!("{:?}", order),
+                    }
+                }
+                out
+            }
+
+            #[inline]
+            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
+            pub(crate) fn store(&self, val: $float_type, order: Ordering) {
+                let dst = self.v.get();
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! atomic_store {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("st.", $sem, ".gpu.", stringify!($float_type), " [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    match order {
+                        Ordering::Relaxed => atomic_store!("relaxed", ""),
+                        Ordering::Release => atomic_store!("release", ""),
+                        Ordering::SeqCst => atomic_store!("relaxed", fence_sc!()),
+                        _ => unreachable!("{:?}", order),
+                    }
+                }
+            }
+
+            #[inline]
+            pub(crate) fn swap(&self, val: $float_type, order: Ordering) -> $float_type {
+                let dst = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! swap {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.exch.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(swap, order);
+                }
+                out
+            }
+
+            #[inline]
+            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
+            pub(crate) fn compare_exchange(
+                &self,
+                old: $float_type,
+                new: $float_type,
+                success: Ordering,
+                failure: Ordering,
+            ) -> Result<$float_type, $float_type> {
+                let order = crate::utils::upgrade_success_ordering(success, failure);
+                let dst = self.v.get();
+                let out: $float_type;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! cmpxchg {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.cas.", stringify!($float_type), " {out}, [{dst}], {old}, {new};"),
+                                dst = in(reg64) dst,
+                                old = in($val_reg) old,
+                                new = in($val_reg) new,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(cmpxchg, order);
+                }
+                if out.to_bits() == old.to_bits() {
+                    Ok(out)
+                } else {
+                    Err(out)
+                }
+            }
+
+            #[inline]
+            #[cfg_attr(all(debug_assertions, not(portable_atomic_no_track_caller)), track_caller)]
+            pub(crate) fn compare_exchange_weak(
+                &self,
+                current: $float_type,
+                new: $float_type,
+                success: Ordering,
+                failure: Ordering,
+            ) -> Result<$float_type, $float_type> {
+                self.compare_exchange(current, new, success, failure)
+            }
+
+            #[inline]
+            pub(crate) fn fetch_add(&self, val: $float_type, order: Ordering) -> $float_type {
+                let dst = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! add {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.add.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(add, order);
+                }
+                out
+            }
+
+            #[inline]
+            pub(crate) fn fetch_sub(&self, val: $float_type, order: Ordering) -> $float_type {
+                // There is no atom.sub, so add `-val`.
+                self.fetch_add(-val, order)
+            }
+
+            #[allow(dead_code)] // TODO
+            #[inline]
+            pub(crate) fn fetch_and(&self, val: $float_type, order: Ordering) -> $float_type {
+                let dst = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! and {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.and.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(and, order);
+                }
+                out
+            }
+
+            #[allow(dead_code)] // TODO
+            #[inline]
+            pub(crate) fn fetch_or(&self, val: $float_type, order: Ordering) -> $float_type {
+                let dst = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! or {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.or.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(or, order);
+                }
+                out
+            }
+
+            #[allow(dead_code)] // TODO
+            #[inline]
+            pub(crate) fn fetch_xor(&self, val: $float_type, order: Ordering) -> $float_type {
+                let dst = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! xor {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.xor.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(xor, order);
+                }
+                out
+            }
+
+            #[inline]
+            pub(crate) fn fetch_max(&self, val: $float_type, order: Ordering) -> $float_type {
+                let dst = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! max {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.max.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(max, order);
+                }
+                out
+            }
+
+            #[inline]
+            pub(crate) fn fetch_min(&self, val: $float_type, order: Ordering) -> $float_type {
+                let dst = self.v.get();
+                let out;
+                // SAFETY: any data races are prevented by atomic intrinsics and the raw
+                // pointer passed in is valid because we got it from a reference.
+                unsafe {
+                    macro_rules! min {
+                        ($sem:tt, $fence_sc:expr) => {
+                            asm!(
+                                $fence_sc,
+                                concat!("atom.", $sem, ".gpu.min.", stringify!($float_type), " {out}, [{dst}], {val};"),
+                                dst = in(reg64) dst,
+                                val = in($val_reg) val,
+                                out = out($val_reg) out,
+                                options(nostack),
+                            )
+                        };
+                    }
+                    atomic_rmw!(min, order);
+                }
+                out
+            }
+
+            #[inline]
+            pub(crate) fn fetch_neg(&self, order: Ordering) -> $float_type {
+                const NEG_MASK: $int_type = !0 / 2 + 1;
+                // TODO: use self.fetch_xor
+                $float_type::from_bits(self.as_bits().fetch_xor(NEG_MASK, order))
+            }
+
+            #[inline]
+            pub(crate) fn fetch_abs(&self, order: Ordering) -> $float_type {
+                const ABS_MASK: $int_type = !0 / 2;
+                // TODO: use self.fetch_and
+                $float_type::from_bits(self.as_bits().fetch_and(ABS_MASK, order))
+            }
+
+            const_fn! {
+                const_if: #[cfg(not(portable_atomic_no_const_raw_ptr_deref))];
+                #[inline]
+                pub(crate) const fn as_bits(&self) -> &crate::$atomic_int_type {
+                    // SAFETY: $atomic_type and $atomic_int_type have the same layout,
+                    // and there is no concurrent access to the value that does not go through this method.
+                    unsafe { &*(self as *const $atomic_type as *const crate::$atomic_int_type) }
+                }
+            }
+
+            #[inline]
+            pub(crate) const fn as_ptr(&self) -> *mut $float_type {
+                self.v.get()
+            }
+        }
+    };
+}
+
+atomic_float!(AtomicF32, f32, AtomicU32, u32, reg32, 4);
+atomic_float!(AtomicF64, f64, AtomicU64, u64, reg64, 8);
diff --git a/src/lib.rs b/src/lib.rs
index c532670d..a575af51 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -62,7 +62,7 @@ See the [`atomic128` module's readme](https://github.com/taiki-e/portable-atomic
 - <a name="optional-features-float"></a>**`float`**<br>
   Provide `AtomicF{32,64}`.
 
-  Note that most of `fetch_*` operations of atomic floats are implemented using CAS loops, which can be slower than equivalent operations of atomic integers. ([GPU targets have atomic instructions for float, so we plan to use these instructions for GPU targets in the future.](https://github.com/taiki-e/portable-atomic/issues/34))
+  Note that most of `fetch_*` operations of atomic floats are implemented using CAS loops, which can be slower than equivalent operations of atomic integers. (GPU targets have atomic instructions for float, so we use these instructions for GPU targets on nightly.)
 
 - **`std`**<br>
   Use `std`.
@@ -237,7 +237,7 @@ RUSTFLAGS="--cfg portable_atomic_no_outline_atomics" cargo ...
 )]
 // asm_experimental_arch
 // AVR, MSP430, and Xtensa are tier 3 platforms and require nightly anyway.
-// On tier 2 platforms (powerpc64 and s390x), we use cfg set by build script to
+// On tier 2 platforms (powerpc64, s390x, and nvptx64), we use cfg set by build script to
 // determine whether this feature is available or not.
 #![cfg_attr(
     all(
@@ -248,6 +248,7 @@ RUSTFLAGS="--cfg portable_atomic_no_outline_atomics" cargo ...
             all(target_arch = "xtensa", portable_atomic_unsafe_assume_single_core),
             all(target_arch = "powerpc64", portable_atomic_unstable_asm_experimental_arch),
             all(target_arch = "s390x", portable_atomic_unstable_asm_experimental_arch),
+            all(target_arch = "nvptx64", portable_atomic_unstable_asm_experimental_arch),
         ),
     ),
     feature(asm_experimental_arch)
diff --git a/tools/build.sh b/tools/build.sh
index 81538eae..4becdf53 100755
--- a/tools/build.sh
+++ b/tools/build.sh
@@ -103,6 +103,9 @@ default_targets=(
     # s390x
     # rustc --print target-list | grep -E '^s390x'
     s390x-unknown-linux-gnu
+
+    # nvptx64
+    nvptx64-nvidia-cuda
 )
 known_cfgs=(
     # Public APIs
@@ -603,6 +606,11 @@ build() {
                 RUSTFLAGS="${target_rustflags} -C target-cpu=z15" \
                 x_cargo "${args[@]}" "$@"
             ;;
+        nvptx64-*)
+            CARGO_TARGET_DIR="${target_dir}/sm_70" \
+                RUSTFLAGS="${target_rustflags} -C target-feature=+sm_70" \
+                x_cargo "${args[@]}" "$@"
+            ;;
     esac
 }