Skip to content

Commit

Permalink
aarch64: Revert "Use CASP instead of LDXP/STXP for store/RMW if avail…
Browse files Browse the repository at this point in the history
…able" on Apple hardware

As of Apple M1/M1 Pro, on Apple hardware, CAS loop-based RMW is much
slower than LL/SC loop-based RMW.
So, revert 93e6ec5 on Apple hardware
for now.

```
Benchmarking bench_portable_atomic_arch/u128_concurrent_swap: Warming up for 3.0000 s
Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 7.9s, enable flat sampling, or reduce sample count to 50.
Benchmarking bench_portable_atomic_arch/u128_concurrent_swap: Collecting 100
bench_portable_atomic_arch/u128_concurrent_swap
                        time:   [1.4365 ms 1.5230 ms 1.5948 ms]
                        change: [+378.49% +402.44% +424.27%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 12 outliers among 100 measurements (12.00%)
  8 (8.00%) low severe
  4 (4.00%) low mild
Benchmarking bench_portable_atomic_arch/u128_concurrent_store_swap: Warming u
Benchmarking bench_portable_atomic_arch/u128_concurrent_store_swap: Collectin
bench_portable_atomic_arch/u128_concurrent_store_swap
                        time:   [286.21 µs 292.15 µs 296.65 µs]
                        change: [+96.504% +102.53% +108.63%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 11 outliers among 100 measurements (11.00%)
  6 (6.00%) low severe
  2 (2.00%) low mild
  1 (1.00%) high mild
  2 (2.00%) high severe
Benchmarking bench_portable_atomic_arch/u128_concurrent_fetch_add: Warming up for 3.0000 s
Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 8.8s, enable flat sampling, or reduce sample count to 50.
Benchmarking bench_portable_atomic_arch/u128_concurrent_fetch_add: Collecting
bench_portable_atomic_arch/u128_concurrent_fetch_add
                        time:   [1.6351 ms 1.6787 ms 1.7170 ms]
                        change: [+279.29% +294.80% +309.89%] (p = 0.00 < 0.05)
                        Performance has regressed.
Found 13 outliers among 100 measurements (13.00%)
  8 (8.00%) low severe
  4 (4.00%) low mild
  1 (1.00%) high mild
```
  • Loading branch information
taiki-e committed Mar 25, 2023
1 parent d97ddb7 commit ceccecb
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 116 deletions.
6 changes: 6 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,12 @@ fn main() {
// As of rustc 1.68, target_feature "lse2" is not available on rustc side:
// https://github.com/rust-lang/rust/blob/1.68.0/compiler/rustc_codegen_ssa/src/target_features.rs#L58
target_feature_if("lse2", is_macos, &version, None, false);

// As of Apple M1/M1 Pro, on Apple hardware, CAS loop-based RMW is much slower than LL/SC
// loop-based RMW.
if is_macos || target_os == "ios" || target_os == "tvos" || target_os == "watchos" {
println!("cargo:rustc-cfg=portable_atomic_ll_sc_rmw");
}
}
"arm" => {
// #[cfg(target_feature = "v7")] and others don't work on stable.
Expand Down
256 changes: 140 additions & 116 deletions src/imp/atomic128/aarch64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
// at run-time, otherwise, use LDXP/STXP loop.
// If FEAT_LSE is available at compile-time, we use CASP for load/store/CAS/RMW.
// If FEAT_LSE2 is available at compile-time, we use LDP/STP for load/store.
// When portable_atomic_ll_sc_rmw cfg is set, use LDXP/STXP loop instead of CASP
// loop for RMW. (by default, it is set on Apple hardware; see build script for details)
//
// Note: FEAT_LSE2 doesn't imply FEAT_LSE.
//
Expand Down Expand Up @@ -535,13 +537,19 @@ use self::atomic_compare_exchange as atomic_compare_exchange_weak;

#[inline]
unsafe fn atomic_swap(dst: *mut u128, val: u128, order: Ordering) -> u128 {
#[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))]
#[cfg(all(
any(target_feature = "lse", portable_atomic_target_feature = "lse"),
not(portable_atomic_ll_sc_rmw),
))]
// SAFETY: the caller must uphold the safety contract.
// cfg guarantee that the CPU supports FEAT_LSE.
unsafe {
_atomic_swap_casp(dst, val, order)
}
#[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))]
#[cfg(not(all(
any(target_feature = "lse", portable_atomic_target_feature = "lse"),
not(portable_atomic_ll_sc_rmw),
)))]
// SAFETY: the caller must uphold the safety contract.
unsafe {
_atomic_swap_ldxp_stxp(dst, val, order)
Expand Down Expand Up @@ -631,12 +639,14 @@ unsafe fn _atomic_swap_ldxp_stxp(dst: *mut u128, val: u128, order: Ordering) ->
/// - prev_lo/prev_hi pair: previous value loaded by ll (read-only for `$op`)
/// - new_lo/new_hi pair: new value that will to stored by sc
macro_rules! atomic_rmw_ll_sc_3 {
($name:ident as $name_no_lse:ident, options($($options:tt)*), $($op:tt)*) => {
($name:ident as $reexport_name:ident, options($($options:tt)*), $($op:tt)*) => {
// If FEAT_LSE is available at compile-time, we use CAS based Atomic RMW
// generated by atomic_rmw_by_atomic_update! macro.
#[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))]
use $name as $name_no_lse;
#[cfg(any(test, not(any(target_feature = "lse", portable_atomic_target_feature = "lse"))))]
#[cfg(not(all(
any(target_feature = "lse", portable_atomic_target_feature = "lse"),
not(portable_atomic_ll_sc_rmw),
)))]
use $name as $reexport_name;
#[inline]
unsafe fn $name(dst: *mut u128, val: u128, order: Ordering) -> u128 {
debug_assert!(dst as usize % 16 == 0);
Expand Down Expand Up @@ -672,7 +682,7 @@ macro_rules! atomic_rmw_ll_sc_3 {
#[cfg(test)]
paste::paste! {
// Helper to test $op separately.
unsafe fn [<$name_no_lse _op>](dst: *mut u128, val: u128) -> u128 {
unsafe fn [<$reexport_name _op>](dst: *mut u128, val: u128) -> u128 {
// SAFETY: the caller must uphold the safety contract.
unsafe {
$name(dst, val, Ordering::Relaxed)
Expand All @@ -689,7 +699,12 @@ macro_rules! atomic_rmw_ll_sc_3 {
/// - x6/x7 pair: previous value loaded (read-only for `$op`)
/// - x4/x5 pair: new value that will to stored
macro_rules! atomic_rmw_cas_3 {
($name:ident, $($op:tt)*) => {
($name:ident as $reexport_name:ident, $($op:tt)*) => {
#[cfg(all(
any(target_feature = "lse", portable_atomic_target_feature = "lse"),
not(portable_atomic_ll_sc_rmw),
))]
use $name as $reexport_name;
#[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))]
#[inline]
unsafe fn $name(dst: *mut u128, val: u128, order: Ordering) -> u128 {
Expand Down Expand Up @@ -738,102 +753,6 @@ macro_rules! atomic_rmw_cas_3 {
}
};
}
atomic_rmw_ll_sc_3! {
_atomic_add_ldxp_stxp as atomic_add,
// Do not use `preserves_flags` because ADDS and ADCS modify the condition flags.
options(nostack),
concat!(
"adds ",
select_le_or_be!("{new_lo}, {prev_lo}, {val_lo}", "{new_hi}, {prev_hi}, {val_hi}")
),
concat!(
"adc ",
select_le_or_be!("{new_hi}, {prev_hi}, {val_hi}", "{new_lo}, {prev_lo}, {val_lo}")
),
}
atomic_rmw_cas_3! {
atomic_add,
concat!(
"adds ",
select_le_or_be!("x4, x6, {val_lo}", "x5, x7, {val_hi}")
),
concat!(
"adc ",
select_le_or_be!("x5, x7, {val_hi}", "x4, x6, {val_lo}")
),
}
atomic_rmw_ll_sc_3! {
_atomic_sub_ldxp_stxp as atomic_sub,
// Do not use `preserves_flags` because SUBS and SBCS modify the condition flags.
options(nostack),
concat!(
"subs ",
select_le_or_be!("{new_lo}, {prev_lo}, {val_lo}", "{new_hi}, {prev_hi}, {val_hi}")
),
concat!(
"sbc ",
select_le_or_be!("{new_hi}, {prev_hi}, {val_hi}", "{new_lo}, {prev_lo}, {val_lo}")
),
}
atomic_rmw_cas_3! {
atomic_sub,
concat!(
"subs ",
select_le_or_be!("x4, x6, {val_lo}", "x5, x7, {val_hi}")
),
concat!(
"sbc ",
select_le_or_be!("x5, x7, {val_hi}", "x4, x6, {val_lo}")
),
}
atomic_rmw_ll_sc_3! {
_atomic_and_ldxp_stxp as atomic_and,
options(nostack, preserves_flags),
"and {new_lo}, {prev_lo}, {val_lo}",
"and {new_hi}, {prev_hi}, {val_hi}",
}
atomic_rmw_cas_3! {
atomic_and,
"and x4, x6, {val_lo}",
"and x5, x7, {val_hi}",
}
atomic_rmw_ll_sc_3! {
_atomic_nand_ldxp_stxp as atomic_nand,
options(nostack, preserves_flags),
"and {new_lo}, {prev_lo}, {val_lo}",
"mvn {new_lo}, {new_lo}",
"and {new_hi}, {prev_hi}, {val_hi}",
"mvn {new_hi}, {new_hi}",
}
atomic_rmw_cas_3! {
atomic_nand,
"and x4, x6, {val_lo}",
"mvn x4, x4",
"and x5, x7, {val_hi}",
"mvn x5, x5",
}
atomic_rmw_ll_sc_3! {
_atomic_or_ldxp_stxp as atomic_or,
options(nostack, preserves_flags),
"orr {new_lo}, {prev_lo}, {val_lo}",
"orr {new_hi}, {prev_hi}, {val_hi}",
}
atomic_rmw_cas_3! {
atomic_or,
"orr x4, x6, {val_lo}",
"orr x5, x7, {val_hi}",
}
atomic_rmw_ll_sc_3! {
_atomic_xor_ldxp_stxp as atomic_xor,
options(nostack, preserves_flags),
"eor {new_lo}, {prev_lo}, {val_lo}",
"eor {new_hi}, {prev_hi}, {val_hi}",
}
atomic_rmw_cas_3! {
atomic_xor,
"eor x4, x6, {val_lo}",
"eor x5, x7, {val_hi}",
}

/// Atomic RMW by LL/SC loop (2 arguments)
/// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;`
Expand All @@ -842,12 +761,14 @@ atomic_rmw_cas_3! {
/// - prev_lo/prev_hi pair: previous value loaded by ll (read-only for `$op`)
/// - new_lo/new_hi pair: new value that will to stored by sc
macro_rules! atomic_rmw_ll_sc_2 {
($name:ident as $name_no_lse:ident, options($($options:tt)*), $($op:tt)*) => {
($name:ident as $reexport_name:ident, options($($options:tt)*), $($op:tt)*) => {
// If FEAT_LSE is available at compile-time, we use CAS based Atomic RMW
// generated by atomic_rmw_by_atomic_update! macro.
#[cfg(not(any(target_feature = "lse", portable_atomic_target_feature = "lse")))]
use $name as $name_no_lse;
#[cfg(any(test, not(any(target_feature = "lse", portable_atomic_target_feature = "lse"))))]
#[cfg(not(all(
any(target_feature = "lse", portable_atomic_target_feature = "lse"),
not(portable_atomic_ll_sc_rmw),
)))]
use $name as $reexport_name;
#[inline]
unsafe fn $name(dst: *mut u128, order: Ordering) -> u128 {
debug_assert!(dst as usize % 16 == 0);
Expand Down Expand Up @@ -880,7 +801,7 @@ macro_rules! atomic_rmw_ll_sc_2 {
#[cfg(test)]
paste::paste! {
// Helper to test $op separately.
unsafe fn [<$name_no_lse _op>](dst: *mut u128) -> u128 {
unsafe fn [<$reexport_name _op>](dst: *mut u128) -> u128 {
// SAFETY: the caller must uphold the safety contract.
unsafe {
$name(dst, Ordering::Relaxed)
Expand All @@ -896,7 +817,12 @@ macro_rules! atomic_rmw_ll_sc_2 {
/// - x6/x7 pair: previous value loaded (read-only for `$op`)
/// - x4/x5 pair: new value that will to stored
macro_rules! atomic_rmw_cas_2 {
($name:ident, $($op:tt)*) => {
($name:ident as $reexport_name:ident, $($op:tt)*) => {
#[cfg(all(
any(target_feature = "lse", portable_atomic_target_feature = "lse"),
not(portable_atomic_ll_sc_rmw),
))]
use $name as $reexport_name;
#[cfg(any(target_feature = "lse", portable_atomic_target_feature = "lse"))]
#[inline]
unsafe fn $name(dst: *mut u128, order: Ordering) -> u128 {
Expand Down Expand Up @@ -942,14 +868,112 @@ macro_rules! atomic_rmw_cas_2 {
}
};
}

atomic_rmw_ll_sc_3! {
_atomic_add_ldxp_stxp as atomic_add,
// Do not use `preserves_flags` because ADDS and ADCS modify the condition flags.
options(nostack),
concat!(
"adds ",
select_le_or_be!("{new_lo}, {prev_lo}, {val_lo}", "{new_hi}, {prev_hi}, {val_hi}")
),
concat!(
"adc ",
select_le_or_be!("{new_hi}, {prev_hi}, {val_hi}", "{new_lo}, {prev_lo}, {val_lo}")
),
}
atomic_rmw_cas_3! {
_atomic_add_casp as atomic_add,
concat!(
"adds ",
select_le_or_be!("x4, x6, {val_lo}", "x5, x7, {val_hi}")
),
concat!(
"adc ",
select_le_or_be!("x5, x7, {val_hi}", "x4, x6, {val_lo}")
),
}
atomic_rmw_ll_sc_3! {
_atomic_sub_ldxp_stxp as atomic_sub,
// Do not use `preserves_flags` because SUBS and SBCS modify the condition flags.
options(nostack),
concat!(
"subs ",
select_le_or_be!("{new_lo}, {prev_lo}, {val_lo}", "{new_hi}, {prev_hi}, {val_hi}")
),
concat!(
"sbc ",
select_le_or_be!("{new_hi}, {prev_hi}, {val_hi}", "{new_lo}, {prev_lo}, {val_lo}")
),
}
atomic_rmw_cas_3! {
_atomic_sub_casp as atomic_sub,
concat!(
"subs ",
select_le_or_be!("x4, x6, {val_lo}", "x5, x7, {val_hi}")
),
concat!(
"sbc ",
select_le_or_be!("x5, x7, {val_hi}", "x4, x6, {val_lo}")
),
}
atomic_rmw_ll_sc_3! {
_atomic_and_ldxp_stxp as atomic_and,
options(nostack, preserves_flags),
"and {new_lo}, {prev_lo}, {val_lo}",
"and {new_hi}, {prev_hi}, {val_hi}",
}
atomic_rmw_cas_3! {
_atomic_and_casp as atomic_and,
"and x4, x6, {val_lo}",
"and x5, x7, {val_hi}",
}
atomic_rmw_ll_sc_3! {
_atomic_nand_ldxp_stxp as atomic_nand,
options(nostack, preserves_flags),
"and {new_lo}, {prev_lo}, {val_lo}",
"mvn {new_lo}, {new_lo}",
"and {new_hi}, {prev_hi}, {val_hi}",
"mvn {new_hi}, {new_hi}",
}
atomic_rmw_cas_3! {
_atomic_nand_casp as atomic_nand,
"and x4, x6, {val_lo}",
"mvn x4, x4",
"and x5, x7, {val_hi}",
"mvn x5, x5",
}
atomic_rmw_ll_sc_3! {
_atomic_or_ldxp_stxp as atomic_or,
options(nostack, preserves_flags),
"orr {new_lo}, {prev_lo}, {val_lo}",
"orr {new_hi}, {prev_hi}, {val_hi}",
}
atomic_rmw_cas_3! {
_atomic_or_casp as atomic_or,
"orr x4, x6, {val_lo}",
"orr x5, x7, {val_hi}",
}
atomic_rmw_ll_sc_3! {
_atomic_xor_ldxp_stxp as atomic_xor,
options(nostack, preserves_flags),
"eor {new_lo}, {prev_lo}, {val_lo}",
"eor {new_hi}, {prev_hi}, {val_hi}",
}
atomic_rmw_cas_3! {
_atomic_xor_casp as atomic_xor,
"eor x4, x6, {val_lo}",
"eor x5, x7, {val_hi}",
}

atomic_rmw_ll_sc_2! {
_atomic_not_ldxp_stxp as atomic_not,
options(nostack, preserves_flags),
"mvn {new_lo}, {prev_lo}",
"mvn {new_hi}, {prev_hi}",
}
atomic_rmw_cas_2! {
atomic_not,
_atomic_not_casp as atomic_not,
"mvn x4, x6",
"mvn x5, x7",
}
Expand All @@ -961,7 +985,7 @@ atomic_rmw_ll_sc_2! {
concat!("ngc ", select_le_or_be!("{new_hi}, {prev_hi}", "{new_lo}, {prev_lo}")),
}
atomic_rmw_cas_2! {
atomic_neg,
_atomic_neg_casp as atomic_neg,
concat!("negs ", select_le_or_be!("x4, x6", "x5, x7")),
concat!("ngc ", select_le_or_be!("x5, x7", "x4, x6")),
}
Expand All @@ -976,7 +1000,7 @@ atomic_rmw_ll_sc_3! {
"csel {new_lo}, {prev_lo}, {val_lo}, lt", // select lo 64-bit
}
atomic_rmw_cas_3! {
atomic_max,
_atomic_max_casp as atomic_max,
select_le_or_be!("cmp {val_lo}, x6", "cmp {val_hi}, x7"),
select_le_or_be!("sbcs xzr, {val_hi}, x7", "sbcs xzr, {val_lo}, x6"),
"csel x5, x7, {val_hi}, lt", // select hi 64-bit
Expand All @@ -993,7 +1017,7 @@ atomic_rmw_ll_sc_3! {
"csel {new_lo}, {prev_lo}, {val_lo}, lo", // select lo 64-bit
}
atomic_rmw_cas_3! {
atomic_umax,
_atomic_umax_casp as atomic_umax,
select_le_or_be!("cmp {val_lo}, x6", "cmp {val_hi}, x7"),
select_le_or_be!("sbcs xzr, {val_hi}, x7", "sbcs xzr, {val_lo}, x6"),
"csel x5, x7, {val_hi}, lo", // select hi 64-bit
Expand All @@ -1010,7 +1034,7 @@ atomic_rmw_ll_sc_3! {
"csel {new_lo}, {prev_lo}, {val_lo}, ge", // select lo 64-bit
}
atomic_rmw_cas_3! {
atomic_min,
_atomic_min_casp as atomic_min,
select_le_or_be!("cmp {val_lo}, x6", "cmp {val_hi}, x7"),
select_le_or_be!("sbcs xzr, {val_hi}, x7", "sbcs xzr, {val_lo}, x6"),
"csel x5, x7, {val_hi}, ge", // select hi 64-bit
Expand All @@ -1027,7 +1051,7 @@ atomic_rmw_ll_sc_3! {
"csel {new_lo}, {prev_lo}, {val_lo}, hs", // select lo 64-bit
}
atomic_rmw_cas_3! {
atomic_umin,
_atomic_umin_casp as atomic_umin,
select_le_or_be!("cmp {val_lo}, x6", "cmp {val_hi}, x7"),
select_le_or_be!("sbcs xzr, {val_hi}, x7", "sbcs xzr, {val_lo}, x6"),
"csel x5, x7, {val_hi}, hs", // select hi 64-bit
Expand Down

0 comments on commit ceccecb

Please sign in to comment.