Improve inlining hints for better performance.

Affects rustc versions 1.81.0+ which changed inlining thresholds that dramatically decreased performance. - Closes #111.
Alexhuszagh · Sep 9, 2024 · f282542 · f282542
1 parent b0c9e64
commit f282542
Show file tree

Hide file tree

Showing 51 changed files with 420 additions and 415 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Changed
 
 - Updated the MSRV to 1.63.0 (1.65.0 for development).
+- Improved performance due to compiler regressions in rustc 1.81.0 and above.
 
 ### Removed
 

diff --git a/lexical-benchmark/README.md b/lexical-benchmark/README.md
@@ -12,6 +12,6 @@ The benchmark requires the following:
 3. An installation of [Python3](https://www.python.org/downloads/).
 4. An installation of [Rust](https://doc.rust-lang.org/1.0.0/book/installing-rust.html).
 5. An installation of Google [Benchmark](https://github.com/google/benchmark).
-5. An installation of [CMake](https://cmake.org/download/).
+6. An installation of [CMake](https://cmake.org/download/).
 
 The use of a Rust version >= 1.59.0, with the feature `asm`, is highly recommended for better metrics and/or performance.
diff --git a/lexical-core/src/lib.rs b/lexical-core/src/lib.rs
@@ -498,7 +498,7 @@ macro_rules! to_lexical_impl {
                 }
             }
 
-            #[cfg_attr(not(feature = "compact"), inline)]
+            #[cfg_attr(not(feature = "compact"), inline(always))]
             fn to_lexical_with_options<'a, const FORMAT: u128>(
                 self,
                 bytes: &'a mut [u8],

diff --git a/lexical-parse-float/src/bellerophon.rs b/lexical-parse-float/src/bellerophon.rs
@@ -177,7 +177,7 @@ const fn error_halfscale() -> u32 {
 }
 
 /// Determine if the number of errors is tolerable for float precision.
-#[cfg_attr(not(feature = "compact"), inline)]
+#[cfg_attr(not(feature = "compact"), inline(always))]
 fn error_is_accurate<F: RawFloat>(errors: u32, fp: &ExtendedFloat80) -> bool {
     // Check we can't have a literal 0 denormal float.
     debug_assert!(fp.exp >= -64);
@@ -283,7 +283,7 @@ fn error_is_accurate<F: RawFloat>(errors: u32, fp: &ExtendedFloat80) -> bool {
 /// itself is 0.
 ///
 /// Get the number of bytes shifted.
-#[cfg_attr(not(feature = "compact"), inline)]
+#[cfg_attr(not(feature = "compact"), inline(always))]
 pub fn normalize(fp: &mut ExtendedFloat80) -> i32 {
     // Note:
     // Using the ctlz intrinsic via leading_zeros is way faster (~10x)
@@ -318,7 +318,7 @@ pub fn normalize(fp: &mut ExtendedFloat80) -> i32 {
 ///     1. Non-signed multiplication of mantissas (requires 2x as many bits as input).
 ///     2. Normalization of the result (not done here).
 ///     3. Addition of exponents.
-#[cfg_attr(not(feature = "compact"), inline)]
+#[cfg_attr(not(feature = "compact"), inline(always))]
 pub fn mul(x: &ExtendedFloat80, y: &ExtendedFloat80) -> ExtendedFloat80 {
     // Logic check, values must be decently normalized prior to multiplication.
     debug_assert!(x.mant >> 32 != 0);
@@ -370,7 +370,7 @@ pub struct BellerophonPowers {
 
 /// Allow indexing of values without bounds checking
 impl BellerophonPowers {
-    #[inline]
+    #[inline(always)]
     pub const fn get_small(&self, index: usize) -> ExtendedFloat80 {
         let mant = self.small[index];
         let exp = (1 - 64) + ((self.log2 * index as i64) >> self.log2_shift);
@@ -380,7 +380,7 @@ impl BellerophonPowers {
         }
     }
 
-    #[inline]
+    #[inline(always)]
     pub const fn get_large(&self, index: usize) -> ExtendedFloat80 {
         let mant = self.large[index];
         let biased_e = index as i64 * self.step as i64 - self.bias as i64;
@@ -391,7 +391,7 @@ impl BellerophonPowers {
         }
     }
 
-    #[inline]
+    #[inline(always)]
     pub const fn get_small_int(&self, index: usize) -> u64 {
         self.small_int[index]
     }