Remove most #[inline] annotations

This commit goes through and deletes almost all `#[inline]` annotations in this crate. It looks like before this commit basically every single function is `#[inline]`, but this is generally not necessary for performance and can have a severe impact on compile times in both debug and release modes, most severely in release mode. Some `#[inline]` annotations are definitely necessary, however. Most functions in this crate are already candidates for inlining because they're generic, but functions like `Group` and `BitMask` aren't candidates for inlining without `#[inline]`. Additionally LLVM is by no means perfect, so some `#[inline]` may still be necessary to get some further speedups. The procedure used to generate this commit looked like: * Remove all `#[inline]` annotations. * Run `cargo bench`, comparing against the `master` branch, and add `#[inline]` to hot spots as necessary. * A [PR] was made against rust-lang/rust to [evaluate the impact][run1] on the compiler for more performance data. * Using this data, `perf diff` was used locally to determine further hot spots and more `#[inline]` annotations were added. * A [second round of benchmarking][run2] was done The numbers are at the point where I think this should land in the crate and get published to move into the standard library. There are up to 20% wins in compile time for hashmap-heavy crates (like Cargo) and milder wins (up to 10%) for a number of other large crates. The regressions are all in the 1-3% range and are largely on benchmarks taking a few handful of milliseconds anyway, which I'd personally say is a worthwhile tradeoff. For comparison, the benchmarks of this crate before and after this commit look like so: name baseline ns/iter new ns/iter diff ns/iter diff % speedup insert_ahash_highbits 7,137 9,044 1,907 26.72% x 0.79 insert_ahash_random 7,575 9,789 2,214 29.23% x 0.77 insert_ahash_serial 9,833 9,476 -357 -3.63% x 1.04 insert_erase_ahash_highbits 15,824 19,164 3,340 21.11% x 0.83 insert_erase_ahash_random 16,933 20,353 3,420 20.20% x 0.83 insert_erase_ahash_serial 20,857 27,675 6,818 32.69% x 0.75 insert_erase_std_highbits 35,117 38,385 3,268 9.31% x 0.91 insert_erase_std_random 35,357 37,236 1,879 5.31% x 0.95 insert_erase_std_serial 30,617 34,136 3,519 11.49% x 0.90 insert_std_highbits 15,675 18,180 2,505 15.98% x 0.86 insert_std_random 16,566 17,803 1,237 7.47% x 0.93 insert_std_serial 14,612 16,025 1,413 9.67% x 0.91 iter_ahash_highbits 1,715 1,640 -75 -4.37% x 1.05 iter_ahash_random 1,721 1,634 -87 -5.06% x 1.05 iter_ahash_serial 1,723 1,636 -87 -5.05% x 1.05 iter_std_highbits 1,715 1,634 -81 -4.72% x 1.05 iter_std_random 1,715 1,637 -78 -4.55% x 1.05 iter_std_serial 1,722 1,637 -85 -4.94% x 1.05 lookup_ahash_highbits 4,565 5,809 1,244 27.25% x 0.79 lookup_ahash_random 4,632 4,047 -585 -12.63% x 1.14 lookup_ahash_serial 4,612 4,906 294 6.37% x 0.94 lookup_fail_ahash_highbits 4,206 3,976 -230 -5.47% x 1.06 lookup_fail_ahash_random 4,327 4,211 -116 -2.68% x 1.03 lookup_fail_ahash_serial 8,999 4,386 -4,613 -51.26% x 2.05 lookup_fail_std_highbits 13,284 13,342 58 0.44% x 1.00 lookup_fail_std_random 13,172 13,614 442 3.36% x 0.97 lookup_fail_std_serial 11,240 11,539 299 2.66% x 0.97 lookup_std_highbits 13,075 13,333 258 1.97% x 0.98 lookup_std_random 13,257 13,193 -64 -0.48% x 1.00 lookup_std_serial 10,782 10,917 135 1.25% x 0.99 The summary of this from what I can tell is that the microbenchmarks are sort of all over the place, but they're neither consistently regressing nor improving, as expected. In general I would be surprised if there's much of a significant performance regression attributed to this commit, and `#[inline]` can always be selectively added back in easily without adding it to every function in the crate. [PR]: rust-lang/rust#64846 [run1]: rust-lang/rust#64846 (comment) [run2]: rust-lang/rust#64846 (comment)
rust-lang · Oct 1, 2019 · f1666de · f1666de
1 parent 7c93e56
commit f1666de
Show file tree

Hide file tree

Showing 11 changed files with 0 additions and 294 deletions.
diff --git a/src/external_trait_impls/rayon/map.rs b/src/external_trait_impls/rayon/map.rs
@@ -22,7 +22,6 @@ pub struct ParIter<'a, K, V, S> {
 impl<'a, K: Sync, V: Sync, S: Sync> ParallelIterator for ParIter<'a, K, V, S> {
     type Item = (&'a K, &'a V);
 
-    #[inline]
     fn drive_unindexed<C>(self, consumer: C) -> C::Result
     where
         C: UnindexedConsumer<Self::Item>,
@@ -39,7 +38,6 @@ impl<'a, K: Sync, V: Sync, S: Sync> ParallelIterator for ParIter<'a, K, V, S> {
 }
 
 impl<K, V, S> Clone for ParIter<'_, K, V, S> {
-    #[inline]
     fn clone(&self) -> Self {
         ParIter { map: self.map }
     }
@@ -65,7 +63,6 @@ pub struct ParKeys<'a, K, V, S> {
 impl<'a, K: Sync, V: Sync, S: Sync> ParallelIterator for ParKeys<'a, K, V, S> {
     type Item = &'a K;
 
-    #[inline]
     fn drive_unindexed<C>(self, consumer: C) -> C::Result
     where
         C: UnindexedConsumer<Self::Item>,
@@ -79,7 +76,6 @@ impl<'a, K: Sync, V: Sync, S: Sync> ParallelIterator for ParKeys<'a, K, V, S> {
 }
 
 impl<K, V, S> Clone for ParKeys<'_, K, V, S> {
-    #[inline]
     fn clone(&self) -> Self {
         ParKeys { map: self.map }
     }
@@ -105,7 +101,6 @@ pub struct ParValues<'a, K, V, S> {
 impl<'a, K: Sync, V: Sync, S: Sync> ParallelIterator for ParValues<'a, K, V, S> {
     type Item = &'a V;
 
-    #[inline]
     fn drive_unindexed<C>(self, consumer: C) -> C::Result
     where
         C: UnindexedConsumer<Self::Item>,
@@ -119,7 +114,6 @@ impl<'a, K: Sync, V: Sync, S: Sync> ParallelIterator for ParValues<'a, K, V, S>
 }
 
 impl<K, V, S> Clone for ParValues<'_, K, V, S> {
-    #[inline]
     fn clone(&self) -> Self {
         ParValues { map: self.map }
     }
@@ -147,7 +141,6 @@ pub struct ParIterMut<'a, K, V, S> {
 impl<'a, K: Send + Sync, V: Send, S: Send> ParallelIterator for ParIterMut<'a, K, V, S> {
     type Item = (&'a K, &'a mut V);
 
-    #[inline]
     fn drive_unindexed<C>(self, consumer: C) -> C::Result
     where
         C: UnindexedConsumer<Self::Item>,
@@ -185,7 +178,6 @@ pub struct ParValuesMut<'a, K, V, S> {
 impl<'a, K: Send, V: Send, S: Send> ParallelIterator for ParValuesMut<'a, K, V, S> {
     type Item = &'a mut V;
 
-    #[inline]
     fn drive_unindexed<C>(self, consumer: C) -> C::Result
     where
         C: UnindexedConsumer<Self::Item>,
@@ -220,7 +212,6 @@ pub struct IntoParIter<K, V, S> {
 impl<K: Send, V: Send, S: Send> ParallelIterator for IntoParIter<K, V, S> {
     type Item = (K, V);
 
-    #[inline]
     fn drive_unindexed<C>(self, consumer: C) -> C::Result
     where
         C: UnindexedConsumer<Self::Item>,
@@ -249,7 +240,6 @@ pub struct ParDrain<'a, K, V, S> {
 impl<K: Send, V: Send, S: Send> ParallelIterator for ParDrain<'_, K, V, S> {
     type Item = (K, V);
 
-    #[inline]
     fn drive_unindexed<C>(self, consumer: C) -> C::Result
     where
         C: UnindexedConsumer<Self::Item>,
@@ -268,28 +258,24 @@ impl<K: fmt::Debug + Eq + Hash, V: fmt::Debug, S: BuildHasher> fmt::Debug
 
 impl<K: Sync, V: Sync, S: Sync> HashMap<K, V, S> {
     /// Visits (potentially in parallel) immutably borrowed keys in an arbitrary order.
-    #[inline]
     pub fn par_keys(&self) -> ParKeys<'_, K, V, S> {
         ParKeys { map: self }
     }
 
     /// Visits (potentially in parallel) immutably borrowed values in an arbitrary order.
-    #[inline]
     pub fn par_values(&self) -> ParValues<'_, K, V, S> {
         ParValues { map: self }
     }
 }
 
 impl<K: Send, V: Send, S: Send> HashMap<K, V, S> {
     /// Visits (potentially in parallel) mutably borrowed values in an arbitrary order.
-    #[inline]
     pub fn par_values_mut(&mut self) -> ParValuesMut<'_, K, V, S> {
         ParValuesMut { map: self }
     }
 
     /// Consumes (potentially in parallel) all values in an arbitrary order,
     /// while preserving the map's allocated memory for reuse.
-    #[inline]
     pub fn par_drain(&mut self) -> ParDrain<'_, K, V, S> {
         ParDrain { map: self }
     }
@@ -317,7 +303,6 @@ impl<K: Send, V: Send, S: Send> IntoParallelIterator for HashMap<K, V, S> {
     type Item = (K, V);
     type Iter = IntoParIter<K, V, S>;
 
-    #[inline]
     fn into_par_iter(self) -> Self::Iter {
         IntoParIter { map: self }
     }
@@ -327,7 +312,6 @@ impl<'a, K: Sync, V: Sync, S: Sync> IntoParallelIterator for &'a HashMap<K, V, S
     type Item = (&'a K, &'a V);
     type Iter = ParIter<'a, K, V, S>;
 
-    #[inline]
     fn into_par_iter(self) -> Self::Iter {
         ParIter { map: self }
     }
@@ -337,7 +321,6 @@ impl<'a, K: Send + Sync, V: Send, S: Send> IntoParallelIterator for &'a mut Hash
     type Item = (&'a K, &'a mut V);
     type Iter = ParIterMut<'a, K, V, S>;
 
-    #[inline]
     fn into_par_iter(self) -> Self::Iter {
         ParIterMut { map: self }
     }

diff --git a/src/external_trait_impls/rayon/raw.rs b/src/external_trait_impls/rayon/raw.rs
@@ -18,7 +18,6 @@ pub struct RawParIter<T> {
 impl<T> ParallelIterator for RawParIter<T> {
     type Item = Bucket<T>;
 
-    #[inline]
     fn drive_unindexed<C>(self, consumer: C) -> C::Result
     where
         C: UnindexedConsumer<Self::Item>,
@@ -36,15 +35,13 @@ struct ParIterProducer<T> {
 impl<T> UnindexedProducer for ParIterProducer<T> {
     type Item = Bucket<T>;
 
-    #[inline]
     fn split(self) -> (Self, Option<Self>) {
         let (left, right) = self.iter.split();
         let left = ParIterProducer { iter: left };
         let right = right.map(|right| ParIterProducer { iter: right });
         (left, right)
     }
 
-    #[inline]
     fn fold_with<F>(self, folder: F) -> F
     where
         F: Folder<Self::Item>,
@@ -61,7 +58,6 @@ pub struct RawIntoParIter<T> {
 impl<T: Send> ParallelIterator for RawIntoParIter<T> {
     type Item = T;
 
-    #[inline]
     fn drive_unindexed<C>(self, consumer: C) -> C::Result
     where
         C: UnindexedConsumer<Self::Item>,
@@ -92,7 +88,6 @@ unsafe impl<T> Send for RawParDrain<'_, T> {}
 impl<T: Send> ParallelIterator for RawParDrain<'_, T> {
     type Item = T;
 
-    #[inline]
     fn drive_unindexed<C>(self, consumer: C) -> C::Result
     where
         C: UnindexedConsumer<Self::Item>,
@@ -123,7 +118,6 @@ struct ParDrainProducer<T> {
 impl<T: Send> UnindexedProducer for ParDrainProducer<T> {
     type Item = T;
 
-    #[inline]
     fn split(self) -> (Self, Option<Self>) {
         let (left, right) = self.iter.clone().split();
         mem::forget(self);
@@ -132,7 +126,6 @@ impl<T: Send> UnindexedProducer for ParDrainProducer<T> {
         (left, right)
     }
 
-    #[inline]
     fn fold_with<F>(mut self, mut folder: F) -> F
     where
         F: Folder<Self::Item>,
@@ -153,7 +146,6 @@ impl<T: Send> UnindexedProducer for ParDrainProducer<T> {
 }
 
 impl<T> Drop for ParDrainProducer<T> {
-    #[inline]
     fn drop(&mut self) {
         // Drop all remaining elements
         if mem::needs_drop::<T>() {
@@ -168,22 +160,19 @@ impl<T> Drop for ParDrainProducer<T> {
 
 impl<T> RawTable<T> {
     /// Returns a parallel iterator over the elements in a `RawTable`.
-    #[inline]
     pub fn par_iter(&self) -> RawParIter<T> {
         RawParIter {
             iter: unsafe { self.iter().iter },
         }
     }
 
     /// Returns a parallel iterator over the elements in a `RawTable`.
-    #[inline]
     pub fn into_par_iter(self) -> RawIntoParIter<T> {
         RawIntoParIter { table: self }
     }
 
     /// Returns a parallel iterator which consumes all elements of a `RawTable`
     /// without freeing its memory allocation.
-    #[inline]
     pub fn par_drain(&mut self) -> RawParDrain<'_, T> {
         RawParDrain {
             table: NonNull::from(self),

diff --git a/src/external_trait_impls/rayon/set.rs b/src/external_trait_impls/rayon/set.rs
@@ -214,14 +214,12 @@ where
 {
     /// Visits (potentially in parallel) the values representing the difference,
     /// i.e. the values that are in `self` but not in `other`.
-    #[inline]
     pub fn par_difference<'a>(&'a self, other: &'a Self) -> ParDifference<'a, T, S> {
         ParDifference { a: self, b: other }
     }
 
     /// Visits (potentially in parallel) the values representing the symmetric
     /// difference, i.e. the values that are in `self` or in `other` but not in both.
-    #[inline]
     pub fn par_symmetric_difference<'a>(
         &'a self,
         other: &'a Self,
@@ -231,14 +229,12 @@ where
 
     /// Visits (potentially in parallel) the values representing the
     /// intersection, i.e. the values that are both in `self` and `other`.
-    #[inline]
     pub fn par_intersection<'a>(&'a self, other: &'a Self) -> ParIntersection<'a, T, S> {
         ParIntersection { a: self, b: other }
     }
 
     /// Visits (potentially in parallel) the values representing the union,
     /// i.e. all the values in `self` or `other`, without duplicates.
-    #[inline]
     pub fn par_union<'a>(&'a self, other: &'a Self) -> ParUnion<'a, T, S> {
         ParUnion { a: self, b: other }
     }
@@ -287,7 +283,6 @@ where
 {
     /// Consumes (potentially in parallel) all values in an arbitrary order,
     /// while preserving the set's allocated memory for reuse.
-    #[inline]
     pub fn par_drain(&mut self) -> ParDrain<'_, T, S> {
         ParDrain { set: self }
     }
@@ -297,7 +292,6 @@ impl<T: Send, S: Send> IntoParallelIterator for HashSet<T, S> {
     type Item = T;
     type Iter = IntoParIter<T, S>;
 
-    #[inline]
     fn into_par_iter(self) -> Self::Iter {
         IntoParIter { set: self }
     }
@@ -307,7 +301,6 @@ impl<'a, T: Sync, S: Sync> IntoParallelIterator for &'a HashSet<T, S> {
     type Item = &'a T;
     type Iter = ParIter<'a, T, S>;
 
-    #[inline]
     fn into_par_iter(self) -> Self::Iter {
         ParIter { set: self }
     }

diff --git a/src/external_trait_impls/serde.rs b/src/external_trait_impls/serde.rs
@@ -4,7 +4,6 @@ mod size_hint {
     /// This presumably exists to prevent denial of service attacks.
     ///
     /// Original discussion: https://github.com/serde-rs/serde/issues/1114.
-    #[inline]
     pub(super) fn cautious(hint: Option<usize>) -> usize {
         cmp::min(hint.unwrap_or(0), 4096)
     }
@@ -27,7 +26,6 @@ mod map {
         V: Serialize,
         H: BuildHasher,
     {
-        #[inline]
         fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
         where
             S: Serializer,
@@ -62,7 +60,6 @@ mod map {
                     formatter.write_str("a map")
                 }
 
-                #[inline]
                 fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
                 where
                     A: MapAccess<'de>,
@@ -104,7 +101,6 @@ mod set {
         T: Serialize + Eq + Hash,
         H: BuildHasher,
     {
-        #[inline]
         fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
         where
             S: Serializer,
@@ -137,7 +133,6 @@ mod set {
                     formatter.write_str("a sequence")
                 }
 
-                #[inline]
                 fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
                 where
                     A: SeqAccess<'de>,
@@ -178,7 +173,6 @@ mod set {
                     formatter.write_str("a sequence")
                 }
 
-                #[inline]
                 fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
                 where
                     A: SeqAccess<'de>,