Rollup merge of rust-lang#48639 - varkor:sort_by_key-cached, r=bluss

Add slice::sort_by_cached_key as a memoised sort_by_key At present, `slice::sort_by_key` calls its key function twice for each comparison that is made. When the key function is expensive (which can often be the case when `sort_by_key` is chosen over `sort_by`), this can lead to very suboptimal behaviour. To address this, I've introduced a new slice method, `sort_by_cached_key`, which has identical semantic behaviour to `sort_by_key`, except that it guarantees the key function will only be called once per element. Where there are `n` elements and the key function is `O(m)`: - `slice::sort_by_cached_key` time complexity is `O(m n log m n)`, compared to `slice::sort_by_key`'s `O(m n + n log n)`. - `slice::sort_by_cached_key` space complexity remains at `O(n + m)`. (Technically, it now reserves a slice of size `n`, whereas before it reserved a slice of size `n/2`.) `slice::sort_unstable_by_key` has not been given an analogue, as it is important that unstable sorts are in-place, which is not a property that is guaranteed here. However, this also means that `slice::sort_unstable_by_key` is likely to be slower than `slice::sort_by_cached_key` when the key function does not have negligible complexity. We might want to explore this trade-off further in the future. Benchmarks (for a vector of 100 `i32`s): ``` # Lexicographic: `|x| x.to_string()` test bench_sort_by_key ... bench: 112,638 ns/iter (+/- 19,563) test bench_sort_by_cached_key ... bench: 15,038 ns/iter (+/- 4,814) # Identity: `|x| *x` test bench_sort_by_key ... bench: 1,346 ns/iter (+/- 238) test bench_sort_by_cached_key ... bench: 1,839 ns/iter (+/- 765) # Power: `|x| x.pow(31)` test bench_sort_by_key ... bench: 3,624 ns/iter (+/- 738) test bench_sort_by_cached_key ... bench: 1,997 ns/iter (+/- 311) # Abs: `|x| x.abs()` test bench_sort_by_key ... bench: 1,546 ns/iter (+/- 174) test bench_sort_by_cached_key ... bench: 1,668 ns/iter (+/- 790) ``` (So it seems functions that are single operations do perform slightly worse with this method, but for pretty much any more complex key, you're better off with this optimisation.) I've definitely found myself using expensive keys in the past and wishing this optimisation was made (e.g. for rust-lang#47415). This feels like both desirable and expected behaviour, at the small cost of slightly more stack allocation and minute degradation in performance for extremely trivial keys. Resolves rust-lang#34447.
kennytm · Mar 27, 2018 · 42de36d · 42de36d
2 parents 31ae4f9 + 9c7b69e
commit 42de36d
Show file tree

Hide file tree

Showing 5 changed files with 111 additions and 13 deletions.
diff --git a/src/liballoc/benches/lib.rs b/src/liballoc/benches/lib.rs
@@ -13,6 +13,7 @@
 #![cfg_attr(stage0, feature(i128_type))]
 #![feature(rand)]
 #![feature(repr_simd)]
+#![feature(slice_sort_by_cached_key)]
 #![feature(test)]
 
 extern crate rand;

diff --git a/src/liballoc/benches/slice.rs b/src/liballoc/benches/slice.rs
@@ -284,6 +284,17 @@ macro_rules! sort_expensive {
     }
 }
 
+macro_rules! sort_lexicographic {
+    ($f:ident, $name:ident, $gen:expr, $len:expr) => {
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            let v = $gen($len);
+            b.iter(|| v.clone().$f(|x| x.to_string()));
+            b.bytes = $len * mem::size_of_val(&$gen(1)[0]) as u64;
+        }
+    }
+}
+
 sort!(sort, sort_small_ascending, gen_ascending, 10);
 sort!(sort, sort_small_descending, gen_descending, 10);
 sort!(sort, sort_small_random, gen_random, 10);
@@ -312,6 +323,10 @@ sort!(sort_unstable, sort_unstable_large_big, gen_big_random, 10000);
 sort_strings!(sort_unstable, sort_unstable_large_strings, gen_strings, 10000);
 sort_expensive!(sort_unstable_by, sort_unstable_large_expensive, gen_random, 10000);
 
+sort_lexicographic!(sort_by_key, sort_by_key_lexicographic, gen_random, 10000);
+sort_lexicographic!(sort_unstable_by_key, sort_unstable_by_key_lexicographic, gen_random, 10000);
+sort_lexicographic!(sort_by_cached_key, sort_by_cached_key_lexicographic, gen_random, 10000);
+
 macro_rules! reverse {
     ($name:ident, $ty:ty, $f:expr) => {
         #[bench]

diff --git a/src/liballoc/slice.rs b/src/liballoc/slice.rs
@@ -102,6 +102,7 @@ use core::mem::size_of;
 use core::mem;
 use core::ptr;
 use core::slice as core_slice;
+use core::{u8, u16, u32};
 
 use borrow::{Borrow, BorrowMut, ToOwned};
 use boxed::Box;
@@ -1302,7 +1303,8 @@ impl<T> [T] {
 
     /// Sorts the slice with a key extraction function.
     ///
-    /// This sort is stable (i.e. does not reorder equal elements) and `O(n log n)` worst-case.
+    /// This sort is stable (i.e. does not reorder equal elements) and `O(m n log(m n))`
+    /// worst-case, where the key function is `O(m)`.
     ///
     /// When applicable, unstable sorting is preferred because it is generally faster than stable
     /// sorting and it doesn't allocate auxiliary memory.
@@ -1328,12 +1330,82 @@ impl<T> [T] {
     /// ```
     #[stable(feature = "slice_sort_by_key", since = "1.7.0")]
     #[inline]
-    pub fn sort_by_key<B, F>(&mut self, mut f: F)
-        where F: FnMut(&T) -> B, B: Ord
+    pub fn sort_by_key<K, F>(&mut self, mut f: F)
+        where F: FnMut(&T) -> K, K: Ord
     {
         merge_sort(self, |a, b| f(a).lt(&f(b)));
     }
 
+    /// Sorts the slice with a key extraction function.
+    ///
+    /// During sorting, the key function is called only once per element.
+    ///
+    /// This sort is stable (i.e. does not reorder equal elements) and `O(m n + n log n)`
+    /// worst-case, where the key function is `O(m)`.
+    ///
+    /// For simple key functions (e.g. functions that are property accesses or
+    /// basic operations), [`sort_by_key`](#method.sort_by_key) is likely to be
+    /// faster.
+    ///
+    /// # Current implementation
+    ///
+    /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
+    /// which combines the fast average case of randomized quicksort with the fast worst case of
+    /// heapsort, while achieving linear time on slices with certain patterns. It uses some
+    /// randomization to avoid degenerate cases, but with a fixed seed to always provide
+    /// deterministic behavior.
+    ///
+    /// In the worst case, the algorithm allocates temporary storage in a `Vec<(K, usize)>` the
+    /// length of the slice.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// #![feature(slice_sort_by_cached_key)]
+    /// let mut v = [-5i32, 4, 32, -3, 2];
+    ///
+    /// v.sort_by_cached_key(|k| k.to_string());
+    /// assert!(v == [-3, -5, 2, 32, 4]);
+    /// ```
+    ///
+    /// [pdqsort]: https://github.com/orlp/pdqsort
+    #[unstable(feature = "slice_sort_by_cached_key", issue = "34447")]
+    #[inline]
+    pub fn sort_by_cached_key<K, F>(&mut self, f: F)
+        where F: FnMut(&T) -> K, K: Ord
+    {
+        // Helper macro for indexing our vector by the smallest possible type, to reduce allocation.
+        macro_rules! sort_by_key {
+            ($t:ty, $slice:ident, $f:ident) => ({
+                let mut indices: Vec<_> =
+                    $slice.iter().map($f).enumerate().map(|(i, k)| (k, i as $t)).collect();
+                // The elements of `indices` are unique, as they are indexed, so any sort will be
+                // stable with respect to the original slice. We use `sort_unstable` here because
+                // it requires less memory allocation.
+                indices.sort_unstable();
+                for i in 0..$slice.len() {
+                    let mut index = indices[i].1;
+                    while (index as usize) < i {
+                        index = indices[index as usize].1;
+                    }
+                    indices[i].1 = index;
+                    $slice.swap(i, index as usize);
+                }
+            })
+        }
+
+        let sz_u8    = mem::size_of::<(K, u8)>();
+        let sz_u16   = mem::size_of::<(K, u16)>();
+        let sz_u32   = mem::size_of::<(K, u32)>();
+        let sz_usize = mem::size_of::<(K, usize)>();
+
+        let len = self.len();
+        if sz_u8  < sz_u16   && len <= ( u8::MAX as usize) { return sort_by_key!( u8, self, f) }
+        if sz_u16 < sz_u32   && len <= (u16::MAX as usize) { return sort_by_key!(u16, self, f) }
+        if sz_u32 < sz_usize && len <= (u32::MAX as usize) { return sort_by_key!(u32, self, f) }
+        sort_by_key!(usize, self, f)
+    }
+
     /// Sorts the slice, but may not preserve the order of equal elements.
     ///
     /// This sort is unstable (i.e. may reorder equal elements), in-place (i.e. does not allocate),
@@ -1410,7 +1482,7 @@ impl<T> [T] {
     /// elements.
     ///
     /// This sort is unstable (i.e. may reorder equal elements), in-place (i.e. does not allocate),
-    /// and `O(n log n)` worst-case.
+    /// and `O(m n log(m n))` worst-case, where the key function is `O(m)`.
     ///
     /// # Current implementation
     ///
@@ -1420,9 +1492,6 @@ impl<T> [T] {
     /// randomization to avoid degenerate cases, but with a fixed seed to always provide
     /// deterministic behavior.
     ///
-    /// It is typically faster than stable sorting, except in a few special cases, e.g. when the
-    /// slice consists of several concatenated sorted sequences.
-    ///
     /// # Examples
     ///
     /// ```
@@ -1435,9 +1504,8 @@ impl<T> [T] {
     /// [pdqsort]: https://github.com/orlp/pdqsort
     #[stable(feature = "sort_unstable", since = "1.20.0")]
     #[inline]
-    pub fn sort_unstable_by_key<B, F>(&mut self, f: F)
-        where F: FnMut(&T) -> B,
-              B: Ord
+    pub fn sort_unstable_by_key<K, F>(&mut self, f: F)
+        where F: FnMut(&T) -> K, K: Ord
     {
         core_slice::SliceExt::sort_unstable_by_key(self, f);
     }

diff --git a/src/liballoc/tests/lib.rs b/src/liballoc/tests/lib.rs
@@ -23,6 +23,7 @@
 #![feature(pattern)]
 #![feature(placement_in_syntax)]
 #![feature(rand)]
+#![feature(slice_sort_by_cached_key)]
 #![feature(splice)]
 #![feature(str_escape)]
 #![feature(string_retain)]

diff --git a/src/liballoc/tests/slice.rs b/src/liballoc/tests/slice.rs
@@ -425,6 +425,14 @@ fn test_sort() {
                 v.sort_by(|a, b| b.cmp(a));
                 assert!(v.windows(2).all(|w| w[0] >= w[1]));
 
+                // Sort in lexicographic order.
+                let mut v1 = orig.clone();
+                let mut v2 = orig.clone();
+                v1.sort_by_key(|x| x.to_string());
+                v2.sort_by_cached_key(|x| x.to_string());
+                assert!(v1.windows(2).all(|w| w[0].to_string() <= w[1].to_string()));
+                assert!(v1 == v2);
+
                 // Sort with many pre-sorted runs.
                 let mut v = orig.clone();
                 v.sort();
@@ -477,24 +485,29 @@ fn test_sort_stability() {
             // the second item represents which occurrence of that
             // number this element is, i.e. the second elements
             // will occur in sorted order.
-            let mut v: Vec<_> = (0..len)
+            let mut orig: Vec<_> = (0..len)
                 .map(|_| {
                     let n = thread_rng().gen::<usize>() % 10;
                     counts[n] += 1;
                     (n, counts[n])
                 })
                 .collect();
 
-            // only sort on the first element, so an unstable sort
+            let mut v = orig.clone();
+            // Only sort on the first element, so an unstable sort
             // may mix up the counts.
             v.sort_by(|&(a, _), &(b, _)| a.cmp(&b));
 
-            // this comparison includes the count (the second item
+            // This comparison includes the count (the second item
             // of the tuple), so elements with equal first items
             // will need to be ordered with increasing
             // counts... i.e. exactly asserting that this sort is
             // stable.
             assert!(v.windows(2).all(|w| w[0] <= w[1]));
+
+            let mut v = orig.clone();
+            v.sort_by_cached_key(|&(x, _)| x);
+            assert!(v.windows(2).all(|w| w[0] <= w[1]));
         }
     }
 }