From ffcc04310d364e9f0b3729b0d3bf60c1f1f15299 Mon Sep 17 00:00:00 2001
From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com>
Date: Thu, 3 Mar 2022 21:01:56 -0600
Subject: [PATCH] Seperate extrema and serialization in all cases

---
 base/sort.jl | 101 ++++++++++++++++++++++++++-------------------------
 1 file changed, 51 insertions(+), 50 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index 51ccfe679a5fd..0f088bda0a9d4 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -722,6 +722,16 @@ end
 
 maybe_unsigned(x::Integer) = x # this is necessary to avoid calling unsigned on BigInt
 maybe_unsigned(x::Union{Int8, Int16, Int32, Int64, Int128}) = unsigned(x)
+function _extrema(v::AbstractArray, lo::Integer, hi::Integer, o::Ordering)
+    mn = mx = v[lo]
+    while lo < hi
+        lo += 1
+        vi = v[lo]
+        lt(o, vi, mn) && (mn = vi)
+        lt(o, mx, vi) && (mx = vi)
+    end
+    mn, mx
+end
 function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::AdaptiveSort, o::Ordering)
     # if the sorting task is unserializable, then we can't radix sort or sort_int_range!
     # so we skip straight to the fallback algorithm which is comparison based.
@@ -735,57 +745,55 @@ function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::AdaptiveSort, o::
     # only count sort on a short range can compete with insertion sort fo ln < 30
     # and the optimization is not worth the detection cost, so we use inserstion sort.
     ln < 30 && return sort!(v, lo, hi, SMALL_ALGORITHM, o)
-    if (eltype(v) == Int128 || eltype(v) == UInt128) && o isa DirectOrdering
-        # This avoids an unneccessary serialization which can have a 0–50% runtime impact.
-        mn, mx = extrema(v)
-        rln = maybe_unsigned(mx-mn)
-         # the range can be big and count sort will still outperform comparison sort
-        if rln < 5ln-100
-            return sort_int_range!(v, rln+1, mn, o === Forward ? identity : reverse, lo, hi)
-        else
-            return sort!(v, lo, hi, a.fallback, o)
+
+    # UInt128 does not support fast bitshifting so we never
+    # dipsatch to radix sort but we may still perform countsort
+    if sizeof(U) > 8
+        if eltype(v) <: Integer && o isa DirectOrdering
+            mn, mx = _extrema(v, lo, hi, Forward)
+            rln = maybe_unsigned(mx-mn)
+            rln < 5ln-100 && # count sort will outperform comparison sort if rln is small
+                return sort_int_range!(v, rln+1, mn, o === Forward ? identity : reverse, lo, hi)
         end
+        return sort!(v, lo, hi, a.fallback, o)
     end
-    u, mn, mx = Serial.serialize!(v, lo, hi, o)
-
-    # Arbitrary types and orders may serialize to UInt128 (i.e. those not caught in the
-    # special case above), but should still never dispatch to radix sort because bit
-    # shifting 128 bits is too slow to be competitive with comparrison based sorting
-    # even if only some of the bits are actually used.
-    should_radix = sizeof(U) <= 8
-
-    # rln is an abbrevation for range_ln. Like ln, rln == length(mn:mx)-1
-    rln = maybe_unsigned(mx-mn)
-    # rln has to be small for cout sort to outperform radix sort
-    if rln < (should_radix ? ln÷2 : 5ln-100)
-        sort_int_range!(u, rln+1, mn, identity, lo, hi)
-        return Serial.deserialize!(v, u, lo, hi, o)
-    end
-    if !should_radix
-        sort!(u, lo, hi, ln < 70 ? SMALL_ALGORITHM : a.fallback, Forward)
-        return Serial.deserialize!(v, u, lo, hi, o)
+
+    mn, mx = _extrema(v, lo, hi, o)
+    if eltype(v) <: Integer && o isa DirectOrdering
+        F = o === Forward
+        rln = maybe_unsigned(F ? mx-mn : mn-mx)
+        if rln < ln÷2 # count sort will be superior if rln is very small
+            return sort_int_range!(v, rln+1, F ? mn : mx, F ? identity : reverse, lo, hi)
+        end
     end
 
+    umn, umx = Serial.serialize(mn, o), Serial.serialize(mx, o)
+    #umn, umx = umx < umn ? (umx, umn) : (umn, umx)
+    urln = maybe_unsigned(umx-umn)
+
     # if rln is small, then once we subtract out mn, we'll get a vector like
     # UInt16[0x001a, 0x0015, 0x0006, 0x001b, 0x0008, 0x000c, 0x0001, 0x000e, 0x001c, 0x0009]
     # where we only need to radix over the last few bits (bits = 5, in the example).
-    bits = unsigned(8sizeof(rln) - leading_zeros(rln))
+    bits = unsigned(8sizeof(urln) - leading_zeros(urln))
 
     # radix sort runs in O(bits * ln), insertion sort runs in O(ln^2). Radix sort has a
     # constant factor that is three times higher, so radix runtime is 3bits * ln and
     # insertion runtime is ln^2. Emperically, insertion is faster than radix iff ln < 3bits.
-    if ln < 3bits
-        # at ln = 64*3-1, QuickSort is about 20% faster than InsertionSort. The window
-        # where QuickSort is superior is the triangle contained by (ln=128, bits=43),
-        # (ln=191, bits=64), and (ln=128, bits=64). This is a small window, spanning only
-        # .015 square orders of magnitude, and the 20% performance gap is only present at
-        # the apex. At the centroid, the gap is about 6%. On the other hand there are
-        # theoretical/compilation benefits to avoiding the fallback entierly for small
-        # serializable types and orderings, so we unconditionaly use InsertionSort.
-        sort!(u, lo, hi, SMALL_ALGORITHM, Forward)
+    ln < 3bits && return sort!(v, lo, hi, SMALL_ALGORITHM, o)
+    # at ln = 64*3-1, QuickSort is about 20% faster than InsertionSort. The window
+    # where QuickSort is superior is the triangle contained by (ln=128, bits=43),
+    # (ln=191, bits=64), and (ln=128, bits=64). This is a small window, spanning only
+    # .015 square orders of magnitude, and the 20% performance gap is only present at
+    # the apex. At the centroid, the gap is about 6%. On the other hand there are
+    # theoretical/compilation benefits to avoiding the fallback entierly for small
+    # serializable types and orderings, so we unconditionaly use InsertionSort.
+
+    u = Serial.serialize!(v, lo, hi, o)
+
+    if urln < ln÷2 # count sort will be superior if rln is very small
+        sort_int_range!(u, urln+1, umn, identity, lo, hi)
         return Serial.deserialize!(v, u, lo, hi, o)
     end
-
     # At this point, we are comitted to radix sort.
 
     # chunk_size is the number of bits to radix over at once.
@@ -804,7 +812,7 @@ function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::AdaptiveSort, o::
     # Float32[2.012, 400.0, 12.345] serializes to UInt32[0x3fff3b63, 0x3c37ffff, 0x414570a4]
     # which is reduced to UInt32[0x03c73b64, 0x00000000, 0x050d70a5] using only 26 bits.
     # the overhead for this subtraction is small enough that it is worthwhile in many cases.
-    @inbounds for i in lo:hi u[i] -= mn end # this line is faster than u[lo:hi] .-= mn
+    @inbounds for i in lo:hi u[i] -= umn end # this line is faster than u[lo:hi] .-= mn
 
     t = similar(u)
     # This if else chain is to avoid dynamic dispatch for small cases.
@@ -826,7 +834,7 @@ function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::AdaptiveSort, o::
     else
         radix_sort!(u, lo, hi, bits, Val(chunk_size), t)
     end
-    Serial.deserialize!(v, u2, lo, hi, o, mn)
+    Serial.deserialize!(v, u2, lo, hi, o, umn)
 end
 
 ## generic sorting methods ##
@@ -1356,17 +1364,10 @@ Serializable(T::Type, order::ReverseOrdering) = Serializable(T, order.fwd)
 # Also return the extrema of its output.
 function serialize!(v::AbstractVector, lo::Integer, hi::Integer, order::Ordering)
     u = reinterpret(Serializable(eltype(v), order), v)
-    @inbounds u[lo] = mn = mx = serialize(v[lo], order)
-    i = lo # rename lo -> i for clarity only
-    @inbounds while i < hi
-        i += 1
-
-        ui = u[i] = serialize(v[i], order)
-
-        mx = max(ui, mx)
-        mn = min(ui, mn)
+    @inbounds for i in lo:hi
+        u[i] = serialize(v[i], order)
     end
-    u, mn, mx
+    u
 end
 
 function deserialize!(v::AbstractVector, u::AbstractVector{U},