From 5dfd6c959fb0f08e4fa9dd838d9b480d37ca8998 Mon Sep 17 00:00:00 2001
From: Lilith Orion Hafner <60898866+LilithHafner@users.noreply.github.com>
Date: Sat, 2 Apr 2022 21:26:38 -0500
Subject: [PATCH] Add radix sort (#44230)

* Add radix sort
---
 base/sort.jl    | 351 ++++++++++++++++++++++++++++++++++++++++++++----
 test/sorting.jl | 157 ++++++++++++++++++++--
 2 files changed, 468 insertions(+), 40 deletions(-)

diff --git a/base/sort.jl b/base/sort.jl
index 981eea35d96ab..c8cacb9770c54 100644
--- a/base/sort.jl
+++ b/base/sort.jl
@@ -10,8 +10,8 @@ using .Base: copymutable, LinearIndices, length, (:), iterate,
     AbstractVector, @inbounds, AbstractRange, @eval, @inline, Vector, @noinline,
     AbstractMatrix, AbstractUnitRange, isless, identity, eltype, >, <, <=, >=, |, +, -, *, !,
     extrema, sub_with_overflow, add_with_overflow, oneunit, div, getindex, setindex!,
-    length, resize!, fill, Missing, require_one_based_indexing, keytype,
-    UnitRange, max, min
+    length, resize!, fill, Missing, require_one_based_indexing, keytype, UnitRange,
+    min, max, reinterpret, signed, unsigned, Signed, Unsigned, typemin, xor, Type, BitSigned
 
 using .Base: >>>, !==
 
@@ -426,6 +426,22 @@ struct InsertionSortAlg <: Algorithm end
 struct QuickSortAlg     <: Algorithm end
 struct MergeSortAlg     <: Algorithm end
 
+"""
+    AdaptiveSort(fallback)
+
+Indicate that a sorting function should use the fastest available algorithm.
+
+Adaptive sort will use the algorithm specified by `fallback` for types and orders that are
+not [`UIntMappable`](@ref). Otherwise, it will typically use:
+  * Insertion sort for short vectors
+  * Radix sort for long vectors
+  * Counting sort for vectors of integers spanning a short range
+
+Adaptive sort is guaranteed to be stable if the fallback algorithm is stable.
+"""
+struct AdaptiveSort{Fallback <: Algorithm} <: Algorithm
+    fallback::Fallback
+end
 """
     PartialQuickSort{T <: Union{Integer,OrdinalRange}}
 
@@ -451,7 +467,7 @@ end
 Indicate that a sorting function should use the insertion sort
 algorithm. Insertion sort traverses the collection one element
 at a time, inserting each element into its correct, sorted position in
-the output list.
+the output vector.
 
 Characteristics:
   * *stable*: preserves the ordering of elements which
@@ -495,8 +511,8 @@ Characteristics:
 """
 const MergeSort     = MergeSortAlg()
 
-const DEFAULT_UNSTABLE = QuickSort
-const DEFAULT_STABLE   = MergeSort
+const DEFAULT_UNSTABLE = AdaptiveSort(QuickSort)
+const DEFAULT_STABLE   = AdaptiveSort(MergeSort)
 const SMALL_ALGORITHM  = InsertionSort
 const SMALL_THRESHOLD  = 20
 
@@ -652,13 +668,202 @@ function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::PartialQuickSort,
     return v
 end
 
+# This is a stable least significant bit first radix sort.
+#
+# That is, it first sorts the entire vector by the last chunk_size bits, then by the second
+# to last chunk_size bits, and so on. Stability means that it will not reorder two elements
+# that compare equal. This is essential so that the order introduced by earlier,
+# less significant passes is preserved by later passes.
+#
+# Each pass divides the input into 2^chunk_size == mask+1 buckets. To do this, it
+#  * counts the number of entries that fall into each bucket
+#  * uses those counts to compute the indices to move elements of those buckets into
+#  * moves elements into the computed indices in the swap array
+#  * switches the swap and working array
+#
+# In the case of an odd number of passes, the returned vector will === the input vector t,
+# not v. This is one of the many reasons radix_sort! is not exported.
+function radix_sort!(v::AbstractVector{U}, lo::Integer, hi::Integer, bits::Unsigned,
+                     t::AbstractVector{U}, chunk_size=radix_chunk_size_heuristic(lo, hi, bits)) where U <: Unsigned
+    # bits is unsigned for performance reasons.
+    mask = UInt(1) << chunk_size - 0x1
+    counts = Vector{UInt}(undef, mask+2)
+
+    @inbounds for shift in 0:chunk_size:bits-1
+
+        # counts[2:mask+2] will store the number of elements that fall into each bucket.
+        # if chunk_size = 8, counts[2] is bucket 0x00 and counts[257] is bucket 0xff.
+        counts .= 0
+        for k in lo:hi
+            x = v[k]                  # lookup the element
+            i = (x >> shift)&mask + 2 # compute its bucket's index for this pass
+            counts[i] += 1            # increment that bucket's count
+        end
+
+        counts[1] = lo                # set target index for the first bucket
+        cumsum!(counts, counts)       # set target indices for subsequent buckets
+        # counts[1:mask+1] now stores indices where the first member of each bucket
+        # belongs, not the number of elements in each bucket. We will put the first element
+        # of bucket 0x00 in t[counts[1]], the next element of bucket 0x00 in t[counts[1]+1],
+        # and the last element of bucket 0x00 in t[counts[2]-1].
+
+        for k in lo:hi
+            x = v[k]                  # lookup the element
+            i = (x >> shift)&mask + 1 # compute its bucket's index for this pass
+            j = counts[i]             # lookup the target index
+            t[j] = x                  # put the element where it belongs
+            counts[i] = j + 1         # increment the target index for the next
+        end                           #  ↳ element in this bucket
+
+        v, t = t, v # swap the now sorted destination vector t back into primary vector v
+
+    end
+
+    v
+end
+function radix_chunk_size_heuristic(lo::Integer, hi::Integer, bits::Unsigned)
+    # chunk_size is the number of bits to radix over at once.
+    # We need to allocate an array of size 2^chunk size, and on the other hand the higher
+    # the chunk size the fewer passes we need. Theoretically, chunk size should be based on
+    # the Lambert W function applied to length. Empirically, we use this heuristic:
+    guess = min(10, log(maybe_unsigned(hi-lo))*3/4+3)
+    # TODO the maximum chunk size should be based on archetecture cache size.
+
+    # We need iterations * chunk size ≥ bits, and these cld's
+    # make an effort to get iterations * chunk size ≈ bits
+    UInt8(cld(bits, cld(bits, guess)))
+end
+
+# For AbstractVector{Bool}, counting sort is always best.
+# This is an implementation of counting sort specialized for Bools.
+function sort!(v::AbstractVector{<:Bool}, lo::Integer, hi::Integer, a::AdaptiveSort, o::Ordering)
+    first = lt(o, false, true) ? false : lt(o, true, false) ? true : return v
+    count = 0
+    @inbounds for i in lo:hi
+        if v[i] == first
+            count += 1
+        end
+    end
+    @inbounds v[lo:lo+count-1] .= first
+    @inbounds v[lo+count:hi] .= !first
+    v
+end
+
+maybe_unsigned(x::Integer) = x # this is necessary to avoid calling unsigned on BigInt
+maybe_unsigned(x::BitSigned) = unsigned(x)
+function _extrema(v::AbstractArray, lo::Integer, hi::Integer, o::Ordering)
+    mn = mx = v[lo]
+    @inbounds for i in (lo+1):hi
+        vi = v[i]
+        lt(o, vi, mn) && (mn = vi)
+        lt(o, mx, vi) && (mx = vi)
+    end
+    mn, mx
+end
+function sort!(v::AbstractVector, lo::Integer, hi::Integer, a::AdaptiveSort, o::Ordering)
+    # if the sorting task is not UIntMappable, then we can't radix sort or sort_int_range!
+    # so we skip straight to the fallback algorithm which is comparison based.
+    U = UIntMappable(eltype(v), o)
+    U === nothing && return sort!(v, lo, hi, a.fallback, o)
+
+    # to avoid introducing excessive detection costs for the trivial sorting problem
+    # and to avoid overflow, we check for small inputs before any other runtime checks
+    hi <= lo && return v
+    lenm1 = maybe_unsigned(hi-lo) # adding 1 would risk overflow
+    # only count sort on a short range can compete with insertion sort when lenm1 < 40
+    # and the optimization is not worth the detection cost, so we use insertion sort.
+    lenm1 < 40 && return sort!(v, lo, hi, SMALL_ALGORITHM, o)
+
+    # For most arrays, a presorted check is cheap (overhead < 5%) and for most large
+    # arrays it is essentially free (<1%). Insertion sort runs in a fast O(n) on presorted
+    # input and this guarantees presorted input will always be efficiently handled
+    issorted(view(v, lo:hi), o) && return v
+
+    # For large arrays, a reverse-sorted check is essentially free (overhead < 1%)
+    if lenm1 >= 500 && issorted(view(v, lo:hi), ReverseOrdering(o))
+        reverse!(view(v, lo:hi))
+        return v
+    end
+
+    # UInt128 does not support fast bit shifting so we never
+    # dispatch to radix sort but we may still perform count sort
+    if sizeof(U) > 8
+        if eltype(v) <: Integer && o isa DirectOrdering
+            v_min, v_max = _extrema(v, lo, hi, Forward)
+            v_range = maybe_unsigned(v_max-v_min)
+            v_range == 0 && return v # all same
+
+            # we know lenm1 ≥ 40, so this will never underflow.
+            # if lenm1 > 3.7e18 (59 exabytes), then this may incorrectly dispatch to fallback
+            if v_range < 5lenm1-100 # count sort will outperform comparison sort if v's range is small
+                return sort_int_range!(v, Int(v_range+1), v_min, o === Forward ? identity : reverse, lo, hi)
+            end
+        end
+        return sort!(v, lo, hi, a.fallback, o)
+    end
+
+    v_min, v_max = _extrema(v, lo, hi, o)
+    lt(o, v_min, v_max) || return v # all same
+    if eltype(v) <: Integer && o isa DirectOrdering
+        R = o === Reverse
+        v_range = maybe_unsigned(R ? v_min-v_max : v_max-v_min)
+        if v_range < div(lenm1, 2) # count sort will be superior if v's range is very small
+            return sort_int_range!(v, Int(v_range+1), R ? v_max : v_min, R ? reverse : identity, lo, hi)
+        end
+    end
+
+    u_min, u_max = uint_map(v_min, o), uint_map(v_max, o)
+    u_range = maybe_unsigned(u_max-u_min)
+    if u_range < div(lenm1, 2) # count sort will be superior if u's range is very small
+        u = uint_map!(v, lo, hi, o)
+        sort_int_range!(u, Int(u_range+1), u_min, identity, lo, hi)
+        return uint_unmap!(v, u, lo, hi, o)
+    end
+
+    # if u's range is small, then once we subtract out v_min, we'll get a vector like
+    # UInt16[0x001a, 0x0015, 0x0006, 0x001b, 0x0008, 0x000c, 0x0001, 0x000e, 0x001c, 0x0009]
+    # where we only need to radix over the last few bits (5, in the example).
+    bits = unsigned(8sizeof(u_range) - leading_zeros(u_range))
+
+    # radix sort runs in O(bits * lenm1), insertion sort runs in O(lenm1^2). Radix sort
+    # has a constant factor that is three times higher, so radix runtime is 3bits * lenm1
+    # and insertion runtime is lenm1^2. Empirically, insertion is faster than radix iff
+    # lenm1 < 3bits.
+    # Insertion < Radix
+    #   lenm1^2 < 3 * bits * lenm1
+    #     lenm1 < 3bits
+    if lenm1 < 3bits
+        # at lenm1 = 64*3-1, QuickSort is about 20% faster than InsertionSort.
+        alg = a.fallback === QuickSort && lenm1 > 120 ? QuickSort : SMALL_ALGORITHM
+        return sort!(v, lo, hi, alg, o)
+    end
+
+    # At this point, we are committed to radix sort.
+    u = uint_map!(v, lo, hi, o)
+
+    # we subtract u_min to avoid radixing over unnecessary bits. For example,
+    # Int32[3, -1, 2] uint_maps to UInt32[0x80000003, 0x7fffffff, 0x80000002]
+    # which uses all 32 bits, but once we subtract u_min = 0x7fffffff, we are left with
+    # UInt32[0x00000004, 0x00000000, 0x00000003] which uses only 3 bits, and
+    # Float32[2.012, 400.0, 12.345] uint_maps to UInt32[0x3fff3b63, 0x3c37ffff, 0x414570a4]
+    # which is reduced to UInt32[0x03c73b64, 0x00000000, 0x050d70a5] using only 26 bits.
+    # the overhead for this subtraction is small enough that it is worthwhile in many cases.
+
+    # this is faster than u[lo:hi] .-= u_min as of v1.9.0-DEV.100
+    @inbounds for i in lo:hi
+        u[i] -= u_min
+    end
+
+    u2 = radix_sort!(u, lo, hi, bits, similar(u))
+    uint_unmap!(v, u2, lo, hi, o, u_min)
+end
 
 ## generic sorting methods ##
 
 defalg(v::AbstractArray) = DEFAULT_STABLE
 defalg(v::AbstractArray{<:Union{Number, Missing}}) = DEFAULT_UNSTABLE
-defalg(v::AbstractArray{Missing}) = DEFAULT_UNSTABLE
-defalg(v::AbstractArray{Union{}}) = DEFAULT_UNSTABLE
+defalg(v::AbstractArray{Missing}) = DEFAULT_UNSTABLE # for method disambiguation
+defalg(v::AbstractArray{Union{}}) = DEFAULT_UNSTABLE # for method disambiguation
 
 function sort!(v::AbstractVector, alg::Algorithm, order::Ordering)
     inds = axes(v,1)
@@ -711,31 +916,20 @@ function sort!(v::AbstractVector;
                by=identity,
                rev::Union{Bool,Nothing}=nothing,
                order::Ordering=Forward)
-    ordr = ord(lt,by,rev,order)
-    if (ordr === Forward || ordr === Reverse) && eltype(v)<:Integer
-        n = length(v)
-        if n > 1
-            min, max = extrema(v)
-            (diff, o1) = sub_with_overflow(max, min)
-            (rangelen, o2) = add_with_overflow(diff, oneunit(diff))
-            if !o1 && !o2 && rangelen < div(n,2)
-                return sort_int_range!(v, rangelen, min, ordr === Reverse ? reverse : identity)
-            end
-        end
-    end
-    sort!(v, alg, ordr)
+    sort!(v, alg, ord(lt,by,rev,order))
 end
 
 # sort! for vectors of few unique integers
-function sort_int_range!(x::AbstractVector{<:Integer}, rangelen, minval, maybereverse)
+function sort_int_range!(x::AbstractVector{<:Integer}, rangelen, minval, maybereverse,
+                         lo=firstindex(x), hi=lastindex(x))
     offs = 1 - minval
 
     counts = fill(0, rangelen)
-    @inbounds for i = eachindex(x)
+    @inbounds for i = lo:hi
         counts[x[i] + offs] += 1
     end
 
-    idx = firstindex(x)
+    idx = lo
     @inbounds for i = maybereverse(1:rangelen)
         lastidx = idx + counts[i] - 1
         val = i-offs
@@ -1109,15 +1303,104 @@ function sort!(A::AbstractArray;
     A
 end
 
+
+## uint mapping to allow radix sorting primitives other than UInts ##
+
+"""
+    UIntMappable(T::Type, order::Ordering)
+
+Return `typeof(uint_map(x::T, order))` if [`uint_map`](@ref) and
+[`uint_unmap`](@ref) are implemented.
+
+If either is not implemented, return `nothing`.
+"""
+UIntMappable(T::Type, order::Ordering) = nothing
+
+"""
+    uint_map(x, order::Ordering)::Unsigned
+
+Map `x` to an un unsigned integer, maintaining sort order.
+
+The map should be reversible with [`uint_unmap`](@ref), so `isless(order, a, b)` must be
+a linear ordering for `a, b <: typeof(x)`. Satisfies
+`isless(order, a, b) === (uint_map(a, order) < uint_map(b, order))`
+and `x === uint_unmap(typeof(x), uint_map(x, order), order)`
+
+See also: [`UIntMappable`](@ref) [`uint_unmap`](@ref)
+"""
+function uint_map end
+
+"""
+    uint_unmap(T::Type, u::Unsigned, order::Ordering)
+
+Reconstruct the unique value `x::T` that uint_maps to `u`. Satisfies
+`x === uint_unmap(T, uint_map(x::T, order), order)` for all `x <: T`.
+
+See also: [`uint_map`](@ref) [`UIntMappable`](@ref)
+"""
+function uint_unmap end
+
+
+### Primitive Types
+
+# Integers
+uint_map(x::Unsigned, ::ForwardOrdering) = x
+uint_unmap(::Type{T}, u::T, ::ForwardOrdering) where T <: Unsigned = u
+
+uint_map(x::Signed, ::ForwardOrdering) =
+    unsigned(xor(x, typemin(x)))
+uint_unmap(::Type{T}, u::Unsigned, ::ForwardOrdering) where T <: Signed =
+    xor(signed(u), typemin(T))
+
+# unsigned(Int) is not available during bootstrapping.
+for (U, S) in [(UInt8, Int8), (UInt16, Int16), (UInt32, Int32), (UInt64, Int64), (UInt128, Int128)]
+    @eval UIntMappable(::Type{<:Union{$U, $S}}, ::ForwardOrdering) = $U
+end
+
+# Floats are not UIntMappable under regular orderings because they fail on NaN edge cases.
+# uint mappings for floats are defined in Float, where the Left and Right orderings
+# guarantee that there are no NaN values
+
+# Chars
+uint_map(x::Char, ::ForwardOrdering) = reinterpret(UInt32, x)
+uint_unmap(::Type{Char}, u::UInt32, ::ForwardOrdering) = reinterpret(Char, u)
+UIntMappable(::Type{Char}, ::ForwardOrdering) = UInt32
+
+### Reverse orderings
+uint_map(x, rev::ReverseOrdering) = ~uint_map(x, rev.fwd)
+uint_unmap(T::Type, u::Unsigned, rev::ReverseOrdering) = uint_unmap(T, ~u, rev.fwd)
+UIntMappable(T::Type, order::ReverseOrdering) = UIntMappable(T, order.fwd)
+
+
+### Vectors
+
+# Convert v to unsigned integers in place, maintaining sort order.
+function uint_map!(v::AbstractVector, lo::Integer, hi::Integer, order::Ordering)
+    u = reinterpret(UIntMappable(eltype(v), order), v)
+    @inbounds for i in lo:hi
+        u[i] = uint_map(v[i], order)
+    end
+    u
+end
+
+function uint_unmap!(v::AbstractVector, u::AbstractVector{U}, lo::Integer, hi::Integer,
+                     order::Ordering, offset::U=zero(U)) where U <: Unsigned
+    @inbounds for i in lo:hi
+        v[i] = uint_unmap(eltype(v), u[i]+offset, order)
+    end
+    v
+end
+
+
 ## fast clever sorting for floats ##
 
 module Float
 using ..Sort
 using ...Order
-using ..Base: @inbounds, AbstractVector, Vector, last, axes, Missing
+using ..Base: @inbounds, AbstractVector, Vector, last, axes, Missing, Type, reinterpret
 
 import Core.Intrinsics: slt_int
-import ..Sort: sort!
+import ..Sort: sort!, UIntMappable, uint_map, uint_unmap
 import ...Order: lt, DirectOrdering
 
 const Floats = Union{Float32,Float64}
@@ -1140,6 +1423,18 @@ right(o::Perm) = Perm(right(o.order), o.data)
 lt(::Left, x::T, y::T) where {T<:Floats} = slt_int(y, x)
 lt(::Right, x::T, y::T) where {T<:Floats} = slt_int(x, y)
 
+uint_map(x::Float32, ::Left) = ~reinterpret(UInt32, x)
+uint_unmap(::Type{Float32}, u::UInt32, ::Left) = reinterpret(Float32, ~u)
+uint_map(x::Float32, ::Right) = reinterpret(UInt32, x)
+uint_unmap(::Type{Float32}, u::UInt32, ::Right) = reinterpret(Float32, u)
+UIntMappable(::Type{Float32}, ::Union{Left, Right}) = UInt32
+
+uint_map(x::Float64, ::Left) = ~reinterpret(UInt64, x)
+uint_unmap(::Type{Float64}, u::UInt64, ::Left) = reinterpret(Float64, ~u)
+uint_map(x::Float64, ::Right) = reinterpret(UInt64, x)
+uint_unmap(::Type{Float64}, u::UInt64, ::Right) = reinterpret(Float64, u)
+UIntMappable(::Type{Float64}, ::Union{Left, Right}) = UInt64
+
 isnan(o::DirectOrdering, x::Floats) = (x!=x)
 isnan(o::DirectOrdering, x::Missing) = false
 isnan(o::Perm, i::Integer) = isnan(o.order,o.data[i])
@@ -1221,6 +1516,10 @@ issignleft(o::ReverseOrdering, x::Floats) = lt(o, x, -zero(x))
 issignleft(o::Perm, i::Integer) = issignleft(o.order, o.data[i])
 
 function fpsort!(v::AbstractVector, a::Algorithm, o::Ordering)
+    # fpsort!'s optimizations speed up comparisons, of which there are O(nlogn).
+    # The overhead is O(n). For n < 10, it's not worth it.
+    length(v) < 10 && return sort!(v, first(axes(v,1)), last(axes(v,1)), SMALL_ALGORITHM, o)
+
     i, j = lo, hi = specials2end!(v,a,o)
     @inbounds while true
         while i <= j &&  issignleft(o,v[i]); i += 1; end
@@ -1240,7 +1539,7 @@ fpsort!(v::AbstractVector, a::Sort.PartialQuickSort, o::Ordering) =
 
 sort!(v::FPSortable, a::Algorithm, o::DirectOrdering) =
     fpsort!(v, a, o)
-sort!(v::AbstractVector{<:Integer}, a::Algorithm, o::Perm{<:DirectOrdering,<:FPSortable}) =
+sort!(v::AbstractVector{<:Union{Signed, Unsigned}}, a::Algorithm, o::Perm{<:DirectOrdering,<:FPSortable}) =
     fpsort!(v, a, o)
 
 end # module Sort.Float
diff --git a/test/sorting.jl b/test/sorting.jl
index 86479eca6cc78..2cb4eec93b380 100644
--- a/test/sorting.jl
+++ b/test/sorting.jl
@@ -265,7 +265,8 @@ Base.step(r::ConstantRange) = 0
     @test searchsortedlast(r, UInt(1), Forward) == 5
 
     a = rand(1:10000, 1000)
-    for alg in [InsertionSort, MergeSort]
+    for alg in [InsertionSort, MergeSort, Base.DEFAULT_STABLE]
+
         b = sort(a, alg=alg)
         @test issorted(b)
 
@@ -330,15 +331,17 @@ Base.step(r::ConstantRange) = 0
     end
 
     @testset "unstable algorithms" begin
-        b = sort(a, alg=QuickSort)
-        @test issorted(b)
-        @test last(b) == last(sort(a, alg=PartialQuickSort(length(a))))
-        b = sort(a, alg=QuickSort, rev=true)
-        @test issorted(b, rev=true)
-        @test last(b) == last(sort(a, alg=PartialQuickSort(length(a)), rev=true))
-        b = sort(a, alg=QuickSort, by=x->1/x)
-        @test issorted(b, by=x->1/x)
-        @test last(b) == last(sort(a, alg=PartialQuickSort(length(a)), by=x->1/x))
+        for alg in [QuickSort, Base.DEFAULT_UNSTABLE]
+            b = sort(a, alg=alg)
+            @test issorted(b)
+            @test last(b) == last(sort(a, alg=PartialQuickSort(length(a))))
+            b = sort(a, alg=alg, rev=true)
+            @test issorted(b, rev=true)
+            @test last(b) == last(sort(a, alg=PartialQuickSort(length(a)), rev=true))
+            b = sort(a, alg=alg, by=x->1/x)
+            @test issorted(b, by=x->1/x)
+            @test last(b) == last(sort(a, alg=PartialQuickSort(length(a)), by=x->1/x))
+        end
     end
 end
 @testset "insorted" begin
@@ -464,7 +467,7 @@ end
             @test c == v
 
             # stable algorithms
-            for alg in [MergeSort]
+            for alg in [MergeSort, Base.DEFAULT_STABLE]
                 p = sortperm(v, alg=alg, rev=rev)
                 p2 = sortperm(float(v), alg=alg, rev=rev)
                 @test p == p2
@@ -477,7 +480,7 @@ end
             end
 
             # unstable algorithms
-            for alg in [QuickSort, PartialQuickSort(1:n)]
+            for alg in [QuickSort, PartialQuickSort(1:n), Base.DEFAULT_UNSTABLE]
                 p = sortperm(v, alg=alg, rev=rev)
                 p2 = sortperm(float(v), alg=alg, rev=rev)
                 @test p == p2
@@ -509,8 +512,9 @@ end
 
         v = randn_with_nans(n,0.1)
         # TODO: alg = PartialQuickSort(n) fails here
-        for alg in [InsertionSort, QuickSort, MergeSort],
+        for alg in [InsertionSort, QuickSort, MergeSort, Base.DEFAULT_UNSTABLE, Base.DEFAULT_STABLE],
             rev in [false,true]
+            alg === InsertionSort && n >= 3000 && continue
             # test float sorting with NaNs
             s = sort(v, alg=alg, rev=rev)
             @test issorted(s, rev=rev)
@@ -570,7 +574,7 @@ end
         @test all(issorted, [sp[inds.==x] for x in 1:200])
     end
 
-    for alg in [InsertionSort, MergeSort]
+    for alg in [InsertionSort, MergeSort, Base.DEFAULT_STABLE]
         sp = sortperm(inds, alg=alg)
         @test all(issorted, [sp[inds.==x] for x in 1:200])
     end
@@ -685,4 +689,129 @@ end
     @test searchsortedlast(o, 1.5) == -1
 end
 
+function adaptive_sort_test(v; trusted=InsertionSort, kw...)
+    sm = sum(hash.(v))
+    truth = sort!(deepcopy(v); alg=trusted, kw...)
+    return (
+        v === sort!(v; kw...) &&
+        issorted(v; kw...) &&
+        sum(hash.(v)) == sm &&
+        all(v .=== truth))
+end
+@testset "AdaptiveSort" begin
+    len = 70
+
+    @testset "Bool" begin
+        @test sort([false, true, false]) == [false, false, true]
+        @test sort([false, true, false], by=x->0) == [false, true, false]
+        @test sort([false, true, false], rev=true) == [true, false, false]
+    end
+
+    @testset "fallback" begin
+        @test adaptive_sort_test(rand(1:typemax(Int32), len), by=x->x^2)# fallback
+        @test adaptive_sort_test(rand(Int, len), by=x->0, trusted=QuickSort)
+    end
+
+    @test adaptive_sort_test(rand(Int, 20)) # InsertionSort
+
+    @testset "large eltype" begin
+        for rev in [true, false]
+            @test adaptive_sort_test(rand(Int128, len), rev=rev) # direct ordered int
+            @test adaptive_sort_test(fill(rand(UInt128), len), rev=rev) # all same
+            @test adaptive_sort_test(rand(Int128.(1:len), len), rev=rev) # short int range
+        end
+    end
+
+    @test adaptive_sort_test(fill(rand(), len)) # All same
+
+    @testset "count sort" begin
+        @test adaptive_sort_test(rand(1:20, len))
+        @test adaptive_sort_test(rand(1:20, len), rev=true)
+    end
+
+    @testset "post-serialization count sort" begin
+        v = reinterpret(Float64, rand(1:20, len))
+        @test adaptive_sort_test(copy(v))
+        @test adaptive_sort_test(copy(v), rev=true)
+    end
+
+    @testset "presorted" begin
+        @test adaptive_sort_test(sort!(rand(len)))
+        @test adaptive_sort_test(sort!(rand(Float32, len), rev=true))
+        @test adaptive_sort_test(vcat(sort!(rand(Int16, len)), Int16(0)))
+        @test adaptive_sort_test(vcat(sort!(rand(UInt64, len), rev=true), 0))
+    end
+
+    @testset "lenm1 < 3bits fallback" begin
+        @test adaptive_sort_test(rand(len)) # InsertionSort
+        @test adaptive_sort_test(rand(130)) # QuickSort
+    end
+
+    @test adaptive_sort_test(rand(1000)) # RadixSort
+end
+
+@testset "uint mappings" begin
+
+    #Construct value lists
+    floats = [T[-π, -1.0, -1/π, 1/π, 1.0, π, -0.0, 0.0, Inf, -Inf, NaN, -NaN,
+                prevfloat(T(0)), nextfloat(T(0)), prevfloat(T(Inf)), nextfloat(T(-Inf))]
+        for T in [Float16, Float32, Float64]]
+
+    ints = [T[17, -T(17), 0, -one(T), 1, typemax(T), typemin(T), typemax(T)-1, typemin(T)+1]
+        for T in Base.BitInteger_types]
+
+    char = Char['\n', ' ', Char(0), Char(8), Char(17), typemax(Char)]
+
+    vals = vcat(floats, ints, [char])
+
+    #Add random values
+    UIntN(::Val{1}) = UInt8
+    UIntN(::Val{2}) = UInt16
+    UIntN(::Val{4}) = UInt32
+    UIntN(::Val{8}) = UInt64
+    UIntN(::Val{16}) = UInt128
+    map(vals) do x
+        T = eltype(x)
+        U = UIntN(Val(sizeof(T)))
+        append!(x, rand(T, 4))
+        append!(x, reinterpret.(T, rand(U, 4)))
+        if T <: AbstractFloat
+            mask = reinterpret(U, T(NaN))
+            append!(x, reinterpret.(T, mask .| rand(U, 4)))
+        end
+    end
+
+    for x in vals
+        T = eltype(x)
+        U = UIntN(Val(sizeof(T)))
+        for order in [Forward, Reverse, Base.Sort.Float.Left(), Base.Sort.Float.Right(), By(Forward, identity)]
+            if order isa Base.Order.By || T === Float16 ||
+                ((T <: AbstractFloat) == (order isa DirectOrdering))
+                @test Base.Sort.UIntMappable(T, order) === nothing
+                continue
+            end
+
+            @test Base.Sort.UIntMappable(T, order) === U
+            x2 = deepcopy(x)
+            u = Base.Sort.uint_map!(x2, 1, length(x), order)
+            @test eltype(u) === U
+            @test all(Base.Sort.uint_map.(x, (order,)) .=== u)
+            mn = rand(U)
+            u .-= mn
+            @test x2 === Base.Sort.uint_unmap!(x2, u, 1, length(x), order, mn)
+            @test all(x2 .=== x)
+
+            for a in x
+                for b in x
+                    if order === Base.Sort.Float.Left() || order === Base.Sort.Float.Right()
+                        # Left and Right orderings guarantee homogeneous sign and no NaNs
+                        (isnan(a) || isnan(b) || signbit(a) != signbit(b)) && continue
+                    end
+                    @test Base.Order.lt(order, a, b) === Base.Order.lt(Forward, Base.Sort.uint_map(a, order), Base.Sort.uint_map(b, order))
+                end
+            end
+        end
+    end
+end
+
 end