src/quicksort.singeli

include './partition'
include './median'
include './xorshift'
include './arith'  # Logs and square roots

def base_cases{src:*T, dst:*T, aux:*T, n:U, aux_bytes:usize, min:T} = {
  # Short array
  if (n < 32) {
    sort_lt32{dst, src, n}
    return{}
  }

  # Distribution base cases
  if (isint{T}) {
    range := dist{min, dst->n}
    if (U^~(range/4) < n and U^~range < aux_bytes/bytes{U}) {
      # Always sort in place on dst
      # Counting sort could have a different src/dst, but if src isn't
      # dst then it's equal to aux, and we need that space
      if (src != dst) set{dst, src, n}
      count_sort{dst, u32<~n, *u32~~aux, min, u32<~range + 1}
      return{}
    }
    if (width{T} == 32 and n <= (1<<16) and range < (1<<16)) {
      radpack32(*u32~~src, *u32~~dst, u32<~n, *void~~aux, u32~~min)
      return{}
    }
  }
}

# Robin Hood approval state
def RH_UNTRIED = 0
def RH_APPROVED = 2

def get_pivot{array:pT, n:U, getaux, sort, proc_pivots, rh_state} = {
  # log_2 of minimum size for sampling
  sl0:U = 8
  # Output array and index
  arr:=array; ind:U = 0
  if (n <= 192) {
    ind = locate_3_median{array, n}
  } else if (rh_state!=RH_UNTRIED and n <= 1024) {
    ind = locate_3of3_pseudomedian{array, n}
  } else if (rh_state!=RH_UNTRIED and n <= 1 << (sl0 = 14)) {
    ind = locate_5of3_pseudomedian{array, n}
  } else {
    aux := getaux{}
    # gap is the expected distance between adjacent samples
    # We'll get about n/gap samples
    log2:U = floor_log2{n, sl0}
    gap_min := 1 << (log2 / 2)
    gap := sqrt_approx{n, gap_min}

    # Collect samples with split xorshift and add to aux
    aux1 := aux
    def add{ind} = { aux1 <- array->ind; ++aux1 }
    mask := gap_min - 1
    def add3 = make_split_xorshift{tup{13,17,5}, n, mask, add}

    i:U = 0; while (i < n - (mask + 2 * gap)) add3{i, gap}
    ns := aux1 - aux
    sort{aux, ns, *void~~aux1, ns*bytes{eltype{pT}}}
    proc_pivots{aux, ns}
    arr = aux
    ind = ns / 2
  }
  arr -> ind
}

# Fluxsort recurrence: partition, handle right then left side
# src may be equal to dst or aux
def flux_recur{recur, tailcall, piv:T, src:*T, dst:*T, aux:*T, n:U, aux_bytes:usize, min:T, rh_state:u8} = {

  # Partition: left side directly in dst with length l, includes pivots
  l:U = 0
  rsrc := aux
  if (LIKELY{n < 1<<14}) {
    l = flux_partition{src, <=, piv, dst, aux, n}
  } else {
    # Previous partitions must have used fulcrum too, so we haven't yet
    # touched aux, and src==dst
    l = fulcrum_partition{dst, <=, piv, aux, n}  # (unstable)
    rsrc = dst + l
  }
  r := n - l  # Values on the right
  m := l      # Values not on the right

  # Never recurse on a partition with more than this many values
  most := n - n/16

  # If most values end up on the left, they're probably mostly pivots
  if (l > most) {
    # Partition again with pivots on the right
    # This bounds performance by O(k*n) for only k unique values
    l = flux_partition{dst, <, piv, dst, aux+r, m}
    set{dst+l, aux+r, m-l} # Should probably write a reverse partition to avoid this
  }

  # Sort the right-hand side, moving it from rsrc to dst
  rdst := dst + m
  if (r > most) { # Unbalanced
    set{rdst, rsrc, r}
    piposort{T}(rdst, r, rsrc)
  } else {
    recur{rsrc, rdst, aux, r, aux_bytes, piv, rh_state}
  }

  if (l > most) { # Unbalanced
    piposort{T}(dst, l, aux)
    return{}
  }

  # Left-hand side by tail call
  src = dst
  n = l
}

flux_loop{U, T}(src:*T, dst:*T, aux:*T, n:U, aux_bytes:usize, min:T, rh_state:u8) : void = {
  while (u1~~1) {
    base_cases{src, dst, aux, n, aux_bytes, min}

    # Find pivot and check for RH sorting
    def getaux{} = { a:=aux; if (a==src) a=dst; a }
    def proc_pivots{pivots, num} = {
      if (isint{T} and n<=1<<17 and rh_state!=RH_APPROVED) {
        # 1 if tried and not approved, 2 for RH_APPROVED
        rh_state = 1 + u8^~checkdist{pivots, num, min, n, dist{U, min, dst->n}}
      }
    }
    def fn{gen} = bind{call, gen{U,T}}
    piv := get_pivot{src, n, getaux, fn{flux_sort}, proc_pivots, rh_state}
    if (isint{T} and rh_state == RH_APPROVED) {
      try_rh{src, dst, n, aux, aux_bytes, min, dst->n, merge_chk, merge_from{merge_chk}}
    }

    # Then finish sorting; loop around for tail call
    flux_recur{fn{flux_loop},1, piv, src,dst,aux,n,aux_bytes, min,rh_state}
  }
}

flux_sort{U, T}(x:*T, n:U, aux:*void, aux_bytes:usize) : void = {
  if (n <= 192) {
    piposort{T}(x, n, *T~~aux)
    return{}
  }

  # Find the minimum value and index of last maximum
  def block = 1024
  min := x->0; max := min
  i:U = 0; imax:=i
  do {
    i0 := i
    i += block; if (i > n) i = n
    blm := x->i0
    @for (x over j from i0 to i) {
      if (x < min) min=x
      if (x > blm) blm=x
    }
    if (blm >= max) { max=blm; imax=i; } # Save block index; refine later
  } while (i < n)

  if (min == max) return{}
  do { --imax } while (x->imax < max)
  x <-{imax} x->(n-1)  # (unstable)
  x <-{n-1} max

  # Now sort
  flux_loop{U,T}(x, x, *T~~aux, n-1, aux_bytes, min, 0)
}