-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge.singeli
121 lines (105 loc) · 3.1 KB
/
merge.singeli
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Merge sorting
# Parity merge: branchless
# Main data movement
def parity_pos{left, right, dst, i} = {
l := left->0; r := right->i
c := l <= r
if (c) r=l; dst <-{i} r
right -= c
left += c
}
def parity_neg{left, right, dst, i} = {
l := left->(-i); r := right->0
c := l <= r
if (c) l=r; dst <-{-i} l
right -= c
left += c
}
# Merge halves of length-n array with constant n (4 and 8 used)
def parity_merge_const{n, dst, src} = {
def h = n / 2
left := clone{src}; right := src + h; dstc := clone{dst}
@for_const (i to h) parity_pos{left, right, dstc, i}
left = src + (h-1); right = src + (n-1); dstc = dst + (n-1)
@for_const (i to h) parity_neg{left, right, dstc, i}
}
# Merge halves of array
parity_merge{T, handle_odd}(dst:*T, src:*T, half:usize, n:usize) : void = {
lpos := src ; rpos := src + half ; dpos := dst
lneg := src + half - 1; rneg := src + n - 1; dneg := dst + n - 1
if (handle_odd and n%2) {
parity_pos{lpos, rpos, dpos, 0}
++rpos; ++dpos
}
@for_unroll{2} (i to n/2) {
parity_pos{lpos, rpos, dpos, i}
parity_neg{lneg, rneg, dneg, i}
}
}
# Merge arrays of length l and n-l starting at a, using buffer aux
# Can be done without moving both sides, but this way's easy
def merge_pair{x:pT, left:U, n:U, aux:pT} = {
set{aux, x, n}
parity_merge{eltype{pT}, 1}(x, aux, left, n)
}
# Merge array x of size n, if units of length block are pre-sorted
def merge_from{x:pT, n:U, aux:pT, block} = {
def T = eltype{pT}
src:=x; dst:=aux
w:U = block; while (w < n) {
ww:=2*w
i:U=0; while (i < n-w) {
l := n-i; if (l>ww) l=ww
parity_merge{T, 1}(dst+i, src+i, w, l)
i += ww
}
w = ww
src <-> dst
}
if (dst != x) set{x, dst, n}
}
# A simple bottom-down and not-very-adaptive merge sort using
# branchless merges in sets of 4
piposort{T}(x:*T, n:usize, aux:*T) : void = {
if (n < 32) {
sort_lt32{x, n}
return{}
}
# Approximate halves n = h1+h2 and quarters h1=q1+q2, h2=q3+q4
h1 := n / 2; q1 := h1 / 2; q2 := h1 - q1
h2 := n - h1; q3 := h2 / 2; q4 := h2 - q3
# Sort each quarter
def recur = piposort{T}
recur(x , q1, aux)
recur(x + q1 , q2, aux)
recur(x + h1 , q3, aux)
recur(x + h1 + q3, q4, aux)
# If each boundary between quarters is ordered, we're done
def ord{i} = x->(i-1) <= x->i
if (ord{q1} and ord{h1} and ord{h1 + q3}) return{}
# Also check for reversed quarters
if (x->0 > x->(h1-1) and x->q1 > x->(h1+q3-1) and x->h1 > x->(n-1)) {
set{aux, x, h1}
set{x, x + h1 + q3, q4}
set{x + q4, x + h1, q3}
set{x + h2, aux + q1, q2}
set{x + h2 + q2, aux, q1}
return{}
}
def merge = parity_merge{T, 1}
merge(aux , x , q1, h1)
merge(aux + h1, x + h1, q3, h2)
merge(x , aux , h1, n)
}
# Requires dst != aux; src may be dst or aux
half_piposort{T}(dst:*T, src:*T, n:usize, aux:*T) : void = {
if (n < 32) {
sort_lt32{dst, src, n}
return{}
}
h1 := n / 2; h2 := n - h1
def recur = half_piposort{T}
recur(aux, src, h1, dst)
recur(aux + h1, src + h1, h2, dst + h1)
parity_merge{T, 1}(dst, aux, h1, n)
}