-
Notifications
You must be signed in to change notification settings - Fork 63
/
misc-benches.cpp
314 lines (255 loc) · 15.2 KB
/
misc-benches.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
/*
* Miscellaneous benchmarks.
*
* A dumping ground, really.
*/
#include "benchmark.hpp"
#include "fmt/format.h"
#include "util.hpp"
extern "C" {
/* misc benches */
bench2_f misc_add_loop32;
bench2_f misc_add_loop64;
bench2_f misc_port7;
bench2_f misc_fusion_add;
bench2_f misc_flag_merge_1;
bench2_f misc_flag_merge_2;
bench2_f misc_flag_merge_3;
bench2_f misc_flag_merge_4;
bench2_f misc_flag_merge_5;
bench2_f misc_flag_merge_6;
bench2_f misc_flag_merge_7;
bench2_f misc_flag_merge_8;
bench2_f misc_flag_merge_9;
bench2_f david_schor1;
bench2_f double_macro_fusion256;
bench2_f double_macro_fusion4000;
bench2_f dsb_alignment_cross64;
bench2_f dsb_alignment_nocross64;
bench2_f bmi_tzcnt;
bench2_f bmi_lzcnt;
bench2_f bmi_popcnt;
bench2_f kreg_lat;
bench2_f kreg_lat_nz;
bench2_f kreg_lat_z;
bench2_f kreg_lat_mov;
bench2_f dendibakh_fused;
bench2_f dendibakh_fused_simple;
bench2_f dendibakh_fused_add;
bench2_f dendibakh_fused_add_simple;
bench2_f dendibakh_unfused;
bench2_f fusion_better_fused;
bench2_f fusion_better_unfused;
bench2_f misc_macro_fusion_addjo;
bench2_f adc_0_lat;
bench2_f adc_1_lat;
bench2_f adc_rcx_lat;
bench2_f adc_0_tput;
bench2_f adc_1_tput;
bench2_f adc_rcx_tput;
bench2_f retpoline_dense_call_lfence;
bench2_f retpoline_dense_call_pause;
bench2_f retpoline_sparse_call_base;
bench2_f retpoline_sparse_indep_call_lfence;
bench2_f retpoline_sparse_indep_call_pause;
bench2_f retpoline_sparse_dep_call_lfence;
bench2_f retpoline_sparse_dep_call_pause;
bench2_f indirect_dense_call_pred;
bench2_f indirect_dense_call_unpred;
bench2_f loop_weirdness_fast;
bench2_f tight_loop1;
bench2_f tight_loop2;
bench2_f tight_loop3;
bench2_f dep_add_noloop_128;
bench2_f vz_samereg;
bench2_f vz_diffreg;
bench2_f vz_diffreg16;
bench2_f vz_diffreg16xor;
bench2_f vz256_samereg;
bench2_f vz256_diffreg;
bench2_f vz128_samereg;
bench2_f vz128_diffreg;
bench2_f vzsse_samereg;
bench2_f vzsse_diffreg;
bench2_f movd_xmm;
bench2_f movd_ymm;
bench2_f rep_movsb;
bench2_f movd_rep;
bench2_f adc_chain32;
bench2_f adc_chain64;
bench2_f weird_store_mov;
bench2_f weird_store_xor;
bench2_f mov_elim;
bench2_f mov_elim_inc;
bench2_f nested_loop;
}
template <typename TIMER>
void register_misc(GroupList& list) {
#if !UARCH_BENCH_PORTABLE
std::shared_ptr<BenchmarkGroup> misc_group = std::make_shared<BenchmarkGroup>("misc", "Miscellaneous tests");
using default_maker = StaticMaker<TIMER>;
const uint32_t iters = 10*1000;
auto maker = DeltaMaker<TIMER>(misc_group.get(), iters);
auto makerbmi1 = maker.setFeatures({BMI1}).setLoopCount(3 * 1000 * 1000);
makerbmi1.template make<misc_add_loop32>("add-32", "32-bit add-loop", 1);
makerbmi1.template make<misc_add_loop64>("add-64", "64-bit add-loop", 1);
auto benches = std::vector<Benchmark> {
default_maker::template make_bench<misc_port7>(misc_group.get(), "port7", "Can port7 be used by loads", 1,
null_provider, iters),
default_maker::template make_bench<misc_fusion_add>(misc_group.get(), "fusion-add", "Test micro-fused add", 128,
null_provider, iters),
default_maker::template make_bench<misc_macro_fusion_addjo>(misc_group.get(), "add-jo-fusion", "Add-JO fusion", 128,
null_provider, iters),
default_maker::template make_bench<adc_0_lat >(misc_group.get(), "adc-0-lat", "adc reg, 0 latency", 128,
null_provider, iters),
default_maker::template make_bench<adc_1_lat >(misc_group.get(), "adc-1-lat", "adc reg, 1 latency", 128,
null_provider, iters),
default_maker::template make_bench<adc_rcx_lat >(misc_group.get(), "adc-reg-lat", "adc reg,zero-reg latency", 128,
null_provider, iters),
default_maker::template make_bench<adc_0_tput>(misc_group.get(), "adc-0-tput", "adc reg, 0 throughput", 128,
null_provider, iters),
default_maker::template make_bench<adc_1_tput>(misc_group.get(), "adc-1-tput", "adc reg, 1 throughput", 128,
null_provider, iters),
default_maker::template make_bench<adc_rcx_tput>(misc_group.get(), "adc-rcx-tput", "adc reg,zero-reg throughput", 128,
null_provider, iters),
default_maker::template make_bench<misc_flag_merge_1>(misc_group.get(), "flag-merge-1", "Flag merge 1", 128,
null_provider, iters),
default_maker::template make_bench<misc_flag_merge_2>(misc_group.get(), "flag-merge-2", "Flag merge 2", 128,
null_provider, iters),
default_maker::template make_bench<misc_flag_merge_3>(misc_group.get(), "flag-merge-3", "Flag merge 3", 128,
null_provider, iters),
default_maker::template make_bench<misc_flag_merge_4>(misc_group.get(), "flag-merge-4", "Flag merge 4", 128,
null_provider, iters),
default_maker::template make_bench<misc_flag_merge_5>(misc_group.get(), "flag-merge-5", "Flag merge 5", 128,
null_provider, iters),
default_maker::template make_bench<misc_flag_merge_6>(misc_group.get(), "flag-merge-6", "Flag merge cmovbe", 128,
null_provider, iters),
default_maker::template make_bench<misc_flag_merge_7>(misc_group.get(), "flag-merge-7", "Flag merge cmovc", 128,
null_provider, iters),
default_maker::template make_bench<misc_flag_merge_8>(misc_group.get(), "flag-merge-8", "Flag merge cmovbe (no merge)", 128,
null_provider, iters),
default_maker::template make_bench<misc_flag_merge_9>(misc_group.get(), "flag-merge-9", "Flag merge macro-fuse and", 128,
null_provider, iters),
default_maker::template make_bench<david_schor1>(misc_group.get(), "schor1", "Suggested by David Schor", 1,
null_provider, iters),
default_maker::template make_bench<double_macro_fusion256>(misc_group.get(), "double-macro-fuse", "Double not-taken macro fusion", 256,
null_provider, iters),
default_maker::template make_bench<double_macro_fusion4000>(misc_group.get(), "double-macro-fuse4000", "Double macro fusion (MITE)", 4000,
null_provider, iters),
default_maker::template make_bench<tight_loop1>(misc_group.get(), "tight-loop1", "Tight dec loop", 1,
null_provider, iters * 10),
default_maker::template make_bench<tight_loop2>(misc_group.get(), "tight-loop2", "Tight dec loop taken jmp", 1,
null_provider, iters * 10),
default_maker::template make_bench<tight_loop3>(misc_group.get(), "tight-loop3", "Tight dec loop untaken jmp", 1,
null_provider, iters * 10),
// https://news.ycombinator.com/item?id=15935283
default_maker::template make_bench<loop_weirdness_fast>(misc_group.get(), "loop-weirdness-fast", "Loop weirdness fast", 1,
[]{ return aligned_ptr(1024, 1024); }, 10000),
// private email
default_maker::template make_bench<adc_chain32>(misc_group.get(), "adc-chain32", "adc add chain 32-bit", 1000,
[]{ return nullptr; }, 10000),
default_maker::template make_bench<adc_chain64>(misc_group.get(), "adc-chain64", "adc add chain 64-bit", 1000,
[]{ return nullptr; }, 10000),
// case where when using the LSD, a loop with 2 stores apparently takes an extra cycle
// Reported by Alexander Monakov in https://github.com/travisdowns/bimodal-performance/issues/4
default_maker::template make_bench<weird_store_mov>(misc_group.get(), "weird-store-mov", "Store LSD weirdness, mov 0", 1000,
[]{ return nullptr; }, 10000),
default_maker::template make_bench<weird_store_xor>(misc_group.get(), "weird-store-xor", "Store LSD weirdness, xor zero", 1000,
[]{ return nullptr; }, 10000),
default_maker::template make_bench<mov_elim>(misc_group.get(), "mov-elim", "8 chained moves", 1),
default_maker::template make_bench<mov_elim_inc>(misc_group.get(), "mov-elim-inc", "8 chained movs and inc", 1),
};
misc_group->add(benches);
list.push_back(misc_group);
// Tests from https://dendibakh.github.io/blog/2018/02/04/Micro-ops-fusion
std::shared_ptr<BenchmarkGroup> dendibakh = std::make_shared<BenchmarkGroup>("dendibakh", "Fusion tests from dendibakh blog");
dendibakh->add(std::vector<Benchmark> {
// https://dendibakh.github.io/blog/2018/01/18/Code_alignment_issues
default_maker::template make_bench<dsb_alignment_cross64>(dendibakh.get(), "dsb-align64-cross", "Crosses 64-byte i-boundary", 1,
[]{ return aligned_ptr(1024, 1024); }, 1024),
default_maker::template make_bench<dsb_alignment_nocross64>(dendibakh.get(), "dsb-align64-nocross", "No cross 64-byte i-boundary", 1,
[]{ return aligned_ptr(1024, 1024); }, 1024),
default_maker::template make_bench<dendibakh_fused> (dendibakh.get(), "fused-original", "Fused (original)", 1, null_provider, 1024),
default_maker::template make_bench<dendibakh_fused_simple> (dendibakh.get(), "fused-simple", "Fused (simple addr)", 1, null_provider, 1024),
default_maker::template make_bench<dendibakh_fused_add> (dendibakh.get(),"fused-add", "Fused (add [reg + reg * 4], 1)", 1, null_provider, 1024),
default_maker::template make_bench<dendibakh_fused_add_simple> (dendibakh.get(),"fused-add-simple", "Fused (add [reg], 1)", 1, null_provider, 1024),
default_maker::template make_bench<dendibakh_unfused>(dendibakh.get(), "unfused-original","Unfused (original)", 1, null_provider, 1024),
default_maker::template make_bench<fusion_better_fused>(dendibakh.get(), "fusion-better-fused", "Fused summation", 1, []{ return aligned_ptr(64, 8000); }, 1024),
default_maker::template make_bench<fusion_better_unfused>(dendibakh.get(), "fusion-better-unfused", "Unfused summation", 1, []{ return aligned_ptr(64, 8000); }, 1024)
});
list.push_back(dendibakh);
{
std::shared_ptr<BenchmarkGroup> bmi_group = std::make_shared<BenchmarkGroup>("bmi", "BMI false-dependency tests");
list.push_back(bmi_group);
auto bmi_maker = DeltaMaker<TIMER>(bmi_group.get()).setTags({"default"});
bmi_maker.template make<bmi_tzcnt>("dep-tzcnt", "dest-dependent tzcnt", 128);
bmi_maker.template make<bmi_lzcnt>("dep-lzcnt", "dest-dependent lzcnt", 128);
bmi_maker.template make<bmi_popcnt>("dep-popcnt", "dest-dependent popcnt", 128);
}
std::shared_ptr<BenchmarkGroup> retpoline_group = std::make_shared<BenchmarkGroup>("misc/retpoline", "retpoline tests");
retpoline_group->add(std::vector<Benchmark> {
default_maker::template make_bench<retpoline_dense_call_pause> (retpoline_group.get(), "retp-call-pause", "Dense retpoline call pause", 32),
default_maker::template make_bench<retpoline_dense_call_lfence>(retpoline_group.get(), "retp-call-lfence", "Dense retpoline call lfence", 32),
default_maker::template make_bench<indirect_dense_call_pred>(retpoline_group.get(), "ibra-call-pred", "Dense indirect pred calls", 32),
default_maker::template make_bench<indirect_dense_call_unpred>(retpoline_group.get(), "ibra-call-unpred", "Dense indirect unpred calls", 32),
default_maker::template make_bench<retpoline_sparse_indep_call_pause,retpoline_sparse_call_base> (retpoline_group.get(), "retp-sparse-indep-call-pause", "Sparse retpo indep call pause", 8),
default_maker::template make_bench<retpoline_sparse_indep_call_lfence,retpoline_sparse_call_base>(retpoline_group.get(), "retp-sparse-indep-call-lfence", "Sparse retpo indep call lfence", 8),
default_maker::template make_bench<retpoline_sparse_dep_call_pause,retpoline_sparse_call_base> (retpoline_group.get(), "retp-sparse-dep-call-pause", "Sparse retpo dep call pause", 8),
default_maker::template make_bench<retpoline_sparse_dep_call_lfence,retpoline_sparse_call_base>(retpoline_group.get(), "retp-sparse-dep-call-lfence", "Sparse retpo dep call lfence", 8)
});
list.push_back(retpoline_group);
{
std::shared_ptr<BenchmarkGroup> group = std::make_shared<BenchmarkGroup>("avx512", "AVX512 stuff");
list.push_back(group);
auto maker = DeltaMaker<TIMER>(group.get()).setTags({"default"}).setFeatures({AVX512F});
maker.template make<kreg_lat>( "kreg_lat", "kreg-GP rountrip latency", 128);
maker.template make<kreg_lat_nz>("kreg_lat_nz", "kreg-GP roundtrip + nonzeroing kxorb", 128);
maker.template make<kreg_lat_z>("kreg_lat_z", "kreg-GP roundtrip + zeroing kxorb", 128);
maker.template make<kreg_lat_mov>("kreg_lat_mov", "kreg-GP roundtrip + mov from GP", 128);
}
{
std::shared_ptr<BenchmarkGroup> group = std::make_shared<BenchmarkGroup>("studies/vzeroall", "VZEROALL weirdness");
list.push_back(group);
auto maker = DeltaMaker<TIMER>(group.get()).setTags({"default"});
auto maker256 = maker.setFeatures({AVX2});
auto maker512 = maker.setFeatures({AVX512F});
maker512.template make<vz_samereg>("vz512-samereg", "vpaddq zmm0, zmm0, zmm0", 100);
maker512.template make<vz_diffreg>("vz512-diffreg", "vpaddq zmm0, zmm1, zmm0", 100);
maker512.template make<vz_diffreg16>("vz512-diff16", "vpaddq zmm0, zmm16, zmm0", 100);
maker512.template make<vz_diffreg16xor>("vz512-diff16xor", "vpxor zmm16; vpaddq zmm0, zmm16, zmm0", 100);
maker256.template make<vz256_samereg>("vz256-samereg", "vpaddq ymm0, ymm0, ymm0", 100);
maker256.template make<vz256_diffreg>("vz256-diffreg", "vpaddq ymm0, ymm1, ymm0", 100);
maker256.template make<vz128_samereg>("vz128-samereg", "vpaddq xmm0, xmm0, xmm0", 100);
maker256.template make<vz128_diffreg>("vz128-diffreg", "vpaddq xmm0, xmm1, xmm0", 100);
maker256.template make<vzsse_samereg>("vzsse-samereg", "paddq xmm0, xmm0", 100);
maker256.template make<vzsse_diffreg>("vzsse-diffreg", "paddq xmm0, xmm1", 100);
}
{
std::shared_ptr<BenchmarkGroup> group = std::make_shared<BenchmarkGroup>("studies/movd", "movd weirdness");
list.push_back(group);
auto maker = DeltaMaker<TIMER>(group.get()).setFeatures({AVX});
maker.template make<movd_xmm>("movd-xmm", "roundtrip mov + vpor xmm", 100);
maker.template make<movd_ymm>("movd-ymm", "roundtrip mov + vpor ymm", 100);
}
{
std::shared_ptr<BenchmarkGroup> group = std::make_shared<BenchmarkGroup>("studies/repm", "repm");
list.push_back(group);
auto maker = DeltaMaker<TIMER>(group.get(), 100);
maker = maker.useLoopDelta();
maker.template make<rep_movsb>("stosb", "stosb to 1024 byte region", 100);
}
{
std::shared_ptr<BenchmarkGroup> group = std::make_shared<BenchmarkGroup>("studies/nested", "Nested loop mispredicts");
list.push_back(group);
auto maker = DeltaMaker<TIMER>(group.get());
// maker = maker.useLoopDelta();
for (uint32_t iters = 1; iters <= 100; iters++) {
auto id = fmt::format("nested-loop-{}", iters);
auto desc = fmt::format("nested loop with {:3} iterations", iters);
maker.template make<nested_loop>(id, desc, 1, arg_object(iters));
}
}
#endif // #if !UARCH_BENCH_PORTABLE
}
#define REG_DEFAULT(CLOCK) template void register_misc<CLOCK>(GroupList& list);
ALL_TIMERS_X(REG_DEFAULT)