-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathtest-spinlock.c
226 lines (199 loc) · 6.24 KB
/
test-spinlock.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#define _GNU_SOURCE
#include <pthread.h>
#include <sched.h>
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <sys/time.h>
#include <errno.h>
#ifdef XCHG
#include "spinlock-xchg.h"
#elif defined(XCHGBACKOFF)
#include "spinlock-xchg-backoff.h"
#elif defined(K42)
#include "spinlock-k42.h"
#elif defined(MCS)
#include "spinlock-mcs.h"
#elif defined(TICKET)
#include "spinlock-ticket.h"
#elif defined(PTHREAD)
#include "spinlock-pthread.h"
#elif defined(CMPXCHG)
#include "spinlock-cmpxchg.h"
#elif defined(RTM)
#include "spinlock-xchg.h"
#include "rtm.h"
#elif defined(HLE)
#include "spinlock-xchg-hle.h"
#else
#error "must define a spinlock implementation"
#endif
#ifndef cpu_relax
#define cpu_relax() asm volatile("pause\n": : :"memory")
#endif
/* It's hard to say which spinlock implementation performs best. I guess the
* performance depends on CPU topology which will affect the cache coherence
* messages, and maybe other factors.
*
* The result of binding threads to the same physical core shows that when
* threads are on a single physical CPU, contention will not cause severe
* performance degradation. But there's one exception, cmpxchg. It's performance
* will degrade no matter the threads resides on the which physical CPU.
*
* Here's the result on Dell R910 server (4 CPUs, 10 cores each), with Intel(R)
* Xeon(R) CPU E7- 4850 @ 2.00GHz
*
* - pthread_mutex BEATS ALL when there's more than 2 cores running this
* benchmark! Not sure what's the trick used by pthread_mutex.
*
* - For spinlock with cmpxchg, the performance degrades very fast with the
* increase of threads. Seems that it does not have good scalability.
*
* - For spinlock with xchg, it has much better scalability than cmpxchg, but it's
* slower when there's 2 and 4 cores.
*
* - For K42, The case for 2 threads is extremely bad, for other number of
* threads, the performance is stable and shows good scalability.
*
* - MCS spinlock has similar performance with K42, and does not have the
* extremely bad 2 thread case.
*
* - Ticket spinlock actually performs very badly.
*/
/* Number of total lock/unlock pair.
* Note we need to ensure the total pair of lock and unlock opeartion are the
* same no matter how many threads are used. */
#define N_PAIR 16000000
/* Bind threads to specific cores. The goal is to make threads locate on the
* same physical CPU. Modify bind_core before using this. */
//#define BIND_CORE
static int nthr = 0;
static volatile uint32_t wflag;
/* Wait on a flag to make all threads start almost at the same time. */
void wait_flag(volatile uint32_t *flag, uint32_t expect) {
__sync_fetch_and_add((uint32_t *)flag, 1);
while (*flag != expect) {
cpu_relax();
}
}
static struct timeval start_time;
static struct timeval end_time;
static void calc_time(struct timeval *start, struct timeval *end) {
if (end->tv_usec < start->tv_usec) {
end->tv_sec -= 1;
end->tv_usec += 1000000;
}
assert(end->tv_sec >= start->tv_sec);
assert(end->tv_usec >= start->tv_usec);
struct timeval interval = {
end->tv_sec - start->tv_sec,
end->tv_usec - start->tv_usec
};
printf("%ld.%06ld\t", (long)interval.tv_sec, (long)interval.tv_usec);
}
// Use an array of counter to see effect on RTM if touches more cache line.
#define NCOUNTER 1
#define CACHE_LINE 64
// Use thread local counter to avoid cache contention between cores.
// For TSX, this avoids TX conflicts so the performance overhead/improvement is
// due to TSX mechanism.
static __thread int8_t counter[CACHE_LINE*NCOUNTER];
#ifdef MCS
mcs_lock cnt_lock = NULL;
#else
spinlock sl;
#endif
#ifdef BIND_CORE
void bind_core(int threadid) {
/* cores with logical id 4x is on CPU physical id 0 */
/* cores with logical id 4x+1 is on CPU physical id 1 */
int phys_id = threadid / 10;
int core = threadid % 10;
int logical_id = 4 * core + phys_id;
/*printf("thread %d bind to logical core %d on physical id %d\n", threadid, logical_id, phys_id);*/
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(logical_id, &set);
if (sched_setaffinity(0, sizeof(set), &set) != 0) {
perror("Set affinity failed");
exit(EXIT_FAILURE);
}
}
#endif
void *inc_thread(void *id) {
int n = N_PAIR / nthr;
assert(n * nthr == N_PAIR);
#ifdef MCS
mcs_lock_t local_lock;
#endif
#ifdef BIND_CORE
bind_core((int)(long)(id));
#endif
wait_flag(&wflag, nthr);
if (((long) id == 0)) {
/*printf("get start time\n");*/
gettimeofday(&start_time, NULL);
}
/* Start lock unlock test. */
for (int i = 0; i < n; i++) {
#ifdef MCS
lock_mcs(&cnt_lock, &local_lock);
for (int j = 0; j < NCOUNTER; j++) counter[j*CACHE_LINE]++;
unlock_mcs(&cnt_lock, &local_lock);
#elif RTM
int status;
if ((status = _xbegin()) == _XBEGIN_STARTED) {
for (int j = 0; j < NCOUNTER; j++) counter[j*CACHE_LINE]++;
if (sl == BUSY)
_xabort(1);
_xend();
} else {
spin_lock(&sl);
for (int j = 0; j < NCOUNTER; j++) counter[j*CACHE_LINE]++;
spin_unlock(&sl);
}
#else
spin_lock(&sl);
for (int j = 0; j < NCOUNTER; j++) counter[j*CACHE_LINE]++;
spin_unlock(&sl);
#endif
}
if (__sync_fetch_and_add((uint32_t *)&wflag, -1) == 1) {
/*printf("get end time\n");*/
gettimeofday(&end_time, NULL);
}
return NULL;
}
int main(int argc, const char *argv[])
{
pthread_t *thr;
int ret = 0;
if (argc != 2) {
printf("Usage: %s <num of threads>\n", argv[0]);
exit(1);
}
nthr = atoi(argv[1]);
/*printf("using %d threads\n", nthr);*/
thr = calloc(sizeof(*thr), nthr);
// Start thread
for (long i = 0; i < nthr; i++) {
if (pthread_create(&thr[i], NULL, inc_thread, (void *)i) != 0) {
perror("thread creating failed");
}
}
// join thread
for (long i = 0; i < nthr; i++)
pthread_join(thr[i], NULL);
calc_time(&start_time, &end_time);
/*
*for (int i = 0; i < NCOUNTER; i++) {
* if (counter[i] == N_PAIR) {
* } else {
* printf("counter %d error\n", i);
* ret = 1;
* }
*}
*/
return ret;
}