1 | /* Measure various lock acquisition times for empty critical sections. |
2 | Copyright (C) 2020-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #define TEST_MAIN |
20 | #define TEST_NAME "pthread-locks" |
21 | |
22 | #include <stdio.h> |
23 | #include <string.h> |
24 | #include <limits.h> |
25 | #include <stdlib.h> |
26 | #include <pthread.h> |
27 | #include <semaphore.h> |
28 | #include <stdatomic.h> |
29 | #include <sys/time.h> |
30 | #include <math.h> |
31 | #include "bench-timing.h" |
32 | #include "json-lib.h" |
33 | |
34 | /* The point of this benchmark is to measure the overhead of an empty |
35 | critical section or a small critical section. This is never going |
36 | to be indicative of real application performance. Instead we are |
37 | trying to benchmark the effects of the compiler and the runtime |
38 | coupled with a particular set of hardware atomic operations. |
39 | The numbers from this benchmark should be taken with a massive gain |
40 | of salt and viewed through the eyes of expert reviewers. */ |
41 | |
42 | static pthread_mutex_t m; |
43 | static pthread_rwlock_t rw; |
44 | static pthread_cond_t cv; |
45 | static pthread_cond_t consumer_c, producer_c; |
46 | static int cv_done; |
47 | static pthread_spinlock_t sp; |
48 | static sem_t sem; |
49 | |
50 | typedef timing_t (*test_t)(long, int); |
51 | |
52 | #define START_ITERS 1000 |
53 | |
54 | #define FILLER_GOES_HERE \ |
55 | if (filler) \ |
56 | do_filler (); |
57 | |
58 | /* Everyone loves a good fibonacci series. This isn't quite one of |
59 | them because we need larger values in fewer steps, in a way that |
60 | won't be optimized away. We're looking to approximately double the |
61 | total time each test iteration takes, so as to not swamp the useful |
62 | timings. */ |
63 | |
64 | #pragma GCC push_options |
65 | #pragma GCC optimize(1) |
66 | |
67 | static int __attribute__((noinline)) |
68 | fibonacci (int i) |
69 | { |
70 | asm("" ); |
71 | if (i > 2) |
72 | return fibonacci (i: i-1) + fibonacci (i: i-2); |
73 | return 10+i; |
74 | } |
75 | |
76 | static void |
77 | do_filler (void) |
78 | { |
79 | static char buf1[512], buf2[512]; |
80 | int f = fibonacci (i: 5); |
81 | memcpy (buf1, buf2, f); |
82 | } |
83 | |
84 | #pragma GCC pop_options |
85 | |
86 | static timing_t |
87 | test_mutex (long iters, int filler) |
88 | { |
89 | timing_t start, stop, cur; |
90 | |
91 | pthread_mutex_init (mutex: &m, NULL); |
92 | |
93 | TIMING_NOW (start); |
94 | for (long j = iters; j >= 0; --j) |
95 | { |
96 | pthread_mutex_lock (mutex: &m); |
97 | FILLER_GOES_HERE; |
98 | pthread_mutex_unlock (mutex: &m); |
99 | } |
100 | TIMING_NOW (stop); |
101 | TIMING_DIFF (cur, start, stop); |
102 | |
103 | return cur; |
104 | } |
105 | |
106 | static timing_t |
107 | test_mutex_trylock (long iters, int filler) |
108 | { |
109 | timing_t start, stop, cur; |
110 | |
111 | pthread_mutex_init (mutex: &m, NULL); |
112 | pthread_mutex_lock (mutex: &m); |
113 | |
114 | TIMING_NOW (start); |
115 | for (long j = iters; j >= 0; --j) |
116 | { |
117 | pthread_mutex_trylock (mutex: &m); |
118 | FILLER_GOES_HERE; |
119 | } |
120 | TIMING_NOW (stop); |
121 | TIMING_DIFF (cur, start, stop); |
122 | |
123 | pthread_mutex_unlock (mutex: &m); |
124 | return cur; |
125 | } |
126 | |
127 | static timing_t |
128 | test_rwlock_read (long iters, int filler) |
129 | { |
130 | timing_t start, stop, cur; |
131 | |
132 | pthread_rwlock_init (rwlock: &rw, NULL); |
133 | |
134 | TIMING_NOW (start); |
135 | for (long j = iters; j >= 0; --j) |
136 | { |
137 | pthread_rwlock_rdlock (rwlock: &rw); |
138 | FILLER_GOES_HERE; |
139 | pthread_rwlock_unlock (rwlock: &rw); |
140 | } |
141 | TIMING_NOW (stop); |
142 | TIMING_DIFF (cur, start, stop); |
143 | |
144 | return cur; |
145 | } |
146 | |
147 | static timing_t |
148 | test_rwlock_tryread (long iters, int filler) |
149 | { |
150 | timing_t start, stop, cur; |
151 | |
152 | pthread_rwlock_init (rwlock: &rw, NULL); |
153 | pthread_rwlock_wrlock (rwlock: &rw); |
154 | |
155 | TIMING_NOW (start); |
156 | for (long j = iters; j >= 0; --j) |
157 | { |
158 | pthread_rwlock_tryrdlock (rwlock: &rw); |
159 | FILLER_GOES_HERE; |
160 | } |
161 | TIMING_NOW (stop); |
162 | TIMING_DIFF (cur, start, stop); |
163 | |
164 | pthread_rwlock_unlock (rwlock: &rw); |
165 | return cur; |
166 | } |
167 | |
168 | static timing_t |
169 | test_rwlock_write (long iters, int filler) |
170 | { |
171 | timing_t start, stop, cur; |
172 | |
173 | pthread_rwlock_init (rwlock: &rw, NULL); |
174 | |
175 | TIMING_NOW (start); |
176 | for (long j = iters; j >= 0; --j) |
177 | { |
178 | pthread_rwlock_wrlock (rwlock: &rw); |
179 | FILLER_GOES_HERE; |
180 | pthread_rwlock_unlock (rwlock: &rw); |
181 | } |
182 | TIMING_NOW (stop); |
183 | TIMING_DIFF (cur, start, stop); |
184 | |
185 | return cur; |
186 | } |
187 | |
188 | static timing_t |
189 | test_rwlock_trywrite (long iters, int filler) |
190 | { |
191 | timing_t start, stop, cur; |
192 | |
193 | pthread_rwlock_init (rwlock: &rw, NULL); |
194 | pthread_rwlock_rdlock (rwlock: &rw); |
195 | |
196 | TIMING_NOW (start); |
197 | for (long j = iters; j >= 0; --j) |
198 | { |
199 | pthread_rwlock_trywrlock (rwlock: &rw); |
200 | FILLER_GOES_HERE; |
201 | } |
202 | TIMING_NOW (stop); |
203 | TIMING_DIFF (cur, start, stop); |
204 | |
205 | pthread_rwlock_unlock (rwlock: &rw); |
206 | return cur; |
207 | } |
208 | |
209 | static timing_t |
210 | test_spin_lock (long iters, int filler) |
211 | { |
212 | timing_t start, stop, cur; |
213 | |
214 | pthread_spin_init (lock: &sp, PTHREAD_PROCESS_PRIVATE); |
215 | |
216 | TIMING_NOW (start); |
217 | for (long j = iters; j >= 0; --j) |
218 | { |
219 | pthread_spin_lock (lock: &sp); |
220 | FILLER_GOES_HERE; |
221 | pthread_spin_unlock (lock: &sp); |
222 | } |
223 | TIMING_NOW (stop); |
224 | TIMING_DIFF (cur, start, stop); |
225 | |
226 | return cur; |
227 | } |
228 | |
229 | static timing_t |
230 | test_spin_trylock (long iters, int filler) |
231 | { |
232 | timing_t start, stop, cur; |
233 | |
234 | pthread_spin_init (lock: &sp, PTHREAD_PROCESS_PRIVATE); |
235 | pthread_spin_lock (lock: &sp); |
236 | |
237 | TIMING_NOW (start); |
238 | for (long j = iters; j >= 0; --j) |
239 | { |
240 | pthread_spin_trylock (lock: &sp); |
241 | FILLER_GOES_HERE; |
242 | } |
243 | TIMING_NOW (stop); |
244 | TIMING_DIFF (cur, start, stop); |
245 | |
246 | pthread_spin_unlock (lock: &sp); |
247 | return cur; |
248 | } |
249 | |
250 | static timing_t |
251 | test_sem_wait (long iters, int filler) |
252 | { |
253 | timing_t start, stop, cur; |
254 | |
255 | sem_init (sem: &sem, pshared: 0, value: 1); |
256 | |
257 | TIMING_NOW (start); |
258 | for (long j = iters; j >= 0; --j) |
259 | { |
260 | sem_post (sem: &sem); |
261 | FILLER_GOES_HERE; |
262 | sem_wait (sem: &sem); |
263 | } |
264 | TIMING_NOW (stop); |
265 | TIMING_DIFF (cur, start, stop); |
266 | |
267 | return cur; |
268 | } |
269 | |
270 | static timing_t |
271 | test_sem_trywait (long iters, int filler) |
272 | { |
273 | timing_t start, stop, cur; |
274 | |
275 | sem_init (sem: &sem, pshared: 0, value: 0); |
276 | |
277 | TIMING_NOW (start); |
278 | for (long j = iters; j >= 0; --j) |
279 | { |
280 | sem_trywait (sem: &sem); |
281 | FILLER_GOES_HERE; |
282 | } |
283 | TIMING_NOW (stop); |
284 | TIMING_DIFF (cur, start, stop); |
285 | |
286 | return cur; |
287 | } |
288 | |
289 | static void * |
290 | test_condvar_helper (void *v) |
291 | { |
292 | /* This is wasteful, but the alternative is to add the overhead of a |
293 | mutex lock/unlock to the overall iteration (both threads) and we |
294 | don't want that. Ideally, this thread would run on an |
295 | independent processing core anyway. The ONLY goal here is to |
296 | minimize the time the other thread spends waiting for us. */ |
297 | while (__atomic_load_n (&cv_done, __ATOMIC_RELAXED) == 0) |
298 | pthread_cond_signal (cond: &cv); |
299 | |
300 | return NULL; |
301 | } |
302 | |
303 | static timing_t |
304 | test_condvar (long iters, int filler) |
305 | { |
306 | timing_t start, stop, cur; |
307 | pthread_t helper_id; |
308 | |
309 | pthread_mutex_init (mutex: &m, NULL); |
310 | pthread_cond_init (cond: &cv, NULL); |
311 | pthread_mutex_lock (mutex: &m); |
312 | |
313 | __atomic_store_n (&cv_done, 0, __ATOMIC_RELAXED); |
314 | pthread_create (newthread: &helper_id, NULL, start_routine: test_condvar_helper, arg: &iters); |
315 | |
316 | TIMING_NOW (start); |
317 | for (long j = iters; j >= 0; --j) |
318 | { |
319 | pthread_cond_wait (cond: &cv, mutex: &m); |
320 | FILLER_GOES_HERE; |
321 | } |
322 | TIMING_NOW (stop); |
323 | TIMING_DIFF (cur, start, stop); |
324 | |
325 | pthread_mutex_unlock (mutex: &m); |
326 | __atomic_store_n (&cv_done, 1, __ATOMIC_RELAXED); |
327 | |
328 | pthread_join (th: helper_id, NULL); |
329 | return cur; |
330 | } |
331 | |
332 | /* How many items are "queued" in our pretend queue. */ |
333 | static int queued = 0; |
334 | |
335 | typedef struct Producer_Params { |
336 | long iters; |
337 | int filler; |
338 | } Producer_Params; |
339 | |
340 | /* We only benchmark the consumer thread, but both threads are doing |
341 | essentially the same thing, and never run in parallel due to the |
342 | locks. Thus, even if they run on separate processing cores, we |
343 | count the time for both threads. */ |
344 | static void * |
345 | test_producer_thread (void *v) |
346 | { |
347 | Producer_Params *p = (Producer_Params *) v; |
348 | long iters = p->iters; |
349 | int filler = p->filler; |
350 | long j; |
351 | |
352 | for (j = iters; j >= 0; --j) |
353 | { |
354 | /* Aquire lock on the queue. */ |
355 | pthread_mutex_lock (mutex: &m); |
356 | /* if something's already there, wait. */ |
357 | while (queued > 0) |
358 | pthread_cond_wait (cond: &consumer_c, mutex: &m); |
359 | |
360 | /* Put something on the queue */ |
361 | FILLER_GOES_HERE; |
362 | ++ queued; |
363 | pthread_cond_signal (cond: &producer_c); |
364 | |
365 | /* Give the other thread a chance to run. */ |
366 | pthread_mutex_unlock (mutex: &m); |
367 | } |
368 | |
369 | return NULL; |
370 | } |
371 | |
372 | static timing_t |
373 | test_consumer_producer (long iters, int filler) |
374 | { |
375 | timing_t start, stop, cur; |
376 | pthread_t helper_id; |
377 | Producer_Params p; |
378 | |
379 | p.iters = iters; |
380 | p.filler = filler; |
381 | |
382 | pthread_mutex_init (mutex: &m, NULL); |
383 | pthread_cond_init (cond: &cv, NULL); |
384 | |
385 | pthread_create (newthread: &helper_id, NULL, start_routine: test_producer_thread, arg: &p); |
386 | |
387 | TIMING_NOW (start); |
388 | |
389 | for (long j = iters; j >= 0; --j) |
390 | { |
391 | /* Aquire lock on the queue. */ |
392 | pthread_mutex_lock (mutex: &m); |
393 | /* Wait for something to be on the queue. */ |
394 | while (queued == 0) |
395 | pthread_cond_wait (cond: &producer_c, mutex: &m); |
396 | |
397 | /* Take if off. */ |
398 | FILLER_GOES_HERE; |
399 | -- queued; |
400 | pthread_cond_signal (cond: &consumer_c); |
401 | |
402 | /* Give the other thread a chance to run. */ |
403 | pthread_mutex_unlock (mutex: &m); |
404 | } |
405 | |
406 | TIMING_NOW (stop); |
407 | TIMING_DIFF (cur, start, stop); |
408 | |
409 | |
410 | pthread_join (th: helper_id, NULL); |
411 | return cur; |
412 | } |
413 | |
414 | /* Number of runs we use for computing mean and standard deviation. |
415 | We actually do two additional runs and discard the outliers. */ |
416 | #define RUN_COUNT 10 |
417 | |
418 | static int |
419 | do_bench_2 (const char *name, test_t func, int filler, json_ctx_t *js) |
420 | { |
421 | timing_t cur; |
422 | struct timeval ts, te; |
423 | double tsd, ted, td; |
424 | long iters, iters_limit; |
425 | timing_t curs[RUN_COUNT + 2]; |
426 | int i, j; |
427 | double mean, stdev; |
428 | |
429 | iters = START_ITERS; |
430 | iters_limit = LONG_MAX / 100; |
431 | |
432 | while (1) { |
433 | gettimeofday (tv: &ts, NULL); |
434 | cur = func(iters, filler); |
435 | gettimeofday (tv: &te, NULL); |
436 | |
437 | /* We want a test to take at least 0.01 seconds, and try |
438 | increasingly larger iteration counts until it does. This |
439 | allows for approximately constant-time tests regardless of |
440 | hardware speed, without the overhead of checking the time |
441 | inside the test loop itself. We stop at a million iterations |
442 | as that should be precise enough. Once we determine a suitable |
443 | iteration count, we run the test multiple times to calculate |
444 | mean and standard deviation. */ |
445 | |
446 | /* Note that this also primes the CPU cache and triggers faster |
447 | MHz, we hope. */ |
448 | tsd = ts.tv_sec + ts.tv_usec / 1000000.0; |
449 | ted = te.tv_sec + te.tv_usec / 1000000.0; |
450 | td = ted - tsd; |
451 | if (td >= 0.01 |
452 | || iters >= iters_limit |
453 | || iters >= 1000000) |
454 | break; |
455 | |
456 | iters *= 10; |
457 | } |
458 | |
459 | curs[0] = cur; |
460 | for (i = 1; i < RUN_COUNT + 2; i ++) |
461 | curs[i] = func(iters, filler); |
462 | |
463 | /* We sort the results so we can discard the fastest and slowest |
464 | times as outliers. In theory we should keep the fastest time, |
465 | but IMHO this is more fair. A simple bubble sort suffices. */ |
466 | |
467 | for (i = 0; i < RUN_COUNT + 1; i ++) |
468 | for (j = i + 1; j < RUN_COUNT + 2; j ++) |
469 | if (curs[i] > curs[j]) |
470 | { |
471 | timing_t temp = curs[i]; |
472 | curs[i] = curs[j]; |
473 | curs[j] = temp; |
474 | } |
475 | |
476 | /* Now calculate mean and standard deviation, skipping the outliers. */ |
477 | mean = 0.0; |
478 | for (i = 1; i<RUN_COUNT + 1; i ++) |
479 | mean += (double) curs[i] / (double) iters; |
480 | mean /= RUN_COUNT; |
481 | |
482 | stdev = 0.0; |
483 | for (i = 1; i < RUN_COUNT + 1; i ++) |
484 | { |
485 | double s = (double) curs[i] / (double) iters - mean; |
486 | stdev += s * s; |
487 | } |
488 | stdev = sqrt (stdev / (RUN_COUNT - 1)); |
489 | |
490 | char buf[128]; |
491 | snprintf (s: buf, maxlen: sizeof buf, format: "%s-%s" , name, filler ? "filler" : "empty" ); |
492 | |
493 | json_attr_object_begin (ctx: js, name: buf); |
494 | |
495 | json_attr_double (ctx: js, name: "duration" , d: (double) cur); |
496 | json_attr_double (ctx: js, name: "iterations" , d: (double) iters); |
497 | json_attr_double (ctx: js, name: "wall-sec" , d: (double) td); |
498 | json_attr_double (ctx: js, name: "mean" , d: mean); |
499 | json_attr_double (ctx: js, name: "stdev" , d: stdev); |
500 | json_attr_double (ctx: js, name: "min-outlier" , d: (double) curs[0] / (double) iters); |
501 | json_attr_double (ctx: js, name: "min" , d: (double) curs[1] / (double) iters); |
502 | json_attr_double (ctx: js, name: "max" , d: (double) curs[RUN_COUNT] / (double) iters); |
503 | json_attr_double (ctx: js, name: "max-outlier" , d: (double) curs[RUN_COUNT + 1] / (double) iters); |
504 | |
505 | json_attr_object_end (ctx: js); |
506 | |
507 | return 0; |
508 | } |
509 | |
510 | static int |
511 | do_bench_1 (const char *name, test_t func, json_ctx_t *js) |
512 | { |
513 | int rv = 0; |
514 | |
515 | rv += do_bench_2 (name, func, filler: 0, js); |
516 | rv += do_bench_2 (name, func, filler: 1, js); |
517 | |
518 | return rv; |
519 | } |
520 | |
521 | int |
522 | do_bench (void) |
523 | { |
524 | int rv = 0; |
525 | json_ctx_t json_ctx; |
526 | |
527 | json_init (ctx: &json_ctx, indent_level: 2, stdout); |
528 | json_attr_object_begin (ctx: &json_ctx, name: "pthread_locks" ); |
529 | |
530 | #define BENCH(n) rv += do_bench_1 (#n, test_##n, &json_ctx) |
531 | |
532 | BENCH (mutex); |
533 | BENCH (mutex_trylock); |
534 | BENCH (rwlock_read); |
535 | BENCH (rwlock_tryread); |
536 | BENCH (rwlock_write); |
537 | BENCH (rwlock_trywrite); |
538 | BENCH (spin_lock); |
539 | BENCH (spin_trylock); |
540 | BENCH (sem_wait); |
541 | BENCH (sem_trywait); |
542 | BENCH (condvar); |
543 | BENCH (consumer_producer); |
544 | |
545 | json_attr_object_end (ctx: &json_ctx); |
546 | |
547 | return rv; |
548 | } |
549 | |
550 | |
551 | #define TEST_FUNCTION do_bench () |
552 | |
553 | #include "../test-skeleton.c" |
554 | |