bench-pthread-locks.c source code [glibc/benchtests/bench-pthread-locks.c]

1	/ Measure various lock acquisition times for empty critical sections.*
2	Copyright (C) 2020-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	#define TEST_MAIN
20	#define TEST_NAME "pthread-locks"
21
22	#include <stdio.h>
23	#include <string.h>
24	#include <limits.h>
25	#include <stdlib.h>
26	#include <pthread.h>
27	#include <semaphore.h>
28	#include <stdatomic.h>
29	#include <sys/time.h>
30	#include <math.h>
31	#include "bench-timing.h"
32	#include "json-lib.h"
33
34	/ The point of this benchmark is to measure the overhead of an empty*
35	critical section or a small critical section. This is never going
36	to be indicative of real application performance. Instead we are
37	trying to benchmark the effects of the compiler and the runtime
38	coupled with a particular set of hardware atomic operations.
39	The numbers from this benchmark should be taken with a massive gain
40	of salt and viewed through the eyes of expert reviewers. /*
41
42	static pthread_mutex_t m;
43	static pthread_rwlock_t rw;
44	static pthread_cond_t cv;
45	static pthread_cond_t consumer_c, producer_c;
46	static int cv_done;
47	static pthread_spinlock_t sp;
48	static sem_t sem;
49
50	typedef timing_t (test_t)(long, int*);
51
52	#define START_ITERS 1000
53
54	#define FILLER_GOES_HERE \
55	if (filler) \
56	do_filler ();
57
58	/ Everyone loves a good fibonacci series. This isn't quite one of*
59	them because we need larger values in fewer steps, in a way that
60	won't be optimized away. We're looking to approximately double the
61	total time each test iteration takes, so as to not swamp the useful
62	timings. /*
63
64	#pragma GCC push_options
65	#pragma GCC optimize(1)
66
67	static int __attribute__((noinline))
68	fibonacci (int i)
69	{
70	asm("");
71	if (i > `2`)
72	return fibonacci (i: i-`1`) + fibonacci (i: i-`2`);
73	return `10`+i;
74	}
75
76	static void
77	do_filler (void)
78	{
79	static char buf1[`512`], buf2[`512`];
80	int f = fibonacci (i: `5`);
81	memcpy (buf1, buf2, f);
82	}
83
84	#pragma GCC pop_options
85
86	static timing_t
87	test_mutex (long iters, int filler)
88	{
89	timing_t start, stop, cur;
90
91	pthread_mutex_init (mutex: &m, NULL);
92
93	TIMING_NOW (start);
94	for (long j = iters; j >= `0`; --j)
95	{
96	pthread_mutex_lock (mutex: &m);
97	FILLER_GOES_HERE;
98	pthread_mutex_unlock (mutex: &m);
99	}
100	TIMING_NOW (stop);
101	TIMING_DIFF (cur, start, stop);
102
103	return cur;
104	}
105
106	static timing_t
107	test_mutex_trylock (long iters, int filler)
108	{
109	timing_t start, stop, cur;
110
111	pthread_mutex_init (mutex: &m, NULL);
112	pthread_mutex_lock (mutex: &m);
113
114	TIMING_NOW (start);
115	for (long j = iters; j >= `0`; --j)
116	{
117	pthread_mutex_trylock (mutex: &m);
118	FILLER_GOES_HERE;
119	}
120	TIMING_NOW (stop);
121	TIMING_DIFF (cur, start, stop);
122
123	pthread_mutex_unlock (mutex: &m);
124	return cur;
125	}
126
127	static timing_t
128	test_rwlock_read (long iters, int filler)
129	{
130	timing_t start, stop, cur;
131
132	pthread_rwlock_init (rwlock: &rw, NULL);
133
134	TIMING_NOW (start);
135	for (long j = iters; j >= `0`; --j)
136	{
137	pthread_rwlock_rdlock (rwlock: &rw);
138	FILLER_GOES_HERE;
139	pthread_rwlock_unlock (rwlock: &rw);
140	}
141	TIMING_NOW (stop);
142	TIMING_DIFF (cur, start, stop);
143
144	return cur;
145	}
146
147	static timing_t
148	test_rwlock_tryread (long iters, int filler)
149	{
150	timing_t start, stop, cur;
151
152	pthread_rwlock_init (rwlock: &rw, NULL);
153	pthread_rwlock_wrlock (rwlock: &rw);
154
155	TIMING_NOW (start);
156	for (long j = iters; j >= `0`; --j)
157	{
158	pthread_rwlock_tryrdlock (rwlock: &rw);
159	FILLER_GOES_HERE;
160	}
161	TIMING_NOW (stop);
162	TIMING_DIFF (cur, start, stop);
163
164	pthread_rwlock_unlock (rwlock: &rw);
165	return cur;
166	}
167
168	static timing_t
169	test_rwlock_write (long iters, int filler)
170	{
171	timing_t start, stop, cur;
172
173	pthread_rwlock_init (rwlock: &rw, NULL);
174
175	TIMING_NOW (start);
176	for (long j = iters; j >= `0`; --j)
177	{
178	pthread_rwlock_wrlock (rwlock: &rw);
179	FILLER_GOES_HERE;
180	pthread_rwlock_unlock (rwlock: &rw);
181	}
182	TIMING_NOW (stop);
183	TIMING_DIFF (cur, start, stop);
184
185	return cur;
186	}
187
188	static timing_t
189	test_rwlock_trywrite (long iters, int filler)
190	{
191	timing_t start, stop, cur;
192
193	pthread_rwlock_init (rwlock: &rw, NULL);
194	pthread_rwlock_rdlock (rwlock: &rw);
195
196	TIMING_NOW (start);
197	for (long j = iters; j >= `0`; --j)
198	{
199	pthread_rwlock_trywrlock (rwlock: &rw);
200	FILLER_GOES_HERE;
201	}
202	TIMING_NOW (stop);
203	TIMING_DIFF (cur, start, stop);
204
205	pthread_rwlock_unlock (rwlock: &rw);
206	return cur;
207	}
208
209	static timing_t
210	test_spin_lock (long iters, int filler)
211	{
212	timing_t start, stop, cur;
213
214	pthread_spin_init (lock: &sp, PTHREAD_PROCESS_PRIVATE);
215
216	TIMING_NOW (start);
217	for (long j = iters; j >= `0`; --j)
218	{
219	pthread_spin_lock (lock: &sp);
220	FILLER_GOES_HERE;
221	pthread_spin_unlock (lock: &sp);
222	}
223	TIMING_NOW (stop);
224	TIMING_DIFF (cur, start, stop);
225
226	return cur;
227	}
228
229	static timing_t
230	test_spin_trylock (long iters, int filler)
231	{
232	timing_t start, stop, cur;
233
234	pthread_spin_init (lock: &sp, PTHREAD_PROCESS_PRIVATE);
235	pthread_spin_lock (lock: &sp);
236
237	TIMING_NOW (start);
238	for (long j = iters; j >= `0`; --j)
239	{
240	pthread_spin_trylock (lock: &sp);
241	FILLER_GOES_HERE;
242	}
243	TIMING_NOW (stop);
244	TIMING_DIFF (cur, start, stop);
245
246	pthread_spin_unlock (lock: &sp);
247	return cur;
248	}
249
250	static timing_t
251	test_sem_wait (long iters, int filler)
252	{
253	timing_t start, stop, cur;
254
255	sem_init (sem: &sem, pshared: `0`, value: `1`);
256
257	TIMING_NOW (start);
258	for (long j = iters; j >= `0`; --j)
259	{
260	sem_post (sem: &sem);
261	FILLER_GOES_HERE;
262	sem_wait (sem: &sem);
263	}
264	TIMING_NOW (stop);
265	TIMING_DIFF (cur, start, stop);
266
267	return cur;
268	}
269
270	static timing_t
271	test_sem_trywait (long iters, int filler)
272	{
273	timing_t start, stop, cur;
274
275	sem_init (sem: &sem, pshared: `0`, value: `0`);
276
277	TIMING_NOW (start);
278	for (long j = iters; j >= `0`; --j)
279	{
280	sem_trywait (sem: &sem);
281	FILLER_GOES_HERE;
282	}
283	TIMING_NOW (stop);
284	TIMING_DIFF (cur, start, stop);
285
286	return cur;
287	}
288
289	static void *
290	test_condvar_helper (void *v)
291	{
292	/ This is wasteful, but the alternative is to add the overhead of a*
293	mutex lock/unlock to the overall iteration (both threads) and we
294	don't want that. Ideally, this thread would run on an
295	independent processing core anyway. The ONLY goal here is to
296	minimize the time the other thread spends waiting for us. /*
297	while (__atomic_load_n (&cv_done, __ATOMIC_RELAXED) == `0`)
298	pthread_cond_signal (cond: &cv);
299
300	return NULL;
301	}
302
303	static timing_t
304	test_condvar (long iters, int filler)
305	{
306	timing_t start, stop, cur;
307	pthread_t helper_id;
308
309	pthread_mutex_init (mutex: &m, NULL);
310	pthread_cond_init (cond: &cv, NULL);
311	pthread_mutex_lock (mutex: &m);
312
313	__atomic_store_n (&cv_done, `0`, __ATOMIC_RELAXED);
314	pthread_create (newthread: &helper_id, NULL, start_routine: test_condvar_helper, arg: &iters);
315
316	TIMING_NOW (start);
317	for (long j = iters; j >= `0`; --j)
318	{
319	pthread_cond_wait (cond: &cv, mutex: &m);
320	FILLER_GOES_HERE;
321	}
322	TIMING_NOW (stop);
323	TIMING_DIFF (cur, start, stop);
324
325	pthread_mutex_unlock (mutex: &m);
326	__atomic_store_n (&cv_done, `1`, __ATOMIC_RELAXED);
327
328	pthread_join (th: helper_id, NULL);
329	return cur;
330	}
331
332	/ How many items are "queued" in our pretend queue. /
333	static int queued = `0`;
334
335	typedef struct Producer_Params {
336	long iters;
337	int filler;
338	} Producer_Params;
339
340	/ We only benchmark the consumer thread, but both threads are doing*
341	essentially the same thing, and never run in parallel due to the
342	locks. Thus, even if they run on separate processing cores, we
343	count the time for both threads. /*
344	static void *
345	test_producer_thread (void *v)
346	{
347	Producer_Params p = (Producer_Params ) v;
348	long iters = p->iters;
349	int filler = p->filler;
350	long j;
351
352	for (j = iters; j >= `0`; --j)
353	{
354	/ Aquire lock on the queue. /
355	pthread_mutex_lock (mutex: &m);
356	/ if something's already there, wait. /
357	while (queued > `0`)
358	pthread_cond_wait (cond: &consumer_c, mutex: &m);
359
360	/ Put something on the queue /
361	FILLER_GOES_HERE;
362	++ queued;
363	pthread_cond_signal (cond: &producer_c);
364
365	/ Give the other thread a chance to run. /
366	pthread_mutex_unlock (mutex: &m);
367	}
368
369	return NULL;
370	}
371
372	static timing_t
373	test_consumer_producer (long iters, int filler)
374	{
375	timing_t start, stop, cur;
376	pthread_t helper_id;
377	Producer_Params p;
378
379	p.iters = iters;
380	p.filler = filler;
381
382	pthread_mutex_init (mutex: &m, NULL);
383	pthread_cond_init (cond: &cv, NULL);
384
385	pthread_create (newthread: &helper_id, NULL, start_routine: test_producer_thread, arg: &p);
386
387	TIMING_NOW (start);
388
389	for (long j = iters; j >= `0`; --j)
390	{
391	/ Aquire lock on the queue. /
392	pthread_mutex_lock (mutex: &m);
393	/ Wait for something to be on the queue. /
394	while (queued == `0`)
395	pthread_cond_wait (cond: &producer_c, mutex: &m);
396
397	/ Take if off. /
398	FILLER_GOES_HERE;
399	-- queued;
400	pthread_cond_signal (cond: &consumer_c);
401
402	/ Give the other thread a chance to run. /
403	pthread_mutex_unlock (mutex: &m);
404	}
405
406	TIMING_NOW (stop);
407	TIMING_DIFF (cur, start, stop);
408
409
410	pthread_join (th: helper_id, NULL);
411	return cur;
412	}
413
414	/ Number of runs we use for computing mean and standard deviation.*
415	We actually do two additional runs and discard the outliers. /*
416	#define RUN_COUNT 10
417
418	static int
419	do_bench_2 (const char name, test_t func, int* filler, json_ctx_t *js)
420	{
421	timing_t cur;
422	struct timeval ts, te;
423	double tsd, ted, td;
424	long iters, iters_limit;
425	timing_t curs[RUN_COUNT + `2`];
426	int i, j;
427	double mean, stdev;
428
429	iters = START_ITERS;
430	iters_limit = LONG_MAX / `100`;
431
432	while (`1`) {
433	gettimeofday (tv: &ts, NULL);
434	cur = func(iters, filler);
435	gettimeofday (tv: &te, NULL);
436
437	/ We want a test to take at least 0.01 seconds, and try*
438	increasingly larger iteration counts until it does. This
439	allows for approximately constant-time tests regardless of
440	hardware speed, without the overhead of checking the time
441	inside the test loop itself. We stop at a million iterations
442	as that should be precise enough. Once we determine a suitable
443	iteration count, we run the test multiple times to calculate
444	mean and standard deviation. /*
445
446	/ Note that this also primes the CPU cache and triggers faster*
447	MHz, we hope. /*
448	tsd = ts.tv_sec + ts.tv_usec / `1000000.0`;
449	ted = te.tv_sec + te.tv_usec / `1000000.0`;
450	td = ted - tsd;
451	if (td >= `0.01`
452	\|\| iters >= iters_limit
453	\|\| iters >= `1000000`)
454	break;
455
456	iters *= `10`;
457	}
458
459	curs[`0`] = cur;
460	for (i = `1`; i < RUN_COUNT + `2`; i ++)
461	curs[i] = func(iters, filler);
462
463	/ We sort the results so we can discard the fastest and slowest*
464	times as outliers. In theory we should keep the fastest time,
465	but IMHO this is more fair. A simple bubble sort suffices. /*
466
467	for (i = `0`; i < RUN_COUNT + `1`; i ++)
468	for (j = i + `1`; j < RUN_COUNT + `2`; j ++)
469	if (curs[i] > curs[j])
470	{
471	timing_t temp = curs[i];
472	curs[i] = curs[j];
473	curs[j] = temp;
474	}
475
476	/ Now calculate mean and standard deviation, skipping the outliers. /
477	mean = `0.0`;
478	for (i = `1`; i<RUN_COUNT + `1`; i ++)
479	mean += (double) curs[i] / (double) iters;
480	mean /= RUN_COUNT;
481
482	stdev = `0.0`;
483	for (i = `1`; i < RUN_COUNT + `1`; i ++)
484	{
485	double s = (double) curs[i] / (double) iters - mean;
486	stdev += s * s;
487	}
488	stdev = sqrt (stdev / (RUN_COUNT - `1`));
489
490	char buf[`128`];
491	snprintf (s: buf, maxlen: sizeof buf, format: "%s-%s", name, filler ? "filler" : "empty");
492
493	json_attr_object_begin (ctx: js, name: buf);
494
495	json_attr_double (ctx: js, name: "duration", d: (double) cur);
496	json_attr_double (ctx: js, name: "iterations", d: (double) iters);
497	json_attr_double (ctx: js, name: "wall-sec", d: (double) td);
498	json_attr_double (ctx: js, name: "mean", d: mean);
499	json_attr_double (ctx: js, name: "stdev", d: stdev);
500	json_attr_double (ctx: js, name: "min-outlier", d: (double) curs[`0`] / (double) iters);
501	json_attr_double (ctx: js, name: "min", d: (double) curs[`1`] / (double) iters);
502	json_attr_double (ctx: js, name: "max", d: (double) curs[RUN_COUNT] / (double) iters);
503	json_attr_double (ctx: js, name: "max-outlier", d: (double) curs[RUN_COUNT + `1`] / (double) iters);
504
505	json_attr_object_end (ctx: js);
506
507	return `0`;
508	}
509
510	static int
511	do_bench_1 (const char name, test_t func, json_ctx_t js)
512	{
513	int rv = `0`;
514
515	rv += do_bench_2 (name, func, filler: `0`, js);
516	rv += do_bench_2 (name, func, filler: `1`, js);
517
518	return rv;
519	}
520
521	int
522	do_bench (void)
523	{
524	int rv = `0`;
525	json_ctx_t json_ctx;
526
527	json_init (ctx: &json_ctx, indent_level: `2`, stdout);
528	json_attr_object_begin (ctx: &json_ctx, name: "pthread_locks");
529
530	#define BENCH(n) rv += do_bench_1 (#n, test_##n, &json_ctx)
531
532	BENCH (mutex);
533	BENCH (mutex_trylock);
534	BENCH (rwlock_read);
535	BENCH (rwlock_tryread);
536	BENCH (rwlock_write);
537	BENCH (rwlock_trywrite);
538	BENCH (spin_lock);
539	BENCH (spin_trylock);
540	BENCH (sem_wait);
541	BENCH (sem_trywait);
542	BENCH (condvar);
543	BENCH (consumer_producer);
544
545	json_attr_object_end (ctx: &json_ctx);
546
547	return rv;
548	}
549
550
551	#define TEST_FUNCTION do_bench ()
552
553	#include "../test-skeleton.c"
554

source code of glibc/benchtests/bench-pthread-locks.c