1 /* Measure various lock acquisition times for empty critical sections.
2 Copyright (C) 2020-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #define TEST_MAIN
20 #define TEST_NAME "pthread-locks"
21
22 #include <stdio.h>
23 #include <string.h>
24 #include <limits.h>
25 #include <stdlib.h>
26 #include <pthread.h>
27 #include <semaphore.h>
28 #include <stdatomic.h>
29 #include <sys/time.h>
30 #include <math.h>
31 #include "bench-timing.h"
32 #include "json-lib.h"
33
34 /* The point of this benchmark is to measure the overhead of an empty
35 critical section or a small critical section. This is never going
36 to be indicative of real application performance. Instead we are
37 trying to benchmark the effects of the compiler and the runtime
38 coupled with a particular set of hardware atomic operations.
39 The numbers from this benchmark should be taken with a massive gain
40 of salt and viewed through the eyes of expert reviewers. */
41
42 static pthread_mutex_t m;
43 static pthread_rwlock_t rw;
44 static pthread_cond_t cv;
45 static pthread_cond_t consumer_c, producer_c;
46 static int cv_done;
47 static pthread_spinlock_t sp;
48 static sem_t sem;
49
50 typedef timing_t (*test_t)(long, int);
51
52 #define START_ITERS 1000
53
54 #define FILLER_GOES_HERE \
55 if (filler) \
56 do_filler ();
57
58 /* Everyone loves a good fibonacci series. This isn't quite one of
59 them because we need larger values in fewer steps, in a way that
60 won't be optimized away. We're looking to approximately double the
61 total time each test iteration takes, so as to not swamp the useful
62 timings. */
63
64 #pragma GCC push_options
65 #pragma GCC optimize(1)
66
67 static int __attribute__((noinline))
fibonacci(int i)68 fibonacci (int i)
69 {
70 asm("");
71 if (i > 2)
72 return fibonacci (i-1) + fibonacci (i-2);
73 return 10+i;
74 }
75
76 static void
do_filler(void)77 do_filler (void)
78 {
79 static char buf1[512], buf2[512];
80 int f = fibonacci (5);
81 memcpy (buf1, buf2, f);
82 }
83
84 #pragma GCC pop_options
85
86 static timing_t
test_mutex(long iters,int filler)87 test_mutex (long iters, int filler)
88 {
89 timing_t start, stop, cur;
90
91 pthread_mutex_init (&m, NULL);
92
93 TIMING_NOW (start);
94 for (long j = iters; j >= 0; --j)
95 {
96 pthread_mutex_lock (&m);
97 FILLER_GOES_HERE;
98 pthread_mutex_unlock (&m);
99 }
100 TIMING_NOW (stop);
101 TIMING_DIFF (cur, start, stop);
102
103 return cur;
104 }
105
106 static timing_t
test_mutex_trylock(long iters,int filler)107 test_mutex_trylock (long iters, int filler)
108 {
109 timing_t start, stop, cur;
110
111 pthread_mutex_init (&m, NULL);
112 pthread_mutex_lock (&m);
113
114 TIMING_NOW (start);
115 for (long j = iters; j >= 0; --j)
116 {
117 pthread_mutex_trylock (&m);
118 FILLER_GOES_HERE;
119 }
120 TIMING_NOW (stop);
121 TIMING_DIFF (cur, start, stop);
122
123 pthread_mutex_unlock (&m);
124 return cur;
125 }
126
127 static timing_t
test_rwlock_read(long iters,int filler)128 test_rwlock_read (long iters, int filler)
129 {
130 timing_t start, stop, cur;
131
132 pthread_rwlock_init (&rw, NULL);
133
134 TIMING_NOW (start);
135 for (long j = iters; j >= 0; --j)
136 {
137 pthread_rwlock_rdlock (&rw);
138 FILLER_GOES_HERE;
139 pthread_rwlock_unlock (&rw);
140 }
141 TIMING_NOW (stop);
142 TIMING_DIFF (cur, start, stop);
143
144 return cur;
145 }
146
147 static timing_t
test_rwlock_tryread(long iters,int filler)148 test_rwlock_tryread (long iters, int filler)
149 {
150 timing_t start, stop, cur;
151
152 pthread_rwlock_init (&rw, NULL);
153 pthread_rwlock_wrlock (&rw);
154
155 TIMING_NOW (start);
156 for (long j = iters; j >= 0; --j)
157 {
158 pthread_rwlock_tryrdlock (&rw);
159 FILLER_GOES_HERE;
160 }
161 TIMING_NOW (stop);
162 TIMING_DIFF (cur, start, stop);
163
164 pthread_rwlock_unlock (&rw);
165 return cur;
166 }
167
168 static timing_t
test_rwlock_write(long iters,int filler)169 test_rwlock_write (long iters, int filler)
170 {
171 timing_t start, stop, cur;
172
173 pthread_rwlock_init (&rw, NULL);
174
175 TIMING_NOW (start);
176 for (long j = iters; j >= 0; --j)
177 {
178 pthread_rwlock_wrlock (&rw);
179 FILLER_GOES_HERE;
180 pthread_rwlock_unlock (&rw);
181 }
182 TIMING_NOW (stop);
183 TIMING_DIFF (cur, start, stop);
184
185 return cur;
186 }
187
188 static timing_t
test_rwlock_trywrite(long iters,int filler)189 test_rwlock_trywrite (long iters, int filler)
190 {
191 timing_t start, stop, cur;
192
193 pthread_rwlock_init (&rw, NULL);
194 pthread_rwlock_rdlock (&rw);
195
196 TIMING_NOW (start);
197 for (long j = iters; j >= 0; --j)
198 {
199 pthread_rwlock_trywrlock (&rw);
200 FILLER_GOES_HERE;
201 }
202 TIMING_NOW (stop);
203 TIMING_DIFF (cur, start, stop);
204
205 pthread_rwlock_unlock (&rw);
206 return cur;
207 }
208
209 static timing_t
test_spin_lock(long iters,int filler)210 test_spin_lock (long iters, int filler)
211 {
212 timing_t start, stop, cur;
213
214 pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE);
215
216 TIMING_NOW (start);
217 for (long j = iters; j >= 0; --j)
218 {
219 pthread_spin_lock (&sp);
220 FILLER_GOES_HERE;
221 pthread_spin_unlock (&sp);
222 }
223 TIMING_NOW (stop);
224 TIMING_DIFF (cur, start, stop);
225
226 return cur;
227 }
228
229 static timing_t
test_spin_trylock(long iters,int filler)230 test_spin_trylock (long iters, int filler)
231 {
232 timing_t start, stop, cur;
233
234 pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE);
235 pthread_spin_lock (&sp);
236
237 TIMING_NOW (start);
238 for (long j = iters; j >= 0; --j)
239 {
240 pthread_spin_trylock (&sp);
241 FILLER_GOES_HERE;
242 }
243 TIMING_NOW (stop);
244 TIMING_DIFF (cur, start, stop);
245
246 pthread_spin_unlock (&sp);
247 return cur;
248 }
249
250 static timing_t
test_sem_wait(long iters,int filler)251 test_sem_wait (long iters, int filler)
252 {
253 timing_t start, stop, cur;
254
255 sem_init (&sem, 0, 1);
256
257 TIMING_NOW (start);
258 for (long j = iters; j >= 0; --j)
259 {
260 sem_post (&sem);
261 FILLER_GOES_HERE;
262 sem_wait (&sem);
263 }
264 TIMING_NOW (stop);
265 TIMING_DIFF (cur, start, stop);
266
267 return cur;
268 }
269
270 static timing_t
test_sem_trywait(long iters,int filler)271 test_sem_trywait (long iters, int filler)
272 {
273 timing_t start, stop, cur;
274
275 sem_init (&sem, 0, 0);
276
277 TIMING_NOW (start);
278 for (long j = iters; j >= 0; --j)
279 {
280 sem_trywait (&sem);
281 FILLER_GOES_HERE;
282 }
283 TIMING_NOW (stop);
284 TIMING_DIFF (cur, start, stop);
285
286 return cur;
287 }
288
289 static void *
test_condvar_helper(void * v)290 test_condvar_helper (void *v)
291 {
292 /* This is wasteful, but the alternative is to add the overhead of a
293 mutex lock/unlock to the overall iteration (both threads) and we
294 don't want that. Ideally, this thread would run on an
295 independent processing core anyway. The ONLY goal here is to
296 minimize the time the other thread spends waiting for us. */
297 while (__atomic_load_n (&cv_done, __ATOMIC_RELAXED) == 0)
298 pthread_cond_signal (&cv);
299
300 return NULL;
301 }
302
303 static timing_t
test_condvar(long iters,int filler)304 test_condvar (long iters, int filler)
305 {
306 timing_t start, stop, cur;
307 pthread_t helper_id;
308
309 pthread_mutex_init (&m, NULL);
310 pthread_cond_init (&cv, NULL);
311 pthread_mutex_lock (&m);
312
313 __atomic_store_n (&cv_done, 0, __ATOMIC_RELAXED);
314 pthread_create (&helper_id, NULL, test_condvar_helper, &iters);
315
316 TIMING_NOW (start);
317 for (long j = iters; j >= 0; --j)
318 {
319 pthread_cond_wait (&cv, &m);
320 FILLER_GOES_HERE;
321 }
322 TIMING_NOW (stop);
323 TIMING_DIFF (cur, start, stop);
324
325 pthread_mutex_unlock (&m);
326 __atomic_store_n (&cv_done, 1, __ATOMIC_RELAXED);
327
328 pthread_join (helper_id, NULL);
329 return cur;
330 }
331
332 /* How many items are "queued" in our pretend queue. */
333 static int queued = 0;
334
335 typedef struct Producer_Params {
336 long iters;
337 int filler;
338 } Producer_Params;
339
340 /* We only benchmark the consumer thread, but both threads are doing
341 essentially the same thing, and never run in parallel due to the
342 locks. Thus, even if they run on separate processing cores, we
343 count the time for both threads. */
344 static void *
test_producer_thread(void * v)345 test_producer_thread (void *v)
346 {
347 Producer_Params *p = (Producer_Params *) v;
348 long iters = p->iters;
349 int filler = p->filler;
350 long j;
351
352 for (j = iters; j >= 0; --j)
353 {
354 /* Aquire lock on the queue. */
355 pthread_mutex_lock (&m);
356 /* if something's already there, wait. */
357 while (queued > 0)
358 pthread_cond_wait (&consumer_c, &m);
359
360 /* Put something on the queue */
361 FILLER_GOES_HERE;
362 ++ queued;
363 pthread_cond_signal (&producer_c);
364
365 /* Give the other thread a chance to run. */
366 pthread_mutex_unlock (&m);
367 }
368
369 return NULL;
370 }
371
372 static timing_t
test_consumer_producer(long iters,int filler)373 test_consumer_producer (long iters, int filler)
374 {
375 timing_t start, stop, cur;
376 pthread_t helper_id;
377 Producer_Params p;
378
379 p.iters = iters;
380 p.filler = filler;
381
382 pthread_mutex_init (&m, NULL);
383 pthread_cond_init (&cv, NULL);
384
385 pthread_create (&helper_id, NULL, test_producer_thread, &p);
386
387 TIMING_NOW (start);
388
389 for (long j = iters; j >= 0; --j)
390 {
391 /* Aquire lock on the queue. */
392 pthread_mutex_lock (&m);
393 /* Wait for something to be on the queue. */
394 while (queued == 0)
395 pthread_cond_wait (&producer_c, &m);
396
397 /* Take if off. */
398 FILLER_GOES_HERE;
399 -- queued;
400 pthread_cond_signal (&consumer_c);
401
402 /* Give the other thread a chance to run. */
403 pthread_mutex_unlock (&m);
404 }
405
406 TIMING_NOW (stop);
407 TIMING_DIFF (cur, start, stop);
408
409
410 pthread_join (helper_id, NULL);
411 return cur;
412 }
413
414 /* Number of runs we use for computing mean and standard deviation.
415 We actually do two additional runs and discard the outliers. */
416 #define RUN_COUNT 10
417
418 static int
do_bench_2(const char * name,test_t func,int filler,json_ctx_t * js)419 do_bench_2 (const char *name, test_t func, int filler, json_ctx_t *js)
420 {
421 timing_t cur;
422 struct timeval ts, te;
423 double tsd, ted, td;
424 long iters, iters_limit;
425 timing_t curs[RUN_COUNT + 2];
426 int i, j;
427 double mean, stdev;
428
429 iters = START_ITERS;
430 iters_limit = LONG_MAX / 100;
431
432 while (1) {
433 gettimeofday (&ts, NULL);
434 cur = func(iters, filler);
435 gettimeofday (&te, NULL);
436
437 /* We want a test to take at least 0.01 seconds, and try
438 increasingly larger iteration counts until it does. This
439 allows for approximately constant-time tests regardless of
440 hardware speed, without the overhead of checking the time
441 inside the test loop itself. We stop at a million iterations
442 as that should be precise enough. Once we determine a suitable
443 iteration count, we run the test multiple times to calculate
444 mean and standard deviation. */
445
446 /* Note that this also primes the CPU cache and triggers faster
447 MHz, we hope. */
448 tsd = ts.tv_sec + ts.tv_usec / 1000000.0;
449 ted = te.tv_sec + te.tv_usec / 1000000.0;
450 td = ted - tsd;
451 if (td >= 0.01
452 || iters >= iters_limit
453 || iters >= 1000000)
454 break;
455
456 iters *= 10;
457 }
458
459 curs[0] = cur;
460 for (i = 1; i < RUN_COUNT + 2; i ++)
461 curs[i] = func(iters, filler);
462
463 /* We sort the results so we can discard the fastest and slowest
464 times as outliers. In theory we should keep the fastest time,
465 but IMHO this is more fair. A simple bubble sort suffices. */
466
467 for (i = 0; i < RUN_COUNT + 1; i ++)
468 for (j = i + 1; j < RUN_COUNT + 2; j ++)
469 if (curs[i] > curs[j])
470 {
471 timing_t temp = curs[i];
472 curs[i] = curs[j];
473 curs[j] = temp;
474 }
475
476 /* Now calculate mean and standard deviation, skipping the outliers. */
477 mean = 0.0;
478 for (i = 1; i<RUN_COUNT + 1; i ++)
479 mean += (double) curs[i] / (double) iters;
480 mean /= RUN_COUNT;
481
482 stdev = 0.0;
483 for (i = 1; i < RUN_COUNT + 1; i ++)
484 {
485 double s = (double) curs[i] / (double) iters - mean;
486 stdev += s * s;
487 }
488 stdev = sqrt (stdev / (RUN_COUNT - 1));
489
490 char buf[128];
491 snprintf (buf, sizeof buf, "%s-%s", name, filler ? "filler" : "empty");
492
493 json_attr_object_begin (js, buf);
494
495 json_attr_double (js, "duration", (double) cur);
496 json_attr_double (js, "iterations", (double) iters);
497 json_attr_double (js, "wall-sec", (double) td);
498 json_attr_double (js, "mean", mean);
499 json_attr_double (js, "stdev", stdev);
500 json_attr_double (js, "min-outlier", (double) curs[0] / (double) iters);
501 json_attr_double (js, "min", (double) curs[1] / (double) iters);
502 json_attr_double (js, "max", (double) curs[RUN_COUNT] / (double) iters);
503 json_attr_double (js, "max-outlier", (double) curs[RUN_COUNT + 1] / (double) iters);
504
505 json_attr_object_end (js);
506
507 return 0;
508 }
509
510 static int
do_bench_1(const char * name,test_t func,json_ctx_t * js)511 do_bench_1 (const char *name, test_t func, json_ctx_t *js)
512 {
513 int rv = 0;
514
515 rv += do_bench_2 (name, func, 0, js);
516 rv += do_bench_2 (name, func, 1, js);
517
518 return rv;
519 }
520
521 int
do_bench(void)522 do_bench (void)
523 {
524 int rv = 0;
525 json_ctx_t json_ctx;
526
527 json_init (&json_ctx, 2, stdout);
528 json_attr_object_begin (&json_ctx, "pthread_locks");
529
530 #define BENCH(n) rv += do_bench_1 (#n, test_##n, &json_ctx)
531
532 BENCH (mutex);
533 BENCH (mutex_trylock);
534 BENCH (rwlock_read);
535 BENCH (rwlock_tryread);
536 BENCH (rwlock_write);
537 BENCH (rwlock_trywrite);
538 BENCH (spin_lock);
539 BENCH (spin_trylock);
540 BENCH (sem_wait);
541 BENCH (sem_trywait);
542 BENCH (condvar);
543 BENCH (consumer_producer);
544
545 json_attr_object_end (&json_ctx);
546
547 return rv;
548 }
549
550
551 #define TEST_FUNCTION do_bench ()
552
553 #include "../test-skeleton.c"
554