1 /* Measure various lock acquisition times for empty critical sections.
2    Copyright (C) 2020-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 #define TEST_MAIN
20 #define TEST_NAME "pthread-locks"
21 
22 #include <stdio.h>
23 #include <string.h>
24 #include <limits.h>
25 #include <stdlib.h>
26 #include <pthread.h>
27 #include <semaphore.h>
28 #include <stdatomic.h>
29 #include <sys/time.h>
30 #include <math.h>
31 #include "bench-timing.h"
32 #include "json-lib.h"
33 
34 /* The point of this benchmark is to measure the overhead of an empty
35    critical section or a small critical section.  This is never going
36    to be indicative of real application performance.  Instead we are
37    trying to benchmark the effects of the compiler and the runtime
38    coupled with a particular set of hardware atomic operations.
39    The numbers from this benchmark should be taken with a massive gain
40    of salt and viewed through the eyes of expert reviewers.  */
41 
42 static pthread_mutex_t m;
43 static pthread_rwlock_t rw;
44 static pthread_cond_t cv;
45 static pthread_cond_t consumer_c, producer_c;
46 static int cv_done;
47 static pthread_spinlock_t sp;
48 static sem_t sem;
49 
50 typedef timing_t (*test_t)(long, int);
51 
52 #define START_ITERS 1000
53 
54 #define FILLER_GOES_HERE \
55   if (filler) \
56     do_filler ();
57 
58 /* Everyone loves a good fibonacci series.  This isn't quite one of
59    them because we need larger values in fewer steps, in a way that
60    won't be optimized away.  We're looking to approximately double the
61    total time each test iteration takes, so as to not swamp the useful
62    timings.  */
63 
64 #pragma GCC push_options
65 #pragma GCC optimize(1)
66 
67 static int __attribute__((noinline))
fibonacci(int i)68 fibonacci (int i)
69 {
70   asm("");
71   if (i > 2)
72     return fibonacci (i-1) + fibonacci (i-2);
73   return 10+i;
74 }
75 
76 static void
do_filler(void)77 do_filler (void)
78 {
79   static char buf1[512], buf2[512];
80   int f = fibonacci (5);
81   memcpy (buf1, buf2, f);
82 }
83 
84 #pragma GCC pop_options
85 
86 static timing_t
test_mutex(long iters,int filler)87 test_mutex (long iters, int filler)
88 {
89   timing_t start, stop, cur;
90 
91   pthread_mutex_init (&m, NULL);
92 
93   TIMING_NOW (start);
94   for (long j = iters; j >= 0; --j)
95     {
96       pthread_mutex_lock (&m);
97       FILLER_GOES_HERE;
98       pthread_mutex_unlock (&m);
99     }
100   TIMING_NOW (stop);
101   TIMING_DIFF (cur, start, stop);
102 
103   return cur;
104 }
105 
106 static timing_t
test_mutex_trylock(long iters,int filler)107 test_mutex_trylock (long iters, int filler)
108 {
109   timing_t start, stop, cur;
110 
111   pthread_mutex_init (&m, NULL);
112   pthread_mutex_lock (&m);
113 
114   TIMING_NOW (start);
115   for (long j = iters; j >= 0; --j)
116     {
117       pthread_mutex_trylock (&m);
118       FILLER_GOES_HERE;
119     }
120   TIMING_NOW (stop);
121   TIMING_DIFF (cur, start, stop);
122 
123   pthread_mutex_unlock (&m);
124   return cur;
125 }
126 
127 static timing_t
test_rwlock_read(long iters,int filler)128 test_rwlock_read (long iters, int filler)
129 {
130   timing_t start, stop, cur;
131 
132   pthread_rwlock_init (&rw, NULL);
133 
134   TIMING_NOW (start);
135   for (long j = iters; j >= 0; --j)
136     {
137       pthread_rwlock_rdlock (&rw);
138       FILLER_GOES_HERE;
139       pthread_rwlock_unlock (&rw);
140     }
141   TIMING_NOW (stop);
142   TIMING_DIFF (cur, start, stop);
143 
144   return cur;
145 }
146 
147 static timing_t
test_rwlock_tryread(long iters,int filler)148 test_rwlock_tryread (long iters, int filler)
149 {
150   timing_t start, stop, cur;
151 
152   pthread_rwlock_init (&rw, NULL);
153   pthread_rwlock_wrlock (&rw);
154 
155   TIMING_NOW (start);
156   for (long j = iters; j >= 0; --j)
157     {
158       pthread_rwlock_tryrdlock (&rw);
159       FILLER_GOES_HERE;
160     }
161   TIMING_NOW (stop);
162   TIMING_DIFF (cur, start, stop);
163 
164   pthread_rwlock_unlock (&rw);
165   return cur;
166 }
167 
168 static timing_t
test_rwlock_write(long iters,int filler)169 test_rwlock_write (long iters, int filler)
170 {
171   timing_t start, stop, cur;
172 
173   pthread_rwlock_init (&rw, NULL);
174 
175   TIMING_NOW (start);
176   for (long j = iters; j >= 0; --j)
177     {
178       pthread_rwlock_wrlock (&rw);
179       FILLER_GOES_HERE;
180       pthread_rwlock_unlock (&rw);
181     }
182   TIMING_NOW (stop);
183   TIMING_DIFF (cur, start, stop);
184 
185   return cur;
186 }
187 
188 static timing_t
test_rwlock_trywrite(long iters,int filler)189 test_rwlock_trywrite (long iters, int filler)
190 {
191   timing_t start, stop, cur;
192 
193   pthread_rwlock_init (&rw, NULL);
194   pthread_rwlock_rdlock (&rw);
195 
196   TIMING_NOW (start);
197   for (long j = iters; j >= 0; --j)
198     {
199       pthread_rwlock_trywrlock (&rw);
200       FILLER_GOES_HERE;
201     }
202   TIMING_NOW (stop);
203   TIMING_DIFF (cur, start, stop);
204 
205   pthread_rwlock_unlock (&rw);
206   return cur;
207 }
208 
209 static timing_t
test_spin_lock(long iters,int filler)210 test_spin_lock (long iters, int filler)
211 {
212   timing_t start, stop, cur;
213 
214   pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE);
215 
216   TIMING_NOW (start);
217   for (long j = iters; j >= 0; --j)
218     {
219       pthread_spin_lock (&sp);
220       FILLER_GOES_HERE;
221       pthread_spin_unlock (&sp);
222     }
223   TIMING_NOW (stop);
224   TIMING_DIFF (cur, start, stop);
225 
226   return cur;
227 }
228 
229 static timing_t
test_spin_trylock(long iters,int filler)230 test_spin_trylock (long iters, int filler)
231 {
232   timing_t start, stop, cur;
233 
234   pthread_spin_init (&sp, PTHREAD_PROCESS_PRIVATE);
235   pthread_spin_lock (&sp);
236 
237   TIMING_NOW (start);
238   for (long j = iters; j >= 0; --j)
239     {
240       pthread_spin_trylock (&sp);
241       FILLER_GOES_HERE;
242     }
243   TIMING_NOW (stop);
244   TIMING_DIFF (cur, start, stop);
245 
246   pthread_spin_unlock (&sp);
247   return cur;
248 }
249 
250 static timing_t
test_sem_wait(long iters,int filler)251 test_sem_wait (long iters, int filler)
252 {
253   timing_t start, stop, cur;
254 
255   sem_init (&sem, 0, 1);
256 
257   TIMING_NOW (start);
258   for (long j = iters; j >= 0; --j)
259     {
260       sem_post (&sem);
261       FILLER_GOES_HERE;
262       sem_wait (&sem);
263     }
264   TIMING_NOW (stop);
265   TIMING_DIFF (cur, start, stop);
266 
267   return cur;
268 }
269 
270 static timing_t
test_sem_trywait(long iters,int filler)271 test_sem_trywait (long iters, int filler)
272 {
273   timing_t start, stop, cur;
274 
275   sem_init (&sem, 0, 0);
276 
277   TIMING_NOW (start);
278   for (long j = iters; j >= 0; --j)
279     {
280       sem_trywait (&sem);
281       FILLER_GOES_HERE;
282     }
283   TIMING_NOW (stop);
284   TIMING_DIFF (cur, start, stop);
285 
286   return cur;
287 }
288 
289 static void *
test_condvar_helper(void * v)290 test_condvar_helper (void *v)
291 {
292   /* This is wasteful, but the alternative is to add the overhead of a
293      mutex lock/unlock to the overall iteration (both threads) and we
294      don't want that.  Ideally, this thread would run on an
295      independent processing core anyway.  The ONLY goal here is to
296      minimize the time the other thread spends waiting for us.  */
297   while (__atomic_load_n (&cv_done, __ATOMIC_RELAXED) == 0)
298     pthread_cond_signal (&cv);
299 
300   return NULL;
301 }
302 
303 static timing_t
test_condvar(long iters,int filler)304 test_condvar (long iters, int filler)
305 {
306   timing_t start, stop, cur;
307   pthread_t helper_id;
308 
309   pthread_mutex_init (&m, NULL);
310   pthread_cond_init (&cv, NULL);
311   pthread_mutex_lock (&m);
312 
313   __atomic_store_n (&cv_done, 0, __ATOMIC_RELAXED);
314   pthread_create (&helper_id, NULL, test_condvar_helper, &iters);
315 
316   TIMING_NOW (start);
317   for (long j = iters; j >= 0; --j)
318     {
319       pthread_cond_wait (&cv, &m);
320       FILLER_GOES_HERE;
321     }
322   TIMING_NOW (stop);
323   TIMING_DIFF (cur, start, stop);
324 
325   pthread_mutex_unlock (&m);
326   __atomic_store_n (&cv_done, 1, __ATOMIC_RELAXED);
327 
328   pthread_join (helper_id, NULL);
329   return cur;
330 }
331 
332 /* How many items are "queued" in our pretend queue.  */
333 static int queued = 0;
334 
335 typedef struct Producer_Params {
336   long iters;
337   int filler;
338 } Producer_Params;
339 
340 /* We only benchmark the consumer thread, but both threads are doing
341    essentially the same thing, and never run in parallel due to the
342    locks.  Thus, even if they run on separate processing cores, we
343    count the time for both threads.  */
344 static void *
test_producer_thread(void * v)345 test_producer_thread (void *v)
346 {
347   Producer_Params *p = (Producer_Params *) v;
348   long iters = p->iters;
349   int filler = p->filler;
350   long j;
351 
352   for (j = iters; j >= 0; --j)
353     {
354       /* Aquire lock on the queue.  */
355       pthread_mutex_lock (&m);
356       /* if something's already there, wait.  */
357       while (queued > 0)
358 	pthread_cond_wait (&consumer_c, &m);
359 
360       /* Put something on the queue */
361       FILLER_GOES_HERE;
362       ++ queued;
363       pthread_cond_signal (&producer_c);
364 
365       /* Give the other thread a chance to run.  */
366       pthread_mutex_unlock (&m);
367     }
368 
369   return NULL;
370 }
371 
372 static timing_t
test_consumer_producer(long iters,int filler)373 test_consumer_producer (long iters, int filler)
374 {
375   timing_t start, stop, cur;
376   pthread_t helper_id;
377   Producer_Params p;
378 
379   p.iters = iters;
380   p.filler = filler;
381 
382   pthread_mutex_init (&m, NULL);
383   pthread_cond_init (&cv, NULL);
384 
385   pthread_create (&helper_id, NULL, test_producer_thread, &p);
386 
387   TIMING_NOW (start);
388 
389   for (long j = iters; j >= 0; --j)
390     {
391       /* Aquire lock on the queue.  */
392       pthread_mutex_lock (&m);
393       /* Wait for something to be on the queue.  */
394       while (queued == 0)
395 	pthread_cond_wait (&producer_c, &m);
396 
397       /* Take if off. */
398       FILLER_GOES_HERE;
399       -- queued;
400       pthread_cond_signal (&consumer_c);
401 
402       /* Give the other thread a chance to run.  */
403       pthread_mutex_unlock (&m);
404     }
405 
406   TIMING_NOW (stop);
407   TIMING_DIFF (cur, start, stop);
408 
409 
410   pthread_join (helper_id, NULL);
411   return cur;
412 }
413 
414 /* Number of runs we use for computing mean and standard deviation.
415    We actually do two additional runs and discard the outliers.  */
416 #define RUN_COUNT 10
417 
418 static int
do_bench_2(const char * name,test_t func,int filler,json_ctx_t * js)419 do_bench_2 (const char *name, test_t func, int filler, json_ctx_t *js)
420 {
421   timing_t cur;
422   struct timeval ts, te;
423   double tsd, ted, td;
424   long iters, iters_limit;
425   timing_t curs[RUN_COUNT + 2];
426   int i, j;
427   double mean, stdev;
428 
429   iters = START_ITERS;
430   iters_limit = LONG_MAX / 100;
431 
432   while (1) {
433     gettimeofday (&ts, NULL);
434     cur = func(iters, filler);
435     gettimeofday (&te, NULL);
436 
437     /* We want a test to take at least 0.01 seconds, and try
438        increasingly larger iteration counts until it does.  This
439        allows for approximately constant-time tests regardless of
440        hardware speed, without the overhead of checking the time
441        inside the test loop itself.  We stop at a million iterations
442        as that should be precise enough.  Once we determine a suitable
443        iteration count, we run the test multiple times to calculate
444        mean and standard deviation.  */
445 
446     /* Note that this also primes the CPU cache and triggers faster
447        MHz, we hope.  */
448     tsd = ts.tv_sec + ts.tv_usec / 1000000.0;
449     ted = te.tv_sec + te.tv_usec / 1000000.0;
450     td = ted - tsd;
451     if (td >= 0.01
452 	|| iters >= iters_limit
453 	|| iters >= 1000000)
454       break;
455 
456     iters *= 10;
457   }
458 
459   curs[0] = cur;
460   for (i = 1; i < RUN_COUNT + 2; i ++)
461     curs[i] = func(iters, filler);
462 
463   /* We sort the results so we can discard the fastest and slowest
464      times as outliers.  In theory we should keep the fastest time,
465      but IMHO this is more fair.  A simple bubble sort suffices.  */
466 
467   for (i = 0; i < RUN_COUNT + 1; i ++)
468     for (j = i + 1; j < RUN_COUNT + 2; j ++)
469       if (curs[i] > curs[j])
470 	{
471 	  timing_t temp = curs[i];
472 	  curs[i] = curs[j];
473 	  curs[j] = temp;
474 	}
475 
476   /* Now calculate mean and standard deviation, skipping the outliers.  */
477   mean = 0.0;
478   for (i = 1; i<RUN_COUNT + 1; i ++)
479     mean += (double) curs[i] / (double) iters;
480   mean /= RUN_COUNT;
481 
482   stdev = 0.0;
483   for (i = 1; i < RUN_COUNT + 1; i ++)
484     {
485       double s = (double) curs[i] / (double) iters - mean;
486       stdev += s * s;
487     }
488   stdev = sqrt (stdev / (RUN_COUNT - 1));
489 
490   char buf[128];
491   snprintf (buf, sizeof buf, "%s-%s", name, filler ? "filler" : "empty");
492 
493   json_attr_object_begin (js, buf);
494 
495   json_attr_double (js, "duration", (double) cur);
496   json_attr_double (js, "iterations", (double) iters);
497   json_attr_double (js, "wall-sec", (double) td);
498   json_attr_double (js, "mean", mean);
499   json_attr_double (js, "stdev", stdev);
500   json_attr_double (js, "min-outlier", (double) curs[0] / (double) iters);
501   json_attr_double (js, "min", (double) curs[1] / (double) iters);
502   json_attr_double (js, "max", (double) curs[RUN_COUNT] / (double) iters);
503   json_attr_double (js, "max-outlier", (double) curs[RUN_COUNT + 1] / (double) iters);
504 
505   json_attr_object_end (js);
506 
507   return 0;
508 }
509 
510 static int
do_bench_1(const char * name,test_t func,json_ctx_t * js)511 do_bench_1 (const char *name, test_t func, json_ctx_t *js)
512 {
513   int rv = 0;
514 
515   rv += do_bench_2 (name, func, 0, js);
516   rv += do_bench_2 (name, func, 1, js);
517 
518   return rv;
519 }
520 
521 int
do_bench(void)522 do_bench (void)
523 {
524   int rv = 0;
525   json_ctx_t json_ctx;
526 
527   json_init (&json_ctx, 2, stdout);
528   json_attr_object_begin (&json_ctx, "pthread_locks");
529 
530 #define BENCH(n) rv += do_bench_1 (#n, test_##n, &json_ctx)
531 
532   BENCH (mutex);
533   BENCH (mutex_trylock);
534   BENCH (rwlock_read);
535   BENCH (rwlock_tryread);
536   BENCH (rwlock_write);
537   BENCH (rwlock_trywrite);
538   BENCH (spin_lock);
539   BENCH (spin_trylock);
540   BENCH (sem_wait);
541   BENCH (sem_trywait);
542   BENCH (condvar);
543   BENCH (consumer_producer);
544 
545   json_attr_object_end (&json_ctx);
546 
547   return rv;
548 }
549 
550 
551 #define TEST_FUNCTION do_bench ()
552 
553 #include "../test-skeleton.c"
554