1 /* Copyright (C) 2003-2022 Free Software Foundation, Inc.
2    This file is part of the GNU C Library.
3 
4    The GNU C Library is free software; you can redistribute it and/or
5    modify it under the terms of the GNU Lesser General Public
6    License as published by the Free Software Foundation; either
7    version 2.1 of the License, or (at your option) any later version.
8 
9    The GNU C Library is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
12    Lesser General Public License for more details.
13 
14    You should have received a copy of the GNU Lesser General Public
15    License along with the GNU C Library; if not, see
16    <https://www.gnu.org/licenses/>.  */
17 
18 #include <endian.h>
19 #include <errno.h>
20 #include <sysdep.h>
21 #include <futex-internal.h>
22 #include <pthread.h>
23 #include <pthreadP.h>
24 #include <sys/time.h>
25 #include <atomic.h>
26 #include <stdint.h>
27 #include <stdbool.h>
28 
29 #include <shlib-compat.h>
30 #include <stap-probe.h>
31 #include <time.h>
32 
33 #include "pthread_cond_common.c"
34 
35 
36 struct _condvar_cleanup_buffer
37 {
38   uint64_t wseq;
39   pthread_cond_t *cond;
40   pthread_mutex_t *mutex;
41   int private;
42 };
43 
44 
45 /* Decrease the waiter reference count.  */
46 static void
__condvar_confirm_wakeup(pthread_cond_t * cond,int private)47 __condvar_confirm_wakeup (pthread_cond_t *cond, int private)
48 {
49   /* If destruction is pending (i.e., the wake-request flag is nonzero) and we
50      are the last waiter (prior value of __wrefs was 1 << 3), then wake any
51      threads waiting in pthread_cond_destroy.  Release MO to synchronize with
52      these threads.  Don't bother clearing the wake-up request flag.  */
53   if ((atomic_fetch_add_release (&cond->__data.__wrefs, -8) >> 2) == 3)
54     futex_wake (&cond->__data.__wrefs, INT_MAX, private);
55 }
56 
57 
58 /* Cancel waiting after having registered as a waiter previously.  SEQ is our
59    position and G is our group index.
60    The goal of cancellation is to make our group smaller if that is still
61    possible.  If we are in a closed group, this is not possible anymore; in
62    this case, we need to send a replacement signal for the one we effectively
63    consumed because the signal should have gotten consumed by another waiter
64    instead; we must not both cancel waiting and consume a signal.
65 
66    Must not be called while still holding a reference on the group.
67 
68    Returns true iff we consumed a signal.
69 
70    On some kind of timeouts, we may be able to pretend that a signal we
71    effectively consumed happened before the timeout (i.e., similarly to first
72    spinning on signals before actually checking whether the timeout has
73    passed already).  Doing this would allow us to skip sending a replacement
74    signal, but this case might happen rarely because the end of the timeout
75    must race with someone else sending a signal.  Therefore, we don't bother
76    trying to optimize this.  */
77 static void
__condvar_cancel_waiting(pthread_cond_t * cond,uint64_t seq,unsigned int g,int private)78 __condvar_cancel_waiting (pthread_cond_t *cond, uint64_t seq, unsigned int g,
79 			  int private)
80 {
81   bool consumed_signal = false;
82 
83   /* No deadlock with group switching is possible here because we do
84      not hold a reference on the group.  */
85   __condvar_acquire_lock (cond, private);
86 
87   uint64_t g1_start = __condvar_load_g1_start_relaxed (cond) >> 1;
88   if (g1_start > seq)
89     {
90       /* Our group is closed, so someone provided enough signals for it.
91 	 Thus, we effectively consumed a signal.  */
92       consumed_signal = true;
93     }
94   else
95     {
96       if (g1_start + __condvar_get_orig_size (cond) <= seq)
97 	{
98 	  /* We are in the current G2 and thus cannot have consumed a signal.
99 	     Reduce its effective size or handle overflow.  Remember that in
100 	     G2, unsigned int size is zero or a negative value.  */
101 	  if (cond->__data.__g_size[g] + __PTHREAD_COND_MAX_GROUP_SIZE > 0)
102 	    {
103 	      cond->__data.__g_size[g]--;
104 	    }
105 	  else
106 	    {
107 	      /* Cancellations would overflow the maximum group size.  Just
108 		 wake up everyone spuriously to create a clean state.  This
109 		 also means we do not consume a signal someone else sent.  */
110 	      __condvar_release_lock (cond, private);
111 	      __pthread_cond_broadcast (cond);
112 	      return;
113 	    }
114 	}
115       else
116 	{
117 	  /* We are in current G1.  If the group's size is zero, someone put
118 	     a signal in the group that nobody else but us can consume.  */
119 	  if (cond->__data.__g_size[g] == 0)
120 	    consumed_signal = true;
121 	  else
122 	    {
123 	      /* Otherwise, we decrease the size of the group.  This is
124 		 equivalent to atomically putting in a signal just for us and
125 		 consuming it right away.  We do not consume a signal sent
126 		 by someone else.  We also cannot have consumed a futex
127 		 wake-up because if we were cancelled or timed out in a futex
128 		 call, the futex will wake another waiter.  */
129 	      cond->__data.__g_size[g]--;
130 	    }
131 	}
132     }
133 
134   __condvar_release_lock (cond, private);
135 
136   if (consumed_signal)
137     {
138       /* We effectively consumed a signal even though we didn't want to.
139 	 Therefore, we need to send a replacement signal.
140 	 If we would want to optimize this, we could do what
141 	 pthread_cond_signal does right in the critical section above.  */
142       __pthread_cond_signal (cond);
143     }
144 }
145 
146 /* Wake up any signalers that might be waiting.  */
147 static void
__condvar_dec_grefs(pthread_cond_t * cond,unsigned int g,int private)148 __condvar_dec_grefs (pthread_cond_t *cond, unsigned int g, int private)
149 {
150   /* Release MO to synchronize-with the acquire load in
151      __condvar_quiesce_and_switch_g1.  */
152   if (atomic_fetch_add_release (cond->__data.__g_refs + g, -2) == 3)
153     {
154       /* Clear the wake-up request flag before waking up.  We do not need more
155 	 than relaxed MO and it doesn't matter if we apply this for an aliased
156 	 group because we wake all futex waiters right after clearing the
157 	 flag.  */
158       atomic_fetch_and_relaxed (cond->__data.__g_refs + g, ~(unsigned int) 1);
159       futex_wake (cond->__data.__g_refs + g, INT_MAX, private);
160     }
161 }
162 
163 /* Clean-up for cancellation of waiters waiting for normal signals.  We cancel
164    our registration as a waiter, confirm we have woken up, and re-acquire the
165    mutex.  */
166 static void
__condvar_cleanup_waiting(void * arg)167 __condvar_cleanup_waiting (void *arg)
168 {
169   struct _condvar_cleanup_buffer *cbuffer =
170     (struct _condvar_cleanup_buffer *) arg;
171   pthread_cond_t *cond = cbuffer->cond;
172   unsigned g = cbuffer->wseq & 1;
173 
174   __condvar_dec_grefs (cond, g, cbuffer->private);
175 
176   __condvar_cancel_waiting (cond, cbuffer->wseq >> 1, g, cbuffer->private);
177   /* FIXME With the current cancellation implementation, it is possible that
178      a thread is cancelled after it has returned from a syscall.  This could
179      result in a cancelled waiter consuming a futex wake-up that is then
180      causing another waiter in the same group to not wake up.  To work around
181      this issue until we have fixed cancellation, just add a futex wake-up
182      conservatively.  */
183   futex_wake (cond->__data.__g_signals + g, 1, cbuffer->private);
184 
185   __condvar_confirm_wakeup (cond, cbuffer->private);
186 
187   /* XXX If locking the mutex fails, should we just stop execution?  This
188      might be better than silently ignoring the error.  */
189   __pthread_mutex_cond_lock (cbuffer->mutex);
190 }
191 
192 /* This condvar implementation guarantees that all calls to signal and
193    broadcast and all of the three virtually atomic parts of each call to wait
194    (i.e., (1) releasing the mutex and blocking, (2) unblocking, and (3) re-
195    acquiring the mutex) happen in some total order that is consistent with the
196    happens-before relations in the calling program.  However, this order does
197    not necessarily result in additional happens-before relations being
198    established (which aligns well with spurious wake-ups being allowed).
199 
200    All waiters acquire a certain position in a 64b waiter sequence (__wseq).
201    This sequence determines which waiters are allowed to consume signals.
202    A broadcast is equal to sending as many signals as are unblocked waiters.
203    When a signal arrives, it samples the current value of __wseq with a
204    relaxed-MO load (i.e., the position the next waiter would get).  (This is
205    sufficient because it is consistent with happens-before; the caller can
206    enforce stronger ordering constraints by calling signal while holding the
207    mutex.)  Only waiters with a position less than the __wseq value observed
208    by the signal are eligible to consume this signal.
209 
210    This would be straight-forward to implement if waiters would just spin but
211    we need to let them block using futexes.  Futexes give no guarantee of
212    waking in FIFO order, so we cannot reliably wake eligible waiters if we
213    just use a single futex.  Also, futex words are 32b in size, but we need
214    to distinguish more than 1<<32 states because we need to represent the
215    order of wake-up (and thus which waiters are eligible to consume signals);
216    blocking in a futex is not atomic with a waiter determining its position in
217    the waiter sequence, so we need the futex word to reliably notify waiters
218    that they should not attempt to block anymore because they have been
219    already signaled in the meantime.  While an ABA issue on a 32b value will
220    be rare, ignoring it when we are aware of it is not the right thing to do
221    either.
222 
223    Therefore, we use a 64b counter to represent the waiter sequence (on
224    architectures which only support 32b atomics, we use a few bits less).
225    To deal with the blocking using futexes, we maintain two groups of waiters:
226    * Group G1 consists of waiters that are all eligible to consume signals;
227      incoming signals will always signal waiters in this group until all
228      waiters in G1 have been signaled.
229    * Group G2 consists of waiters that arrive when a G1 is present and still
230      contains waiters that have not been signaled.  When all waiters in G1
231      are signaled and a new signal arrives, the new signal will convert G2
232      into the new G1 and create a new G2 for future waiters.
233 
234    We cannot allocate new memory because of process-shared condvars, so we
235    have just two slots of groups that change their role between G1 and G2.
236    Each has a separate futex word, a number of signals available for
237    consumption, a size (number of waiters in the group that have not been
238    signaled), and a reference count.
239 
240    The group reference count is used to maintain the number of waiters that
241    are using the group's futex.  Before a group can change its role, the
242    reference count must show that no waiters are using the futex anymore; this
243    prevents ABA issues on the futex word.
244 
245    To represent which intervals in the waiter sequence the groups cover (and
246    thus also which group slot contains G1 or G2), we use a 64b counter to
247    designate the start position of G1 (inclusive), and a single bit in the
248    waiter sequence counter to represent which group slot currently contains
249    G2.  This allows us to switch group roles atomically wrt. waiters obtaining
250    a position in the waiter sequence.  The G1 start position allows waiters to
251    figure out whether they are in a group that has already been completely
252    signaled (i.e., if the current G1 starts at a later position that the
253    waiter's position).  Waiters cannot determine whether they are currently
254    in G2 or G1 -- but they do not have too because all they are interested in
255    is whether there are available signals, and they always start in G2 (whose
256    group slot they know because of the bit in the waiter sequence.  Signalers
257    will simply fill the right group until it is completely signaled and can
258    be closed (they do not switch group roles until they really have to to
259    decrease the likelihood of having to wait for waiters still holding a
260    reference on the now-closed G1).
261 
262    Signalers maintain the initial size of G1 to be able to determine where
263    G2 starts (G2 is always open-ended until it becomes G1).  They track the
264    remaining size of a group; when waiters cancel waiting (due to PThreads
265    cancellation or timeouts), they will decrease this remaining size as well.
266 
267    To implement condvar destruction requirements (i.e., that
268    pthread_cond_destroy can be called as soon as all waiters have been
269    signaled), waiters increment a reference count before starting to wait and
270    decrement it after they stopped waiting but right before they acquire the
271    mutex associated with the condvar.
272 
273    pthread_cond_t thus consists of the following (bits that are used for
274    flags and are not part of the primary value of each field but necessary
275    to make some things atomic or because there was no space for them
276    elsewhere in the data structure):
277 
278    __wseq: Waiter sequence counter
279      * LSB is index of current G2.
280      * Waiters fetch-add while having acquire the mutex associated with the
281        condvar.  Signalers load it and fetch-xor it concurrently.
282    __g1_start: Starting position of G1 (inclusive)
283      * LSB is index of current G2.
284      * Modified by signalers while having acquired the condvar-internal lock
285        and observed concurrently by waiters.
286    __g1_orig_size: Initial size of G1
287      * The two least-significant bits represent the condvar-internal lock.
288      * Only accessed while having acquired the condvar-internal lock.
289    __wrefs: Waiter reference counter.
290      * Bit 2 is true if waiters should run futex_wake when they remove the
291        last reference.  pthread_cond_destroy uses this as futex word.
292      * Bit 1 is the clock ID (0 == CLOCK_REALTIME, 1 == CLOCK_MONOTONIC).
293      * Bit 0 is true iff this is a process-shared condvar.
294      * Simple reference count used by both waiters and pthread_cond_destroy.
295      (If the format of __wrefs is changed, update nptl_lock_constants.pysym
296       and the pretty printers.)
297    For each of the two groups, we have:
298    __g_refs: Futex waiter reference count.
299      * LSB is true if waiters should run futex_wake when they remove the
300        last reference.
301      * Reference count used by waiters concurrently with signalers that have
302        acquired the condvar-internal lock.
303    __g_signals: The number of signals that can still be consumed.
304      * Used as a futex word by waiters.  Used concurrently by waiters and
305        signalers.
306      * LSB is true iff this group has been completely signaled (i.e., it is
307        closed).
308    __g_size: Waiters remaining in this group (i.e., which have not been
309      signaled yet.
310      * Accessed by signalers and waiters that cancel waiting (both do so only
311        when having acquired the condvar-internal lock.
312      * The size of G2 is always zero because it cannot be determined until
313        the group becomes G1.
314      * Although this is of unsigned type, we rely on using unsigned overflow
315        rules to make this hold effectively negative values too (in
316        particular, when waiters in G2 cancel waiting).
317 
318    A PTHREAD_COND_INITIALIZER condvar has all fields set to zero, which yields
319    a condvar that has G2 starting at position 0 and a G1 that is closed.
320 
321    Because waiters do not claim ownership of a group right when obtaining a
322    position in __wseq but only reference count the group when using futexes
323    to block, it can happen that a group gets closed before a waiter can
324    increment the reference count.  Therefore, waiters have to check whether
325    their group is already closed using __g1_start.  They also have to perform
326    this check when spinning when trying to grab a signal from __g_signals.
327    Note that for these checks, using relaxed MO to load __g1_start is
328    sufficient because if a waiter can see a sufficiently large value, it could
329    have also consume a signal in the waiters group.
330 
331    Waiters try to grab a signal from __g_signals without holding a reference
332    count, which can lead to stealing a signal from a more recent group after
333    their own group was already closed.  They cannot always detect whether they
334    in fact did because they do not know when they stole, but they can
335    conservatively add a signal back to the group they stole from; if they
336    did so unnecessarily, all that happens is a spurious wake-up.  To make this
337    even less likely, __g1_start contains the index of the current g2 too,
338    which allows waiters to check if there aliasing on the group slots; if
339    there wasn't, they didn't steal from the current G1, which means that the
340    G1 they stole from must have been already closed and they do not need to
341    fix anything.
342 
343    It is essential that the last field in pthread_cond_t is __g_signals[1]:
344    The previous condvar used a pointer-sized field in pthread_cond_t, so a
345    PTHREAD_COND_INITIALIZER from that condvar implementation might only
346    initialize 4 bytes to zero instead of the 8 bytes we need (i.e., 44 bytes
347    in total instead of the 48 we need).  __g_signals[1] is not accessed before
348    the first group switch (G2 starts at index 0), which will set its value to
349    zero after a harmless fetch-or whose return value is ignored.  This
350    effectively completes initialization.
351 
352 
353    Limitations:
354    * This condvar isn't designed to allow for more than
355      __PTHREAD_COND_MAX_GROUP_SIZE * (1 << 31) calls to __pthread_cond_wait.
356    * More than __PTHREAD_COND_MAX_GROUP_SIZE concurrent waiters are not
357      supported.
358    * Beyond what is allowed as errors by POSIX or documented, we can also
359      return the following errors:
360      * EPERM if MUTEX is a recursive mutex and the caller doesn't own it.
361      * EOWNERDEAD or ENOTRECOVERABLE when using robust mutexes.  Unlike
362        for other errors, this can happen when we re-acquire the mutex; this
363        isn't allowed by POSIX (which requires all errors to virtually happen
364        before we release the mutex or change the condvar state), but there's
365        nothing we can do really.
366      * When using PTHREAD_MUTEX_PP_* mutexes, we can also return all errors
367        returned by __pthread_tpp_change_priority.  We will already have
368        released the mutex in such cases, so the caller cannot expect to own
369        MUTEX.
370 
371    Other notes:
372    * Instead of the normal mutex unlock / lock functions, we use
373      __pthread_mutex_unlock_usercnt(m, 0) / __pthread_mutex_cond_lock(m)
374      because those will not change the mutex-internal users count, so that it
375      can be detected when a condvar is still associated with a particular
376      mutex because there is a waiter blocked on this condvar using this mutex.
377 */
378 static __always_inline int
__pthread_cond_wait_common(pthread_cond_t * cond,pthread_mutex_t * mutex,clockid_t clockid,const struct __timespec64 * abstime)379 __pthread_cond_wait_common (pthread_cond_t *cond, pthread_mutex_t *mutex,
380     clockid_t clockid, const struct __timespec64 *abstime)
381 {
382   const int maxspin = 0;
383   int err;
384   int result = 0;
385 
386   LIBC_PROBE (cond_wait, 2, cond, mutex);
387 
388   /* clockid will already have been checked by
389      __pthread_cond_clockwait or pthread_condattr_setclock, or we
390      don't use it if abstime is NULL, so we don't need to check it
391      here. */
392 
393   /* Acquire a position (SEQ) in the waiter sequence (WSEQ).  We use an
394      atomic operation because signals and broadcasts may update the group
395      switch without acquiring the mutex.  We do not need release MO here
396      because we do not need to establish any happens-before relation with
397      signalers (see __pthread_cond_signal); modification order alone
398      establishes a total order of waiters/signals.  We do need acquire MO
399      to synchronize with group reinitialization in
400      __condvar_quiesce_and_switch_g1.  */
401   uint64_t wseq = __condvar_fetch_add_wseq_acquire (cond, 2);
402   /* Find our group's index.  We always go into what was G2 when we acquired
403      our position.  */
404   unsigned int g = wseq & 1;
405   uint64_t seq = wseq >> 1;
406 
407   /* Increase the waiter reference count.  Relaxed MO is sufficient because
408      we only need to synchronize when decrementing the reference count.  */
409   unsigned int flags = atomic_fetch_add_relaxed (&cond->__data.__wrefs, 8);
410   int private = __condvar_get_private (flags);
411 
412   /* Now that we are registered as a waiter, we can release the mutex.
413      Waiting on the condvar must be atomic with releasing the mutex, so if
414      the mutex is used to establish a happens-before relation with any
415      signaler, the waiter must be visible to the latter; thus, we release the
416      mutex after registering as waiter.
417      If releasing the mutex fails, we just cancel our registration as a
418      waiter and confirm that we have woken up.  */
419   err = __pthread_mutex_unlock_usercnt (mutex, 0);
420   if (__glibc_unlikely (err != 0))
421     {
422       __condvar_cancel_waiting (cond, seq, g, private);
423       __condvar_confirm_wakeup (cond, private);
424       return err;
425     }
426 
427   /* Now wait until a signal is available in our group or it is closed.
428      Acquire MO so that if we observe a value of zero written after group
429      switching in __condvar_quiesce_and_switch_g1, we synchronize with that
430      store and will see the prior update of __g1_start done while switching
431      groups too.  */
432   unsigned int signals = atomic_load_acquire (cond->__data.__g_signals + g);
433 
434   do
435     {
436       while (1)
437 	{
438 	  /* Spin-wait first.
439 	     Note that spinning first without checking whether a timeout
440 	     passed might lead to what looks like a spurious wake-up even
441 	     though we should return ETIMEDOUT (e.g., if the caller provides
442 	     an absolute timeout that is clearly in the past).  However,
443 	     (1) spurious wake-ups are allowed, (2) it seems unlikely that a
444 	     user will (ab)use pthread_cond_wait as a check for whether a
445 	     point in time is in the past, and (3) spinning first without
446 	     having to compare against the current time seems to be the right
447 	     choice from a performance perspective for most use cases.  */
448 	  unsigned int spin = maxspin;
449 	  while (signals == 0 && spin > 0)
450 	    {
451 	      /* Check that we are not spinning on a group that's already
452 		 closed.  */
453 	      if (seq < (__condvar_load_g1_start_relaxed (cond) >> 1))
454 		goto done;
455 
456 	      /* TODO Back off.  */
457 
458 	      /* Reload signals.  See above for MO.  */
459 	      signals = atomic_load_acquire (cond->__data.__g_signals + g);
460 	      spin--;
461 	    }
462 
463 	  /* If our group will be closed as indicated by the flag on signals,
464 	     don't bother grabbing a signal.  */
465 	  if (signals & 1)
466 	    goto done;
467 
468 	  /* If there is an available signal, don't block.  */
469 	  if (signals != 0)
470 	    break;
471 
472 	  /* No signals available after spinning, so prepare to block.
473 	     We first acquire a group reference and use acquire MO for that so
474 	     that we synchronize with the dummy read-modify-write in
475 	     __condvar_quiesce_and_switch_g1 if we read from that.  In turn,
476 	     in this case this will make us see the closed flag on __g_signals
477 	     that designates a concurrent attempt to reuse the group's slot.
478 	     We use acquire MO for the __g_signals check to make the
479 	     __g1_start check work (see spinning above).
480 	     Note that the group reference acquisition will not mask the
481 	     release MO when decrementing the reference count because we use
482 	     an atomic read-modify-write operation and thus extend the release
483 	     sequence.  */
484 	  atomic_fetch_add_acquire (cond->__data.__g_refs + g, 2);
485 	  if (((atomic_load_acquire (cond->__data.__g_signals + g) & 1) != 0)
486 	      || (seq < (__condvar_load_g1_start_relaxed (cond) >> 1)))
487 	    {
488 	      /* Our group is closed.  Wake up any signalers that might be
489 		 waiting.  */
490 	      __condvar_dec_grefs (cond, g, private);
491 	      goto done;
492 	    }
493 
494 	  // Now block.
495 	  struct _pthread_cleanup_buffer buffer;
496 	  struct _condvar_cleanup_buffer cbuffer;
497 	  cbuffer.wseq = wseq;
498 	  cbuffer.cond = cond;
499 	  cbuffer.mutex = mutex;
500 	  cbuffer.private = private;
501 	  __pthread_cleanup_push (&buffer, __condvar_cleanup_waiting, &cbuffer);
502 
503 	  err = __futex_abstimed_wait_cancelable64 (
504 	    cond->__data.__g_signals + g, 0, clockid, abstime, private);
505 
506 	  __pthread_cleanup_pop (&buffer, 0);
507 
508 	  if (__glibc_unlikely (err == ETIMEDOUT || err == EOVERFLOW))
509 	    {
510 	      __condvar_dec_grefs (cond, g, private);
511 	      /* If we timed out, we effectively cancel waiting.  Note that
512 		 we have decremented __g_refs before cancellation, so that a
513 		 deadlock between waiting for quiescence of our group in
514 		 __condvar_quiesce_and_switch_g1 and us trying to acquire
515 		 the lock during cancellation is not possible.  */
516 	      __condvar_cancel_waiting (cond, seq, g, private);
517 	      result = err;
518 	      goto done;
519 	    }
520 	  else
521 	    __condvar_dec_grefs (cond, g, private);
522 
523 	  /* Reload signals.  See above for MO.  */
524 	  signals = atomic_load_acquire (cond->__data.__g_signals + g);
525 	}
526 
527     }
528   /* Try to grab a signal.  Use acquire MO so that we see an up-to-date value
529      of __g1_start below (see spinning above for a similar case).  In
530      particular, if we steal from a more recent group, we will also see a
531      more recent __g1_start below.  */
532   while (!atomic_compare_exchange_weak_acquire (cond->__data.__g_signals + g,
533 						&signals, signals - 2));
534 
535   /* We consumed a signal but we could have consumed from a more recent group
536      that aliased with ours due to being in the same group slot.  If this
537      might be the case our group must be closed as visible through
538      __g1_start.  */
539   uint64_t g1_start = __condvar_load_g1_start_relaxed (cond);
540   if (seq < (g1_start >> 1))
541     {
542       /* We potentially stole a signal from a more recent group but we do not
543 	 know which group we really consumed from.
544 	 We do not care about groups older than current G1 because they are
545 	 closed; we could have stolen from these, but then we just add a
546 	 spurious wake-up for the current groups.
547 	 We will never steal a signal from current G2 that was really intended
548 	 for G2 because G2 never receives signals (until it becomes G1).  We
549 	 could have stolen a signal from G2 that was conservatively added by a
550 	 previous waiter that also thought it stole a signal -- but given that
551 	 that signal was added unnecessarily, it's not a problem if we steal
552 	 it.
553 	 Thus, the remaining case is that we could have stolen from the current
554 	 G1, where "current" means the __g1_start value we observed.  However,
555 	 if the current G1 does not have the same slot index as we do, we did
556 	 not steal from it and do not need to undo that.  This is the reason
557 	 for putting a bit with G2's index into__g1_start as well.  */
558       if (((g1_start & 1) ^ 1) == g)
559 	{
560 	  /* We have to conservatively undo our potential mistake of stealing
561 	     a signal.  We can stop trying to do that when the current G1
562 	     changes because other spinning waiters will notice this too and
563 	     __condvar_quiesce_and_switch_g1 has checked that there are no
564 	     futex waiters anymore before switching G1.
565 	     Relaxed MO is fine for the __g1_start load because we need to
566 	     merely be able to observe this fact and not have to observe
567 	     something else as well.
568 	     ??? Would it help to spin for a little while to see whether the
569 	     current G1 gets closed?  This might be worthwhile if the group is
570 	     small or close to being closed.  */
571 	  unsigned int s = atomic_load_relaxed (cond->__data.__g_signals + g);
572 	  while (__condvar_load_g1_start_relaxed (cond) == g1_start)
573 	    {
574 	      /* Try to add a signal.  We don't need to acquire the lock
575 		 because at worst we can cause a spurious wake-up.  If the
576 		 group is in the process of being closed (LSB is true), this
577 		 has an effect similar to us adding a signal.  */
578 	      if (((s & 1) != 0)
579 		  || atomic_compare_exchange_weak_relaxed
580 		       (cond->__data.__g_signals + g, &s, s + 2))
581 		{
582 		  /* If we added a signal, we also need to add a wake-up on
583 		     the futex.  We also need to do that if we skipped adding
584 		     a signal because the group is being closed because
585 		     while __condvar_quiesce_and_switch_g1 could have closed
586 		     the group, it might stil be waiting for futex waiters to
587 		     leave (and one of those waiters might be the one we stole
588 		     the signal from, which cause it to block using the
589 		     futex).  */
590 		  futex_wake (cond->__data.__g_signals + g, 1, private);
591 		  break;
592 		}
593 	      /* TODO Back off.  */
594 	    }
595 	}
596     }
597 
598  done:
599 
600   /* Confirm that we have been woken.  We do that before acquiring the mutex
601      to allow for execution of pthread_cond_destroy while having acquired the
602      mutex.  */
603   __condvar_confirm_wakeup (cond, private);
604 
605   /* Woken up; now re-acquire the mutex.  If this doesn't fail, return RESULT,
606      which is set to ETIMEDOUT if a timeout occured, or zero otherwise.  */
607   err = __pthread_mutex_cond_lock (mutex);
608   /* XXX Abort on errors that are disallowed by POSIX?  */
609   return (err != 0) ? err : result;
610 }
611 
612 
613 /* See __pthread_cond_wait_common.  */
614 int
___pthread_cond_wait(pthread_cond_t * cond,pthread_mutex_t * mutex)615 ___pthread_cond_wait (pthread_cond_t *cond, pthread_mutex_t *mutex)
616 {
617   /* clockid is unused when abstime is NULL. */
618   return __pthread_cond_wait_common (cond, mutex, 0, NULL);
619 }
620 
621 versioned_symbol (libc, ___pthread_cond_wait, pthread_cond_wait,
622 		  GLIBC_2_3_2);
libc_hidden_ver(___pthread_cond_wait,__pthread_cond_wait)623 libc_hidden_ver (___pthread_cond_wait, __pthread_cond_wait)
624 #ifndef SHARED
625 strong_alias (___pthread_cond_wait, __pthread_cond_wait)
626 #endif
627 
628 /* See __pthread_cond_wait_common.  */
629 int
630 ___pthread_cond_timedwait64 (pthread_cond_t *cond, pthread_mutex_t *mutex,
631 			     const struct __timespec64 *abstime)
632 {
633   /* Check parameter validity.  This should also tell the compiler that
634      it can assume that abstime is not NULL.  */
635   if (! valid_nanoseconds (abstime->tv_nsec))
636     return EINVAL;
637 
638   /* Relaxed MO is suffice because clock ID bit is only modified
639      in condition creation.  */
640   unsigned int flags = atomic_load_relaxed (&cond->__data.__wrefs);
641   clockid_t clockid = (flags & __PTHREAD_COND_CLOCK_MONOTONIC_MASK)
642                     ? CLOCK_MONOTONIC : CLOCK_REALTIME;
643   return __pthread_cond_wait_common (cond, mutex, clockid, abstime);
644 }
645 
646 #if __TIMESIZE == 64
647 strong_alias (___pthread_cond_timedwait64, ___pthread_cond_timedwait)
648 #else
649 strong_alias (___pthread_cond_timedwait64, __pthread_cond_timedwait64)
650 libc_hidden_def (__pthread_cond_timedwait64)
651 
652 int
653 ___pthread_cond_timedwait (pthread_cond_t *cond, pthread_mutex_t *mutex,
654 			    const struct timespec *abstime)
655 {
656   struct __timespec64 ts64 = valid_timespec_to_timespec64 (*abstime);
657 
658   return __pthread_cond_timedwait64 (cond, mutex, &ts64);
659 }
660 #endif /* __TIMESIZE == 64 */
661 versioned_symbol (libc, ___pthread_cond_timedwait,
662 		  pthread_cond_timedwait, GLIBC_2_3_2);
libc_hidden_ver(___pthread_cond_timedwait,__pthread_cond_timedwait)663 libc_hidden_ver (___pthread_cond_timedwait, __pthread_cond_timedwait)
664 #ifndef SHARED
665 strong_alias (___pthread_cond_timedwait, __pthread_cond_timedwait)
666 #endif
667 
668 /* See __pthread_cond_wait_common.  */
669 int
670 ___pthread_cond_clockwait64 (pthread_cond_t *cond, pthread_mutex_t *mutex,
671 			      clockid_t clockid,
672 			      const struct __timespec64 *abstime)
673 {
674   /* Check parameter validity.  This should also tell the compiler that
675      it can assume that abstime is not NULL.  */
676   if (! valid_nanoseconds (abstime->tv_nsec))
677     return EINVAL;
678 
679   if (!futex_abstimed_supported_clockid (clockid))
680     return EINVAL;
681 
682   return __pthread_cond_wait_common (cond, mutex, clockid, abstime);
683 }
684 
685 #if __TIMESIZE == 64
686 strong_alias (___pthread_cond_clockwait64, ___pthread_cond_clockwait)
687 #else
688 strong_alias (___pthread_cond_clockwait64, __pthread_cond_clockwait64);
689 libc_hidden_def (__pthread_cond_clockwait64)
690 
691 int
692 ___pthread_cond_clockwait (pthread_cond_t *cond, pthread_mutex_t *mutex,
693                           clockid_t clockid,
694                           const struct timespec *abstime)
695 {
696   struct __timespec64 ts64 = valid_timespec_to_timespec64 (*abstime);
697 
698   return __pthread_cond_clockwait64 (cond, mutex, clockid, &ts64);
699 }
700 #endif /* __TIMESIZE == 64 */
701 libc_hidden_ver (___pthread_cond_clockwait, __pthread_cond_clockwait)
702 #ifndef SHARED
703 strong_alias (___pthread_cond_clockwait, __pthread_cond_clockwait)
704 #endif
705 versioned_symbol (libc, ___pthread_cond_clockwait,
706 		  pthread_cond_clockwait, GLIBC_2_34);
707 #if OTHER_SHLIB_COMPAT (libpthread, GLIBC_2_30, GLIBC_2_34)
708 compat_symbol (libpthread, ___pthread_cond_clockwait,
709 	       pthread_cond_clockwait, GLIBC_2_30);
710 #endif
711