1 /* POSIX reader--writer lock: core parts.
2    Copyright (C) 2016-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 #include <errno.h>
20 #include <sysdep.h>
21 #include <pthread.h>
22 #include <pthreadP.h>
23 #include <sys/time.h>
24 #include <stap-probe.h>
25 #include <atomic.h>
26 #include <futex-internal.h>
27 #include <time.h>
28 
29 
30 /* A reader--writer lock that fulfills the POSIX requirements (but operations
31    on this lock are not necessarily full barriers, as one may interpret the
32    POSIX requirement about "synchronizing memory").  All critical sections are
33    in a total order, writers synchronize with prior writers and readers, and
34    readers synchronize with prior writers.
35 
36    A thread is allowed to acquire a read lock recursively (i.e., have rdlock
37    critical sections that overlap in sequenced-before) unless the kind of the
38    rwlock is set to PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP.
39 
40    This lock is built so that workloads of mostly readers can be executed with
41    low runtime overheads.  This matches that the default kind of the lock is
42    PTHREAD_RWLOCK_PREFER_READER_NP.  Acquiring a read lock requires a single
43    atomic addition if the lock is or was previously acquired by other
44    readers; releasing the lock is a single CAS if there are no concurrent
45    writers.
46    Workloads consisting of mostly writers are of secondary importance.
47    An uncontended write lock acquisition is as fast as for a normal
48    exclusive mutex but writer contention is somewhat more costly due to
49    keeping track of the exact number of writers.  If the rwlock kind requests
50    writers to be preferred (i.e., PTHREAD_RWLOCK_PREFER_WRITER_NP or the
51    no-recursive-readers variant of it), then writer--to--writer lock ownership
52    hand-over is fairly fast and bypasses lock acquisition attempts by readers.
53    The costs of lock ownership transfer between readers and writers vary.  If
54    the program asserts that there are no recursive readers and writers are
55    preferred, then write lock acquisition attempts will block subsequent read
56    lock acquisition attempts, so that new incoming readers do not prolong a
57    phase in which readers have acquired the lock.
58 
59    The main components of the rwlock are a writer-only lock that allows only
60    one of the concurrent writers to be the primary writer, and a
61    single-writer-multiple-readers lock that decides between read phases, in
62    which readers have acquired the rwlock, and write phases in which a primary
63    writer or a sequence of different primary writers have acquired the rwlock.
64 
65    The single-writer-multiple-readers lock is the central piece of state
66    describing the rwlock and is encoded in the __readers field (see below for
67    a detailed explanation):
68 
69    State WP  WL  R   RW  Notes
70    ---------------------------
71    #1    0   0   0   0   Lock is idle (and in a read phase).
72    #2    0   0   >0  0   Readers have acquired the lock.
73    #3    0   1   0   0   Lock is not acquired; a writer will try to start a
74 			 write phase.
75    #4    0   1   >0  0   Readers have acquired the lock; a writer is waiting
76 			 and explicit hand-over to the writer is required.
77    #4a   0   1   >0  1   Same as #4 except that there are further readers
78 			 waiting because the writer is to be preferred.
79    #5    1   0   0   0   Lock is idle (and in a write phase).
80    #6    1   0   >0  0   Write phase; readers will try to start a read phase
81 			 (requires explicit hand-over to all readers that
82 			 do not start the read phase).
83    #7    1   1   0   0   Lock is acquired by a writer.
84    #8    1   1   >0  0   Lock acquired by a writer and readers are waiting;
85 			 explicit hand-over to the readers is required.
86 
87    WP (PTHREAD_RWLOCK_WRPHASE) is true if the lock is in a write phase, so
88    potentially acquired by a primary writer.
89    WL (PTHREAD_RWLOCK_WRLOCKED) is true if there is a primary writer (i.e.,
90    the thread that was able to set this bit from false to true).
91    R (all bits in __readers except the number of least-significant bits
92    denoted in PTHREAD_RWLOCK_READER_SHIFT) is the number of readers that have
93    or are trying to acquired the lock.  There may be more readers waiting if
94    writers are preferred and there will be no recursive readers, in which
95    case RW (PTHREAD_RWLOCK_RWAITING) is true in state #4a.
96 
97    We want to block using futexes but using __readers as a futex word directly
98    is not a good solution.  First, we want to wait on different conditions
99    such as waiting for a phase change vs. waiting for the primary writer to
100    release the writer-only lock.  Second, the number of readers could change
101    frequently, which would make it likely that a writer's futex_wait fails
102    frequently too because the expected value does not match the value of
103    __readers anymore.
104    Therefore, we split out the futex words into the __wrphase_futex and
105    __writers_futex fields.  The former tracks the value of the WP bit and is
106    changed after changing WP by the thread that changes WP.  However, because
107    of the POSIX requirements regarding mutex/rwlock destruction (i.e., that
108    destroying a rwlock is allowed as soon as no thread has acquired or will
109    acquire the lock), we have to be careful and hand over lock ownership (via
110    a phase change) carefully to those threads waiting.  Specifically, we must
111    prevent a situation in which we are not quite sure whether we still have
112    to unblock another thread through a change to memory (executing a
113    futex_wake on a former futex word that is now used for something else is
114    fine).
115    The scheme we use for __wrphase_futex is that waiting threads that may
116    use the futex word to block now all have to use the futex word to block; it
117    is not allowed to take the short-cut and spin-wait on __readers because
118    then the waking thread cannot just make one final change to memory to
119    unblock all potentially waiting threads.  If, for example, a reader
120    increments R in states #7 or #8, it has to then block until __wrphase_futex
121    is 0 and it can confirm that the value of 0 was stored by the primary
122    writer; in turn, the primary writer has to change to a read phase too when
123    releasing WL (i.e., to state #2), and it must change __wrphase_futex to 0
124    as the next step.  This ensures that the waiting reader will not be able to
125    acquire, release, and then destroy the lock concurrently with the pending
126    futex unblock operations by the former primary writer.  This scheme is
127    called explicit hand-over in what follows.
128    Note that waiting threads can cancel waiting only if explicit hand-over has
129    not yet started (e.g., if __readers is still in states #7 or #8 in the
130    example above).
131 
132    Writers determine the primary writer through WL.  Blocking using futexes
133    is performed using __writers_futex as a futex word; primary writers will
134    enable waiting on this futex by setting it to 1 after they acquired the WL
135    bit and will disable waiting by setting it to 0 before they release WL.
136    This leaves small windows where blocking using futexes is not possible
137    although a primary writer exists, but in turn decreases complexity of the
138    writer--writer synchronization and does not affect correctness.
139    If writers are preferred, writers can hand over WL directly to other
140    waiting writers that registered by incrementing __writers:  If the primary
141    writer can CAS __writers from a non-zero value to the same value with the
142    PTHREAD_RWLOCK_WRHANDOVER bit set, it effectively transfers WL ownership
143    to one of the registered waiting writers and does not reset WL; in turn,
144    a registered writer that can clear PTHREAD_RWLOCK_WRHANDOVER using a CAS
145    then takes over WL.  Note that registered waiting writers can cancel
146    waiting by decrementing __writers, but the last writer to unregister must
147    become the primary writer if PTHREAD_RWLOCK_WRHANDOVER is set.
148    Also note that adding another state/bit to signal potential writer--writer
149    contention (e.g., as done in the normal mutex algorithm) would not be
150    helpful because we would have to conservatively assume that there is in
151    fact no other writer, and wake up readers too.
152 
153    To avoid having to call futex_wake when no thread uses __wrphase_futex or
154    __writers_futex, threads will set the PTHREAD_RWLOCK_FUTEX_USED bit in the
155    respective futex words before waiting on it (using a CAS so it will only be
156    set if in a state in which waiting would be possible).  In the case of
157    __writers_futex, we wake only one thread but several threads may share
158    PTHREAD_RWLOCK_FUTEX_USED, so we must assume that there are still others.
159    This is similar to what we do in pthread_mutex_lock.  We do not need to
160    do this for __wrphase_futex because there, we always wake all waiting
161    threads.
162 
163    Blocking in the state #4a simply uses __readers as futex word.  This
164    simplifies the algorithm but suffers from some of the drawbacks discussed
165    before, though not to the same extent because R can only decrease in this
166    state, so the number of potentially failing futex_wait attempts will be
167    bounded.  All threads moving from state #4a to another state must wake
168    up threads blocked on the __readers futex.
169 
170    The ordering invariants that we have to take care of in the implementation
171    are primarily those necessary for a reader--writer lock; this is rather
172    straightforward and happens during write/read phase switching (potentially
173    through explicit hand-over), and between writers through synchronization
174    involving the PTHREAD_RWLOCK_WRLOCKED or PTHREAD_RWLOCK_WRHANDOVER bits.
175    Additionally, we need to take care that modifications of __writers_futex
176    and __wrphase_futex (e.g., by otherwise unordered readers) take place in
177    the writer critical sections or read/write phases, respectively, and that
178    explicit hand-over observes stores from the previous phase.  How this is
179    done is explained in more detail in comments in the code.
180 
181    Many of the accesses to the futex words just need relaxed MO.  This is
182    possible because we essentially drive both the core rwlock synchronization
183    and the futex synchronization in parallel.  For example, an unlock will
184    unlock the rwlock and take part in the futex synchronization (using
185    PTHREAD_RWLOCK_FUTEX_USED, see above); even if they are not tightly
186    ordered in some way, the futex synchronization ensures that there are no
187    lost wake-ups, and woken threads will then eventually see the most recent
188    state of the rwlock.  IOW, waiting threads will always be woken up, while
189    not being able to wait using futexes (which can happen) is harmless; in
190    turn, this means that waiting threads don't need special ordering wrt.
191    waking threads.
192 
193    The futex synchronization consists of the three-state futex word:
194    (1) cannot block on it, (2) can block on it, and (3) there might be a
195    thread blocked on it (i.e., with PTHREAD_RWLOCK_FUTEX_USED set).
196    Relaxed-MO atomic read-modify-write operations are sufficient to maintain
197    this (e.g., using a CAS to go from (2) to (3) but not from (1) to (3)),
198    but we need ordering of the futex word modifications by the waking threads
199    so that they collectively make correct state changes between (1)-(3).
200    The futex-internal synchronization (i.e., the conceptual critical sections
201    around futex operations in the kernel) then ensures that even an
202    unconstrained load (i.e., relaxed MO) inside of futex_wait will not lead to
203    lost wake-ups because either the waiting thread will see the change from
204    (3) to (1) when a futex_wake came first, or this futex_wake will wake this
205    waiting thread because the waiting thread came first.
206 
207 
208    POSIX allows but does not require rwlock acquisitions to be a cancellation
209    point.  We do not support cancellation.
210 
211    TODO We do not try to elide any read or write lock acquisitions currently.
212    While this would be possible, it is unclear whether HTM performance is
213    currently predictable enough and our runtime tuning is good enough at
214    deciding when to use elision so that enabling it would lead to consistently
215    better performance.  */
216 
217 
218 static int
__pthread_rwlock_get_private(pthread_rwlock_t * rwlock)219 __pthread_rwlock_get_private (pthread_rwlock_t *rwlock)
220 {
221   return rwlock->__data.__shared != 0 ? FUTEX_SHARED : FUTEX_PRIVATE;
222 }
223 
224 static __always_inline void
__pthread_rwlock_rdunlock(pthread_rwlock_t * rwlock)225 __pthread_rwlock_rdunlock (pthread_rwlock_t *rwlock)
226 {
227   int private = __pthread_rwlock_get_private (rwlock);
228   /* We decrease the number of readers, and if we are the last reader and
229      there is a primary writer, we start a write phase.  We use a CAS to
230      make this atomic so that it is clear whether we must hand over ownership
231      explicitly.  */
232   unsigned int r = atomic_load_relaxed (&rwlock->__data.__readers);
233   unsigned int rnew;
234   for (;;)
235     {
236       rnew = r - (1 << PTHREAD_RWLOCK_READER_SHIFT);
237       /* If we are the last reader, we also need to unblock any readers
238 	 that are waiting for a writer to go first (PTHREAD_RWLOCK_RWAITING)
239 	 so that they can register while the writer is active.  */
240       if ((rnew >> PTHREAD_RWLOCK_READER_SHIFT) == 0)
241 	{
242 	  if ((rnew & PTHREAD_RWLOCK_WRLOCKED) != 0)
243 	    rnew |= PTHREAD_RWLOCK_WRPHASE;
244 	  rnew &= ~(unsigned int) PTHREAD_RWLOCK_RWAITING;
245 	}
246       /* We need release MO here for three reasons.  First, so that we
247 	 synchronize with subsequent writers.  Second, we might have been the
248 	 first reader and set __wrphase_futex to 0, so we need to synchronize
249 	 with the last reader that will set it to 1 (note that we will always
250 	 change __readers before the last reader, or we are the last reader).
251 	 Third, a writer that takes part in explicit hand-over needs to see
252 	 the first reader's store to __wrphase_futex (or a later value) if
253 	 the writer observes that a write phase has been started.  */
254       if (atomic_compare_exchange_weak_release (&rwlock->__data.__readers,
255 						&r, rnew))
256 	break;
257       /* TODO Back-off.  */
258     }
259   if ((rnew & PTHREAD_RWLOCK_WRPHASE) != 0)
260     {
261       /* We need to do explicit hand-over.  We need the acquire MO fence so
262 	 that our modification of _wrphase_futex happens after a store by
263 	 another reader that started a read phase.  Relaxed MO is sufficient
264 	 for the modification of __wrphase_futex because it is just used
265 	 to delay acquisition by a writer until all threads are unblocked
266 	 irrespective of whether they are looking at __readers or
267 	 __wrphase_futex; any other synchronizes-with relations that are
268 	 necessary are established through __readers.  */
269       atomic_thread_fence_acquire ();
270       if ((atomic_exchange_relaxed (&rwlock->__data.__wrphase_futex, 1)
271 	   & PTHREAD_RWLOCK_FUTEX_USED) != 0)
272 	futex_wake (&rwlock->__data.__wrphase_futex, INT_MAX, private);
273     }
274   /* Also wake up waiting readers if we did reset the RWAITING flag.  */
275   if ((r & PTHREAD_RWLOCK_RWAITING) != (rnew & PTHREAD_RWLOCK_RWAITING))
276     futex_wake (&rwlock->__data.__readers, INT_MAX, private);
277 }
278 
279 
280 static __always_inline int
__pthread_rwlock_rdlock_full64(pthread_rwlock_t * rwlock,clockid_t clockid,const struct __timespec64 * abstime)281 __pthread_rwlock_rdlock_full64 (pthread_rwlock_t *rwlock, clockid_t clockid,
282                                 const struct __timespec64 *abstime)
283 {
284   unsigned int r;
285 
286   /* Make sure any passed in clockid and timeout value are valid.  Note that
287      the previous implementation assumed that this check *must* not be
288      performed if there would in fact be no blocking; however, POSIX only
289      requires that "the validity of the abstime parameter need not be checked
290      if the lock can be immediately acquired" (i.e., we need not but may check
291      it).  */
292   if (abstime && __glibc_unlikely (!futex_abstimed_supported_clockid (clockid)
293       || ! valid_nanoseconds (abstime->tv_nsec)))
294     return EINVAL;
295 
296   /* Make sure we are not holding the rwlock as a writer.  This is a deadlock
297      situation we recognize and report.  */
298   if (__glibc_unlikely (atomic_load_relaxed (&rwlock->__data.__cur_writer)
299 			== THREAD_GETMEM (THREAD_SELF, tid)))
300     return EDEADLK;
301 
302   /* If we prefer writers, recursive rdlock is disallowed, we are in a read
303      phase, and there are other readers present, we try to wait without
304      extending the read phase.  We will be unblocked by either one of the
305      other active readers, or if the writer gives up WRLOCKED (e.g., on
306      timeout).
307      If there are no other readers, we simply race with any existing primary
308      writer; it would have been a race anyway, and changing the odds slightly
309      will likely not make a big difference.  */
310   if (rwlock->__data.__flags == PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP)
311     {
312       r = atomic_load_relaxed (&rwlock->__data.__readers);
313       while ((r & PTHREAD_RWLOCK_WRPHASE) == 0
314 	     && (r & PTHREAD_RWLOCK_WRLOCKED) != 0
315 	     && (r >> PTHREAD_RWLOCK_READER_SHIFT) > 0)
316 	{
317 	  /* TODO Spin first.  */
318 	  /* Try setting the flag signaling that we are waiting without having
319 	     incremented the number of readers.  Relaxed MO is fine because
320 	     this is just about waiting for a state change in __readers.  */
321 	  if (atomic_compare_exchange_weak_relaxed
322 	      (&rwlock->__data.__readers, &r, r | PTHREAD_RWLOCK_RWAITING))
323 	    {
324 	      /* Wait for as long as the flag is set.  An ABA situation is
325 		 harmless because the flag is just about the state of
326 		 __readers, and all threads set the flag under the same
327 		 conditions.  */
328 	      while (((r = atomic_load_relaxed (&rwlock->__data.__readers))
329 		      & PTHREAD_RWLOCK_RWAITING) != 0)
330 		{
331 		  int private = __pthread_rwlock_get_private (rwlock);
332 		  int err = __futex_abstimed_wait64 (&rwlock->__data.__readers,
333 		                                     r, clockid, abstime,
334 		                                     private);
335 		  /* We ignore EAGAIN and EINTR.  On time-outs, we can just
336 		     return because we don't need to clean up anything.  */
337 		  if (err == ETIMEDOUT || err == EOVERFLOW)
338 		    return err;
339 		}
340 	      /* It makes sense to not break out of the outer loop here
341 		 because we might be in the same situation again.  */
342 	    }
343 	  else
344 	    {
345 	      /* TODO Back-off.  */
346 	    }
347 	}
348     }
349   /* Register as a reader, using an add-and-fetch so that R can be used as
350      expected value for future operations.  Acquire MO so we synchronize with
351      prior writers as well as the last reader of the previous read phase (see
352      below).  */
353   r = (atomic_fetch_add_acquire (&rwlock->__data.__readers,
354 				 (1 << PTHREAD_RWLOCK_READER_SHIFT))
355        + (1 << PTHREAD_RWLOCK_READER_SHIFT));
356 
357   /* Check whether there is an overflow in the number of readers.  We assume
358      that the total number of threads is less than half the maximum number
359      of readers that we have bits for in __readers (i.e., with 32-bit int and
360      PTHREAD_RWLOCK_READER_SHIFT of 3, we assume there are less than
361      1 << (32-3-1) concurrent threads).
362      If there is an overflow, we use a CAS to try to decrement the number of
363      readers if there still is an overflow situation.  If so, we return
364      EAGAIN; if not, we are not a thread causing an overflow situation, and so
365      we just continue.  Using a fetch-add instead of the CAS isn't possible
366      because other readers might release the lock concurrently, which could
367      make us the last reader and thus responsible for handing ownership over
368      to writers (which requires a CAS too to make the decrement and ownership
369      transfer indivisible).  */
370   while (__glibc_unlikely (r >= PTHREAD_RWLOCK_READER_OVERFLOW))
371     {
372       /* Relaxed MO is okay because we just want to undo our registration and
373 	 cannot have changed the rwlock state substantially if the CAS
374 	 succeeds.  */
375       if (atomic_compare_exchange_weak_relaxed
376 	  (&rwlock->__data.__readers,
377 	   &r, r - (1 << PTHREAD_RWLOCK_READER_SHIFT)))
378 	return EAGAIN;
379     }
380 
381   /* We have registered as a reader, so if we are in a read phase, we have
382      acquired a read lock.  This is also the reader--reader fast-path.
383      Even if there is a primary writer, we just return.  If writers are to
384      be preferred and we are the only active reader, we could try to enter a
385      write phase to let the writer proceed.  This would be okay because we
386      cannot have acquired the lock previously as a reader (which could result
387      in deadlock if we would wait for the primary writer to run).  However,
388      this seems to be a corner case and handling it specially not be worth the
389      complexity.  */
390   if (__glibc_likely ((r & PTHREAD_RWLOCK_WRPHASE) == 0))
391     return 0;
392   /* Otherwise, if we were in a write phase (states #6 or #8), we must wait
393      for explicit hand-over of the read phase; the only exception is if we
394      can start a read phase if there is no primary writer currently.  */
395   while ((r & PTHREAD_RWLOCK_WRPHASE) != 0
396 	 && (r & PTHREAD_RWLOCK_WRLOCKED) == 0)
397     {
398       /* Try to enter a read phase: If the CAS below succeeds, we have
399 	 ownership; if it fails, we will simply retry and reassess the
400 	 situation.
401 	 Acquire MO so we synchronize with prior writers.  */
402       if (atomic_compare_exchange_weak_acquire (&rwlock->__data.__readers, &r,
403 						r ^ PTHREAD_RWLOCK_WRPHASE))
404 	{
405 	  /* We started the read phase, so we are also responsible for
406 	     updating the write-phase futex.  Relaxed MO is sufficient.
407 	     We have to do the same steps as a writer would when handing
408 	     over the read phase to us because other readers cannot
409 	     distinguish between us and the writer; this includes
410 	     explicit hand-over and potentially having to wake other readers
411 	     (but we can pretend to do the setting and unsetting of WRLOCKED
412 	     atomically, and thus can skip this step).  */
413 	  if ((atomic_exchange_relaxed (&rwlock->__data.__wrphase_futex, 0)
414 	       & PTHREAD_RWLOCK_FUTEX_USED) != 0)
415 	    {
416 	      int private = __pthread_rwlock_get_private (rwlock);
417 	      futex_wake (&rwlock->__data.__wrphase_futex, INT_MAX, private);
418 	    }
419 	  return 0;
420 	}
421       else
422 	{
423 	  /* TODO Back off before retrying.  Also see above.  */
424 	}
425     }
426 
427   /* We were in a write phase but did not install the read phase.  We cannot
428      distinguish between a writer and another reader starting the read phase,
429      so we must wait for explicit hand-over via __wrphase_futex.
430      However, __wrphase_futex might not have been set to 1 yet (either
431      because explicit hand-over to the writer is still ongoing, or because
432      the writer has started the write phase but has not yet updated
433      __wrphase_futex).  The least recent value of __wrphase_futex we can
434      read from here is the modification of the last read phase (because
435      we synchronize with the last reader in this read phase through
436      __readers; see the use of acquire MO on the fetch_add above).
437      Therefore, if we observe a value of 0 for __wrphase_futex, we need
438      to subsequently check that __readers now indicates a read phase; we
439      need to use acquire MO for this so that if we observe a read phase,
440      we will also see the modification of __wrphase_futex by the previous
441      writer.  We then need to load __wrphase_futex again and continue to
442      wait if it is not 0, so that we do not skip explicit hand-over.
443      Relaxed MO is sufficient for the load from __wrphase_futex because
444      we just use it as an indicator for when we can proceed; we use
445      __readers and the acquire MO accesses to it to eventually read from
446      the proper stores to __wrphase_futex.  */
447   unsigned int wpf;
448   bool ready = false;
449   for (;;)
450     {
451       while (((wpf = atomic_load_relaxed (&rwlock->__data.__wrphase_futex))
452 	      | PTHREAD_RWLOCK_FUTEX_USED) == (1 | PTHREAD_RWLOCK_FUTEX_USED))
453 	{
454 	  int private = __pthread_rwlock_get_private (rwlock);
455 	  if (((wpf & PTHREAD_RWLOCK_FUTEX_USED) == 0)
456 	      && (!atomic_compare_exchange_weak_relaxed
457 		  (&rwlock->__data.__wrphase_futex,
458 		   &wpf, wpf | PTHREAD_RWLOCK_FUTEX_USED)))
459 	    continue;
460 	  int err = __futex_abstimed_wait64 (&rwlock->__data.__wrphase_futex,
461 					     1 | PTHREAD_RWLOCK_FUTEX_USED,
462 					     clockid, abstime, private);
463 	  if (err == ETIMEDOUT || err == EOVERFLOW)
464 	    {
465 	      /* If we timed out, we need to unregister.  If no read phase
466 		 has been installed while we waited, we can just decrement
467 		 the number of readers.  Otherwise, we just acquire the
468 		 lock, which is allowed because we give no precise timing
469 		 guarantees, and because the timeout is only required to
470 		 be in effect if we would have had to wait for other
471 		 threads (e.g., if futex_wait would time-out immediately
472 		 because the given absolute time is in the past).  */
473 	      r = atomic_load_relaxed (&rwlock->__data.__readers);
474 	      while ((r & PTHREAD_RWLOCK_WRPHASE) != 0)
475 		{
476 		  /* We don't need to make anything else visible to
477 		     others besides unregistering, so relaxed MO is
478 		     sufficient.  */
479 		  if (atomic_compare_exchange_weak_relaxed
480 		      (&rwlock->__data.__readers, &r,
481 		       r - (1 << PTHREAD_RWLOCK_READER_SHIFT)))
482 		    return err;
483 		  /* TODO Back-off.  */
484 		}
485 	      /* Use the acquire MO fence to mirror the steps taken in the
486 		 non-timeout case.  Note that the read can happen both
487 		 in the atomic_load above as well as in the failure case
488 		 of the CAS operation.  */
489 	      atomic_thread_fence_acquire ();
490 	      /* We still need to wait for explicit hand-over, but we must
491 		 not use futex_wait anymore because we would just time out
492 		 in this case and thus make the spin-waiting we need
493 		 unnecessarily expensive.  */
494 	      while ((atomic_load_relaxed (&rwlock->__data.__wrphase_futex)
495 		      | PTHREAD_RWLOCK_FUTEX_USED)
496 		     == (1 | PTHREAD_RWLOCK_FUTEX_USED))
497 		{
498 		  /* TODO Back-off?  */
499 		}
500 	      ready = true;
501 	      break;
502 	    }
503 	  /* If we got interrupted (EINTR) or the futex word does not have the
504 	     expected value (EAGAIN), retry.  */
505 	}
506       if (ready)
507 	/* See below.  */
508 	break;
509       /* We need acquire MO here so that we synchronize with the lock
510 	 release of the writer, and so that we observe a recent value of
511 	 __wrphase_futex (see below).  */
512       if ((atomic_load_acquire (&rwlock->__data.__readers)
513 	   & PTHREAD_RWLOCK_WRPHASE) == 0)
514 	/* We are in a read phase now, so the least recent modification of
515 	   __wrphase_futex we can read from is the store by the writer
516 	   with value 1.  Thus, only now we can assume that if we observe
517 	   a value of 0, explicit hand-over is finished. Retry the loop
518 	   above one more time.  */
519 	ready = true;
520     }
521 
522   return 0;
523 }
524 
525 
526 static __always_inline void
__pthread_rwlock_wrunlock(pthread_rwlock_t * rwlock)527 __pthread_rwlock_wrunlock (pthread_rwlock_t *rwlock)
528 {
529   int private = __pthread_rwlock_get_private (rwlock);
530 
531   atomic_store_relaxed (&rwlock->__data.__cur_writer, 0);
532   /* Disable waiting by writers.  We will wake up after we decided how to
533      proceed.  */
534   bool wake_writers
535     = ((atomic_exchange_relaxed (&rwlock->__data.__writers_futex, 0)
536 	& PTHREAD_RWLOCK_FUTEX_USED) != 0);
537 
538   if (rwlock->__data.__flags != PTHREAD_RWLOCK_PREFER_READER_NP)
539     {
540       /* First, try to hand over to another writer.  */
541       unsigned int w = atomic_load_relaxed (&rwlock->__data.__writers);
542       while (w != 0)
543 	{
544 	  /* Release MO so that another writer that gets WRLOCKED from us will
545 	     synchronize with us and thus can take over our view of
546 	     __readers (including, for example, whether we are in a write
547 	     phase or not).  */
548 	  if (atomic_compare_exchange_weak_release
549 	      (&rwlock->__data.__writers, &w, w | PTHREAD_RWLOCK_WRHANDOVER))
550 	    /* Another writer will take over.  */
551 	    goto done;
552 	  /* TODO Back-off.  */
553 	}
554     }
555 
556   /* We have done everything we needed to do to prefer writers, so now we
557      either hand over explicitly to readers if there are any, or we simply
558      stay in a write phase.  See pthread_rwlock_rdunlock for more details.  */
559   unsigned int r = atomic_load_relaxed (&rwlock->__data.__readers);
560   /* Release MO so that subsequent readers or writers synchronize with us.  */
561   while (!atomic_compare_exchange_weak_release
562 	 (&rwlock->__data.__readers, &r,
563 	  ((r ^ PTHREAD_RWLOCK_WRLOCKED)
564 	   ^ ((r >> PTHREAD_RWLOCK_READER_SHIFT) == 0 ? 0
565 	      : PTHREAD_RWLOCK_WRPHASE))))
566     {
567       /* TODO Back-off.  */
568     }
569   if ((r >> PTHREAD_RWLOCK_READER_SHIFT) != 0)
570     {
571       /* We must hand over explicitly through __wrphase_futex.  Relaxed MO is
572 	 sufficient because it is just used to delay acquisition by a writer;
573 	 any other synchronizes-with relations that are necessary are
574 	 established through __readers.  */
575       if ((atomic_exchange_relaxed (&rwlock->__data.__wrphase_futex, 0)
576 	   & PTHREAD_RWLOCK_FUTEX_USED) != 0)
577 	futex_wake (&rwlock->__data.__wrphase_futex, INT_MAX, private);
578     }
579 
580  done:
581   /* We released WRLOCKED in some way, so wake a writer.  */
582   if (wake_writers)
583     futex_wake (&rwlock->__data.__writers_futex, 1, private);
584 }
585 
586 
587 static __always_inline int
__pthread_rwlock_wrlock_full64(pthread_rwlock_t * rwlock,clockid_t clockid,const struct __timespec64 * abstime)588 __pthread_rwlock_wrlock_full64 (pthread_rwlock_t *rwlock, clockid_t clockid,
589                                 const struct __timespec64 *abstime)
590 {
591   /* Make sure any passed in clockid and timeout value are valid.  Note that
592      the previous implementation assumed that this check *must* not be
593      performed if there would in fact be no blocking; however, POSIX only
594      requires that "the validity of the abstime parameter need not be checked
595      if the lock can be immediately acquired" (i.e., we need not but may check
596      it).  */
597   if (abstime && __glibc_unlikely (!futex_abstimed_supported_clockid (clockid)
598       || ! valid_nanoseconds (abstime->tv_nsec)))
599     return EINVAL;
600 
601   /* Make sure we are not holding the rwlock as a writer.  This is a deadlock
602      situation we recognize and report.  */
603   if (__glibc_unlikely (atomic_load_relaxed (&rwlock->__data.__cur_writer)
604 			== THREAD_GETMEM (THREAD_SELF, tid)))
605     return EDEADLK;
606 
607   /* First we try to acquire the role of primary writer by setting WRLOCKED;
608      if it was set before, there already is a primary writer.  Acquire MO so
609      that we synchronize with previous primary writers.
610 
611      We do not try to change to a write phase right away using a fetch_or
612      because we would have to reset it again and wake readers if there are
613      readers present (some readers could try to acquire the lock more than
614      once, so setting a write phase in the middle of this could cause
615      deadlock).  Changing to a write phase eagerly would only speed up the
616      transition from a read phase to a write phase in the uncontended case,
617      but it would slow down the contended case if readers are preferred (which
618      is the default).
619      We could try to CAS from a state with no readers to a write phase, but
620      this could be less scalable if readers arrive and leave frequently.  */
621   bool may_share_futex_used_flag = false;
622   unsigned int r = atomic_fetch_or_acquire (&rwlock->__data.__readers,
623 					    PTHREAD_RWLOCK_WRLOCKED);
624   if (__glibc_unlikely ((r & PTHREAD_RWLOCK_WRLOCKED) != 0))
625     {
626       /* There is another primary writer.  */
627       bool prefer_writer
628 	= (rwlock->__data.__flags != PTHREAD_RWLOCK_PREFER_READER_NP);
629       if (prefer_writer)
630 	{
631 	  /* We register as a waiting writer, so that we can make use of
632 	     writer--writer hand-over.  Relaxed MO is fine because we just
633 	     want to register.  We assume that the maximum number of threads
634 	     is less than the capacity in __writers.  */
635 	  atomic_fetch_add_relaxed (&rwlock->__data.__writers, 1);
636 	}
637       for (;;)
638 	{
639 	  /* TODO Spin until WRLOCKED is 0 before trying the CAS below.
640 	     But pay attention to not delay trying writer--writer hand-over
641 	     for too long (which we must try eventually anyway).  */
642 	  if ((r & PTHREAD_RWLOCK_WRLOCKED) == 0)
643 	    {
644 	      /* Try to become the primary writer or retry.  Acquire MO as in
645 		 the fetch_or above.  */
646 	      if (atomic_compare_exchange_weak_acquire
647 		  (&rwlock->__data.__readers, &r, r | PTHREAD_RWLOCK_WRLOCKED))
648 		{
649 		  if (prefer_writer)
650 		    {
651 		      /* Unregister as a waiting writer.  Note that because we
652 			 acquired WRLOCKED, WRHANDOVER will not be set.
653 			 Acquire MO on the CAS above ensures that
654 			 unregistering happens after the previous writer;
655 			 this sorts the accesses to __writers by all
656 			 primary writers in a useful way (e.g., any other
657 			 primary writer acquiring after us or getting it from
658 			 us through WRHANDOVER will see both our changes to
659 			 __writers).
660 			 ??? Perhaps this is not strictly necessary for
661 			 reasons we do not yet know of.  */
662 		      atomic_fetch_add_relaxed (&rwlock->__data.__writers, -1);
663 		    }
664 		  break;
665 		}
666 	      /* Retry if the CAS fails (r will have been updated).  */
667 	      continue;
668 	    }
669 	  /* If writer--writer hand-over is available, try to become the
670 	     primary writer this way by grabbing the WRHANDOVER token.  If we
671 	     succeed, we own WRLOCKED.  */
672 	  if (prefer_writer)
673 	    {
674 	      unsigned int w = atomic_load_relaxed (&rwlock->__data.__writers);
675 	      if ((w & PTHREAD_RWLOCK_WRHANDOVER) != 0)
676 		{
677 		  /* Acquire MO is required here so that we synchronize with
678 		     the writer that handed over WRLOCKED.  We also need this
679 		     for the reload of __readers below because our view of
680 		     __readers must be at least as recent as the view of the
681 		     writer that handed over WRLOCKED; we must avoid an ABA
682 		     through WRHANDOVER, which could, for example, lead to us
683 		     assuming we are still in a write phase when in fact we
684 		     are not.  */
685 		  if (atomic_compare_exchange_weak_acquire
686 		      (&rwlock->__data.__writers,
687 		       &w, (w - PTHREAD_RWLOCK_WRHANDOVER - 1)))
688 		    {
689 		      /* Reload so our view is consistent with the view of
690 			 the previous owner of WRLOCKED.  See above.  */
691 		      r = atomic_load_relaxed (&rwlock->__data.__readers);
692 		      break;
693 		    }
694 		  /* We do not need to reload __readers here.  We should try
695 		     to perform writer--writer hand-over if possible; if it
696 		     is not possible anymore, we will reload __readers
697 		     elsewhere in this loop.  */
698 		  continue;
699 		}
700 	    }
701 	  /* We did not acquire WRLOCKED nor were able to use writer--writer
702 	     hand-over, so we block on __writers_futex.  */
703 	  int private = __pthread_rwlock_get_private (rwlock);
704 	  unsigned int wf
705 	    = atomic_load_relaxed (&rwlock->__data.__writers_futex);
706 	  if (((wf & ~(unsigned int) PTHREAD_RWLOCK_FUTEX_USED) != 1)
707 	      || ((wf != (1 | PTHREAD_RWLOCK_FUTEX_USED))
708 		  && (!atomic_compare_exchange_weak_relaxed
709 		      (&rwlock->__data.__writers_futex, &wf,
710 		       1 | PTHREAD_RWLOCK_FUTEX_USED))))
711 	    {
712 	      /* If we cannot block on __writers_futex because there is no
713 		 primary writer, or we cannot set PTHREAD_RWLOCK_FUTEX_USED,
714 		 we retry.  We must reload __readers here in case we cannot
715 		 block on __writers_futex so that we can become the primary
716 		 writer and are not stuck in a loop that just continuously
717 		 fails to block on __writers_futex.  */
718 	      r = atomic_load_relaxed (&rwlock->__data.__readers);
719 	      continue;
720 	    }
721 	  /* We set the flag that signals that the futex is used, or we could
722 	     have set it if we had been faster than other waiters.  As a
723 	     result, we may share the flag with an unknown number of other
724 	     writers.  Therefore, we must keep this flag set when we acquire
725 	     the lock.  We do not need to do this when we do not reach this
726 	     point here because then we are not part of the group that may
727 	     share the flag, and another writer will wake one of the writers
728 	     in this group.  */
729 	  may_share_futex_used_flag = true;
730 	  int err = __futex_abstimed_wait64 (&rwlock->__data.__writers_futex,
731 					     1 | PTHREAD_RWLOCK_FUTEX_USED,
732 					     clockid, abstime, private);
733 	  if (err == ETIMEDOUT || err == EOVERFLOW)
734 	    {
735 	      if (prefer_writer)
736 		{
737 		  /* We need to unregister as a waiting writer.  If we are the
738 		     last writer and writer--writer hand-over is available,
739 		     we must make use of it because nobody else will reset
740 		     WRLOCKED otherwise.  (If we use it, we simply pretend
741 		     that this happened before the timeout; see
742 		     pthread_rwlock_rdlock_full for the full reasoning.)
743 		     Also see the similar code above.  */
744 		  unsigned int w
745 		    = atomic_load_relaxed (&rwlock->__data.__writers);
746 		  while (!atomic_compare_exchange_weak_acquire
747 			 (&rwlock->__data.__writers, &w,
748 			  (w == PTHREAD_RWLOCK_WRHANDOVER + 1 ? 0 : w - 1)))
749 		    {
750 		      /* TODO Back-off.  */
751 		    }
752 		  if (w == PTHREAD_RWLOCK_WRHANDOVER + 1)
753 		    {
754 		      /* We must continue as primary writer.  See above.  */
755 		      r = atomic_load_relaxed (&rwlock->__data.__readers);
756 		      break;
757 		    }
758 		}
759 	      /* We cleaned up and cannot have stolen another waiting writer's
760 		 futex wake-up, so just return.  */
761 	      return err;
762 	    }
763 	  /* If we got interrupted (EINTR) or the futex word does not have the
764 	     expected value (EAGAIN), retry after reloading __readers.  */
765 	  r = atomic_load_relaxed (&rwlock->__data.__readers);
766 	}
767       /* Our snapshot of __readers is up-to-date at this point because we
768 	 either set WRLOCKED using a CAS (and update r accordingly below,
769 	 which was used as expected value for the CAS) or got WRLOCKED from
770 	 another writer whose snapshot of __readers we inherit.  */
771       r |= PTHREAD_RWLOCK_WRLOCKED;
772     }
773 
774   /* We are the primary writer; enable blocking on __writers_futex.  Relaxed
775      MO is sufficient for futex words; acquire MO on the previous
776      modifications of __readers ensures that this store happens after the
777      store of value 0 by the previous primary writer.  */
778   atomic_store_relaxed (&rwlock->__data.__writers_futex,
779 			1 | (may_share_futex_used_flag
780 			     ? PTHREAD_RWLOCK_FUTEX_USED : 0));
781 
782   /* If we are in a write phase, we have acquired the lock.  */
783   if ((r & PTHREAD_RWLOCK_WRPHASE) != 0)
784     goto done;
785 
786   /* If we are in a read phase and there are no readers, try to start a write
787      phase.  */
788   while ((r & PTHREAD_RWLOCK_WRPHASE) == 0
789 	 && (r >> PTHREAD_RWLOCK_READER_SHIFT) == 0)
790     {
791       /* Acquire MO so that we synchronize with prior writers and do
792 	 not interfere with their updates to __writers_futex, as well
793 	 as regarding prior readers and their updates to __wrphase_futex,
794 	 respectively.  */
795       if (atomic_compare_exchange_weak_acquire (&rwlock->__data.__readers,
796 						&r, r | PTHREAD_RWLOCK_WRPHASE))
797 	{
798 	  /* We have started a write phase, so need to enable readers to wait.
799 	     See the similar case in __pthread_rwlock_rdlock_full.  Unlike in
800 	     that similar case, we are the (only) primary writer and so do
801 	     not need to wake another writer.  */
802 	  atomic_store_relaxed (&rwlock->__data.__wrphase_futex, 1);
803 
804 	  goto done;
805 	}
806       /* TODO Back-off.  */
807     }
808 
809   /* We became the primary writer in a read phase and there were readers when
810      we did (because of the previous loop).  Thus, we have to wait for
811      explicit hand-over from one of these readers.
812      We basically do the same steps as for the similar case in
813      __pthread_rwlock_rdlock_full, except that we additionally might try
814      to directly hand over to another writer and need to wake up
815      other writers or waiting readers (i.e., PTHREAD_RWLOCK_RWAITING).  */
816   unsigned int wpf;
817   bool ready = false;
818   for (;;)
819     {
820       while (((wpf = atomic_load_relaxed (&rwlock->__data.__wrphase_futex))
821 	      | PTHREAD_RWLOCK_FUTEX_USED) == PTHREAD_RWLOCK_FUTEX_USED)
822 	{
823 	  int private = __pthread_rwlock_get_private (rwlock);
824 	  if ((wpf & PTHREAD_RWLOCK_FUTEX_USED) == 0
825 	      && (!atomic_compare_exchange_weak_relaxed
826 		  (&rwlock->__data.__wrphase_futex, &wpf,
827 		   PTHREAD_RWLOCK_FUTEX_USED)))
828 	    continue;
829 	  int err = __futex_abstimed_wait64 (&rwlock->__data.__wrphase_futex,
830 					     PTHREAD_RWLOCK_FUTEX_USED,
831 					     clockid, abstime, private);
832 	  if (err == ETIMEDOUT || err == EOVERFLOW)
833 	    {
834 	      if (rwlock->__data.__flags != PTHREAD_RWLOCK_PREFER_READER_NP)
835 		{
836 		  /* We try writer--writer hand-over.  */
837 		  unsigned int w
838 		    = atomic_load_relaxed (&rwlock->__data.__writers);
839 		  if (w != 0)
840 		    {
841 		      /* We are about to hand over WRLOCKED, so we must
842 			 release __writers_futex too; otherwise, we'd have
843 			 a pending store, which could at least prevent
844 			 other threads from waiting using the futex
845 			 because it could interleave with the stores
846 			 by subsequent writers.  In turn, this means that
847 			 we have to clean up when we do not hand over
848 			 WRLOCKED.
849 			 Release MO so that another writer that gets
850 			 WRLOCKED from us can take over our view of
851 			 __readers.  */
852 		      unsigned int wf
853 			= atomic_exchange_relaxed (&rwlock->__data.__writers_futex, 0);
854 		      while (w != 0)
855 			{
856 			  if (atomic_compare_exchange_weak_release
857 			      (&rwlock->__data.__writers, &w,
858 			       w | PTHREAD_RWLOCK_WRHANDOVER))
859 			    {
860 			      /* Wake other writers.  */
861 			      if ((wf & PTHREAD_RWLOCK_FUTEX_USED) != 0)
862 				futex_wake (&rwlock->__data.__writers_futex,
863 					    1, private);
864 			      return err;
865 			    }
866 			  /* TODO Back-off.  */
867 			}
868 		      /* We still own WRLOCKED and someone else might set
869 			 a write phase concurrently, so enable waiting
870 			 again.  Make sure we don't loose the flag that
871 			 signals whether there are threads waiting on
872 			 this futex.  */
873 		      atomic_store_relaxed (&rwlock->__data.__writers_futex, wf);
874 		    }
875 		}
876 	      /* If we timed out and we are not in a write phase, we can
877 		 just stop being a primary writer.  Otherwise, we just
878 		 acquire the lock.  */
879 	      r = atomic_load_relaxed (&rwlock->__data.__readers);
880 	      if ((r & PTHREAD_RWLOCK_WRPHASE) == 0)
881 		{
882 		  /* We are about to release WRLOCKED, so we must release
883 		     __writers_futex too; see the handling of
884 		     writer--writer hand-over above.  */
885 		  unsigned int wf
886 		    = atomic_exchange_relaxed (&rwlock->__data.__writers_futex, 0);
887 		  while ((r & PTHREAD_RWLOCK_WRPHASE) == 0)
888 		    {
889 		      /* While we don't need to make anything from a
890 			 caller's critical section visible to other
891 			 threads, we need to ensure that our changes to
892 			 __writers_futex are properly ordered.
893 			 Therefore, use release MO to synchronize with
894 			 subsequent primary writers.  Also wake up any
895 			 waiting readers as they are waiting because of
896 			 us.  */
897 		      if (atomic_compare_exchange_weak_release
898 			  (&rwlock->__data.__readers, &r,
899 			   (r ^ PTHREAD_RWLOCK_WRLOCKED)
900 			   & ~(unsigned int) PTHREAD_RWLOCK_RWAITING))
901 			{
902 			  /* Wake other writers.  */
903 			  if ((wf & PTHREAD_RWLOCK_FUTEX_USED) != 0)
904 			    futex_wake (&rwlock->__data.__writers_futex,
905 					1, private);
906 			  /* Wake waiting readers.  */
907 			  if ((r & PTHREAD_RWLOCK_RWAITING) != 0)
908 			    futex_wake (&rwlock->__data.__readers,
909 					INT_MAX, private);
910 			  return ETIMEDOUT;
911 			}
912 		    }
913 		  /* We still own WRLOCKED and someone else might set a
914 		     write phase concurrently, so enable waiting again.
915 		     Make sure we don't loose the flag that signals
916 		     whether there are threads waiting on this futex.  */
917 		  atomic_store_relaxed (&rwlock->__data.__writers_futex, wf);
918 		}
919 	      /* Use the acquire MO fence to mirror the steps taken in the
920 		 non-timeout case.  Note that the read can happen both
921 		 in the atomic_load above as well as in the failure case
922 		 of the CAS operation.  */
923 	      atomic_thread_fence_acquire ();
924 	      /* We still need to wait for explicit hand-over, but we must
925 		 not use futex_wait anymore.  */
926 	      while ((atomic_load_relaxed (&rwlock->__data.__wrphase_futex)
927 		      | PTHREAD_RWLOCK_FUTEX_USED)
928 		     == PTHREAD_RWLOCK_FUTEX_USED)
929 		{
930 		  /* TODO Back-off.  */
931 		}
932 	      ready = true;
933 	      break;
934 	    }
935 	  /* If we got interrupted (EINTR) or the futex word does not have
936 	     the expected value (EAGAIN), retry.  */
937 	}
938       /* See pthread_rwlock_rdlock_full.  */
939       if (ready)
940 	break;
941       if ((atomic_load_acquire (&rwlock->__data.__readers)
942 	   & PTHREAD_RWLOCK_WRPHASE) != 0)
943 	ready = true;
944     }
945 
946  done:
947   atomic_store_relaxed (&rwlock->__data.__cur_writer,
948 			THREAD_GETMEM (THREAD_SELF, tid));
949   return 0;
950 }
951