1/* POSIX reader--writer lock: core parts.
2 Copyright (C) 2016-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <errno.h>
20#include <sysdep.h>
21#include <pthread.h>
22#include <pthreadP.h>
23#include <sys/time.h>
24#include <stap-probe.h>
25#include <atomic.h>
26#include <futex-internal.h>
27#include <time.h>
28
29
30/* A reader--writer lock that fulfills the POSIX requirements (but operations
31 on this lock are not necessarily full barriers, as one may interpret the
32 POSIX requirement about "synchronizing memory"). All critical sections are
33 in a total order, writers synchronize with prior writers and readers, and
34 readers synchronize with prior writers.
35
36 A thread is allowed to acquire a read lock recursively (i.e., have rdlock
37 critical sections that overlap in sequenced-before) unless the kind of the
38 rwlock is set to PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP.
39
40 This lock is built so that workloads of mostly readers can be executed with
41 low runtime overheads. This matches that the default kind of the lock is
42 PTHREAD_RWLOCK_PREFER_READER_NP. Acquiring a read lock requires a single
43 atomic addition if the lock is or was previously acquired by other
44 readers; releasing the lock is a single CAS if there are no concurrent
45 writers.
46 Workloads consisting of mostly writers are of secondary importance.
47 An uncontended write lock acquisition is as fast as for a normal
48 exclusive mutex but writer contention is somewhat more costly due to
49 keeping track of the exact number of writers. If the rwlock kind requests
50 writers to be preferred (i.e., PTHREAD_RWLOCK_PREFER_WRITER_NP or the
51 no-recursive-readers variant of it), then writer--to--writer lock ownership
52 hand-over is fairly fast and bypasses lock acquisition attempts by readers.
53 The costs of lock ownership transfer between readers and writers vary. If
54 the program asserts that there are no recursive readers and writers are
55 preferred, then write lock acquisition attempts will block subsequent read
56 lock acquisition attempts, so that new incoming readers do not prolong a
57 phase in which readers have acquired the lock.
58
59 The main components of the rwlock are a writer-only lock that allows only
60 one of the concurrent writers to be the primary writer, and a
61 single-writer-multiple-readers lock that decides between read phases, in
62 which readers have acquired the rwlock, and write phases in which a primary
63 writer or a sequence of different primary writers have acquired the rwlock.
64
65 The single-writer-multiple-readers lock is the central piece of state
66 describing the rwlock and is encoded in the __readers field (see below for
67 a detailed explanation):
68
69 State WP WL R RW Notes
70 ---------------------------
71 #1 0 0 0 0 Lock is idle (and in a read phase).
72 #2 0 0 >0 0 Readers have acquired the lock.
73 #3 0 1 0 0 Lock is not acquired; a writer will try to start a
74 write phase.
75 #4 0 1 >0 0 Readers have acquired the lock; a writer is waiting
76 and explicit hand-over to the writer is required.
77 #4a 0 1 >0 1 Same as #4 except that there are further readers
78 waiting because the writer is to be preferred.
79 #5 1 0 0 0 Lock is idle (and in a write phase).
80 #6 1 0 >0 0 Write phase; readers will try to start a read phase
81 (requires explicit hand-over to all readers that
82 do not start the read phase).
83 #7 1 1 0 0 Lock is acquired by a writer.
84 #8 1 1 >0 0 Lock acquired by a writer and readers are waiting;
85 explicit hand-over to the readers is required.
86
87 WP (PTHREAD_RWLOCK_WRPHASE) is true if the lock is in a write phase, so
88 potentially acquired by a primary writer.
89 WL (PTHREAD_RWLOCK_WRLOCKED) is true if there is a primary writer (i.e.,
90 the thread that was able to set this bit from false to true).
91 R (all bits in __readers except the number of least-significant bits
92 denoted in PTHREAD_RWLOCK_READER_SHIFT) is the number of readers that have
93 or are trying to acquired the lock. There may be more readers waiting if
94 writers are preferred and there will be no recursive readers, in which
95 case RW (PTHREAD_RWLOCK_RWAITING) is true in state #4a.
96
97 We want to block using futexes but using __readers as a futex word directly
98 is not a good solution. First, we want to wait on different conditions
99 such as waiting for a phase change vs. waiting for the primary writer to
100 release the writer-only lock. Second, the number of readers could change
101 frequently, which would make it likely that a writer's futex_wait fails
102 frequently too because the expected value does not match the value of
103 __readers anymore.
104 Therefore, we split out the futex words into the __wrphase_futex and
105 __writers_futex fields. The former tracks the value of the WP bit and is
106 changed after changing WP by the thread that changes WP. However, because
107 of the POSIX requirements regarding mutex/rwlock destruction (i.e., that
108 destroying a rwlock is allowed as soon as no thread has acquired or will
109 acquire the lock), we have to be careful and hand over lock ownership (via
110 a phase change) carefully to those threads waiting. Specifically, we must
111 prevent a situation in which we are not quite sure whether we still have
112 to unblock another thread through a change to memory (executing a
113 futex_wake on a former futex word that is now used for something else is
114 fine).
115 The scheme we use for __wrphase_futex is that waiting threads that may
116 use the futex word to block now all have to use the futex word to block; it
117 is not allowed to take the short-cut and spin-wait on __readers because
118 then the waking thread cannot just make one final change to memory to
119 unblock all potentially waiting threads. If, for example, a reader
120 increments R in states #7 or #8, it has to then block until __wrphase_futex
121 is 0 and it can confirm that the value of 0 was stored by the primary
122 writer; in turn, the primary writer has to change to a read phase too when
123 releasing WL (i.e., to state #2), and it must change __wrphase_futex to 0
124 as the next step. This ensures that the waiting reader will not be able to
125 acquire, release, and then destroy the lock concurrently with the pending
126 futex unblock operations by the former primary writer. This scheme is
127 called explicit hand-over in what follows.
128 Note that waiting threads can cancel waiting only if explicit hand-over has
129 not yet started (e.g., if __readers is still in states #7 or #8 in the
130 example above).
131
132 Writers determine the primary writer through WL. Blocking using futexes
133 is performed using __writers_futex as a futex word; primary writers will
134 enable waiting on this futex by setting it to 1 after they acquired the WL
135 bit and will disable waiting by setting it to 0 before they release WL.
136 This leaves small windows where blocking using futexes is not possible
137 although a primary writer exists, but in turn decreases complexity of the
138 writer--writer synchronization and does not affect correctness.
139 If writers are preferred, writers can hand over WL directly to other
140 waiting writers that registered by incrementing __writers: If the primary
141 writer can CAS __writers from a non-zero value to the same value with the
142 PTHREAD_RWLOCK_WRHANDOVER bit set, it effectively transfers WL ownership
143 to one of the registered waiting writers and does not reset WL; in turn,
144 a registered writer that can clear PTHREAD_RWLOCK_WRHANDOVER using a CAS
145 then takes over WL. Note that registered waiting writers can cancel
146 waiting by decrementing __writers, but the last writer to unregister must
147 become the primary writer if PTHREAD_RWLOCK_WRHANDOVER is set.
148 Also note that adding another state/bit to signal potential writer--writer
149 contention (e.g., as done in the normal mutex algorithm) would not be
150 helpful because we would have to conservatively assume that there is in
151 fact no other writer, and wake up readers too.
152
153 To avoid having to call futex_wake when no thread uses __wrphase_futex or
154 __writers_futex, threads will set the PTHREAD_RWLOCK_FUTEX_USED bit in the
155 respective futex words before waiting on it (using a CAS so it will only be
156 set if in a state in which waiting would be possible). In the case of
157 __writers_futex, we wake only one thread but several threads may share
158 PTHREAD_RWLOCK_FUTEX_USED, so we must assume that there are still others.
159 This is similar to what we do in pthread_mutex_lock. We do not need to
160 do this for __wrphase_futex because there, we always wake all waiting
161 threads.
162
163 Blocking in the state #4a simply uses __readers as futex word. This
164 simplifies the algorithm but suffers from some of the drawbacks discussed
165 before, though not to the same extent because R can only decrease in this
166 state, so the number of potentially failing futex_wait attempts will be
167 bounded. All threads moving from state #4a to another state must wake
168 up threads blocked on the __readers futex.
169
170 The ordering invariants that we have to take care of in the implementation
171 are primarily those necessary for a reader--writer lock; this is rather
172 straightforward and happens during write/read phase switching (potentially
173 through explicit hand-over), and between writers through synchronization
174 involving the PTHREAD_RWLOCK_WRLOCKED or PTHREAD_RWLOCK_WRHANDOVER bits.
175 Additionally, we need to take care that modifications of __writers_futex
176 and __wrphase_futex (e.g., by otherwise unordered readers) take place in
177 the writer critical sections or read/write phases, respectively, and that
178 explicit hand-over observes stores from the previous phase. How this is
179 done is explained in more detail in comments in the code.
180
181 Many of the accesses to the futex words just need relaxed MO. This is
182 possible because we essentially drive both the core rwlock synchronization
183 and the futex synchronization in parallel. For example, an unlock will
184 unlock the rwlock and take part in the futex synchronization (using
185 PTHREAD_RWLOCK_FUTEX_USED, see above); even if they are not tightly
186 ordered in some way, the futex synchronization ensures that there are no
187 lost wake-ups, and woken threads will then eventually see the most recent
188 state of the rwlock. IOW, waiting threads will always be woken up, while
189 not being able to wait using futexes (which can happen) is harmless; in
190 turn, this means that waiting threads don't need special ordering wrt.
191 waking threads.
192
193 The futex synchronization consists of the three-state futex word:
194 (1) cannot block on it, (2) can block on it, and (3) there might be a
195 thread blocked on it (i.e., with PTHREAD_RWLOCK_FUTEX_USED set).
196 Relaxed-MO atomic read-modify-write operations are sufficient to maintain
197 this (e.g., using a CAS to go from (2) to (3) but not from (1) to (3)),
198 but we need ordering of the futex word modifications by the waking threads
199 so that they collectively make correct state changes between (1)-(3).
200 The futex-internal synchronization (i.e., the conceptual critical sections
201 around futex operations in the kernel) then ensures that even an
202 unconstrained load (i.e., relaxed MO) inside of futex_wait will not lead to
203 lost wake-ups because either the waiting thread will see the change from
204 (3) to (1) when a futex_wake came first, or this futex_wake will wake this
205 waiting thread because the waiting thread came first.
206
207
208 POSIX allows but does not require rwlock acquisitions to be a cancellation
209 point. We do not support cancellation.
210
211 TODO We do not try to elide any read or write lock acquisitions currently.
212 While this would be possible, it is unclear whether HTM performance is
213 currently predictable enough and our runtime tuning is good enough at
214 deciding when to use elision so that enabling it would lead to consistently
215 better performance. */
216
217
218static int
219__pthread_rwlock_get_private (pthread_rwlock_t *rwlock)
220{
221 return rwlock->__data.__shared != 0 ? FUTEX_SHARED : FUTEX_PRIVATE;
222}
223
224static __always_inline void
225__pthread_rwlock_rdunlock (pthread_rwlock_t *rwlock)
226{
227 int private = __pthread_rwlock_get_private (rwlock);
228 /* We decrease the number of readers, and if we are the last reader and
229 there is a primary writer, we start a write phase. We use a CAS to
230 make this atomic so that it is clear whether we must hand over ownership
231 explicitly. */
232 unsigned int r = atomic_load_relaxed (&rwlock->__data.__readers);
233 unsigned int rnew;
234 for (;;)
235 {
236 rnew = r - (1 << PTHREAD_RWLOCK_READER_SHIFT);
237 /* If we are the last reader, we also need to unblock any readers
238 that are waiting for a writer to go first (PTHREAD_RWLOCK_RWAITING)
239 so that they can register while the writer is active. */
240 if ((rnew >> PTHREAD_RWLOCK_READER_SHIFT) == 0)
241 {
242 if ((rnew & PTHREAD_RWLOCK_WRLOCKED) != 0)
243 rnew |= PTHREAD_RWLOCK_WRPHASE;
244 rnew &= ~(unsigned int) PTHREAD_RWLOCK_RWAITING;
245 }
246 /* We need release MO here for three reasons. First, so that we
247 synchronize with subsequent writers. Second, we might have been the
248 first reader and set __wrphase_futex to 0, so we need to synchronize
249 with the last reader that will set it to 1 (note that we will always
250 change __readers before the last reader, or we are the last reader).
251 Third, a writer that takes part in explicit hand-over needs to see
252 the first reader's store to __wrphase_futex (or a later value) if
253 the writer observes that a write phase has been started. */
254 if (atomic_compare_exchange_weak_release (&rwlock->__data.__readers,
255 &r, rnew))
256 break;
257 /* TODO Back-off. */
258 }
259 if ((rnew & PTHREAD_RWLOCK_WRPHASE) != 0)
260 {
261 /* We need to do explicit hand-over. We need the acquire MO fence so
262 that our modification of _wrphase_futex happens after a store by
263 another reader that started a read phase. Relaxed MO is sufficient
264 for the modification of __wrphase_futex because it is just used
265 to delay acquisition by a writer until all threads are unblocked
266 irrespective of whether they are looking at __readers or
267 __wrphase_futex; any other synchronizes-with relations that are
268 necessary are established through __readers. */
269 atomic_thread_fence_acquire ();
270 if ((atomic_exchange_relaxed (&rwlock->__data.__wrphase_futex, 1)
271 & PTHREAD_RWLOCK_FUTEX_USED) != 0)
272 futex_wake (futex_word: &rwlock->__data.__wrphase_futex, INT_MAX, private);
273 }
274 /* Also wake up waiting readers if we did reset the RWAITING flag. */
275 if ((r & PTHREAD_RWLOCK_RWAITING) != (rnew & PTHREAD_RWLOCK_RWAITING))
276 futex_wake (futex_word: &rwlock->__data.__readers, INT_MAX, private);
277}
278
279
280static __always_inline int
281__pthread_rwlock_rdlock_full64 (pthread_rwlock_t *rwlock, clockid_t clockid,
282 const struct __timespec64 *abstime)
283{
284 unsigned int r;
285
286 /* Make sure any passed in clockid and timeout value are valid. Note that
287 the previous implementation assumed that this check *must* not be
288 performed if there would in fact be no blocking; however, POSIX only
289 requires that "the validity of the abstime parameter need not be checked
290 if the lock can be immediately acquired" (i.e., we need not but may check
291 it). */
292 if (abstime && __glibc_unlikely (!futex_abstimed_supported_clockid (clockid)
293 || ! valid_nanoseconds (abstime->tv_nsec)))
294 return EINVAL;
295
296 /* Make sure we are not holding the rwlock as a writer. This is a deadlock
297 situation we recognize and report. */
298 if (__glibc_unlikely (atomic_load_relaxed (&rwlock->__data.__cur_writer)
299 == THREAD_GETMEM (THREAD_SELF, tid)))
300 return EDEADLK;
301
302 /* If we prefer writers, recursive rdlock is disallowed, we are in a read
303 phase, and there are other readers present, we try to wait without
304 extending the read phase. We will be unblocked by either one of the
305 other active readers, or if the writer gives up WRLOCKED (e.g., on
306 timeout).
307 If there are no other readers, we simply race with any existing primary
308 writer; it would have been a race anyway, and changing the odds slightly
309 will likely not make a big difference. */
310 if (rwlock->__data.__flags == PTHREAD_RWLOCK_PREFER_WRITER_NONRECURSIVE_NP)
311 {
312 r = atomic_load_relaxed (&rwlock->__data.__readers);
313 while ((r & PTHREAD_RWLOCK_WRPHASE) == 0
314 && (r & PTHREAD_RWLOCK_WRLOCKED) != 0
315 && (r >> PTHREAD_RWLOCK_READER_SHIFT) > 0)
316 {
317 /* TODO Spin first. */
318 /* Try setting the flag signaling that we are waiting without having
319 incremented the number of readers. Relaxed MO is fine because
320 this is just about waiting for a state change in __readers. */
321 if (atomic_compare_exchange_weak_relaxed
322 (&rwlock->__data.__readers, &r, r | PTHREAD_RWLOCK_RWAITING))
323 {
324 /* Wait for as long as the flag is set. An ABA situation is
325 harmless because the flag is just about the state of
326 __readers, and all threads set the flag under the same
327 conditions. */
328 while (((r = atomic_load_relaxed (&rwlock->__data.__readers))
329 & PTHREAD_RWLOCK_RWAITING) != 0)
330 {
331 int private = __pthread_rwlock_get_private (rwlock);
332 int err = __futex_abstimed_wait64 (&rwlock->__data.__readers,
333 r, clockid, abstime,
334 private);
335 /* We ignore EAGAIN and EINTR. On time-outs, we can just
336 return because we don't need to clean up anything. */
337 if (err == ETIMEDOUT || err == EOVERFLOW)
338 return err;
339 }
340 /* It makes sense to not break out of the outer loop here
341 because we might be in the same situation again. */
342 }
343 else
344 {
345 /* TODO Back-off. */
346 }
347 }
348 }
349 /* Register as a reader, using an add-and-fetch so that R can be used as
350 expected value for future operations. Acquire MO so we synchronize with
351 prior writers as well as the last reader of the previous read phase (see
352 below). */
353 r = (atomic_fetch_add_acquire (&rwlock->__data.__readers,
354 (1 << PTHREAD_RWLOCK_READER_SHIFT))
355 + (1 << PTHREAD_RWLOCK_READER_SHIFT));
356
357 /* Check whether there is an overflow in the number of readers. We assume
358 that the total number of threads is less than half the maximum number
359 of readers that we have bits for in __readers (i.e., with 32-bit int and
360 PTHREAD_RWLOCK_READER_SHIFT of 3, we assume there are less than
361 1 << (32-3-1) concurrent threads).
362 If there is an overflow, we use a CAS to try to decrement the number of
363 readers if there still is an overflow situation. If so, we return
364 EAGAIN; if not, we are not a thread causing an overflow situation, and so
365 we just continue. Using a fetch-add instead of the CAS isn't possible
366 because other readers might release the lock concurrently, which could
367 make us the last reader and thus responsible for handing ownership over
368 to writers (which requires a CAS too to make the decrement and ownership
369 transfer indivisible). */
370 while (__glibc_unlikely (r >= PTHREAD_RWLOCK_READER_OVERFLOW))
371 {
372 /* Relaxed MO is okay because we just want to undo our registration and
373 cannot have changed the rwlock state substantially if the CAS
374 succeeds. */
375 if (atomic_compare_exchange_weak_relaxed
376 (&rwlock->__data.__readers,
377 &r, r - (1 << PTHREAD_RWLOCK_READER_SHIFT)))
378 return EAGAIN;
379 }
380
381 /* We have registered as a reader, so if we are in a read phase, we have
382 acquired a read lock. This is also the reader--reader fast-path.
383 Even if there is a primary writer, we just return. If writers are to
384 be preferred and we are the only active reader, we could try to enter a
385 write phase to let the writer proceed. This would be okay because we
386 cannot have acquired the lock previously as a reader (which could result
387 in deadlock if we would wait for the primary writer to run). However,
388 this seems to be a corner case and handling it specially not be worth the
389 complexity. */
390 if (__glibc_likely ((r & PTHREAD_RWLOCK_WRPHASE) == 0))
391 return 0;
392 /* Otherwise, if we were in a write phase (states #6 or #8), we must wait
393 for explicit hand-over of the read phase; the only exception is if we
394 can start a read phase if there is no primary writer currently. */
395 while ((r & PTHREAD_RWLOCK_WRPHASE) != 0
396 && (r & PTHREAD_RWLOCK_WRLOCKED) == 0)
397 {
398 /* Try to enter a read phase: If the CAS below succeeds, we have
399 ownership; if it fails, we will simply retry and reassess the
400 situation.
401 Acquire MO so we synchronize with prior writers. */
402 if (atomic_compare_exchange_weak_acquire (&rwlock->__data.__readers, &r,
403 r ^ PTHREAD_RWLOCK_WRPHASE))
404 {
405 /* We started the read phase, so we are also responsible for
406 updating the write-phase futex. Relaxed MO is sufficient.
407 We have to do the same steps as a writer would when handing
408 over the read phase to us because other readers cannot
409 distinguish between us and the writer; this includes
410 explicit hand-over and potentially having to wake other readers
411 (but we can pretend to do the setting and unsetting of WRLOCKED
412 atomically, and thus can skip this step). */
413 if ((atomic_exchange_relaxed (&rwlock->__data.__wrphase_futex, 0)
414 & PTHREAD_RWLOCK_FUTEX_USED) != 0)
415 {
416 int private = __pthread_rwlock_get_private (rwlock);
417 futex_wake (futex_word: &rwlock->__data.__wrphase_futex, INT_MAX, private);
418 }
419 return 0;
420 }
421 else
422 {
423 /* TODO Back off before retrying. Also see above. */
424 }
425 }
426
427 /* We were in a write phase but did not install the read phase. We cannot
428 distinguish between a writer and another reader starting the read phase,
429 so we must wait for explicit hand-over via __wrphase_futex.
430 However, __wrphase_futex might not have been set to 1 yet (either
431 because explicit hand-over to the writer is still ongoing, or because
432 the writer has started the write phase but has not yet updated
433 __wrphase_futex). The least recent value of __wrphase_futex we can
434 read from here is the modification of the last read phase (because
435 we synchronize with the last reader in this read phase through
436 __readers; see the use of acquire MO on the fetch_add above).
437 Therefore, if we observe a value of 0 for __wrphase_futex, we need
438 to subsequently check that __readers now indicates a read phase; we
439 need to use acquire MO for this so that if we observe a read phase,
440 we will also see the modification of __wrphase_futex by the previous
441 writer. We then need to load __wrphase_futex again and continue to
442 wait if it is not 0, so that we do not skip explicit hand-over.
443 Relaxed MO is sufficient for the load from __wrphase_futex because
444 we just use it as an indicator for when we can proceed; we use
445 __readers and the acquire MO accesses to it to eventually read from
446 the proper stores to __wrphase_futex. */
447 unsigned int wpf;
448 bool ready = false;
449 for (;;)
450 {
451 while (((wpf = atomic_load_relaxed (&rwlock->__data.__wrphase_futex))
452 | PTHREAD_RWLOCK_FUTEX_USED) == (1 | PTHREAD_RWLOCK_FUTEX_USED))
453 {
454 int private = __pthread_rwlock_get_private (rwlock);
455 if (((wpf & PTHREAD_RWLOCK_FUTEX_USED) == 0)
456 && (!atomic_compare_exchange_weak_relaxed
457 (&rwlock->__data.__wrphase_futex,
458 &wpf, wpf | PTHREAD_RWLOCK_FUTEX_USED)))
459 continue;
460 int err = __futex_abstimed_wait64 (&rwlock->__data.__wrphase_futex,
461 1 | PTHREAD_RWLOCK_FUTEX_USED,
462 clockid, abstime, private);
463 if (err == ETIMEDOUT || err == EOVERFLOW)
464 {
465 /* If we timed out, we need to unregister. If no read phase
466 has been installed while we waited, we can just decrement
467 the number of readers. Otherwise, we just acquire the
468 lock, which is allowed because we give no precise timing
469 guarantees, and because the timeout is only required to
470 be in effect if we would have had to wait for other
471 threads (e.g., if futex_wait would time-out immediately
472 because the given absolute time is in the past). */
473 r = atomic_load_relaxed (&rwlock->__data.__readers);
474 while ((r & PTHREAD_RWLOCK_WRPHASE) != 0)
475 {
476 /* We don't need to make anything else visible to
477 others besides unregistering, so relaxed MO is
478 sufficient. */
479 if (atomic_compare_exchange_weak_relaxed
480 (&rwlock->__data.__readers, &r,
481 r - (1 << PTHREAD_RWLOCK_READER_SHIFT)))
482 return err;
483 /* TODO Back-off. */
484 }
485 /* Use the acquire MO fence to mirror the steps taken in the
486 non-timeout case. Note that the read can happen both
487 in the atomic_load above as well as in the failure case
488 of the CAS operation. */
489 atomic_thread_fence_acquire ();
490 /* We still need to wait for explicit hand-over, but we must
491 not use futex_wait anymore because we would just time out
492 in this case and thus make the spin-waiting we need
493 unnecessarily expensive. */
494 while ((atomic_load_relaxed (&rwlock->__data.__wrphase_futex)
495 | PTHREAD_RWLOCK_FUTEX_USED)
496 == (1 | PTHREAD_RWLOCK_FUTEX_USED))
497 {
498 /* TODO Back-off? */
499 }
500 ready = true;
501 break;
502 }
503 /* If we got interrupted (EINTR) or the futex word does not have the
504 expected value (EAGAIN), retry. */
505 }
506 if (ready)
507 /* See below. */
508 break;
509 /* We need acquire MO here so that we synchronize with the lock
510 release of the writer, and so that we observe a recent value of
511 __wrphase_futex (see below). */
512 if ((atomic_load_acquire (&rwlock->__data.__readers)
513 & PTHREAD_RWLOCK_WRPHASE) == 0)
514 /* We are in a read phase now, so the least recent modification of
515 __wrphase_futex we can read from is the store by the writer
516 with value 1. Thus, only now we can assume that if we observe
517 a value of 0, explicit hand-over is finished. Retry the loop
518 above one more time. */
519 ready = true;
520 }
521
522 return 0;
523}
524
525
526static __always_inline void
527__pthread_rwlock_wrunlock (pthread_rwlock_t *rwlock)
528{
529 int private = __pthread_rwlock_get_private (rwlock);
530
531 atomic_store_relaxed (&rwlock->__data.__cur_writer, 0);
532 /* Disable waiting by writers. We will wake up after we decided how to
533 proceed. */
534 bool wake_writers
535 = ((atomic_exchange_relaxed (&rwlock->__data.__writers_futex, 0)
536 & PTHREAD_RWLOCK_FUTEX_USED) != 0);
537
538 if (rwlock->__data.__flags != PTHREAD_RWLOCK_PREFER_READER_NP)
539 {
540 /* First, try to hand over to another writer. */
541 unsigned int w = atomic_load_relaxed (&rwlock->__data.__writers);
542 while (w != 0)
543 {
544 /* Release MO so that another writer that gets WRLOCKED from us will
545 synchronize with us and thus can take over our view of
546 __readers (including, for example, whether we are in a write
547 phase or not). */
548 if (atomic_compare_exchange_weak_release
549 (&rwlock->__data.__writers, &w, w | PTHREAD_RWLOCK_WRHANDOVER))
550 /* Another writer will take over. */
551 goto done;
552 /* TODO Back-off. */
553 }
554 }
555
556 /* We have done everything we needed to do to prefer writers, so now we
557 either hand over explicitly to readers if there are any, or we simply
558 stay in a write phase. See pthread_rwlock_rdunlock for more details. */
559 unsigned int r = atomic_load_relaxed (&rwlock->__data.__readers);
560 /* Release MO so that subsequent readers or writers synchronize with us. */
561 while (!atomic_compare_exchange_weak_release
562 (&rwlock->__data.__readers, &r,
563 ((r ^ PTHREAD_RWLOCK_WRLOCKED)
564 ^ ((r >> PTHREAD_RWLOCK_READER_SHIFT) == 0 ? 0
565 : PTHREAD_RWLOCK_WRPHASE))))
566 {
567 /* TODO Back-off. */
568 }
569 if ((r >> PTHREAD_RWLOCK_READER_SHIFT) != 0)
570 {
571 /* We must hand over explicitly through __wrphase_futex. Relaxed MO is
572 sufficient because it is just used to delay acquisition by a writer;
573 any other synchronizes-with relations that are necessary are
574 established through __readers. */
575 if ((atomic_exchange_relaxed (&rwlock->__data.__wrphase_futex, 0)
576 & PTHREAD_RWLOCK_FUTEX_USED) != 0)
577 futex_wake (futex_word: &rwlock->__data.__wrphase_futex, INT_MAX, private);
578 }
579
580 done:
581 /* We released WRLOCKED in some way, so wake a writer. */
582 if (wake_writers)
583 futex_wake (futex_word: &rwlock->__data.__writers_futex, processes_to_wake: 1, private);
584}
585
586
587static __always_inline int
588__pthread_rwlock_wrlock_full64 (pthread_rwlock_t *rwlock, clockid_t clockid,
589 const struct __timespec64 *abstime)
590{
591 /* Make sure any passed in clockid and timeout value are valid. Note that
592 the previous implementation assumed that this check *must* not be
593 performed if there would in fact be no blocking; however, POSIX only
594 requires that "the validity of the abstime parameter need not be checked
595 if the lock can be immediately acquired" (i.e., we need not but may check
596 it). */
597 if (abstime && __glibc_unlikely (!futex_abstimed_supported_clockid (clockid)
598 || ! valid_nanoseconds (abstime->tv_nsec)))
599 return EINVAL;
600
601 /* Make sure we are not holding the rwlock as a writer. This is a deadlock
602 situation we recognize and report. */
603 if (__glibc_unlikely (atomic_load_relaxed (&rwlock->__data.__cur_writer)
604 == THREAD_GETMEM (THREAD_SELF, tid)))
605 return EDEADLK;
606
607 /* First we try to acquire the role of primary writer by setting WRLOCKED;
608 if it was set before, there already is a primary writer. Acquire MO so
609 that we synchronize with previous primary writers.
610
611 We do not try to change to a write phase right away using a fetch_or
612 because we would have to reset it again and wake readers if there are
613 readers present (some readers could try to acquire the lock more than
614 once, so setting a write phase in the middle of this could cause
615 deadlock). Changing to a write phase eagerly would only speed up the
616 transition from a read phase to a write phase in the uncontended case,
617 but it would slow down the contended case if readers are preferred (which
618 is the default).
619 We could try to CAS from a state with no readers to a write phase, but
620 this could be less scalable if readers arrive and leave frequently. */
621 bool may_share_futex_used_flag = false;
622 unsigned int r = atomic_fetch_or_acquire (&rwlock->__data.__readers,
623 PTHREAD_RWLOCK_WRLOCKED);
624 if (__glibc_unlikely ((r & PTHREAD_RWLOCK_WRLOCKED) != 0))
625 {
626 /* There is another primary writer. */
627 bool prefer_writer
628 = (rwlock->__data.__flags != PTHREAD_RWLOCK_PREFER_READER_NP);
629 if (prefer_writer)
630 {
631 /* We register as a waiting writer, so that we can make use of
632 writer--writer hand-over. Relaxed MO is fine because we just
633 want to register. We assume that the maximum number of threads
634 is less than the capacity in __writers. */
635 atomic_fetch_add_relaxed (&rwlock->__data.__writers, 1);
636 }
637 for (;;)
638 {
639 /* TODO Spin until WRLOCKED is 0 before trying the CAS below.
640 But pay attention to not delay trying writer--writer hand-over
641 for too long (which we must try eventually anyway). */
642 if ((r & PTHREAD_RWLOCK_WRLOCKED) == 0)
643 {
644 /* Try to become the primary writer or retry. Acquire MO as in
645 the fetch_or above. */
646 if (atomic_compare_exchange_weak_acquire
647 (&rwlock->__data.__readers, &r, r | PTHREAD_RWLOCK_WRLOCKED))
648 {
649 if (prefer_writer)
650 {
651 /* Unregister as a waiting writer. Note that because we
652 acquired WRLOCKED, WRHANDOVER will not be set.
653 Acquire MO on the CAS above ensures that
654 unregistering happens after the previous writer;
655 this sorts the accesses to __writers by all
656 primary writers in a useful way (e.g., any other
657 primary writer acquiring after us or getting it from
658 us through WRHANDOVER will see both our changes to
659 __writers).
660 ??? Perhaps this is not strictly necessary for
661 reasons we do not yet know of. */
662 atomic_fetch_add_relaxed (&rwlock->__data.__writers, -1);
663 }
664 break;
665 }
666 /* Retry if the CAS fails (r will have been updated). */
667 continue;
668 }
669 /* If writer--writer hand-over is available, try to become the
670 primary writer this way by grabbing the WRHANDOVER token. If we
671 succeed, we own WRLOCKED. */
672 if (prefer_writer)
673 {
674 unsigned int w = atomic_load_relaxed (&rwlock->__data.__writers);
675 if ((w & PTHREAD_RWLOCK_WRHANDOVER) != 0)
676 {
677 /* Acquire MO is required here so that we synchronize with
678 the writer that handed over WRLOCKED. We also need this
679 for the reload of __readers below because our view of
680 __readers must be at least as recent as the view of the
681 writer that handed over WRLOCKED; we must avoid an ABA
682 through WRHANDOVER, which could, for example, lead to us
683 assuming we are still in a write phase when in fact we
684 are not. */
685 if (atomic_compare_exchange_weak_acquire
686 (&rwlock->__data.__writers,
687 &w, (w - PTHREAD_RWLOCK_WRHANDOVER - 1)))
688 {
689 /* Reload so our view is consistent with the view of
690 the previous owner of WRLOCKED. See above. */
691 r = atomic_load_relaxed (&rwlock->__data.__readers);
692 break;
693 }
694 /* We do not need to reload __readers here. We should try
695 to perform writer--writer hand-over if possible; if it
696 is not possible anymore, we will reload __readers
697 elsewhere in this loop. */
698 continue;
699 }
700 }
701 /* We did not acquire WRLOCKED nor were able to use writer--writer
702 hand-over, so we block on __writers_futex. */
703 int private = __pthread_rwlock_get_private (rwlock);
704 unsigned int wf
705 = atomic_load_relaxed (&rwlock->__data.__writers_futex);
706 if (((wf & ~(unsigned int) PTHREAD_RWLOCK_FUTEX_USED) != 1)
707 || ((wf != (1 | PTHREAD_RWLOCK_FUTEX_USED))
708 && (!atomic_compare_exchange_weak_relaxed
709 (&rwlock->__data.__writers_futex, &wf,
710 1 | PTHREAD_RWLOCK_FUTEX_USED))))
711 {
712 /* If we cannot block on __writers_futex because there is no
713 primary writer, or we cannot set PTHREAD_RWLOCK_FUTEX_USED,
714 we retry. We must reload __readers here in case we cannot
715 block on __writers_futex so that we can become the primary
716 writer and are not stuck in a loop that just continuously
717 fails to block on __writers_futex. */
718 r = atomic_load_relaxed (&rwlock->__data.__readers);
719 continue;
720 }
721 /* We set the flag that signals that the futex is used, or we could
722 have set it if we had been faster than other waiters. As a
723 result, we may share the flag with an unknown number of other
724 writers. Therefore, we must keep this flag set when we acquire
725 the lock. We do not need to do this when we do not reach this
726 point here because then we are not part of the group that may
727 share the flag, and another writer will wake one of the writers
728 in this group. */
729 may_share_futex_used_flag = true;
730 int err = __futex_abstimed_wait64 (&rwlock->__data.__writers_futex,
731 1 | PTHREAD_RWLOCK_FUTEX_USED,
732 clockid, abstime, private);
733 if (err == ETIMEDOUT || err == EOVERFLOW)
734 {
735 if (prefer_writer)
736 {
737 /* We need to unregister as a waiting writer. If we are the
738 last writer and writer--writer hand-over is available,
739 we must make use of it because nobody else will reset
740 WRLOCKED otherwise. (If we use it, we simply pretend
741 that this happened before the timeout; see
742 pthread_rwlock_rdlock_full for the full reasoning.)
743 Also see the similar code above. */
744 unsigned int w
745 = atomic_load_relaxed (&rwlock->__data.__writers);
746 while (!atomic_compare_exchange_weak_acquire
747 (&rwlock->__data.__writers, &w,
748 (w == PTHREAD_RWLOCK_WRHANDOVER + 1 ? 0 : w - 1)))
749 {
750 /* TODO Back-off. */
751 }
752 if (w == PTHREAD_RWLOCK_WRHANDOVER + 1)
753 {
754 /* We must continue as primary writer. See above. */
755 r = atomic_load_relaxed (&rwlock->__data.__readers);
756 break;
757 }
758 }
759 /* We cleaned up and cannot have stolen another waiting writer's
760 futex wake-up, so just return. */
761 return err;
762 }
763 /* If we got interrupted (EINTR) or the futex word does not have the
764 expected value (EAGAIN), retry after reloading __readers. */
765 r = atomic_load_relaxed (&rwlock->__data.__readers);
766 }
767 /* Our snapshot of __readers is up-to-date at this point because we
768 either set WRLOCKED using a CAS (and update r accordingly below,
769 which was used as expected value for the CAS) or got WRLOCKED from
770 another writer whose snapshot of __readers we inherit. */
771 r |= PTHREAD_RWLOCK_WRLOCKED;
772 }
773
774 /* We are the primary writer; enable blocking on __writers_futex. Relaxed
775 MO is sufficient for futex words; acquire MO on the previous
776 modifications of __readers ensures that this store happens after the
777 store of value 0 by the previous primary writer. */
778 atomic_store_relaxed (&rwlock->__data.__writers_futex,
779 1 | (may_share_futex_used_flag
780 ? PTHREAD_RWLOCK_FUTEX_USED : 0));
781
782 /* If we are in a write phase, we have acquired the lock. */
783 if ((r & PTHREAD_RWLOCK_WRPHASE) != 0)
784 goto done;
785
786 /* If we are in a read phase and there are no readers, try to start a write
787 phase. */
788 while ((r & PTHREAD_RWLOCK_WRPHASE) == 0
789 && (r >> PTHREAD_RWLOCK_READER_SHIFT) == 0)
790 {
791 /* Acquire MO so that we synchronize with prior writers and do
792 not interfere with their updates to __writers_futex, as well
793 as regarding prior readers and their updates to __wrphase_futex,
794 respectively. */
795 if (atomic_compare_exchange_weak_acquire (&rwlock->__data.__readers,
796 &r, r | PTHREAD_RWLOCK_WRPHASE))
797 {
798 /* We have started a write phase, so need to enable readers to wait.
799 See the similar case in __pthread_rwlock_rdlock_full. Unlike in
800 that similar case, we are the (only) primary writer and so do
801 not need to wake another writer. */
802 atomic_store_relaxed (&rwlock->__data.__wrphase_futex, 1);
803
804 goto done;
805 }
806 /* TODO Back-off. */
807 }
808
809 /* We became the primary writer in a read phase and there were readers when
810 we did (because of the previous loop). Thus, we have to wait for
811 explicit hand-over from one of these readers.
812 We basically do the same steps as for the similar case in
813 __pthread_rwlock_rdlock_full, except that we additionally might try
814 to directly hand over to another writer and need to wake up
815 other writers or waiting readers (i.e., PTHREAD_RWLOCK_RWAITING). */
816 unsigned int wpf;
817 bool ready = false;
818 for (;;)
819 {
820 while (((wpf = atomic_load_relaxed (&rwlock->__data.__wrphase_futex))
821 | PTHREAD_RWLOCK_FUTEX_USED) == PTHREAD_RWLOCK_FUTEX_USED)
822 {
823 int private = __pthread_rwlock_get_private (rwlock);
824 if ((wpf & PTHREAD_RWLOCK_FUTEX_USED) == 0
825 && (!atomic_compare_exchange_weak_relaxed
826 (&rwlock->__data.__wrphase_futex, &wpf,
827 PTHREAD_RWLOCK_FUTEX_USED)))
828 continue;
829 int err = __futex_abstimed_wait64 (&rwlock->__data.__wrphase_futex,
830 PTHREAD_RWLOCK_FUTEX_USED,
831 clockid, abstime, private);
832 if (err == ETIMEDOUT || err == EOVERFLOW)
833 {
834 if (rwlock->__data.__flags != PTHREAD_RWLOCK_PREFER_READER_NP)
835 {
836 /* We try writer--writer hand-over. */
837 unsigned int w
838 = atomic_load_relaxed (&rwlock->__data.__writers);
839 if (w != 0)
840 {
841 /* We are about to hand over WRLOCKED, so we must
842 release __writers_futex too; otherwise, we'd have
843 a pending store, which could at least prevent
844 other threads from waiting using the futex
845 because it could interleave with the stores
846 by subsequent writers. In turn, this means that
847 we have to clean up when we do not hand over
848 WRLOCKED.
849 Release MO so that another writer that gets
850 WRLOCKED from us can take over our view of
851 __readers. */
852 unsigned int wf
853 = atomic_exchange_relaxed (&rwlock->__data.__writers_futex, 0);
854 while (w != 0)
855 {
856 if (atomic_compare_exchange_weak_release
857 (&rwlock->__data.__writers, &w,
858 w | PTHREAD_RWLOCK_WRHANDOVER))
859 {
860 /* Wake other writers. */
861 if ((wf & PTHREAD_RWLOCK_FUTEX_USED) != 0)
862 futex_wake (futex_word: &rwlock->__data.__writers_futex,
863 processes_to_wake: 1, private);
864 return err;
865 }
866 /* TODO Back-off. */
867 }
868 /* We still own WRLOCKED and someone else might set
869 a write phase concurrently, so enable waiting
870 again. Make sure we don't loose the flag that
871 signals whether there are threads waiting on
872 this futex. */
873 atomic_store_relaxed (&rwlock->__data.__writers_futex, wf);
874 }
875 }
876 /* If we timed out and we are not in a write phase, we can
877 just stop being a primary writer. Otherwise, we just
878 acquire the lock. */
879 r = atomic_load_relaxed (&rwlock->__data.__readers);
880 if ((r & PTHREAD_RWLOCK_WRPHASE) == 0)
881 {
882 /* We are about to release WRLOCKED, so we must release
883 __writers_futex too; see the handling of
884 writer--writer hand-over above. */
885 unsigned int wf
886 = atomic_exchange_relaxed (&rwlock->__data.__writers_futex, 0);
887 while ((r & PTHREAD_RWLOCK_WRPHASE) == 0)
888 {
889 /* While we don't need to make anything from a
890 caller's critical section visible to other
891 threads, we need to ensure that our changes to
892 __writers_futex are properly ordered.
893 Therefore, use release MO to synchronize with
894 subsequent primary writers. Also wake up any
895 waiting readers as they are waiting because of
896 us. */
897 if (atomic_compare_exchange_weak_release
898 (&rwlock->__data.__readers, &r,
899 (r ^ PTHREAD_RWLOCK_WRLOCKED)
900 & ~(unsigned int) PTHREAD_RWLOCK_RWAITING))
901 {
902 /* Wake other writers. */
903 if ((wf & PTHREAD_RWLOCK_FUTEX_USED) != 0)
904 futex_wake (futex_word: &rwlock->__data.__writers_futex,
905 processes_to_wake: 1, private);
906 /* Wake waiting readers. */
907 if ((r & PTHREAD_RWLOCK_RWAITING) != 0)
908 futex_wake (futex_word: &rwlock->__data.__readers,
909 INT_MAX, private);
910 return ETIMEDOUT;
911 }
912 }
913 /* We still own WRLOCKED and someone else might set a
914 write phase concurrently, so enable waiting again.
915 Make sure we don't loose the flag that signals
916 whether there are threads waiting on this futex. */
917 atomic_store_relaxed (&rwlock->__data.__writers_futex, wf);
918 }
919 /* Use the acquire MO fence to mirror the steps taken in the
920 non-timeout case. Note that the read can happen both
921 in the atomic_load above as well as in the failure case
922 of the CAS operation. */
923 atomic_thread_fence_acquire ();
924 /* We still need to wait for explicit hand-over, but we must
925 not use futex_wait anymore. */
926 while ((atomic_load_relaxed (&rwlock->__data.__wrphase_futex)
927 | PTHREAD_RWLOCK_FUTEX_USED)
928 == PTHREAD_RWLOCK_FUTEX_USED)
929 {
930 /* TODO Back-off. */
931 }
932 ready = true;
933 break;
934 }
935 /* If we got interrupted (EINTR) or the futex word does not have
936 the expected value (EAGAIN), retry. */
937 }
938 /* See pthread_rwlock_rdlock_full. */
939 if (ready)
940 break;
941 if ((atomic_load_acquire (&rwlock->__data.__readers)
942 & PTHREAD_RWLOCK_WRPHASE) != 0)
943 ready = true;
944 }
945
946 done:
947 atomic_store_relaxed (&rwlock->__data.__cur_writer,
948 THREAD_GETMEM (THREAD_SELF, tid));
949 return 0;
950}
951

source code of glibc/nptl/pthread_rwlock_common.c