1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* kernel/rwsem.c: R/W semaphores, public implementation |
3 | * |
4 | * Written by David Howells (dhowells@redhat.com). |
5 | * Derived from asm-i386/semaphore.h |
6 | * |
7 | * Writer lock-stealing by Alex Shi <alex.shi@intel.com> |
8 | * and Michel Lespinasse <walken@google.com> |
9 | * |
10 | * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> |
11 | * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. |
12 | * |
13 | * Rwsem count bit fields re-definition and rwsem rearchitecture by |
14 | * Waiman Long <longman@redhat.com> and |
15 | * Peter Zijlstra <peterz@infradead.org>. |
16 | */ |
17 | |
18 | #include <linux/types.h> |
19 | #include <linux/kernel.h> |
20 | #include <linux/sched.h> |
21 | #include <linux/sched/rt.h> |
22 | #include <linux/sched/task.h> |
23 | #include <linux/sched/debug.h> |
24 | #include <linux/sched/wake_q.h> |
25 | #include <linux/sched/signal.h> |
26 | #include <linux/sched/clock.h> |
27 | #include <linux/export.h> |
28 | #include <linux/rwsem.h> |
29 | #include <linux/atomic.h> |
30 | #include <trace/events/lock.h> |
31 | |
32 | #ifndef CONFIG_PREEMPT_RT |
33 | #include "lock_events.h" |
34 | |
35 | /* |
36 | * The least significant 2 bits of the owner value has the following |
37 | * meanings when set. |
38 | * - Bit 0: RWSEM_READER_OWNED - rwsem may be owned by readers (just a hint) |
39 | * - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock |
40 | * |
41 | * When the rwsem is reader-owned and a spinning writer has timed out, |
42 | * the nonspinnable bit will be set to disable optimistic spinning. |
43 | |
44 | * When a writer acquires a rwsem, it puts its task_struct pointer |
45 | * into the owner field. It is cleared after an unlock. |
46 | * |
47 | * When a reader acquires a rwsem, it will also puts its task_struct |
48 | * pointer into the owner field with the RWSEM_READER_OWNED bit set. |
49 | * On unlock, the owner field will largely be left untouched. So |
50 | * for a free or reader-owned rwsem, the owner value may contain |
51 | * information about the last reader that acquires the rwsem. |
52 | * |
53 | * That information may be helpful in debugging cases where the system |
54 | * seems to hang on a reader owned rwsem especially if only one reader |
55 | * is involved. Ideally we would like to track all the readers that own |
56 | * a rwsem, but the overhead is simply too big. |
57 | * |
58 | * A fast path reader optimistic lock stealing is supported when the rwsem |
59 | * is previously owned by a writer and the following conditions are met: |
60 | * - rwsem is not currently writer owned |
61 | * - the handoff isn't set. |
62 | */ |
63 | #define RWSEM_READER_OWNED (1UL << 0) |
64 | #define RWSEM_NONSPINNABLE (1UL << 1) |
65 | #define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) |
66 | |
67 | #ifdef CONFIG_DEBUG_RWSEMS |
68 | # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ |
69 | if (!debug_locks_silent && \ |
70 | WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, magic = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ |
71 | #c, atomic_long_read(&(sem)->count), \ |
72 | (unsigned long) sem->magic, \ |
73 | atomic_long_read(&(sem)->owner), (long)current, \ |
74 | list_empty(&(sem)->wait_list) ? "" : "not ")) \ |
75 | debug_locks_off(); \ |
76 | } while (0) |
77 | #else |
78 | # define DEBUG_RWSEMS_WARN_ON(c, sem) |
79 | #endif |
80 | |
81 | /* |
82 | * On 64-bit architectures, the bit definitions of the count are: |
83 | * |
84 | * Bit 0 - writer locked bit |
85 | * Bit 1 - waiters present bit |
86 | * Bit 2 - lock handoff bit |
87 | * Bits 3-7 - reserved |
88 | * Bits 8-62 - 55-bit reader count |
89 | * Bit 63 - read fail bit |
90 | * |
91 | * On 32-bit architectures, the bit definitions of the count are: |
92 | * |
93 | * Bit 0 - writer locked bit |
94 | * Bit 1 - waiters present bit |
95 | * Bit 2 - lock handoff bit |
96 | * Bits 3-7 - reserved |
97 | * Bits 8-30 - 23-bit reader count |
98 | * Bit 31 - read fail bit |
99 | * |
100 | * It is not likely that the most significant bit (read fail bit) will ever |
101 | * be set. This guard bit is still checked anyway in the down_read() fastpath |
102 | * just in case we need to use up more of the reader bits for other purpose |
103 | * in the future. |
104 | * |
105 | * atomic_long_fetch_add() is used to obtain reader lock, whereas |
106 | * atomic_long_cmpxchg() will be used to obtain writer lock. |
107 | * |
108 | * There are three places where the lock handoff bit may be set or cleared. |
109 | * 1) rwsem_mark_wake() for readers -- set, clear |
110 | * 2) rwsem_try_write_lock() for writers -- set, clear |
111 | * 3) rwsem_del_waiter() -- clear |
112 | * |
113 | * For all the above cases, wait_lock will be held. A writer must also |
114 | * be the first one in the wait_list to be eligible for setting the handoff |
115 | * bit. So concurrent setting/clearing of handoff bit is not possible. |
116 | */ |
117 | #define RWSEM_WRITER_LOCKED (1UL << 0) |
118 | #define RWSEM_FLAG_WAITERS (1UL << 1) |
119 | #define RWSEM_FLAG_HANDOFF (1UL << 2) |
120 | #define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1)) |
121 | |
122 | #define RWSEM_READER_SHIFT 8 |
123 | #define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) |
124 | #define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) |
125 | #define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED |
126 | #define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) |
127 | #define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\ |
128 | RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL) |
129 | |
130 | /* |
131 | * All writes to owner are protected by WRITE_ONCE() to make sure that |
132 | * store tearing can't happen as optimistic spinners may read and use |
133 | * the owner value concurrently without lock. Read from owner, however, |
134 | * may not need READ_ONCE() as long as the pointer value is only used |
135 | * for comparison and isn't being dereferenced. |
136 | * |
137 | * Both rwsem_{set,clear}_owner() functions should be in the same |
138 | * preempt disable section as the atomic op that changes sem->count. |
139 | */ |
140 | static inline void rwsem_set_owner(struct rw_semaphore *sem) |
141 | { |
142 | lockdep_assert_preemption_disabled(); |
143 | atomic_long_set(v: &sem->owner, i: (long)current); |
144 | } |
145 | |
146 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) |
147 | { |
148 | lockdep_assert_preemption_disabled(); |
149 | atomic_long_set(v: &sem->owner, i: 0); |
150 | } |
151 | |
152 | /* |
153 | * Test the flags in the owner field. |
154 | */ |
155 | static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) |
156 | { |
157 | return atomic_long_read(v: &sem->owner) & flags; |
158 | } |
159 | |
160 | /* |
161 | * The task_struct pointer of the last owning reader will be left in |
162 | * the owner field. |
163 | * |
164 | * Note that the owner value just indicates the task has owned the rwsem |
165 | * previously, it may not be the real owner or one of the real owners |
166 | * anymore when that field is examined, so take it with a grain of salt. |
167 | * |
168 | * The reader non-spinnable bit is preserved. |
169 | */ |
170 | static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, |
171 | struct task_struct *owner) |
172 | { |
173 | unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | |
174 | (atomic_long_read(v: &sem->owner) & RWSEM_NONSPINNABLE); |
175 | |
176 | atomic_long_set(v: &sem->owner, i: val); |
177 | } |
178 | |
179 | static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) |
180 | { |
181 | __rwsem_set_reader_owned(sem, current); |
182 | } |
183 | |
184 | /* |
185 | * Return true if the rwsem is owned by a reader. |
186 | */ |
187 | static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) |
188 | { |
189 | #ifdef CONFIG_DEBUG_RWSEMS |
190 | /* |
191 | * Check the count to see if it is write-locked. |
192 | */ |
193 | long count = atomic_long_read(v: &sem->count); |
194 | |
195 | if (count & RWSEM_WRITER_MASK) |
196 | return false; |
197 | #endif |
198 | return rwsem_test_oflags(sem, RWSEM_READER_OWNED); |
199 | } |
200 | |
201 | #ifdef CONFIG_DEBUG_RWSEMS |
202 | /* |
203 | * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there |
204 | * is a task pointer in owner of a reader-owned rwsem, it will be the |
205 | * real owner or one of the real owners. The only exception is when the |
206 | * unlock is done by up_read_non_owner(). |
207 | */ |
208 | static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) |
209 | { |
210 | unsigned long val = atomic_long_read(v: &sem->owner); |
211 | |
212 | while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) { |
213 | if (atomic_long_try_cmpxchg(v: &sem->owner, old: &val, |
214 | new: val & RWSEM_OWNER_FLAGS_MASK)) |
215 | return; |
216 | } |
217 | } |
218 | #else |
219 | static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) |
220 | { |
221 | } |
222 | #endif |
223 | |
224 | /* |
225 | * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag |
226 | * remains set. Otherwise, the operation will be aborted. |
227 | */ |
228 | static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) |
229 | { |
230 | unsigned long owner = atomic_long_read(v: &sem->owner); |
231 | |
232 | do { |
233 | if (!(owner & RWSEM_READER_OWNED)) |
234 | break; |
235 | if (owner & RWSEM_NONSPINNABLE) |
236 | break; |
237 | } while (!atomic_long_try_cmpxchg(v: &sem->owner, old: &owner, |
238 | new: owner | RWSEM_NONSPINNABLE)); |
239 | } |
240 | |
241 | static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp) |
242 | { |
243 | *cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, v: &sem->count); |
244 | |
245 | if (WARN_ON_ONCE(*cntp < 0)) |
246 | rwsem_set_nonspinnable(sem); |
247 | |
248 | if (!(*cntp & RWSEM_READ_FAILED_MASK)) { |
249 | rwsem_set_reader_owned(sem); |
250 | return true; |
251 | } |
252 | |
253 | return false; |
254 | } |
255 | |
256 | static inline bool rwsem_write_trylock(struct rw_semaphore *sem) |
257 | { |
258 | long tmp = RWSEM_UNLOCKED_VALUE; |
259 | |
260 | if (atomic_long_try_cmpxchg_acquire(v: &sem->count, old: &tmp, RWSEM_WRITER_LOCKED)) { |
261 | rwsem_set_owner(sem); |
262 | return true; |
263 | } |
264 | |
265 | return false; |
266 | } |
267 | |
268 | /* |
269 | * Return just the real task structure pointer of the owner |
270 | */ |
271 | static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) |
272 | { |
273 | return (struct task_struct *) |
274 | (atomic_long_read(v: &sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); |
275 | } |
276 | |
277 | /* |
278 | * Return the real task structure pointer of the owner and the embedded |
279 | * flags in the owner. pflags must be non-NULL. |
280 | */ |
281 | static inline struct task_struct * |
282 | rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags) |
283 | { |
284 | unsigned long owner = atomic_long_read(v: &sem->owner); |
285 | |
286 | *pflags = owner & RWSEM_OWNER_FLAGS_MASK; |
287 | return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK); |
288 | } |
289 | |
290 | /* |
291 | * Guide to the rw_semaphore's count field. |
292 | * |
293 | * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned |
294 | * by a writer. |
295 | * |
296 | * The lock is owned by readers when |
297 | * (1) the RWSEM_WRITER_LOCKED isn't set in count, |
298 | * (2) some of the reader bits are set in count, and |
299 | * (3) the owner field has RWSEM_READ_OWNED bit set. |
300 | * |
301 | * Having some reader bits set is not enough to guarantee a readers owned |
302 | * lock as the readers may be in the process of backing out from the count |
303 | * and a writer has just released the lock. So another writer may steal |
304 | * the lock immediately after that. |
305 | */ |
306 | |
307 | /* |
308 | * Initialize an rwsem: |
309 | */ |
310 | void __init_rwsem(struct rw_semaphore *sem, const char *name, |
311 | struct lock_class_key *key) |
312 | { |
313 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
314 | /* |
315 | * Make sure we are not reinitializing a held semaphore: |
316 | */ |
317 | debug_check_no_locks_freed(from: (void *)sem, len: sizeof(*sem)); |
318 | lockdep_init_map_wait(lock: &sem->dep_map, name, key, subclass: 0, inner: LD_WAIT_SLEEP); |
319 | #endif |
320 | #ifdef CONFIG_DEBUG_RWSEMS |
321 | sem->magic = sem; |
322 | #endif |
323 | atomic_long_set(v: &sem->count, RWSEM_UNLOCKED_VALUE); |
324 | raw_spin_lock_init(&sem->wait_lock); |
325 | INIT_LIST_HEAD(list: &sem->wait_list); |
326 | atomic_long_set(v: &sem->owner, i: 0L); |
327 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
328 | osq_lock_init(lock: &sem->osq); |
329 | #endif |
330 | } |
331 | EXPORT_SYMBOL(__init_rwsem); |
332 | |
333 | enum rwsem_waiter_type { |
334 | RWSEM_WAITING_FOR_WRITE, |
335 | RWSEM_WAITING_FOR_READ |
336 | }; |
337 | |
338 | struct rwsem_waiter { |
339 | struct list_head list; |
340 | struct task_struct *task; |
341 | enum rwsem_waiter_type type; |
342 | unsigned long timeout; |
343 | bool handoff_set; |
344 | }; |
345 | #define rwsem_first_waiter(sem) \ |
346 | list_first_entry(&sem->wait_list, struct rwsem_waiter, list) |
347 | |
348 | enum rwsem_wake_type { |
349 | RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ |
350 | RWSEM_WAKE_READERS, /* Wake readers only */ |
351 | RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ |
352 | }; |
353 | |
354 | /* |
355 | * The typical HZ value is either 250 or 1000. So set the minimum waiting |
356 | * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait |
357 | * queue before initiating the handoff protocol. |
358 | */ |
359 | #define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250) |
360 | |
361 | /* |
362 | * Magic number to batch-wakeup waiting readers, even when writers are |
363 | * also present in the queue. This both limits the amount of work the |
364 | * waking thread must do and also prevents any potential counter overflow, |
365 | * however unlikely. |
366 | */ |
367 | #define MAX_READERS_WAKEUP 0x100 |
368 | |
369 | static inline void |
370 | rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) |
371 | { |
372 | lockdep_assert_held(&sem->wait_lock); |
373 | list_add_tail(new: &waiter->list, head: &sem->wait_list); |
374 | /* caller will set RWSEM_FLAG_WAITERS */ |
375 | } |
376 | |
377 | /* |
378 | * Remove a waiter from the wait_list and clear flags. |
379 | * |
380 | * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of |
381 | * this function. Modify with care. |
382 | * |
383 | * Return: true if wait_list isn't empty and false otherwise |
384 | */ |
385 | static inline bool |
386 | rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) |
387 | { |
388 | lockdep_assert_held(&sem->wait_lock); |
389 | list_del(entry: &waiter->list); |
390 | if (likely(!list_empty(&sem->wait_list))) |
391 | return true; |
392 | |
393 | atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, v: &sem->count); |
394 | return false; |
395 | } |
396 | |
397 | /* |
398 | * handle the lock release when processes blocked on it that can now run |
399 | * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must |
400 | * have been set. |
401 | * - there must be someone on the queue |
402 | * - the wait_lock must be held by the caller |
403 | * - tasks are marked for wakeup, the caller must later invoke wake_up_q() |
404 | * to actually wakeup the blocked task(s) and drop the reference count, |
405 | * preferably when the wait_lock is released |
406 | * - woken process blocks are discarded from the list after having task zeroed |
407 | * - writers are only marked woken if downgrading is false |
408 | * |
409 | * Implies rwsem_del_waiter() for all woken readers. |
410 | */ |
411 | static void rwsem_mark_wake(struct rw_semaphore *sem, |
412 | enum rwsem_wake_type wake_type, |
413 | struct wake_q_head *wake_q) |
414 | { |
415 | struct rwsem_waiter *waiter, *tmp; |
416 | long oldcount, woken = 0, adjustment = 0; |
417 | struct list_head wlist; |
418 | |
419 | lockdep_assert_held(&sem->wait_lock); |
420 | |
421 | /* |
422 | * Take a peek at the queue head waiter such that we can determine |
423 | * the wakeup(s) to perform. |
424 | */ |
425 | waiter = rwsem_first_waiter(sem); |
426 | |
427 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { |
428 | if (wake_type == RWSEM_WAKE_ANY) { |
429 | /* |
430 | * Mark writer at the front of the queue for wakeup. |
431 | * Until the task is actually later awoken later by |
432 | * the caller, other writers are able to steal it. |
433 | * Readers, on the other hand, will block as they |
434 | * will notice the queued writer. |
435 | */ |
436 | wake_q_add(head: wake_q, task: waiter->task); |
437 | lockevent_inc(rwsem_wake_writer); |
438 | } |
439 | |
440 | return; |
441 | } |
442 | |
443 | /* |
444 | * No reader wakeup if there are too many of them already. |
445 | */ |
446 | if (unlikely(atomic_long_read(&sem->count) < 0)) |
447 | return; |
448 | |
449 | /* |
450 | * Writers might steal the lock before we grant it to the next reader. |
451 | * We prefer to do the first reader grant before counting readers |
452 | * so we can bail out early if a writer stole the lock. |
453 | */ |
454 | if (wake_type != RWSEM_WAKE_READ_OWNED) { |
455 | struct task_struct *owner; |
456 | |
457 | adjustment = RWSEM_READER_BIAS; |
458 | oldcount = atomic_long_fetch_add(i: adjustment, v: &sem->count); |
459 | if (unlikely(oldcount & RWSEM_WRITER_MASK)) { |
460 | /* |
461 | * When we've been waiting "too" long (for writers |
462 | * to give up the lock), request a HANDOFF to |
463 | * force the issue. |
464 | */ |
465 | if (time_after(jiffies, waiter->timeout)) { |
466 | if (!(oldcount & RWSEM_FLAG_HANDOFF)) { |
467 | adjustment -= RWSEM_FLAG_HANDOFF; |
468 | lockevent_inc(rwsem_rlock_handoff); |
469 | } |
470 | waiter->handoff_set = true; |
471 | } |
472 | |
473 | atomic_long_add(i: -adjustment, v: &sem->count); |
474 | return; |
475 | } |
476 | /* |
477 | * Set it to reader-owned to give spinners an early |
478 | * indication that readers now have the lock. |
479 | * The reader nonspinnable bit seen at slowpath entry of |
480 | * the reader is copied over. |
481 | */ |
482 | owner = waiter->task; |
483 | __rwsem_set_reader_owned(sem, owner); |
484 | } |
485 | |
486 | /* |
487 | * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the |
488 | * queue. We know that the woken will be at least 1 as we accounted |
489 | * for above. Note we increment the 'active part' of the count by the |
490 | * number of readers before waking any processes up. |
491 | * |
492 | * This is an adaptation of the phase-fair R/W locks where at the |
493 | * reader phase (first waiter is a reader), all readers are eligible |
494 | * to acquire the lock at the same time irrespective of their order |
495 | * in the queue. The writers acquire the lock according to their |
496 | * order in the queue. |
497 | * |
498 | * We have to do wakeup in 2 passes to prevent the possibility that |
499 | * the reader count may be decremented before it is incremented. It |
500 | * is because the to-be-woken waiter may not have slept yet. So it |
501 | * may see waiter->task got cleared, finish its critical section and |
502 | * do an unlock before the reader count increment. |
503 | * |
504 | * 1) Collect the read-waiters in a separate list, count them and |
505 | * fully increment the reader count in rwsem. |
506 | * 2) For each waiters in the new list, clear waiter->task and |
507 | * put them into wake_q to be woken up later. |
508 | */ |
509 | INIT_LIST_HEAD(list: &wlist); |
510 | list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { |
511 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) |
512 | continue; |
513 | |
514 | woken++; |
515 | list_move_tail(list: &waiter->list, head: &wlist); |
516 | |
517 | /* |
518 | * Limit # of readers that can be woken up per wakeup call. |
519 | */ |
520 | if (unlikely(woken >= MAX_READERS_WAKEUP)) |
521 | break; |
522 | } |
523 | |
524 | adjustment = woken * RWSEM_READER_BIAS - adjustment; |
525 | lockevent_cond_inc(rwsem_wake_reader, woken); |
526 | |
527 | oldcount = atomic_long_read(v: &sem->count); |
528 | if (list_empty(head: &sem->wait_list)) { |
529 | /* |
530 | * Combined with list_move_tail() above, this implies |
531 | * rwsem_del_waiter(). |
532 | */ |
533 | adjustment -= RWSEM_FLAG_WAITERS; |
534 | if (oldcount & RWSEM_FLAG_HANDOFF) |
535 | adjustment -= RWSEM_FLAG_HANDOFF; |
536 | } else if (woken) { |
537 | /* |
538 | * When we've woken a reader, we no longer need to force |
539 | * writers to give up the lock and we can clear HANDOFF. |
540 | */ |
541 | if (oldcount & RWSEM_FLAG_HANDOFF) |
542 | adjustment -= RWSEM_FLAG_HANDOFF; |
543 | } |
544 | |
545 | if (adjustment) |
546 | atomic_long_add(i: adjustment, v: &sem->count); |
547 | |
548 | /* 2nd pass */ |
549 | list_for_each_entry_safe(waiter, tmp, &wlist, list) { |
550 | struct task_struct *tsk; |
551 | |
552 | tsk = waiter->task; |
553 | get_task_struct(t: tsk); |
554 | |
555 | /* |
556 | * Ensure calling get_task_struct() before setting the reader |
557 | * waiter to nil such that rwsem_down_read_slowpath() cannot |
558 | * race with do_exit() by always holding a reference count |
559 | * to the task to wakeup. |
560 | */ |
561 | smp_store_release(&waiter->task, NULL); |
562 | /* |
563 | * Ensure issuing the wakeup (either by us or someone else) |
564 | * after setting the reader waiter to nil. |
565 | */ |
566 | wake_q_add_safe(head: wake_q, task: tsk); |
567 | } |
568 | } |
569 | |
570 | /* |
571 | * Remove a waiter and try to wake up other waiters in the wait queue |
572 | * This function is called from the out_nolock path of both the reader and |
573 | * writer slowpaths with wait_lock held. It releases the wait_lock and |
574 | * optionally wake up waiters before it returns. |
575 | */ |
576 | static inline void |
577 | rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter, |
578 | struct wake_q_head *wake_q) |
579 | __releases(&sem->wait_lock) |
580 | { |
581 | bool first = rwsem_first_waiter(sem) == waiter; |
582 | |
583 | wake_q_init(head: wake_q); |
584 | |
585 | /* |
586 | * If the wait_list isn't empty and the waiter to be deleted is |
587 | * the first waiter, we wake up the remaining waiters as they may |
588 | * be eligible to acquire or spin on the lock. |
589 | */ |
590 | if (rwsem_del_waiter(sem, waiter) && first) |
591 | rwsem_mark_wake(sem, wake_type: RWSEM_WAKE_ANY, wake_q); |
592 | raw_spin_unlock_irq(&sem->wait_lock); |
593 | if (!wake_q_empty(head: wake_q)) |
594 | wake_up_q(head: wake_q); |
595 | } |
596 | |
597 | /* |
598 | * This function must be called with the sem->wait_lock held to prevent |
599 | * race conditions between checking the rwsem wait list and setting the |
600 | * sem->count accordingly. |
601 | * |
602 | * Implies rwsem_del_waiter() on success. |
603 | */ |
604 | static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, |
605 | struct rwsem_waiter *waiter) |
606 | { |
607 | struct rwsem_waiter *first = rwsem_first_waiter(sem); |
608 | long count, new; |
609 | |
610 | lockdep_assert_held(&sem->wait_lock); |
611 | |
612 | count = atomic_long_read(v: &sem->count); |
613 | do { |
614 | bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); |
615 | |
616 | if (has_handoff) { |
617 | /* |
618 | * Honor handoff bit and yield only when the first |
619 | * waiter is the one that set it. Otherwisee, we |
620 | * still try to acquire the rwsem. |
621 | */ |
622 | if (first->handoff_set && (waiter != first)) |
623 | return false; |
624 | } |
625 | |
626 | new = count; |
627 | |
628 | if (count & RWSEM_LOCK_MASK) { |
629 | /* |
630 | * A waiter (first or not) can set the handoff bit |
631 | * if it is an RT task or wait in the wait queue |
632 | * for too long. |
633 | */ |
634 | if (has_handoff || (!rt_task(p: waiter->task) && |
635 | !time_after(jiffies, waiter->timeout))) |
636 | return false; |
637 | |
638 | new |= RWSEM_FLAG_HANDOFF; |
639 | } else { |
640 | new |= RWSEM_WRITER_LOCKED; |
641 | new &= ~RWSEM_FLAG_HANDOFF; |
642 | |
643 | if (list_is_singular(head: &sem->wait_list)) |
644 | new &= ~RWSEM_FLAG_WAITERS; |
645 | } |
646 | } while (!atomic_long_try_cmpxchg_acquire(v: &sem->count, old: &count, new)); |
647 | |
648 | /* |
649 | * We have either acquired the lock with handoff bit cleared or set |
650 | * the handoff bit. Only the first waiter can have its handoff_set |
651 | * set here to enable optimistic spinning in slowpath loop. |
652 | */ |
653 | if (new & RWSEM_FLAG_HANDOFF) { |
654 | first->handoff_set = true; |
655 | lockevent_inc(rwsem_wlock_handoff); |
656 | return false; |
657 | } |
658 | |
659 | /* |
660 | * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on |
661 | * success. |
662 | */ |
663 | list_del(entry: &waiter->list); |
664 | rwsem_set_owner(sem); |
665 | return true; |
666 | } |
667 | |
668 | /* |
669 | * The rwsem_spin_on_owner() function returns the following 4 values |
670 | * depending on the lock owner state. |
671 | * OWNER_NULL : owner is currently NULL |
672 | * OWNER_WRITER: when owner changes and is a writer |
673 | * OWNER_READER: when owner changes and the new owner may be a reader. |
674 | * OWNER_NONSPINNABLE: |
675 | * when optimistic spinning has to stop because either the |
676 | * owner stops running, is unknown, or its timeslice has |
677 | * been used up. |
678 | */ |
679 | enum owner_state { |
680 | OWNER_NULL = 1 << 0, |
681 | OWNER_WRITER = 1 << 1, |
682 | OWNER_READER = 1 << 2, |
683 | OWNER_NONSPINNABLE = 1 << 3, |
684 | }; |
685 | |
686 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER |
687 | /* |
688 | * Try to acquire write lock before the writer has been put on wait queue. |
689 | */ |
690 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) |
691 | { |
692 | long count = atomic_long_read(v: &sem->count); |
693 | |
694 | while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) { |
695 | if (atomic_long_try_cmpxchg_acquire(v: &sem->count, old: &count, |
696 | new: count | RWSEM_WRITER_LOCKED)) { |
697 | rwsem_set_owner(sem); |
698 | lockevent_inc(rwsem_opt_lock); |
699 | return true; |
700 | } |
701 | } |
702 | return false; |
703 | } |
704 | |
705 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) |
706 | { |
707 | struct task_struct *owner; |
708 | unsigned long flags; |
709 | bool ret = true; |
710 | |
711 | if (need_resched()) { |
712 | lockevent_inc(rwsem_opt_fail); |
713 | return false; |
714 | } |
715 | |
716 | /* |
717 | * Disable preemption is equal to the RCU read-side crital section, |
718 | * thus the task_strcut structure won't go away. |
719 | */ |
720 | owner = rwsem_owner_flags(sem, pflags: &flags); |
721 | /* |
722 | * Don't check the read-owner as the entry may be stale. |
723 | */ |
724 | if ((flags & RWSEM_NONSPINNABLE) || |
725 | (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner))) |
726 | ret = false; |
727 | |
728 | lockevent_cond_inc(rwsem_opt_fail, !ret); |
729 | return ret; |
730 | } |
731 | |
732 | #define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) |
733 | |
734 | static inline enum owner_state |
735 | rwsem_owner_state(struct task_struct *owner, unsigned long flags) |
736 | { |
737 | if (flags & RWSEM_NONSPINNABLE) |
738 | return OWNER_NONSPINNABLE; |
739 | |
740 | if (flags & RWSEM_READER_OWNED) |
741 | return OWNER_READER; |
742 | |
743 | return owner ? OWNER_WRITER : OWNER_NULL; |
744 | } |
745 | |
746 | static noinline enum owner_state |
747 | rwsem_spin_on_owner(struct rw_semaphore *sem) |
748 | { |
749 | struct task_struct *new, *owner; |
750 | unsigned long flags, new_flags; |
751 | enum owner_state state; |
752 | |
753 | lockdep_assert_preemption_disabled(); |
754 | |
755 | owner = rwsem_owner_flags(sem, pflags: &flags); |
756 | state = rwsem_owner_state(owner, flags); |
757 | if (state != OWNER_WRITER) |
758 | return state; |
759 | |
760 | for (;;) { |
761 | /* |
762 | * When a waiting writer set the handoff flag, it may spin |
763 | * on the owner as well. Once that writer acquires the lock, |
764 | * we can spin on it. So we don't need to quit even when the |
765 | * handoff bit is set. |
766 | */ |
767 | new = rwsem_owner_flags(sem, pflags: &new_flags); |
768 | if ((new != owner) || (new_flags != flags)) { |
769 | state = rwsem_owner_state(owner: new, flags: new_flags); |
770 | break; |
771 | } |
772 | |
773 | /* |
774 | * Ensure we emit the owner->on_cpu, dereference _after_ |
775 | * checking sem->owner still matches owner, if that fails, |
776 | * owner might point to free()d memory, if it still matches, |
777 | * our spinning context already disabled preemption which is |
778 | * equal to RCU read-side crital section ensures the memory |
779 | * stays valid. |
780 | */ |
781 | barrier(); |
782 | |
783 | if (need_resched() || !owner_on_cpu(owner)) { |
784 | state = OWNER_NONSPINNABLE; |
785 | break; |
786 | } |
787 | |
788 | cpu_relax(); |
789 | } |
790 | |
791 | return state; |
792 | } |
793 | |
794 | /* |
795 | * Calculate reader-owned rwsem spinning threshold for writer |
796 | * |
797 | * The more readers own the rwsem, the longer it will take for them to |
798 | * wind down and free the rwsem. So the empirical formula used to |
799 | * determine the actual spinning time limit here is: |
800 | * |
801 | * Spinning threshold = (10 + nr_readers/2)us |
802 | * |
803 | * The limit is capped to a maximum of 25us (30 readers). This is just |
804 | * a heuristic and is subjected to change in the future. |
805 | */ |
806 | static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem) |
807 | { |
808 | long count = atomic_long_read(v: &sem->count); |
809 | int readers = count >> RWSEM_READER_SHIFT; |
810 | u64 delta; |
811 | |
812 | if (readers > 30) |
813 | readers = 30; |
814 | delta = (20 + readers) * NSEC_PER_USEC / 2; |
815 | |
816 | return sched_clock() + delta; |
817 | } |
818 | |
819 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) |
820 | { |
821 | bool taken = false; |
822 | int prev_owner_state = OWNER_NULL; |
823 | int loop = 0; |
824 | u64 rspin_threshold = 0; |
825 | |
826 | /* sem->wait_lock should not be held when doing optimistic spinning */ |
827 | if (!osq_lock(lock: &sem->osq)) |
828 | goto done; |
829 | |
830 | /* |
831 | * Optimistically spin on the owner field and attempt to acquire the |
832 | * lock whenever the owner changes. Spinning will be stopped when: |
833 | * 1) the owning writer isn't running; or |
834 | * 2) readers own the lock and spinning time has exceeded limit. |
835 | */ |
836 | for (;;) { |
837 | enum owner_state owner_state; |
838 | |
839 | owner_state = rwsem_spin_on_owner(sem); |
840 | if (!(owner_state & OWNER_SPINNABLE)) |
841 | break; |
842 | |
843 | /* |
844 | * Try to acquire the lock |
845 | */ |
846 | taken = rwsem_try_write_lock_unqueued(sem); |
847 | |
848 | if (taken) |
849 | break; |
850 | |
851 | /* |
852 | * Time-based reader-owned rwsem optimistic spinning |
853 | */ |
854 | if (owner_state == OWNER_READER) { |
855 | /* |
856 | * Re-initialize rspin_threshold every time when |
857 | * the owner state changes from non-reader to reader. |
858 | * This allows a writer to steal the lock in between |
859 | * 2 reader phases and have the threshold reset at |
860 | * the beginning of the 2nd reader phase. |
861 | */ |
862 | if (prev_owner_state != OWNER_READER) { |
863 | if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)) |
864 | break; |
865 | rspin_threshold = rwsem_rspin_threshold(sem); |
866 | loop = 0; |
867 | } |
868 | |
869 | /* |
870 | * Check time threshold once every 16 iterations to |
871 | * avoid calling sched_clock() too frequently so |
872 | * as to reduce the average latency between the times |
873 | * when the lock becomes free and when the spinner |
874 | * is ready to do a trylock. |
875 | */ |
876 | else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) { |
877 | rwsem_set_nonspinnable(sem); |
878 | lockevent_inc(rwsem_opt_nospin); |
879 | break; |
880 | } |
881 | } |
882 | |
883 | /* |
884 | * An RT task cannot do optimistic spinning if it cannot |
885 | * be sure the lock holder is running or live-lock may |
886 | * happen if the current task and the lock holder happen |
887 | * to run in the same CPU. However, aborting optimistic |
888 | * spinning while a NULL owner is detected may miss some |
889 | * opportunity where spinning can continue without causing |
890 | * problem. |
891 | * |
892 | * There are 2 possible cases where an RT task may be able |
893 | * to continue spinning. |
894 | * |
895 | * 1) The lock owner is in the process of releasing the |
896 | * lock, sem->owner is cleared but the lock has not |
897 | * been released yet. |
898 | * 2) The lock was free and owner cleared, but another |
899 | * task just comes in and acquire the lock before |
900 | * we try to get it. The new owner may be a spinnable |
901 | * writer. |
902 | * |
903 | * To take advantage of two scenarios listed above, the RT |
904 | * task is made to retry one more time to see if it can |
905 | * acquire the lock or continue spinning on the new owning |
906 | * writer. Of course, if the time lag is long enough or the |
907 | * new owner is not a writer or spinnable, the RT task will |
908 | * quit spinning. |
909 | * |
910 | * If the owner is a writer, the need_resched() check is |
911 | * done inside rwsem_spin_on_owner(). If the owner is not |
912 | * a writer, need_resched() check needs to be done here. |
913 | */ |
914 | if (owner_state != OWNER_WRITER) { |
915 | if (need_resched()) |
916 | break; |
917 | if (rt_task(current) && |
918 | (prev_owner_state != OWNER_WRITER)) |
919 | break; |
920 | } |
921 | prev_owner_state = owner_state; |
922 | |
923 | /* |
924 | * The cpu_relax() call is a compiler barrier which forces |
925 | * everything in this loop to be re-loaded. We don't need |
926 | * memory barriers as we'll eventually observe the right |
927 | * values at the cost of a few extra spins. |
928 | */ |
929 | cpu_relax(); |
930 | } |
931 | osq_unlock(lock: &sem->osq); |
932 | done: |
933 | lockevent_cond_inc(rwsem_opt_fail, !taken); |
934 | return taken; |
935 | } |
936 | |
937 | /* |
938 | * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should |
939 | * only be called when the reader count reaches 0. |
940 | */ |
941 | static inline void clear_nonspinnable(struct rw_semaphore *sem) |
942 | { |
943 | if (unlikely(rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))) |
944 | atomic_long_andnot(RWSEM_NONSPINNABLE, v: &sem->owner); |
945 | } |
946 | |
947 | #else |
948 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) |
949 | { |
950 | return false; |
951 | } |
952 | |
953 | static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem) |
954 | { |
955 | return false; |
956 | } |
957 | |
958 | static inline void clear_nonspinnable(struct rw_semaphore *sem) { } |
959 | |
960 | static inline enum owner_state |
961 | rwsem_spin_on_owner(struct rw_semaphore *sem) |
962 | { |
963 | return OWNER_NONSPINNABLE; |
964 | } |
965 | #endif |
966 | |
967 | /* |
968 | * Prepare to wake up waiter(s) in the wait queue by putting them into the |
969 | * given wake_q if the rwsem lock owner isn't a writer. If rwsem is likely |
970 | * reader-owned, wake up read lock waiters in queue front or wake up any |
971 | * front waiter otherwise. |
972 | |
973 | * This is being called from both reader and writer slow paths. |
974 | */ |
975 | static inline void rwsem_cond_wake_waiter(struct rw_semaphore *sem, long count, |
976 | struct wake_q_head *wake_q) |
977 | { |
978 | enum rwsem_wake_type wake_type; |
979 | |
980 | if (count & RWSEM_WRITER_MASK) |
981 | return; |
982 | |
983 | if (count & RWSEM_READER_MASK) { |
984 | wake_type = RWSEM_WAKE_READERS; |
985 | } else { |
986 | wake_type = RWSEM_WAKE_ANY; |
987 | clear_nonspinnable(sem); |
988 | } |
989 | rwsem_mark_wake(sem, wake_type, wake_q); |
990 | } |
991 | |
992 | /* |
993 | * Wait for the read lock to be granted |
994 | */ |
995 | static struct rw_semaphore __sched * |
996 | rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int state) |
997 | { |
998 | long adjustment = -RWSEM_READER_BIAS; |
999 | long rcnt = (count >> RWSEM_READER_SHIFT); |
1000 | struct rwsem_waiter waiter; |
1001 | DEFINE_WAKE_Q(wake_q); |
1002 | |
1003 | /* |
1004 | * To prevent a constant stream of readers from starving a sleeping |
1005 | * writer, don't attempt optimistic lock stealing if the lock is |
1006 | * very likely owned by readers. |
1007 | */ |
1008 | if ((atomic_long_read(v: &sem->owner) & RWSEM_READER_OWNED) && |
1009 | (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED)) |
1010 | goto queue; |
1011 | |
1012 | /* |
1013 | * Reader optimistic lock stealing. |
1014 | */ |
1015 | if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) { |
1016 | rwsem_set_reader_owned(sem); |
1017 | lockevent_inc(rwsem_rlock_steal); |
1018 | |
1019 | /* |
1020 | * Wake up other readers in the wait queue if it is |
1021 | * the first reader. |
1022 | */ |
1023 | if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) { |
1024 | raw_spin_lock_irq(&sem->wait_lock); |
1025 | if (!list_empty(head: &sem->wait_list)) |
1026 | rwsem_mark_wake(sem, wake_type: RWSEM_WAKE_READ_OWNED, |
1027 | wake_q: &wake_q); |
1028 | raw_spin_unlock_irq(&sem->wait_lock); |
1029 | wake_up_q(head: &wake_q); |
1030 | } |
1031 | return sem; |
1032 | } |
1033 | |
1034 | queue: |
1035 | waiter.task = current; |
1036 | waiter.type = RWSEM_WAITING_FOR_READ; |
1037 | waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; |
1038 | waiter.handoff_set = false; |
1039 | |
1040 | raw_spin_lock_irq(&sem->wait_lock); |
1041 | if (list_empty(head: &sem->wait_list)) { |
1042 | /* |
1043 | * In case the wait queue is empty and the lock isn't owned |
1044 | * by a writer, this reader can exit the slowpath and return |
1045 | * immediately as its RWSEM_READER_BIAS has already been set |
1046 | * in the count. |
1047 | */ |
1048 | if (!(atomic_long_read(v: &sem->count) & RWSEM_WRITER_MASK)) { |
1049 | /* Provide lock ACQUIRE */ |
1050 | smp_acquire__after_ctrl_dep(); |
1051 | raw_spin_unlock_irq(&sem->wait_lock); |
1052 | rwsem_set_reader_owned(sem); |
1053 | lockevent_inc(rwsem_rlock_fast); |
1054 | return sem; |
1055 | } |
1056 | adjustment += RWSEM_FLAG_WAITERS; |
1057 | } |
1058 | rwsem_add_waiter(sem, waiter: &waiter); |
1059 | |
1060 | /* we're now waiting on the lock, but no longer actively locking */ |
1061 | count = atomic_long_add_return(i: adjustment, v: &sem->count); |
1062 | |
1063 | rwsem_cond_wake_waiter(sem, count, wake_q: &wake_q); |
1064 | raw_spin_unlock_irq(&sem->wait_lock); |
1065 | |
1066 | if (!wake_q_empty(head: &wake_q)) |
1067 | wake_up_q(head: &wake_q); |
1068 | |
1069 | trace_contention_begin(lock: sem, LCB_F_READ); |
1070 | |
1071 | /* wait to be given the lock */ |
1072 | for (;;) { |
1073 | set_current_state(state); |
1074 | if (!smp_load_acquire(&waiter.task)) { |
1075 | /* Matches rwsem_mark_wake()'s smp_store_release(). */ |
1076 | break; |
1077 | } |
1078 | if (signal_pending_state(state, current)) { |
1079 | raw_spin_lock_irq(&sem->wait_lock); |
1080 | if (waiter.task) |
1081 | goto out_nolock; |
1082 | raw_spin_unlock_irq(&sem->wait_lock); |
1083 | /* Ordered by sem->wait_lock against rwsem_mark_wake(). */ |
1084 | break; |
1085 | } |
1086 | schedule_preempt_disabled(); |
1087 | lockevent_inc(rwsem_sleep_reader); |
1088 | } |
1089 | |
1090 | __set_current_state(TASK_RUNNING); |
1091 | lockevent_inc(rwsem_rlock); |
1092 | trace_contention_end(lock: sem, ret: 0); |
1093 | return sem; |
1094 | |
1095 | out_nolock: |
1096 | rwsem_del_wake_waiter(sem, waiter: &waiter, wake_q: &wake_q); |
1097 | __set_current_state(TASK_RUNNING); |
1098 | lockevent_inc(rwsem_rlock_fail); |
1099 | trace_contention_end(lock: sem, ret: -EINTR); |
1100 | return ERR_PTR(error: -EINTR); |
1101 | } |
1102 | |
1103 | /* |
1104 | * Wait until we successfully acquire the write lock |
1105 | */ |
1106 | static struct rw_semaphore __sched * |
1107 | rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) |
1108 | { |
1109 | struct rwsem_waiter waiter; |
1110 | DEFINE_WAKE_Q(wake_q); |
1111 | |
1112 | /* do optimistic spinning and steal lock if possible */ |
1113 | if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) { |
1114 | /* rwsem_optimistic_spin() implies ACQUIRE on success */ |
1115 | return sem; |
1116 | } |
1117 | |
1118 | /* |
1119 | * Optimistic spinning failed, proceed to the slowpath |
1120 | * and block until we can acquire the sem. |
1121 | */ |
1122 | waiter.task = current; |
1123 | waiter.type = RWSEM_WAITING_FOR_WRITE; |
1124 | waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; |
1125 | waiter.handoff_set = false; |
1126 | |
1127 | raw_spin_lock_irq(&sem->wait_lock); |
1128 | rwsem_add_waiter(sem, waiter: &waiter); |
1129 | |
1130 | /* we're now waiting on the lock */ |
1131 | if (rwsem_first_waiter(sem) != &waiter) { |
1132 | rwsem_cond_wake_waiter(sem, count: atomic_long_read(v: &sem->count), |
1133 | wake_q: &wake_q); |
1134 | if (!wake_q_empty(head: &wake_q)) { |
1135 | /* |
1136 | * We want to minimize wait_lock hold time especially |
1137 | * when a large number of readers are to be woken up. |
1138 | */ |
1139 | raw_spin_unlock_irq(&sem->wait_lock); |
1140 | wake_up_q(head: &wake_q); |
1141 | raw_spin_lock_irq(&sem->wait_lock); |
1142 | } |
1143 | } else { |
1144 | atomic_long_or(RWSEM_FLAG_WAITERS, v: &sem->count); |
1145 | } |
1146 | |
1147 | /* wait until we successfully acquire the lock */ |
1148 | set_current_state(state); |
1149 | trace_contention_begin(lock: sem, LCB_F_WRITE); |
1150 | |
1151 | for (;;) { |
1152 | if (rwsem_try_write_lock(sem, waiter: &waiter)) { |
1153 | /* rwsem_try_write_lock() implies ACQUIRE on success */ |
1154 | break; |
1155 | } |
1156 | |
1157 | raw_spin_unlock_irq(&sem->wait_lock); |
1158 | |
1159 | if (signal_pending_state(state, current)) |
1160 | goto out_nolock; |
1161 | |
1162 | /* |
1163 | * After setting the handoff bit and failing to acquire |
1164 | * the lock, attempt to spin on owner to accelerate lock |
1165 | * transfer. If the previous owner is a on-cpu writer and it |
1166 | * has just released the lock, OWNER_NULL will be returned. |
1167 | * In this case, we attempt to acquire the lock again |
1168 | * without sleeping. |
1169 | */ |
1170 | if (waiter.handoff_set) { |
1171 | enum owner_state owner_state; |
1172 | |
1173 | owner_state = rwsem_spin_on_owner(sem); |
1174 | if (owner_state == OWNER_NULL) |
1175 | goto trylock_again; |
1176 | } |
1177 | |
1178 | schedule_preempt_disabled(); |
1179 | lockevent_inc(rwsem_sleep_writer); |
1180 | set_current_state(state); |
1181 | trylock_again: |
1182 | raw_spin_lock_irq(&sem->wait_lock); |
1183 | } |
1184 | __set_current_state(TASK_RUNNING); |
1185 | raw_spin_unlock_irq(&sem->wait_lock); |
1186 | lockevent_inc(rwsem_wlock); |
1187 | trace_contention_end(lock: sem, ret: 0); |
1188 | return sem; |
1189 | |
1190 | out_nolock: |
1191 | __set_current_state(TASK_RUNNING); |
1192 | raw_spin_lock_irq(&sem->wait_lock); |
1193 | rwsem_del_wake_waiter(sem, waiter: &waiter, wake_q: &wake_q); |
1194 | lockevent_inc(rwsem_wlock_fail); |
1195 | trace_contention_end(lock: sem, ret: -EINTR); |
1196 | return ERR_PTR(error: -EINTR); |
1197 | } |
1198 | |
1199 | /* |
1200 | * handle waking up a waiter on the semaphore |
1201 | * - up_read/up_write has decremented the active part of count if we come here |
1202 | */ |
1203 | static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) |
1204 | { |
1205 | unsigned long flags; |
1206 | DEFINE_WAKE_Q(wake_q); |
1207 | |
1208 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
1209 | |
1210 | if (!list_empty(head: &sem->wait_list)) |
1211 | rwsem_mark_wake(sem, wake_type: RWSEM_WAKE_ANY, wake_q: &wake_q); |
1212 | |
1213 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
1214 | wake_up_q(head: &wake_q); |
1215 | |
1216 | return sem; |
1217 | } |
1218 | |
1219 | /* |
1220 | * downgrade a write lock into a read lock |
1221 | * - caller incremented waiting part of count and discovered it still negative |
1222 | * - just wake up any readers at the front of the queue |
1223 | */ |
1224 | static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) |
1225 | { |
1226 | unsigned long flags; |
1227 | DEFINE_WAKE_Q(wake_q); |
1228 | |
1229 | raw_spin_lock_irqsave(&sem->wait_lock, flags); |
1230 | |
1231 | if (!list_empty(head: &sem->wait_list)) |
1232 | rwsem_mark_wake(sem, wake_type: RWSEM_WAKE_READ_OWNED, wake_q: &wake_q); |
1233 | |
1234 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); |
1235 | wake_up_q(head: &wake_q); |
1236 | |
1237 | return sem; |
1238 | } |
1239 | |
1240 | /* |
1241 | * lock for reading |
1242 | */ |
1243 | static __always_inline int __down_read_common(struct rw_semaphore *sem, int state) |
1244 | { |
1245 | int ret = 0; |
1246 | long count; |
1247 | |
1248 | preempt_disable(); |
1249 | if (!rwsem_read_trylock(sem, cntp: &count)) { |
1250 | if (IS_ERR(ptr: rwsem_down_read_slowpath(sem, count, state))) { |
1251 | ret = -EINTR; |
1252 | goto out; |
1253 | } |
1254 | DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); |
1255 | } |
1256 | out: |
1257 | preempt_enable(); |
1258 | return ret; |
1259 | } |
1260 | |
1261 | static __always_inline void __down_read(struct rw_semaphore *sem) |
1262 | { |
1263 | __down_read_common(sem, TASK_UNINTERRUPTIBLE); |
1264 | } |
1265 | |
1266 | static __always_inline int __down_read_interruptible(struct rw_semaphore *sem) |
1267 | { |
1268 | return __down_read_common(sem, TASK_INTERRUPTIBLE); |
1269 | } |
1270 | |
1271 | static __always_inline int __down_read_killable(struct rw_semaphore *sem) |
1272 | { |
1273 | return __down_read_common(sem, TASK_KILLABLE); |
1274 | } |
1275 | |
1276 | static inline int __down_read_trylock(struct rw_semaphore *sem) |
1277 | { |
1278 | int ret = 0; |
1279 | long tmp; |
1280 | |
1281 | DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); |
1282 | |
1283 | preempt_disable(); |
1284 | tmp = atomic_long_read(v: &sem->count); |
1285 | while (!(tmp & RWSEM_READ_FAILED_MASK)) { |
1286 | if (atomic_long_try_cmpxchg_acquire(v: &sem->count, old: &tmp, |
1287 | new: tmp + RWSEM_READER_BIAS)) { |
1288 | rwsem_set_reader_owned(sem); |
1289 | ret = 1; |
1290 | break; |
1291 | } |
1292 | } |
1293 | preempt_enable(); |
1294 | return ret; |
1295 | } |
1296 | |
1297 | /* |
1298 | * lock for writing |
1299 | */ |
1300 | static inline int __down_write_common(struct rw_semaphore *sem, int state) |
1301 | { |
1302 | int ret = 0; |
1303 | |
1304 | preempt_disable(); |
1305 | if (unlikely(!rwsem_write_trylock(sem))) { |
1306 | if (IS_ERR(ptr: rwsem_down_write_slowpath(sem, state))) |
1307 | ret = -EINTR; |
1308 | } |
1309 | preempt_enable(); |
1310 | return ret; |
1311 | } |
1312 | |
1313 | static inline void __down_write(struct rw_semaphore *sem) |
1314 | { |
1315 | __down_write_common(sem, TASK_UNINTERRUPTIBLE); |
1316 | } |
1317 | |
1318 | static inline int __down_write_killable(struct rw_semaphore *sem) |
1319 | { |
1320 | return __down_write_common(sem, TASK_KILLABLE); |
1321 | } |
1322 | |
1323 | static inline int __down_write_trylock(struct rw_semaphore *sem) |
1324 | { |
1325 | int ret; |
1326 | |
1327 | preempt_disable(); |
1328 | DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); |
1329 | ret = rwsem_write_trylock(sem); |
1330 | preempt_enable(); |
1331 | |
1332 | return ret; |
1333 | } |
1334 | |
1335 | /* |
1336 | * unlock after reading |
1337 | */ |
1338 | static inline void __up_read(struct rw_semaphore *sem) |
1339 | { |
1340 | long tmp; |
1341 | |
1342 | DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); |
1343 | DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); |
1344 | |
1345 | preempt_disable(); |
1346 | rwsem_clear_reader_owned(sem); |
1347 | tmp = atomic_long_add_return_release(i: -RWSEM_READER_BIAS, v: &sem->count); |
1348 | DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); |
1349 | if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == |
1350 | RWSEM_FLAG_WAITERS)) { |
1351 | clear_nonspinnable(sem); |
1352 | rwsem_wake(sem); |
1353 | } |
1354 | preempt_enable(); |
1355 | } |
1356 | |
1357 | /* |
1358 | * unlock after writing |
1359 | */ |
1360 | static inline void __up_write(struct rw_semaphore *sem) |
1361 | { |
1362 | long tmp; |
1363 | |
1364 | DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); |
1365 | /* |
1366 | * sem->owner may differ from current if the ownership is transferred |
1367 | * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. |
1368 | */ |
1369 | DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && |
1370 | !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); |
1371 | |
1372 | preempt_disable(); |
1373 | rwsem_clear_owner(sem); |
1374 | tmp = atomic_long_fetch_add_release(i: -RWSEM_WRITER_LOCKED, v: &sem->count); |
1375 | if (unlikely(tmp & RWSEM_FLAG_WAITERS)) |
1376 | rwsem_wake(sem); |
1377 | preempt_enable(); |
1378 | } |
1379 | |
1380 | /* |
1381 | * downgrade write lock to read lock |
1382 | */ |
1383 | static inline void __downgrade_write(struct rw_semaphore *sem) |
1384 | { |
1385 | long tmp; |
1386 | |
1387 | /* |
1388 | * When downgrading from exclusive to shared ownership, |
1389 | * anything inside the write-locked region cannot leak |
1390 | * into the read side. In contrast, anything in the |
1391 | * read-locked region is ok to be re-ordered into the |
1392 | * write side. As such, rely on RELEASE semantics. |
1393 | */ |
1394 | DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem); |
1395 | preempt_disable(); |
1396 | tmp = atomic_long_fetch_add_release( |
1397 | i: -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, v: &sem->count); |
1398 | rwsem_set_reader_owned(sem); |
1399 | if (tmp & RWSEM_FLAG_WAITERS) |
1400 | rwsem_downgrade_wake(sem); |
1401 | preempt_enable(); |
1402 | } |
1403 | |
1404 | #else /* !CONFIG_PREEMPT_RT */ |
1405 | |
1406 | #define RT_MUTEX_BUILD_MUTEX |
1407 | #include "rtmutex.c" |
1408 | |
1409 | #define rwbase_set_and_save_current_state(state) \ |
1410 | set_current_state(state) |
1411 | |
1412 | #define rwbase_restore_current_state() \ |
1413 | __set_current_state(TASK_RUNNING) |
1414 | |
1415 | #define rwbase_rtmutex_lock_state(rtm, state) \ |
1416 | __rt_mutex_lock(rtm, state) |
1417 | |
1418 | #define rwbase_rtmutex_slowlock_locked(rtm, state) \ |
1419 | __rt_mutex_slowlock_locked(rtm, NULL, state) |
1420 | |
1421 | #define rwbase_rtmutex_unlock(rtm) \ |
1422 | __rt_mutex_unlock(rtm) |
1423 | |
1424 | #define rwbase_rtmutex_trylock(rtm) \ |
1425 | __rt_mutex_trylock(rtm) |
1426 | |
1427 | #define rwbase_signal_pending_state(state, current) \ |
1428 | signal_pending_state(state, current) |
1429 | |
1430 | #define rwbase_pre_schedule() \ |
1431 | rt_mutex_pre_schedule() |
1432 | |
1433 | #define rwbase_schedule() \ |
1434 | rt_mutex_schedule() |
1435 | |
1436 | #define rwbase_post_schedule() \ |
1437 | rt_mutex_post_schedule() |
1438 | |
1439 | #include "rwbase_rt.c" |
1440 | |
1441 | void __init_rwsem(struct rw_semaphore *sem, const char *name, |
1442 | struct lock_class_key *key) |
1443 | { |
1444 | init_rwbase_rt(&(sem)->rwbase); |
1445 | |
1446 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
1447 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); |
1448 | lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); |
1449 | #endif |
1450 | } |
1451 | EXPORT_SYMBOL(__init_rwsem); |
1452 | |
1453 | static inline void __down_read(struct rw_semaphore *sem) |
1454 | { |
1455 | rwbase_read_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE); |
1456 | } |
1457 | |
1458 | static inline int __down_read_interruptible(struct rw_semaphore *sem) |
1459 | { |
1460 | return rwbase_read_lock(&sem->rwbase, TASK_INTERRUPTIBLE); |
1461 | } |
1462 | |
1463 | static inline int __down_read_killable(struct rw_semaphore *sem) |
1464 | { |
1465 | return rwbase_read_lock(&sem->rwbase, TASK_KILLABLE); |
1466 | } |
1467 | |
1468 | static inline int __down_read_trylock(struct rw_semaphore *sem) |
1469 | { |
1470 | return rwbase_read_trylock(&sem->rwbase); |
1471 | } |
1472 | |
1473 | static inline void __up_read(struct rw_semaphore *sem) |
1474 | { |
1475 | rwbase_read_unlock(&sem->rwbase, TASK_NORMAL); |
1476 | } |
1477 | |
1478 | static inline void __sched __down_write(struct rw_semaphore *sem) |
1479 | { |
1480 | rwbase_write_lock(&sem->rwbase, TASK_UNINTERRUPTIBLE); |
1481 | } |
1482 | |
1483 | static inline int __sched __down_write_killable(struct rw_semaphore *sem) |
1484 | { |
1485 | return rwbase_write_lock(&sem->rwbase, TASK_KILLABLE); |
1486 | } |
1487 | |
1488 | static inline int __down_write_trylock(struct rw_semaphore *sem) |
1489 | { |
1490 | return rwbase_write_trylock(&sem->rwbase); |
1491 | } |
1492 | |
1493 | static inline void __up_write(struct rw_semaphore *sem) |
1494 | { |
1495 | rwbase_write_unlock(&sem->rwbase); |
1496 | } |
1497 | |
1498 | static inline void __downgrade_write(struct rw_semaphore *sem) |
1499 | { |
1500 | rwbase_write_downgrade(&sem->rwbase); |
1501 | } |
1502 | |
1503 | /* Debug stubs for the common API */ |
1504 | #define DEBUG_RWSEMS_WARN_ON(c, sem) |
1505 | |
1506 | static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, |
1507 | struct task_struct *owner) |
1508 | { |
1509 | } |
1510 | |
1511 | static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) |
1512 | { |
1513 | int count = atomic_read(&sem->rwbase.readers); |
1514 | |
1515 | return count < 0 && count != READER_BIAS; |
1516 | } |
1517 | |
1518 | #endif /* CONFIG_PREEMPT_RT */ |
1519 | |
1520 | /* |
1521 | * lock for reading |
1522 | */ |
1523 | void __sched down_read(struct rw_semaphore *sem) |
1524 | { |
1525 | might_sleep(); |
1526 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); |
1527 | |
1528 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
1529 | } |
1530 | EXPORT_SYMBOL(down_read); |
1531 | |
1532 | int __sched down_read_interruptible(struct rw_semaphore *sem) |
1533 | { |
1534 | might_sleep(); |
1535 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); |
1536 | |
1537 | if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) { |
1538 | rwsem_release(&sem->dep_map, _RET_IP_); |
1539 | return -EINTR; |
1540 | } |
1541 | |
1542 | return 0; |
1543 | } |
1544 | EXPORT_SYMBOL(down_read_interruptible); |
1545 | |
1546 | int __sched down_read_killable(struct rw_semaphore *sem) |
1547 | { |
1548 | might_sleep(); |
1549 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); |
1550 | |
1551 | if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { |
1552 | rwsem_release(&sem->dep_map, _RET_IP_); |
1553 | return -EINTR; |
1554 | } |
1555 | |
1556 | return 0; |
1557 | } |
1558 | EXPORT_SYMBOL(down_read_killable); |
1559 | |
1560 | /* |
1561 | * trylock for reading -- returns 1 if successful, 0 if contention |
1562 | */ |
1563 | int down_read_trylock(struct rw_semaphore *sem) |
1564 | { |
1565 | int ret = __down_read_trylock(sem); |
1566 | |
1567 | if (ret == 1) |
1568 | rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); |
1569 | return ret; |
1570 | } |
1571 | EXPORT_SYMBOL(down_read_trylock); |
1572 | |
1573 | /* |
1574 | * lock for writing |
1575 | */ |
1576 | void __sched down_write(struct rw_semaphore *sem) |
1577 | { |
1578 | might_sleep(); |
1579 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); |
1580 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
1581 | } |
1582 | EXPORT_SYMBOL(down_write); |
1583 | |
1584 | /* |
1585 | * lock for writing |
1586 | */ |
1587 | int __sched down_write_killable(struct rw_semaphore *sem) |
1588 | { |
1589 | might_sleep(); |
1590 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); |
1591 | |
1592 | if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, |
1593 | __down_write_killable)) { |
1594 | rwsem_release(&sem->dep_map, _RET_IP_); |
1595 | return -EINTR; |
1596 | } |
1597 | |
1598 | return 0; |
1599 | } |
1600 | EXPORT_SYMBOL(down_write_killable); |
1601 | |
1602 | /* |
1603 | * trylock for writing -- returns 1 if successful, 0 if contention |
1604 | */ |
1605 | int down_write_trylock(struct rw_semaphore *sem) |
1606 | { |
1607 | int ret = __down_write_trylock(sem); |
1608 | |
1609 | if (ret == 1) |
1610 | rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); |
1611 | |
1612 | return ret; |
1613 | } |
1614 | EXPORT_SYMBOL(down_write_trylock); |
1615 | |
1616 | /* |
1617 | * release a read lock |
1618 | */ |
1619 | void up_read(struct rw_semaphore *sem) |
1620 | { |
1621 | rwsem_release(&sem->dep_map, _RET_IP_); |
1622 | __up_read(sem); |
1623 | } |
1624 | EXPORT_SYMBOL(up_read); |
1625 | |
1626 | /* |
1627 | * release a write lock |
1628 | */ |
1629 | void up_write(struct rw_semaphore *sem) |
1630 | { |
1631 | rwsem_release(&sem->dep_map, _RET_IP_); |
1632 | __up_write(sem); |
1633 | } |
1634 | EXPORT_SYMBOL(up_write); |
1635 | |
1636 | /* |
1637 | * downgrade write lock to read lock |
1638 | */ |
1639 | void downgrade_write(struct rw_semaphore *sem) |
1640 | { |
1641 | lock_downgrade(lock: &sem->dep_map, _RET_IP_); |
1642 | __downgrade_write(sem); |
1643 | } |
1644 | EXPORT_SYMBOL(downgrade_write); |
1645 | |
1646 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
1647 | |
1648 | void down_read_nested(struct rw_semaphore *sem, int subclass) |
1649 | { |
1650 | might_sleep(); |
1651 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); |
1652 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
1653 | } |
1654 | EXPORT_SYMBOL(down_read_nested); |
1655 | |
1656 | int down_read_killable_nested(struct rw_semaphore *sem, int subclass) |
1657 | { |
1658 | might_sleep(); |
1659 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); |
1660 | |
1661 | if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { |
1662 | rwsem_release(&sem->dep_map, _RET_IP_); |
1663 | return -EINTR; |
1664 | } |
1665 | |
1666 | return 0; |
1667 | } |
1668 | EXPORT_SYMBOL(down_read_killable_nested); |
1669 | |
1670 | void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) |
1671 | { |
1672 | might_sleep(); |
1673 | rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); |
1674 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
1675 | } |
1676 | EXPORT_SYMBOL(_down_write_nest_lock); |
1677 | |
1678 | void down_read_non_owner(struct rw_semaphore *sem) |
1679 | { |
1680 | might_sleep(); |
1681 | __down_read(sem); |
1682 | /* |
1683 | * The owner value for a reader-owned lock is mostly for debugging |
1684 | * purpose only and is not critical to the correct functioning of |
1685 | * rwsem. So it is perfectly fine to set it in a preempt-enabled |
1686 | * context here. |
1687 | */ |
1688 | __rwsem_set_reader_owned(sem, NULL); |
1689 | } |
1690 | EXPORT_SYMBOL(down_read_non_owner); |
1691 | |
1692 | void down_write_nested(struct rw_semaphore *sem, int subclass) |
1693 | { |
1694 | might_sleep(); |
1695 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); |
1696 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
1697 | } |
1698 | EXPORT_SYMBOL(down_write_nested); |
1699 | |
1700 | int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) |
1701 | { |
1702 | might_sleep(); |
1703 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); |
1704 | |
1705 | if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, |
1706 | __down_write_killable)) { |
1707 | rwsem_release(&sem->dep_map, _RET_IP_); |
1708 | return -EINTR; |
1709 | } |
1710 | |
1711 | return 0; |
1712 | } |
1713 | EXPORT_SYMBOL(down_write_killable_nested); |
1714 | |
1715 | void up_read_non_owner(struct rw_semaphore *sem) |
1716 | { |
1717 | DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); |
1718 | __up_read(sem); |
1719 | } |
1720 | EXPORT_SYMBOL(up_read_non_owner); |
1721 | |
1722 | #endif |
1723 | |