1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | #include <linux/atomic.h> |
3 | #include <linux/percpu.h> |
4 | #include <linux/wait.h> |
5 | #include <linux/lockdep.h> |
6 | #include <linux/percpu-rwsem.h> |
7 | #include <linux/rcupdate.h> |
8 | #include <linux/sched.h> |
9 | #include <linux/sched/task.h> |
10 | #include <linux/sched/debug.h> |
11 | #include <linux/errno.h> |
12 | #include <trace/events/lock.h> |
13 | |
14 | int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, |
15 | const char *name, struct lock_class_key *key) |
16 | { |
17 | sem->read_count = alloc_percpu(int); |
18 | if (unlikely(!sem->read_count)) |
19 | return -ENOMEM; |
20 | |
21 | rcu_sync_init(&sem->rss); |
22 | rcuwait_init(w: &sem->writer); |
23 | init_waitqueue_head(&sem->waiters); |
24 | atomic_set(v: &sem->block, i: 0); |
25 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
26 | debug_check_no_locks_freed(from: (void *)sem, len: sizeof(*sem)); |
27 | lockdep_init_map(lock: &sem->dep_map, name, key, subclass: 0); |
28 | #endif |
29 | return 0; |
30 | } |
31 | EXPORT_SYMBOL_GPL(__percpu_init_rwsem); |
32 | |
33 | void percpu_free_rwsem(struct percpu_rw_semaphore *sem) |
34 | { |
35 | /* |
36 | * XXX: temporary kludge. The error path in alloc_super() |
37 | * assumes that percpu_free_rwsem() is safe after kzalloc(). |
38 | */ |
39 | if (!sem->read_count) |
40 | return; |
41 | |
42 | rcu_sync_dtor(&sem->rss); |
43 | free_percpu(pdata: sem->read_count); |
44 | sem->read_count = NULL; /* catch use after free bugs */ |
45 | } |
46 | EXPORT_SYMBOL_GPL(percpu_free_rwsem); |
47 | |
48 | static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) |
49 | { |
50 | this_cpu_inc(*sem->read_count); |
51 | |
52 | /* |
53 | * Due to having preemption disabled the decrement happens on |
54 | * the same CPU as the increment, avoiding the |
55 | * increment-on-one-CPU-and-decrement-on-another problem. |
56 | * |
57 | * If the reader misses the writer's assignment of sem->block, then the |
58 | * writer is guaranteed to see the reader's increment. |
59 | * |
60 | * Conversely, any readers that increment their sem->read_count after |
61 | * the writer looks are guaranteed to see the sem->block value, which |
62 | * in turn means that they are guaranteed to immediately decrement |
63 | * their sem->read_count, so that it doesn't matter that the writer |
64 | * missed them. |
65 | */ |
66 | |
67 | smp_mb(); /* A matches D */ |
68 | |
69 | /* |
70 | * If !sem->block the critical section starts here, matched by the |
71 | * release in percpu_up_write(). |
72 | */ |
73 | if (likely(!atomic_read_acquire(&sem->block))) |
74 | return true; |
75 | |
76 | this_cpu_dec(*sem->read_count); |
77 | |
78 | /* Prod writer to re-evaluate readers_active_check() */ |
79 | rcuwait_wake_up(w: &sem->writer); |
80 | |
81 | return false; |
82 | } |
83 | |
84 | static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem) |
85 | { |
86 | if (atomic_read(v: &sem->block)) |
87 | return false; |
88 | |
89 | return atomic_xchg(v: &sem->block, new: 1) == 0; |
90 | } |
91 | |
92 | static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader) |
93 | { |
94 | if (reader) { |
95 | bool ret; |
96 | |
97 | preempt_disable(); |
98 | ret = __percpu_down_read_trylock(sem); |
99 | preempt_enable(); |
100 | |
101 | return ret; |
102 | } |
103 | return __percpu_down_write_trylock(sem); |
104 | } |
105 | |
106 | /* |
107 | * The return value of wait_queue_entry::func means: |
108 | * |
109 | * <0 - error, wakeup is terminated and the error is returned |
110 | * 0 - no wakeup, a next waiter is tried |
111 | * >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive. |
112 | * |
113 | * We use EXCLUSIVE for both readers and writers to preserve FIFO order, |
114 | * and play games with the return value to allow waking multiple readers. |
115 | * |
116 | * Specifically, we wake readers until we've woken a single writer, or until a |
117 | * trylock fails. |
118 | */ |
119 | static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, |
120 | unsigned int mode, int wake_flags, |
121 | void *key) |
122 | { |
123 | bool reader = wq_entry->flags & WQ_FLAG_CUSTOM; |
124 | struct percpu_rw_semaphore *sem = key; |
125 | struct task_struct *p; |
126 | |
127 | /* concurrent against percpu_down_write(), can get stolen */ |
128 | if (!__percpu_rwsem_trylock(sem, reader)) |
129 | return 1; |
130 | |
131 | p = get_task_struct(t: wq_entry->private); |
132 | list_del_init(entry: &wq_entry->entry); |
133 | smp_store_release(&wq_entry->private, NULL); |
134 | |
135 | wake_up_process(tsk: p); |
136 | put_task_struct(t: p); |
137 | |
138 | return !reader; /* wake (readers until) 1 writer */ |
139 | } |
140 | |
141 | static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) |
142 | { |
143 | DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); |
144 | bool wait; |
145 | |
146 | spin_lock_irq(lock: &sem->waiters.lock); |
147 | /* |
148 | * Serialize against the wakeup in percpu_up_write(), if we fail |
149 | * the trylock, the wakeup must see us on the list. |
150 | */ |
151 | wait = !__percpu_rwsem_trylock(sem, reader); |
152 | if (wait) { |
153 | wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM; |
154 | __add_wait_queue_entry_tail(wq_head: &sem->waiters, wq_entry: &wq_entry); |
155 | } |
156 | spin_unlock_irq(lock: &sem->waiters.lock); |
157 | |
158 | while (wait) { |
159 | set_current_state(TASK_UNINTERRUPTIBLE); |
160 | if (!smp_load_acquire(&wq_entry.private)) |
161 | break; |
162 | schedule(); |
163 | } |
164 | __set_current_state(TASK_RUNNING); |
165 | } |
166 | |
167 | bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) |
168 | { |
169 | if (__percpu_down_read_trylock(sem)) |
170 | return true; |
171 | |
172 | if (try) |
173 | return false; |
174 | |
175 | trace_contention_begin(lock: sem, LCB_F_PERCPU | LCB_F_READ); |
176 | preempt_enable(); |
177 | percpu_rwsem_wait(sem, /* .reader = */ true); |
178 | preempt_disable(); |
179 | trace_contention_end(lock: sem, ret: 0); |
180 | |
181 | return true; |
182 | } |
183 | EXPORT_SYMBOL_GPL(__percpu_down_read); |
184 | |
185 | #define per_cpu_sum(var) \ |
186 | ({ \ |
187 | typeof(var) __sum = 0; \ |
188 | int cpu; \ |
189 | compiletime_assert_atomic_type(__sum); \ |
190 | for_each_possible_cpu(cpu) \ |
191 | __sum += per_cpu(var, cpu); \ |
192 | __sum; \ |
193 | }) |
194 | |
195 | bool percpu_is_read_locked(struct percpu_rw_semaphore *sem) |
196 | { |
197 | return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(v: &sem->block); |
198 | } |
199 | EXPORT_SYMBOL_GPL(percpu_is_read_locked); |
200 | |
201 | /* |
202 | * Return true if the modular sum of the sem->read_count per-CPU variable is |
203 | * zero. If this sum is zero, then it is stable due to the fact that if any |
204 | * newly arriving readers increment a given counter, they will immediately |
205 | * decrement that same counter. |
206 | * |
207 | * Assumes sem->block is set. |
208 | */ |
209 | static bool readers_active_check(struct percpu_rw_semaphore *sem) |
210 | { |
211 | if (per_cpu_sum(*sem->read_count) != 0) |
212 | return false; |
213 | |
214 | /* |
215 | * If we observed the decrement; ensure we see the entire critical |
216 | * section. |
217 | */ |
218 | |
219 | smp_mb(); /* C matches B */ |
220 | |
221 | return true; |
222 | } |
223 | |
224 | void __sched percpu_down_write(struct percpu_rw_semaphore *sem) |
225 | { |
226 | bool contended = false; |
227 | |
228 | might_sleep(); |
229 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); |
230 | |
231 | /* Notify readers to take the slow path. */ |
232 | rcu_sync_enter(&sem->rss); |
233 | |
234 | /* |
235 | * Try set sem->block; this provides writer-writer exclusion. |
236 | * Having sem->block set makes new readers block. |
237 | */ |
238 | if (!__percpu_down_write_trylock(sem)) { |
239 | trace_contention_begin(lock: sem, LCB_F_PERCPU | LCB_F_WRITE); |
240 | percpu_rwsem_wait(sem, /* .reader = */ false); |
241 | contended = true; |
242 | } |
243 | |
244 | /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */ |
245 | |
246 | /* |
247 | * If they don't see our store of sem->block, then we are guaranteed to |
248 | * see their sem->read_count increment, and therefore will wait for |
249 | * them. |
250 | */ |
251 | |
252 | /* Wait for all active readers to complete. */ |
253 | rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE); |
254 | if (contended) |
255 | trace_contention_end(lock: sem, ret: 0); |
256 | } |
257 | EXPORT_SYMBOL_GPL(percpu_down_write); |
258 | |
259 | void percpu_up_write(struct percpu_rw_semaphore *sem) |
260 | { |
261 | rwsem_release(&sem->dep_map, _RET_IP_); |
262 | |
263 | /* |
264 | * Signal the writer is done, no fast path yet. |
265 | * |
266 | * One reason that we cannot just immediately flip to readers_fast is |
267 | * that new readers might fail to see the results of this writer's |
268 | * critical section. |
269 | * |
270 | * Therefore we force it through the slow path which guarantees an |
271 | * acquire and thereby guarantees the critical section's consistency. |
272 | */ |
273 | atomic_set_release(v: &sem->block, i: 0); |
274 | |
275 | /* |
276 | * Prod any pending reader/writer to make progress. |
277 | */ |
278 | __wake_up(wq_head: &sem->waiters, TASK_NORMAL, nr: 1, key: sem); |
279 | |
280 | /* |
281 | * Once this completes (at least one RCU-sched grace period hence) the |
282 | * reader fast path will be available again. Safe to use outside the |
283 | * exclusive write lock because its counting. |
284 | */ |
285 | rcu_sync_exit(&sem->rss); |
286 | } |
287 | EXPORT_SYMBOL_GPL(percpu_up_write); |
288 | |