1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ |
3 | #ifndef BPF_ARENA_SPIN_LOCK_H |
4 | #define BPF_ARENA_SPIN_LOCK_H |
5 | |
6 | #include <vmlinux.h> |
7 | #include <bpf/bpf_helpers.h> |
8 | #include "bpf_atomic.h" |
9 | |
10 | #define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label) |
11 | #define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) |
12 | |
13 | #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) |
14 | |
15 | #define EBUSY 16 |
16 | #define EOPNOTSUPP 95 |
17 | #define ETIMEDOUT 110 |
18 | |
19 | #ifndef __arena |
20 | #define __arena __attribute__((address_space(1))) |
21 | #endif |
22 | |
23 | extern unsigned long CONFIG_NR_CPUS __kconfig; |
24 | |
25 | /* |
26 | * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but |
27 | * PowerPC overrides the definition to define lock->val as u32 instead of |
28 | * atomic_t, leading to compilation errors. Import a local definition below so |
29 | * that we don't depend on the vmlinux.h version. |
30 | */ |
31 | |
32 | struct __qspinlock { |
33 | union { |
34 | atomic_t val; |
35 | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
36 | struct { |
37 | u8 locked; |
38 | u8 pending; |
39 | }; |
40 | struct { |
41 | u16 locked_pending; |
42 | u16 tail; |
43 | }; |
44 | #else |
45 | struct { |
46 | u16 tail; |
47 | u16 locked_pending; |
48 | }; |
49 | struct { |
50 | u8 reserved[2]; |
51 | u8 pending; |
52 | u8 locked; |
53 | }; |
54 | #endif |
55 | }; |
56 | }; |
57 | |
58 | #define arena_spinlock_t struct __qspinlock |
59 | /* FIXME: Using typedef causes CO-RE relocation error */ |
60 | /* typedef struct qspinlock arena_spinlock_t; */ |
61 | |
62 | struct arena_mcs_spinlock { |
63 | struct arena_mcs_spinlock __arena *next; |
64 | int locked; |
65 | int count; |
66 | }; |
67 | |
68 | struct arena_qnode { |
69 | struct arena_mcs_spinlock mcs; |
70 | }; |
71 | |
72 | #define _Q_MAX_NODES 4 |
73 | #define _Q_PENDING_LOOPS 1 |
74 | |
75 | /* |
76 | * Bitfields in the atomic value: |
77 | * |
78 | * 0- 7: locked byte |
79 | * 8: pending |
80 | * 9-15: not used |
81 | * 16-17: tail index |
82 | * 18-31: tail cpu (+1) |
83 | */ |
84 | #define _Q_MAX_CPUS 1024 |
85 | |
86 | #define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ |
87 | << _Q_ ## type ## _OFFSET) |
88 | #define _Q_LOCKED_OFFSET 0 |
89 | #define _Q_LOCKED_BITS 8 |
90 | #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) |
91 | |
92 | #define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) |
93 | #define _Q_PENDING_BITS 8 |
94 | #define _Q_PENDING_MASK _Q_SET_MASK(PENDING) |
95 | |
96 | #define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) |
97 | #define _Q_TAIL_IDX_BITS 2 |
98 | #define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) |
99 | |
100 | #define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) |
101 | #define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) |
102 | #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) |
103 | |
104 | #define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET |
105 | #define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) |
106 | |
107 | #define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) |
108 | #define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) |
109 | |
110 | struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; |
111 | |
112 | static inline u32 encode_tail(int cpu, int idx) |
113 | { |
114 | u32 tail; |
115 | |
116 | tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; |
117 | tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ |
118 | |
119 | return tail; |
120 | } |
121 | |
122 | static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail) |
123 | { |
124 | u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; |
125 | u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; |
126 | |
127 | return &qnodes[cpu][idx].mcs; |
128 | } |
129 | |
130 | static inline |
131 | struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx) |
132 | { |
133 | return &((struct arena_qnode __arena *)base + idx)->mcs; |
134 | } |
135 | |
136 | #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) |
137 | |
138 | /** |
139 | * xchg_tail - Put in the new queue tail code word & retrieve previous one |
140 | * @lock : Pointer to queued spinlock structure |
141 | * @tail : The new queue tail code word |
142 | * Return: The previous queue tail code word |
143 | * |
144 | * xchg(lock, tail) |
145 | * |
146 | * p,*,* -> n,*,* ; prev = xchg(lock, node) |
147 | */ |
148 | static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail) |
149 | { |
150 | u32 old, new; |
151 | |
152 | old = atomic_read(&lock->val); |
153 | do { |
154 | new = (old & _Q_LOCKED_PENDING_MASK) | tail; |
155 | /* |
156 | * We can use relaxed semantics since the caller ensures that |
157 | * the MCS node is properly initialized before updating the |
158 | * tail. |
159 | */ |
160 | /* These loops are not expected to stall, but we still need to |
161 | * prove to the verifier they will terminate eventually. |
162 | */ |
163 | cond_break_label(out); |
164 | } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); |
165 | |
166 | return old; |
167 | out: |
168 | bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!" , __func__); |
169 | return old; |
170 | } |
171 | |
172 | /** |
173 | * clear_pending - clear the pending bit. |
174 | * @lock: Pointer to queued spinlock structure |
175 | * |
176 | * *,1,* -> *,0,* |
177 | */ |
178 | static __always_inline void clear_pending(arena_spinlock_t __arena *lock) |
179 | { |
180 | WRITE_ONCE(lock->pending, 0); |
181 | } |
182 | |
183 | /** |
184 | * clear_pending_set_locked - take ownership and clear the pending bit. |
185 | * @lock: Pointer to queued spinlock structure |
186 | * |
187 | * *,1,0 -> *,0,1 |
188 | * |
189 | * Lock stealing is not allowed if this function is used. |
190 | */ |
191 | static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock) |
192 | { |
193 | WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); |
194 | } |
195 | |
196 | /** |
197 | * set_locked - Set the lock bit and own the lock |
198 | * @lock: Pointer to queued spinlock structure |
199 | * |
200 | * *,*,0 -> *,0,1 |
201 | */ |
202 | static __always_inline void set_locked(arena_spinlock_t __arena *lock) |
203 | { |
204 | WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); |
205 | } |
206 | |
207 | static __always_inline |
208 | u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock) |
209 | { |
210 | u32 old, new; |
211 | |
212 | old = atomic_read(&lock->val); |
213 | do { |
214 | new = old | _Q_PENDING_VAL; |
215 | /* |
216 | * These loops are not expected to stall, but we still need to |
217 | * prove to the verifier they will terminate eventually. |
218 | */ |
219 | cond_break_label(out); |
220 | } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new)); |
221 | |
222 | return old; |
223 | out: |
224 | bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!" , __func__); |
225 | return old; |
226 | } |
227 | |
228 | /** |
229 | * arena_spin_trylock - try to acquire the queued spinlock |
230 | * @lock : Pointer to queued spinlock structure |
231 | * Return: 1 if lock acquired, 0 if failed |
232 | */ |
233 | static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) |
234 | { |
235 | int val = atomic_read(&lock->val); |
236 | |
237 | if (unlikely(val)) |
238 | return 0; |
239 | |
240 | return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); |
241 | } |
242 | |
243 | __noinline |
244 | int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val) |
245 | { |
246 | struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; |
247 | int ret = -ETIMEDOUT; |
248 | u32 old, tail; |
249 | int idx; |
250 | |
251 | /* |
252 | * Wait for in-progress pending->locked hand-overs with a bounded |
253 | * number of spins so that we guarantee forward progress. |
254 | * |
255 | * 0,1,0 -> 0,0,1 |
256 | */ |
257 | if (val == _Q_PENDING_VAL) { |
258 | int cnt = _Q_PENDING_LOOPS; |
259 | val = atomic_cond_read_relaxed_label(&lock->val, |
260 | (VAL != _Q_PENDING_VAL) || !cnt--, |
261 | release_err); |
262 | } |
263 | |
264 | /* |
265 | * If we observe any contention; queue. |
266 | */ |
267 | if (val & ~_Q_LOCKED_MASK) |
268 | goto queue; |
269 | |
270 | /* |
271 | * trylock || pending |
272 | * |
273 | * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock |
274 | */ |
275 | val = arena_fetch_set_pending_acquire(lock); |
276 | |
277 | /* |
278 | * If we observe contention, there is a concurrent locker. |
279 | * |
280 | * Undo and queue; our setting of PENDING might have made the |
281 | * n,0,0 -> 0,0,0 transition fail and it will now be waiting |
282 | * on @next to become !NULL. |
283 | */ |
284 | if (unlikely(val & ~_Q_LOCKED_MASK)) { |
285 | |
286 | /* Undo PENDING if we set it. */ |
287 | if (!(val & _Q_PENDING_MASK)) |
288 | clear_pending(lock); |
289 | |
290 | goto queue; |
291 | } |
292 | |
293 | /* |
294 | * We're pending, wait for the owner to go away. |
295 | * |
296 | * 0,1,1 -> *,1,0 |
297 | * |
298 | * this wait loop must be a load-acquire such that we match the |
299 | * store-release that clears the locked bit and create lock |
300 | * sequentiality; this is because not all |
301 | * clear_pending_set_locked() implementations imply full |
302 | * barriers. |
303 | */ |
304 | if (val & _Q_LOCKED_MASK) |
305 | smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); |
306 | |
307 | /* |
308 | * take ownership and clear the pending bit. |
309 | * |
310 | * 0,1,0 -> 0,0,1 |
311 | */ |
312 | clear_pending_set_locked(lock); |
313 | return 0; |
314 | |
315 | /* |
316 | * End of pending bit optimistic spinning and beginning of MCS |
317 | * queuing. |
318 | */ |
319 | queue: |
320 | node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs; |
321 | idx = node0->count++; |
322 | tail = encode_tail(bpf_get_smp_processor_id(), idx); |
323 | |
324 | /* |
325 | * 4 nodes are allocated based on the assumption that there will not be |
326 | * nested NMIs taking spinlocks. That may not be true in some |
327 | * architectures even though the chance of needing more than 4 nodes |
328 | * will still be extremely unlikely. When that happens, we simply return |
329 | * an error. Original qspinlock has a trylock fallback in this case. |
330 | */ |
331 | if (unlikely(idx >= _Q_MAX_NODES)) { |
332 | ret = -EBUSY; |
333 | goto release_node_err; |
334 | } |
335 | |
336 | node = grab_mcs_node(node0, idx); |
337 | |
338 | /* |
339 | * Ensure that we increment the head node->count before initialising |
340 | * the actual node. If the compiler is kind enough to reorder these |
341 | * stores, then an IRQ could overwrite our assignments. |
342 | */ |
343 | barrier(); |
344 | |
345 | node->locked = 0; |
346 | node->next = NULL; |
347 | |
348 | /* |
349 | * We touched a (possibly) cold cacheline in the per-cpu queue node; |
350 | * attempt the trylock once more in the hope someone let go while we |
351 | * weren't watching. |
352 | */ |
353 | if (arena_spin_trylock(lock)) |
354 | goto release; |
355 | |
356 | /* |
357 | * Ensure that the initialisation of @node is complete before we |
358 | * publish the updated tail via xchg_tail() and potentially link |
359 | * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. |
360 | */ |
361 | smp_wmb(); |
362 | |
363 | /* |
364 | * Publish the updated tail. |
365 | * We have already touched the queueing cacheline; don't bother with |
366 | * pending stuff. |
367 | * |
368 | * p,*,* -> n,*,* |
369 | */ |
370 | old = xchg_tail(lock, tail); |
371 | next = NULL; |
372 | |
373 | /* |
374 | * if there was a previous node; link it and wait until reaching the |
375 | * head of the waitqueue. |
376 | */ |
377 | if (old & _Q_TAIL_MASK) { |
378 | prev = decode_tail(old); |
379 | |
380 | /* Link @node into the waitqueue. */ |
381 | WRITE_ONCE(prev->next, node); |
382 | |
383 | arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); |
384 | |
385 | /* |
386 | * While waiting for the MCS lock, the next pointer may have |
387 | * been set by another lock waiter. We cannot prefetch here |
388 | * due to lack of equivalent instruction in BPF ISA. |
389 | */ |
390 | next = READ_ONCE(node->next); |
391 | } |
392 | |
393 | /* |
394 | * we're at the head of the waitqueue, wait for the owner & pending to |
395 | * go away. |
396 | * |
397 | * *,x,y -> *,0,0 |
398 | * |
399 | * this wait loop must use a load-acquire such that we match the |
400 | * store-release that clears the locked bit and create lock |
401 | * sequentiality; this is because the set_locked() function below |
402 | * does not imply a full barrier. |
403 | */ |
404 | val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), |
405 | release_node_err); |
406 | |
407 | /* |
408 | * claim the lock: |
409 | * |
410 | * n,0,0 -> 0,0,1 : lock, uncontended |
411 | * *,*,0 -> *,*,1 : lock, contended |
412 | * |
413 | * If the queue head is the only one in the queue (lock value == tail) |
414 | * and nobody is pending, clear the tail code and grab the lock. |
415 | * Otherwise, we only need to grab the lock. |
416 | */ |
417 | |
418 | /* |
419 | * In the PV case we might already have _Q_LOCKED_VAL set, because |
420 | * of lock stealing; therefore we must also allow: |
421 | * |
422 | * n,0,1 -> 0,0,1 |
423 | * |
424 | * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the |
425 | * above wait condition, therefore any concurrent setting of |
426 | * PENDING will make the uncontended transition fail. |
427 | */ |
428 | if ((val & _Q_TAIL_MASK) == tail) { |
429 | if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) |
430 | goto release; /* No contention */ |
431 | } |
432 | |
433 | /* |
434 | * Either somebody is queued behind us or _Q_PENDING_VAL got set |
435 | * which will then detect the remaining tail and queue behind us |
436 | * ensuring we'll see a @next. |
437 | */ |
438 | set_locked(lock); |
439 | |
440 | /* |
441 | * contended path; wait for next if not observed yet, release. |
442 | */ |
443 | if (!next) |
444 | next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err); |
445 | |
446 | arch_mcs_spin_unlock_contended(&next->locked); |
447 | |
448 | release:; |
449 | /* |
450 | * release the node |
451 | * |
452 | * Doing a normal dec vs this_cpu_dec is fine. An upper context always |
453 | * decrements count it incremented before returning, thus we're fine. |
454 | * For contexts interrupting us, they either observe our dec or not. |
455 | * Just ensure the compiler doesn't reorder this statement, as a |
456 | * this_cpu_dec implicitly implied that. |
457 | */ |
458 | barrier(); |
459 | node0->count--; |
460 | return 0; |
461 | release_node_err: |
462 | barrier(); |
463 | node0->count--; |
464 | goto release_err; |
465 | release_err: |
466 | return ret; |
467 | } |
468 | |
469 | /** |
470 | * arena_spin_lock - acquire a queued spinlock |
471 | * @lock: Pointer to queued spinlock structure |
472 | * |
473 | * On error, returned value will be negative. |
474 | * On success, zero is returned. |
475 | * |
476 | * The return value _must_ be tested against zero for success, |
477 | * instead of checking it against negative, for passing the |
478 | * BPF verifier. |
479 | * |
480 | * The user should do: |
481 | * if (arena_spin_lock(...) != 0) // failure |
482 | * or |
483 | * if (arena_spin_lock(...) == 0) // success |
484 | * or |
485 | * if (arena_spin_lock(...)) // failure |
486 | * or |
487 | * if (!arena_spin_lock(...)) // success |
488 | * instead of: |
489 | * if (arena_spin_lock(...) < 0) // failure |
490 | * |
491 | * The return value can still be inspected later. |
492 | */ |
493 | static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock) |
494 | { |
495 | int val = 0; |
496 | |
497 | if (CONFIG_NR_CPUS > 1024) |
498 | return -EOPNOTSUPP; |
499 | |
500 | bpf_preempt_disable(); |
501 | if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) |
502 | return 0; |
503 | |
504 | val = arena_spin_lock_slowpath(lock, val); |
505 | /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */ |
506 | if (val) |
507 | bpf_preempt_enable(); |
508 | return val; |
509 | } |
510 | |
511 | /** |
512 | * arena_spin_unlock - release a queued spinlock |
513 | * @lock : Pointer to queued spinlock structure |
514 | */ |
515 | static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock) |
516 | { |
517 | /* |
518 | * unlock() needs release semantics: |
519 | */ |
520 | smp_store_release(&lock->locked, 0); |
521 | bpf_preempt_enable(); |
522 | } |
523 | |
524 | #define arena_spin_lock_irqsave(lock, flags) \ |
525 | ({ \ |
526 | int __ret; \ |
527 | bpf_local_irq_save(&(flags)); \ |
528 | __ret = arena_spin_lock((lock)); \ |
529 | if (__ret) \ |
530 | bpf_local_irq_restore(&(flags)); \ |
531 | (__ret); \ |
532 | }) |
533 | |
534 | #define arena_spin_unlock_irqrestore(lock, flags) \ |
535 | ({ \ |
536 | arena_spin_unlock((lock)); \ |
537 | bpf_local_irq_restore(&(flags)); \ |
538 | }) |
539 | |
540 | #endif |
541 | |
542 | #endif /* BPF_ARENA_SPIN_LOCK_H */ |
543 | |