1// SPDX-License-Identifier: GPL-2.0
2/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
3#ifndef BPF_ARENA_SPIN_LOCK_H
4#define BPF_ARENA_SPIN_LOCK_H
5
6#include <vmlinux.h>
7#include <bpf/bpf_helpers.h>
8#include "bpf_atomic.h"
9
10#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
11#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)
12
13#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
14
15#define EBUSY 16
16#define EOPNOTSUPP 95
17#define ETIMEDOUT 110
18
19#ifndef __arena
20#define __arena __attribute__((address_space(1)))
21#endif
22
23extern unsigned long CONFIG_NR_CPUS __kconfig;
24
25/*
26 * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but
27 * PowerPC overrides the definition to define lock->val as u32 instead of
28 * atomic_t, leading to compilation errors. Import a local definition below so
29 * that we don't depend on the vmlinux.h version.
30 */
31
32struct __qspinlock {
33 union {
34 atomic_t val;
35#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
36 struct {
37 u8 locked;
38 u8 pending;
39 };
40 struct {
41 u16 locked_pending;
42 u16 tail;
43 };
44#else
45 struct {
46 u16 tail;
47 u16 locked_pending;
48 };
49 struct {
50 u8 reserved[2];
51 u8 pending;
52 u8 locked;
53 };
54#endif
55 };
56};
57
58#define arena_spinlock_t struct __qspinlock
59/* FIXME: Using typedef causes CO-RE relocation error */
60/* typedef struct qspinlock arena_spinlock_t; */
61
62struct arena_mcs_spinlock {
63 struct arena_mcs_spinlock __arena *next;
64 int locked;
65 int count;
66};
67
68struct arena_qnode {
69 struct arena_mcs_spinlock mcs;
70};
71
72#define _Q_MAX_NODES 4
73#define _Q_PENDING_LOOPS 1
74
75/*
76 * Bitfields in the atomic value:
77 *
78 * 0- 7: locked byte
79 * 8: pending
80 * 9-15: not used
81 * 16-17: tail index
82 * 18-31: tail cpu (+1)
83 */
84#define _Q_MAX_CPUS 1024
85
86#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\
87 << _Q_ ## type ## _OFFSET)
88#define _Q_LOCKED_OFFSET 0
89#define _Q_LOCKED_BITS 8
90#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED)
91
92#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
93#define _Q_PENDING_BITS 8
94#define _Q_PENDING_MASK _Q_SET_MASK(PENDING)
95
96#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
97#define _Q_TAIL_IDX_BITS 2
98#define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX)
99
100#define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
101#define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET)
102#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU)
103
104#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET
105#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
106
107#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET)
108#define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET)
109
110struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];
111
112static inline u32 encode_tail(int cpu, int idx)
113{
114 u32 tail;
115
116 tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
117 tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
118
119 return tail;
120}
121
122static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
123{
124 u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
125 u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
126
127 return &qnodes[cpu][idx].mcs;
128}
129
130static inline
131struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx)
132{
133 return &((struct arena_qnode __arena *)base + idx)->mcs;
134}
135
136#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
137
138/**
139 * xchg_tail - Put in the new queue tail code word & retrieve previous one
140 * @lock : Pointer to queued spinlock structure
141 * @tail : The new queue tail code word
142 * Return: The previous queue tail code word
143 *
144 * xchg(lock, tail)
145 *
146 * p,*,* -> n,*,* ; prev = xchg(lock, node)
147 */
148static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
149{
150 u32 old, new;
151
152 old = atomic_read(&lock->val);
153 do {
154 new = (old & _Q_LOCKED_PENDING_MASK) | tail;
155 /*
156 * We can use relaxed semantics since the caller ensures that
157 * the MCS node is properly initialized before updating the
158 * tail.
159 */
160 /* These loops are not expected to stall, but we still need to
161 * prove to the verifier they will terminate eventually.
162 */
163 cond_break_label(out);
164 } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
165
166 return old;
167out:
168 bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
169 return old;
170}
171
172/**
173 * clear_pending - clear the pending bit.
174 * @lock: Pointer to queued spinlock structure
175 *
176 * *,1,* -> *,0,*
177 */
178static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
179{
180 WRITE_ONCE(lock->pending, 0);
181}
182
183/**
184 * clear_pending_set_locked - take ownership and clear the pending bit.
185 * @lock: Pointer to queued spinlock structure
186 *
187 * *,1,0 -> *,0,1
188 *
189 * Lock stealing is not allowed if this function is used.
190 */
191static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
192{
193 WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
194}
195
196/**
197 * set_locked - Set the lock bit and own the lock
198 * @lock: Pointer to queued spinlock structure
199 *
200 * *,*,0 -> *,0,1
201 */
202static __always_inline void set_locked(arena_spinlock_t __arena *lock)
203{
204 WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
205}
206
207static __always_inline
208u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
209{
210 u32 old, new;
211
212 old = atomic_read(&lock->val);
213 do {
214 new = old | _Q_PENDING_VAL;
215 /*
216 * These loops are not expected to stall, but we still need to
217 * prove to the verifier they will terminate eventually.
218 */
219 cond_break_label(out);
220 } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));
221
222 return old;
223out:
224 bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
225 return old;
226}
227
228/**
229 * arena_spin_trylock - try to acquire the queued spinlock
230 * @lock : Pointer to queued spinlock structure
231 * Return: 1 if lock acquired, 0 if failed
232 */
233static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
234{
235 int val = atomic_read(&lock->val);
236
237 if (unlikely(val))
238 return 0;
239
240 return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
241}
242
243__noinline
244int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
245{
246 struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
247 int ret = -ETIMEDOUT;
248 u32 old, tail;
249 int idx;
250
251 /*
252 * Wait for in-progress pending->locked hand-overs with a bounded
253 * number of spins so that we guarantee forward progress.
254 *
255 * 0,1,0 -> 0,0,1
256 */
257 if (val == _Q_PENDING_VAL) {
258 int cnt = _Q_PENDING_LOOPS;
259 val = atomic_cond_read_relaxed_label(&lock->val,
260 (VAL != _Q_PENDING_VAL) || !cnt--,
261 release_err);
262 }
263
264 /*
265 * If we observe any contention; queue.
266 */
267 if (val & ~_Q_LOCKED_MASK)
268 goto queue;
269
270 /*
271 * trylock || pending
272 *
273 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
274 */
275 val = arena_fetch_set_pending_acquire(lock);
276
277 /*
278 * If we observe contention, there is a concurrent locker.
279 *
280 * Undo and queue; our setting of PENDING might have made the
281 * n,0,0 -> 0,0,0 transition fail and it will now be waiting
282 * on @next to become !NULL.
283 */
284 if (unlikely(val & ~_Q_LOCKED_MASK)) {
285
286 /* Undo PENDING if we set it. */
287 if (!(val & _Q_PENDING_MASK))
288 clear_pending(lock);
289
290 goto queue;
291 }
292
293 /*
294 * We're pending, wait for the owner to go away.
295 *
296 * 0,1,1 -> *,1,0
297 *
298 * this wait loop must be a load-acquire such that we match the
299 * store-release that clears the locked bit and create lock
300 * sequentiality; this is because not all
301 * clear_pending_set_locked() implementations imply full
302 * barriers.
303 */
304 if (val & _Q_LOCKED_MASK)
305 smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
306
307 /*
308 * take ownership and clear the pending bit.
309 *
310 * 0,1,0 -> 0,0,1
311 */
312 clear_pending_set_locked(lock);
313 return 0;
314
315 /*
316 * End of pending bit optimistic spinning and beginning of MCS
317 * queuing.
318 */
319queue:
320 node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs;
321 idx = node0->count++;
322 tail = encode_tail(bpf_get_smp_processor_id(), idx);
323
324 /*
325 * 4 nodes are allocated based on the assumption that there will not be
326 * nested NMIs taking spinlocks. That may not be true in some
327 * architectures even though the chance of needing more than 4 nodes
328 * will still be extremely unlikely. When that happens, we simply return
329 * an error. Original qspinlock has a trylock fallback in this case.
330 */
331 if (unlikely(idx >= _Q_MAX_NODES)) {
332 ret = -EBUSY;
333 goto release_node_err;
334 }
335
336 node = grab_mcs_node(node0, idx);
337
338 /*
339 * Ensure that we increment the head node->count before initialising
340 * the actual node. If the compiler is kind enough to reorder these
341 * stores, then an IRQ could overwrite our assignments.
342 */
343 barrier();
344
345 node->locked = 0;
346 node->next = NULL;
347
348 /*
349 * We touched a (possibly) cold cacheline in the per-cpu queue node;
350 * attempt the trylock once more in the hope someone let go while we
351 * weren't watching.
352 */
353 if (arena_spin_trylock(lock))
354 goto release;
355
356 /*
357 * Ensure that the initialisation of @node is complete before we
358 * publish the updated tail via xchg_tail() and potentially link
359 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
360 */
361 smp_wmb();
362
363 /*
364 * Publish the updated tail.
365 * We have already touched the queueing cacheline; don't bother with
366 * pending stuff.
367 *
368 * p,*,* -> n,*,*
369 */
370 old = xchg_tail(lock, tail);
371 next = NULL;
372
373 /*
374 * if there was a previous node; link it and wait until reaching the
375 * head of the waitqueue.
376 */
377 if (old & _Q_TAIL_MASK) {
378 prev = decode_tail(old);
379
380 /* Link @node into the waitqueue. */
381 WRITE_ONCE(prev->next, node);
382
383 arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
384
385 /*
386 * While waiting for the MCS lock, the next pointer may have
387 * been set by another lock waiter. We cannot prefetch here
388 * due to lack of equivalent instruction in BPF ISA.
389 */
390 next = READ_ONCE(node->next);
391 }
392
393 /*
394 * we're at the head of the waitqueue, wait for the owner & pending to
395 * go away.
396 *
397 * *,x,y -> *,0,0
398 *
399 * this wait loop must use a load-acquire such that we match the
400 * store-release that clears the locked bit and create lock
401 * sequentiality; this is because the set_locked() function below
402 * does not imply a full barrier.
403 */
404 val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
405 release_node_err);
406
407 /*
408 * claim the lock:
409 *
410 * n,0,0 -> 0,0,1 : lock, uncontended
411 * *,*,0 -> *,*,1 : lock, contended
412 *
413 * If the queue head is the only one in the queue (lock value == tail)
414 * and nobody is pending, clear the tail code and grab the lock.
415 * Otherwise, we only need to grab the lock.
416 */
417
418 /*
419 * In the PV case we might already have _Q_LOCKED_VAL set, because
420 * of lock stealing; therefore we must also allow:
421 *
422 * n,0,1 -> 0,0,1
423 *
424 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
425 * above wait condition, therefore any concurrent setting of
426 * PENDING will make the uncontended transition fail.
427 */
428 if ((val & _Q_TAIL_MASK) == tail) {
429 if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
430 goto release; /* No contention */
431 }
432
433 /*
434 * Either somebody is queued behind us or _Q_PENDING_VAL got set
435 * which will then detect the remaining tail and queue behind us
436 * ensuring we'll see a @next.
437 */
438 set_locked(lock);
439
440 /*
441 * contended path; wait for next if not observed yet, release.
442 */
443 if (!next)
444 next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);
445
446 arch_mcs_spin_unlock_contended(&next->locked);
447
448release:;
449 /*
450 * release the node
451 *
452 * Doing a normal dec vs this_cpu_dec is fine. An upper context always
453 * decrements count it incremented before returning, thus we're fine.
454 * For contexts interrupting us, they either observe our dec or not.
455 * Just ensure the compiler doesn't reorder this statement, as a
456 * this_cpu_dec implicitly implied that.
457 */
458 barrier();
459 node0->count--;
460 return 0;
461release_node_err:
462 barrier();
463 node0->count--;
464 goto release_err;
465release_err:
466 return ret;
467}
468
469/**
470 * arena_spin_lock - acquire a queued spinlock
471 * @lock: Pointer to queued spinlock structure
472 *
473 * On error, returned value will be negative.
474 * On success, zero is returned.
475 *
476 * The return value _must_ be tested against zero for success,
477 * instead of checking it against negative, for passing the
478 * BPF verifier.
479 *
480 * The user should do:
481 * if (arena_spin_lock(...) != 0) // failure
482 * or
483 * if (arena_spin_lock(...) == 0) // success
484 * or
485 * if (arena_spin_lock(...)) // failure
486 * or
487 * if (!arena_spin_lock(...)) // success
488 * instead of:
489 * if (arena_spin_lock(...) < 0) // failure
490 *
491 * The return value can still be inspected later.
492 */
493static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
494{
495 int val = 0;
496
497 if (CONFIG_NR_CPUS > 1024)
498 return -EOPNOTSUPP;
499
500 bpf_preempt_disable();
501 if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
502 return 0;
503
504 val = arena_spin_lock_slowpath(lock, val);
505 /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
506 if (val)
507 bpf_preempt_enable();
508 return val;
509}
510
511/**
512 * arena_spin_unlock - release a queued spinlock
513 * @lock : Pointer to queued spinlock structure
514 */
515static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
516{
517 /*
518 * unlock() needs release semantics:
519 */
520 smp_store_release(&lock->locked, 0);
521 bpf_preempt_enable();
522}
523
524#define arena_spin_lock_irqsave(lock, flags) \
525 ({ \
526 int __ret; \
527 bpf_local_irq_save(&(flags)); \
528 __ret = arena_spin_lock((lock)); \
529 if (__ret) \
530 bpf_local_irq_restore(&(flags)); \
531 (__ret); \
532 })
533
534#define arena_spin_unlock_irqrestore(lock, flags) \
535 ({ \
536 arena_spin_unlock((lock)); \
537 bpf_local_irq_restore(&(flags)); \
538 })
539
540#endif
541
542#endif /* BPF_ARENA_SPIN_LOCK_H */
543

Provided by KDAB

Privacy Policy
Improve your Profiling and Debugging skills
Find out more

source code of linux/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h