bpf_arena_spin_lock.h source code [linux/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h]

1	// SPDX-License-Identifier: GPL-2.0
2	/ Copyright (c) 2025 Meta Platforms, Inc. and affiliates. /
3	#ifndef BPF_ARENA_SPIN_LOCK_H
4	#define BPF_ARENA_SPIN_LOCK_H
5
6	#include <vmlinux.h>
7	#include <bpf/bpf_helpers.h>
8	#include "bpf_atomic.h"
9
10	#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
11	#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)
12
13	#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
14
15	#define EBUSY 16
16	#define EOPNOTSUPP 95
17	#define ETIMEDOUT 110
18
19	#ifndef __arena
20	#define __arena __attribute__((address_space(1)))
21	#endif
22
23	extern unsigned long CONFIG_NR_CPUS __kconfig;
24
25	/*
26	* Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but
27	* PowerPC overrides the definition to define lock->val as u32 instead of
28	* atomic_t, leading to compilation errors. Import a local definition below so
29	* that we don't depend on the vmlinux.h version.
30	*/
31
32	struct __qspinlock {
33	union {
34	atomic_t val;
35	#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
36	struct {
37	u8 locked;
38	u8 pending;
39	};
40	struct {
41	u16 locked_pending;
42	u16 tail;
43	};
44	#else
45	struct {
46	u16 tail;
47	u16 locked_pending;
48	};
49	struct {
50	u8 reserved[`2`];
51	u8 pending;
52	u8 locked;
53	};
54	#endif
55	};
56	};
57
58	#define arena_spinlock_t struct __qspinlock
59	/ FIXME: Using typedef causes CO-RE relocation error /
60	/ typedef struct qspinlock arena_spinlock_t; /
61
62	struct arena_mcs_spinlock {
63	struct arena_mcs_spinlock __arena *next;
64	int locked;
65	int count;
66	};
67
68	struct arena_qnode {
69	struct arena_mcs_spinlock mcs;
70	};
71
72	#define _Q_MAX_NODES 4
73	#define _Q_PENDING_LOOPS 1
74
75	/*
76	* Bitfields in the atomic value:
77	*
78	* 0- 7: locked byte
79	* 8: pending
80	* 9-15: not used
81	* 16-17: tail index
82	* 18-31: tail cpu (+1)
83	*/
84	#define _Q_MAX_CPUS 1024
85
86	#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\
87	<< _Q_ ## type ## _OFFSET)
88	#define _Q_LOCKED_OFFSET 0
89	#define _Q_LOCKED_BITS 8
90	#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED)
91
92	#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
93	#define _Q_PENDING_BITS 8
94	#define _Q_PENDING_MASK _Q_SET_MASK(PENDING)
95
96	#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
97	#define _Q_TAIL_IDX_BITS 2
98	#define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX)
99
100	#define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
101	#define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET)
102	#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU)
103
104	#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET
105	#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK \| _Q_TAIL_CPU_MASK)
106
107	#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET)
108	#define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET)
109
110	struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES];
111
112	static inline u32 encode_tail(int cpu, int idx)
113	{
114	u32 tail;
115
116	tail = (cpu + `1`) << _Q_TAIL_CPU_OFFSET;
117	tail \|= idx << _Q_TAIL_IDX_OFFSET; / assume < 4 /
118
119	return tail;
120	}
121
122	static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
123	{
124	u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - `1`;
125	u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
126
127	return &qnodes[cpu][idx].mcs;
128	}
129
130	static inline
131	struct arena_mcs_spinlock __arena grab_mcs_node(struct* arena_mcs_spinlock __arena base, int* idx)
132	{
133	return &((struct arena_qnode __arena *)base + idx)->mcs;
134	}
135
136	#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK \| _Q_PENDING_MASK)
137
138	/**
139	* xchg_tail - Put in the new queue tail code word & retrieve previous one
140	* @lock : Pointer to queued spinlock structure
141	* @tail : The new queue tail code word
142	* Return: The previous queue tail code word
143	*
144	* xchg(lock, tail)
145	*
146	* p,, -> n,, ; prev = xchg(lock, node)
147	*/
148	static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
149	{
150	u32 old, new;
151
152	old = atomic_read(&lock->val);
153	do {
154	new = (old & _Q_LOCKED_PENDING_MASK) \| tail;
155	/*
156	* We can use relaxed semantics since the caller ensures that
157	* the MCS node is properly initialized before updating the
158	* tail.
159	*/
160	/ These loops are not expected to stall, but we still need to*
161	* prove to the verifier they will terminate eventually.
162	*/
163	cond_break_label(out);
164	} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
165
166	return old;
167	out:
168	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
169	return old;
170	}
171
172	/**
173	* clear_pending - clear the pending bit.
174	* @lock: Pointer to queued spinlock structure
175	*
176	* ,1, -> ,0,
177	*/
178	static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
179	{
180	WRITE_ONCE(lock->pending, `0`);
181	}
182
183	/**
184	* clear_pending_set_locked - take ownership and clear the pending bit.
185	* @lock: Pointer to queued spinlock structure
186	*
187	* ,1,0 -> ,0,1
188	*
189	* Lock stealing is not allowed if this function is used.
190	*/
191	static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
192	{
193	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
194	}
195
196	/**
197	* set_locked - Set the lock bit and own the lock
198	* @lock: Pointer to queued spinlock structure
199	*
200	* ,,0 -> *,0,1
201	*/
202	static __always_inline void set_locked(arena_spinlock_t __arena *lock)
203	{
204	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
205	}
206
207	static __always_inline
208	u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
209	{
210	u32 old, new;
211
212	old = atomic_read(&lock->val);
213	do {
214	new = old \| _Q_PENDING_VAL;
215	/*
216	* These loops are not expected to stall, but we still need to
217	* prove to the verifier they will terminate eventually.
218	*/
219	cond_break_label(out);
220	} while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));
221
222	return old;
223	out:
224	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
225	return old;
226	}
227
228	/**
229	* arena_spin_trylock - try to acquire the queued spinlock
230	* @lock : Pointer to queued spinlock structure
231	* Return: 1 if lock acquired, 0 if failed
232	*/
233	static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
234	{
235	int val = atomic_read(&lock->val);
236
237	if (unlikely(val))
238	return `0`;
239
240	return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
241	}
242
243	__noinline
244	int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
245	{
246	struct arena_mcs_spinlock __arena prev, next, node0, node;
247	int ret = -ETIMEDOUT;
248	u32 old, tail;
249	int idx;
250
251	/*
252	* Wait for in-progress pending->locked hand-overs with a bounded
253	* number of spins so that we guarantee forward progress.
254	*
255	* 0,1,0 -> 0,0,1
256	*/
257	if (val == _Q_PENDING_VAL) {
258	int cnt = _Q_PENDING_LOOPS;
259	val = atomic_cond_read_relaxed_label(&lock->val,
260	(VAL != _Q_PENDING_VAL) \|\| !cnt--,
261	release_err);
262	}
263
264	/*
265	* If we observe any contention; queue.
266	*/
267	if (val & ~_Q_LOCKED_MASK)
268	goto queue;
269
270	/*
271	* trylock \|\| pending
272	*
273	* 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
274	*/
275	val = arena_fetch_set_pending_acquire(lock);
276
277	/*
278	* If we observe contention, there is a concurrent locker.
279	*
280	* Undo and queue; our setting of PENDING might have made the
281	* n,0,0 -> 0,0,0 transition fail and it will now be waiting
282	* on @next to become !NULL.
283	*/
284	if (unlikely(val & ~_Q_LOCKED_MASK)) {
285
286	/ Undo PENDING if we set it. /
287	if (!(val & _Q_PENDING_MASK))
288	clear_pending(lock);
289
290	goto queue;
291	}
292
293	/*
294	* We're pending, wait for the owner to go away.
295	*
296	* 0,1,1 -> *,1,0
297	*
298	* this wait loop must be a load-acquire such that we match the
299	* store-release that clears the locked bit and create lock
300	* sequentiality; this is because not all
301	* clear_pending_set_locked() implementations imply full
302	* barriers.
303	*/
304	if (val & _Q_LOCKED_MASK)
305	smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
306
307	/*
308	* take ownership and clear the pending bit.
309	*
310	* 0,1,0 -> 0,0,1
311	*/
312	clear_pending_set_locked(lock);
313	return `0`;
314
315	/*
316	* End of pending bit optimistic spinning and beginning of MCS
317	* queuing.
318	*/
319	queue:
320	node0 = &(qnodes[bpf_get_smp_processor_id()])[`0`].mcs;
321	idx = node0->count++;
322	tail = encode_tail(bpf_get_smp_processor_id(), idx);
323
324	/*
325	* 4 nodes are allocated based on the assumption that there will not be
326	* nested NMIs taking spinlocks. That may not be true in some
327	* architectures even though the chance of needing more than 4 nodes
328	* will still be extremely unlikely. When that happens, we simply return
329	* an error. Original qspinlock has a trylock fallback in this case.
330	*/
331	if (unlikely(idx >= _Q_MAX_NODES)) {
332	ret = -EBUSY;
333	goto release_node_err;
334	}
335
336	node = grab_mcs_node(node0, idx);
337
338	/*
339	* Ensure that we increment the head node->count before initialising
340	* the actual node. If the compiler is kind enough to reorder these
341	* stores, then an IRQ could overwrite our assignments.
342	*/
343	barrier();
344
345	node->locked = `0`;
346	node->next = NULL;
347
348	/*
349	* We touched a (possibly) cold cacheline in the per-cpu queue node;
350	* attempt the trylock once more in the hope someone let go while we
351	* weren't watching.
352	*/
353	if (arena_spin_trylock(lock))
354	goto release;
355
356	/*
357	* Ensure that the initialisation of @node is complete before we
358	* publish the updated tail via xchg_tail() and potentially link
359	* @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
360	*/
361	smp_wmb();
362
363	/*
364	* Publish the updated tail.
365	* We have already touched the queueing cacheline; don't bother with
366	* pending stuff.
367	*
368	* p,, -> n,,
369	*/
370	old = xchg_tail(lock, tail);
371	next = NULL;
372
373	/*
374	* if there was a previous node; link it and wait until reaching the
375	* head of the waitqueue.
376	*/
377	if (old & _Q_TAIL_MASK) {
378	prev = decode_tail(old);
379
380	/ Link @node into the waitqueue. /
381	WRITE_ONCE(prev->next, node);
382
383	arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
384
385	/*
386	* While waiting for the MCS lock, the next pointer may have
387	* been set by another lock waiter. We cannot prefetch here
388	* due to lack of equivalent instruction in BPF ISA.
389	*/
390	next = READ_ONCE(node->next);
391	}
392
393	/*
394	* we're at the head of the waitqueue, wait for the owner & pending to
395	* go away.
396	*
397	* ,x,y -> ,0,0
398	*
399	* this wait loop must use a load-acquire such that we match the
400	* store-release that clears the locked bit and create lock
401	* sequentiality; this is because the set_locked() function below
402	* does not imply a full barrier.
403	*/
404	val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
405	release_node_err);
406
407	/*
408	* claim the lock:
409	*
410	* n,0,0 -> 0,0,1 : lock, uncontended
411	* ,,0 -> ,,1 : lock, contended
412	*
413	* If the queue head is the only one in the queue (lock value == tail)
414	* and nobody is pending, clear the tail code and grab the lock.
415	* Otherwise, we only need to grab the lock.
416	*/
417
418	/*
419	* In the PV case we might already have _Q_LOCKED_VAL set, because
420	* of lock stealing; therefore we must also allow:
421	*
422	* n,0,1 -> 0,0,1
423	*
424	* Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
425	* above wait condition, therefore any concurrent setting of
426	* PENDING will make the uncontended transition fail.
427	*/
428	if ((val & _Q_TAIL_MASK) == tail) {
429	if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
430	goto release; / No contention /
431	}
432
433	/*
434	* Either somebody is queued behind us or _Q_PENDING_VAL got set
435	* which will then detect the remaining tail and queue behind us
436	* ensuring we'll see a @next.
437	*/
438	set_locked(lock);
439
440	/*
441	* contended path; wait for next if not observed yet, release.
442	*/
443	if (!next)
444	next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);
445
446	arch_mcs_spin_unlock_contended(&next->locked);
447
448	release:;
449	/*
450	* release the node
451	*
452	* Doing a normal dec vs this_cpu_dec is fine. An upper context always
453	* decrements count it incremented before returning, thus we're fine.
454	* For contexts interrupting us, they either observe our dec or not.
455	* Just ensure the compiler doesn't reorder this statement, as a
456	* this_cpu_dec implicitly implied that.
457	*/
458	barrier();
459	node0->count--;
460	return `0`;
461	release_node_err:
462	barrier();
463	node0->count--;
464	goto release_err;
465	release_err:
466	return ret;
467	}
468
469	/**
470	* arena_spin_lock - acquire a queued spinlock
471	* @lock: Pointer to queued spinlock structure
472	*
473	* On error, returned value will be negative.
474	* On success, zero is returned.
475	*
476	* The return value _must_ be tested against zero for success,
477	* instead of checking it against negative, for passing the
478	* BPF verifier.
479	*
480	* The user should do:
481	* if (arena_spin_lock(...) != 0) // failure
482	* or
483	* if (arena_spin_lock(...) == 0) // success
484	* or
485	* if (arena_spin_lock(...)) // failure
486	* or
487	* if (!arena_spin_lock(...)) // success
488	* instead of:
489	* if (arena_spin_lock(...) < 0) // failure
490	*
491	* The return value can still be inspected later.
492	*/
493	static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
494	{
495	int val = `0`;
496
497	if (CONFIG_NR_CPUS > `1024`)
498	return -EOPNOTSUPP;
499
500	bpf_preempt_disable();
501	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
502	return `0`;
503
504	val = arena_spin_lock_slowpath(lock, val);
505	/ FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. /
506	if (val)
507	bpf_preempt_enable();
508	return val;
509	}
510
511	/**
512	* arena_spin_unlock - release a queued spinlock
513	* @lock : Pointer to queued spinlock structure
514	*/
515	static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
516	{
517	/*
518	* unlock() needs release semantics:
519	*/
520	smp_store_release(&lock->locked, `0`);
521	bpf_preempt_enable();
522	}
523
524	#define arena_spin_lock_irqsave(lock, flags) \
525	({ \
526	int __ret; \
527	bpf_local_irq_save(&(flags)); \
528	__ret = arena_spin_lock((lock)); \
529	if (__ret) \
530	bpf_local_irq_restore(&(flags)); \
531	(__ret); \
532	})
533
534	#define arena_spin_unlock_irqrestore(lock, flags) \
535	({ \
536	arena_spin_unlock((lock)); \
537	bpf_local_irq_restore(&(flags)); \
538	})
539
540	#endif
541
542	#endif /* BPF_ARENA_SPIN_LOCK_H */
543

Provided by KDAB

Improve your Profiling and Debugging skills

Find out more

source code of linux/tools/testing/selftests/bpf/progs/bpf_arena_spin_lock.h