ringbuf.c source code [linux/kernel/bpf/ringbuf.c]

1	#include <linux/bpf.h>
2	#include <linux/btf.h>
3	#include <linux/err.h>
4	#include <linux/irq_work.h>
5	#include <linux/slab.h>
6	#include <linux/filter.h>
7	#include <linux/mm.h>
8	#include <linux/vmalloc.h>
9	#include <linux/wait.h>
10	#include <linux/poll.h>
11	#include <linux/kmemleak.h>
12	#include <uapi/linux/btf.h>
13	#include <linux/btf_ids.h>
14	#include <asm/rqspinlock.h>
15
16	#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
17
18	/ non-mmap()'able part of bpf_ringbuf (everything up to consumer page) /
19	#define RINGBUF_PGOFF \
20	(offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
21	/ consumer page and producer page /
22	#define RINGBUF_POS_PAGES 2
23	#define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES)
24
25	#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
26
27	struct bpf_ringbuf {
28	wait_queue_head_t waitq;
29	struct irq_work work;
30	u64 mask;
31	struct page **pages;
32	int nr_pages;
33	rqspinlock_t spinlock ____cacheline_aligned_in_smp;
34	/ For user-space producer ring buffers, an atomic_t busy bit is used*
35	* to synchronize access to the ring buffers in the kernel, rather than
36	* the spinlock that is used for kernel-producer ring buffers. This is
37	* done because the ring buffer must hold a lock across a BPF program's
38	* callback:
39	*
40	* __bpf_user_ringbuf_peek() // lock acquired
41	* -> program callback_fn()
42	* -> __bpf_user_ringbuf_sample_release() // lock released
43	*
44	* It is unsafe and incorrect to hold an IRQ spinlock across what could
45	* be a long execution window, so we instead simply disallow concurrent
46	* access to the ring buffer by kernel consumers, and return -EBUSY from
47	* __bpf_user_ringbuf_peek() if the busy bit is held by another task.
48	*/
49	atomic_t busy ____cacheline_aligned_in_smp;
50	/ Consumer and producer counters are put into separate pages to*
51	* allow each position to be mapped with different permissions.
52	* This prevents a user-space application from modifying the
53	* position and ruining in-kernel tracking. The permissions of the
54	* pages depend on who is producing samples: user-space or the
55	* kernel. Note that the pending counter is placed in the same
56	* page as the producer, so that it shares the same cache line.
57	*
58	* Kernel-producer
59	* ---------------
60	* The producer position and data pages are mapped as r/o in
61	* userspace. For this approach, bits in the header of samples are
62	* used to signal to user-space, and to other producers, whether a
63	* sample is currently being written.
64	*
65	* User-space producer
66	* -------------------
67	* Only the page containing the consumer position is mapped r/o in
68	* user-space. User-space producers also use bits of the header to
69	* communicate to the kernel, but the kernel must carefully check and
70	* validate each sample to ensure that they're correctly formatted, and
71	* fully contained within the ring buffer.
72	*/
73	unsigned long consumer_pos __aligned(PAGE_SIZE);
74	unsigned long producer_pos __aligned(PAGE_SIZE);
75	unsigned long pending_pos;
76	char data[] __aligned(PAGE_SIZE);
77	};
78
79	struct bpf_ringbuf_map {
80	struct bpf_map map;
81	struct bpf_ringbuf *rb;
82	};
83
84	/ 8-byte ring buffer record header structure /
85	struct bpf_ringbuf_hdr {
86	u32 len;
87	u32 pg_off;
88	};
89
90	static struct bpf_ringbuf bpf_ringbuf_area_alloc(size_t data_sz, int* numa_node)
91	{
92	const gfp_t flags = GFP_KERNEL_ACCOUNT \| __GFP_RETRY_MAYFAIL \|
93	__GFP_NOWARN \| __GFP_ZERO;
94	int nr_meta_pages = RINGBUF_NR_META_PAGES;
95	int nr_data_pages = data_sz >> PAGE_SHIFT;
96	int nr_pages = nr_meta_pages + nr_data_pages;
97	struct page *pages, page;
98	struct bpf_ringbuf *rb;
99	size_t array_size;
100	int i;
101
102	/ Each data page is mapped twice to allow "virtual"*
103	* continuous read of samples wrapping around the end of ring
104	* buffer area:
105	* ------------------------------------------------------
106	* \| meta pages \| real data pages \| same data pages \|
107	* ------------------------------------------------------
108	* \| \| 1 2 3 4 5 6 7 8 9 \| 1 2 3 4 5 6 7 8 9 \|
109	* ------------------------------------------------------
110	* \| \| TA DA \| TA DA \|
111	* ------------------------------------------------------
112	* ^^^^^^^
113	* \|
114	* Here, no need to worry about special handling of wrapped-around
115	* data due to double-mapped data pages. This works both in kernel and
116	* when mmap()'ed in user-space, simplifying both kernel and
117	* user-space implementations significantly.
118	*/
119	array_size = (nr_meta_pages + `2` * nr_data_pages) * sizeof(*pages);
120	pages = bpf_map_area_alloc(size: array_size, numa_node);
121	if (!pages)
122	return NULL;
123
124	for (i = `0`; i < nr_pages; i++) {
125	page = alloc_pages_node(numa_node, flags, `0`);
126	if (!page) {
127	nr_pages = i;
128	goto err_free_pages;
129	}
130	pages[i] = page;
131	if (i >= nr_meta_pages)
132	pages[nr_data_pages + i] = page;
133	}
134
135	rb = vmap(pages, count: nr_meta_pages + `2` * nr_data_pages,
136	VM_MAP \| VM_USERMAP, PAGE_KERNEL);
137	if (rb) {
138	kmemleak_not_leak(ptr: pages);
139	rb->pages = pages;
140	rb->nr_pages = nr_pages;
141	return rb;
142	}
143
144	err_free_pages:
145	for (i = `0`; i < nr_pages; i++)
146	__free_page(pages[i]);
147	bpf_map_area_free(base: pages);
148	return NULL;
149	}
150
151	static void bpf_ringbuf_notify(struct irq_work *work)
152	{
153	struct bpf_ringbuf rb = container_of(work, struct* bpf_ringbuf, work);
154
155	wake_up_all(&rb->waitq);
156	}
157
158	/ Maximum size of ring buffer area is limited by 32-bit page offset within*
159	* record header, counted in pages. Reserve 8 bits for extensibility, and
160	* take into account few extra pages for consumer/producer pages and
161	* non-mmap()'able parts, the current maximum size would be:
162	*
163	* (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
164	*
165	* This gives 64GB limit, which seems plenty for single ring buffer. Now
166	* considering that the maximum value of data_sz is (4GB - 1), there
167	* will be no overflow, so just note the size limit in the comments.
168	*/
169	static struct bpf_ringbuf bpf_ringbuf_alloc(size_t data_sz, int* numa_node)
170	{
171	struct bpf_ringbuf *rb;
172
173	rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
174	if (!rb)
175	return NULL;
176
177	raw_res_spin_lock_init(&rb->spinlock);
178	atomic_set(v: &rb->busy, i: `0`);
179	init_waitqueue_head(&rb->waitq);
180	init_irq_work(work: &rb->work, func: bpf_ringbuf_notify);
181
182	rb->mask = data_sz - `1`;
183	rb->consumer_pos = `0`;
184	rb->producer_pos = `0`;
185	rb->pending_pos = `0`;
186
187	return rb;
188	}
189
190	static struct bpf_map ringbuf_map_alloc(union* bpf_attr *attr)
191	{
192	struct bpf_ringbuf_map *rb_map;
193
194	if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
195	return ERR_PTR(error: -EINVAL);
196
197	if (attr->key_size \|\| attr->value_size \|\|
198	!is_power_of_2(n: attr->max_entries) \|\|
199	!PAGE_ALIGNED(attr->max_entries))
200	return ERR_PTR(error: -EINVAL);
201
202	rb_map = bpf_map_area_alloc(size: sizeof(*rb_map), NUMA_NO_NODE);
203	if (!rb_map)
204	return ERR_PTR(error: -ENOMEM);
205
206	bpf_map_init_from_attr(map: &rb_map->map, attr);
207
208	rb_map->rb = bpf_ringbuf_alloc(data_sz: attr->max_entries, numa_node: rb_map->map.numa_node);
209	if (!rb_map->rb) {
210	bpf_map_area_free(base: rb_map);
211	return ERR_PTR(error: -ENOMEM);
212	}
213
214	return &rb_map->map;
215	}
216
217	static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
218	{
219	/ copy pages pointer and nr_pages to local variable, as we are going*
220	* to unmap rb itself with vunmap() below
221	*/
222	struct page **pages = rb->pages;
223	int i, nr_pages = rb->nr_pages;
224
225	vunmap(addr: rb);
226	for (i = `0`; i < nr_pages; i++)
227	__free_page(pages[i]);
228	bpf_map_area_free(base: pages);
229	}
230
231	static void ringbuf_map_free(struct bpf_map *map)
232	{
233	struct bpf_ringbuf_map *rb_map;
234
235	rb_map = container_of(map, struct bpf_ringbuf_map, map);
236	bpf_ringbuf_free(rb: rb_map->rb);
237	bpf_map_area_free(base: rb_map);
238	}
239
240	static void ringbuf_map_lookup_elem(struct* bpf_map map, void* *key)
241	{
242	return ERR_PTR(error: -ENOTSUPP);
243	}
244
245	static long ringbuf_map_update_elem(struct bpf_map map, void* key, void* *value,
246	u64 flags)
247	{
248	return -ENOTSUPP;
249	}
250
251	static long ringbuf_map_delete_elem(struct bpf_map map, void* *key)
252	{
253	return -ENOTSUPP;
254	}
255
256	static int ringbuf_map_get_next_key(struct bpf_map map, void* *key,
257	void *next_key)
258	{
259	return -ENOTSUPP;
260	}
261
262	static int ringbuf_map_mmap_kern(struct bpf_map map, struct* vm_area_struct *vma)
263	{
264	struct bpf_ringbuf_map *rb_map;
265
266	rb_map = container_of(map, struct bpf_ringbuf_map, map);
267
268	if (vma->vm_flags & VM_WRITE) {
269	/ allow writable mapping for the consumer_pos only /
270	if (vma->vm_pgoff != `0` \|\| vma->vm_end - vma->vm_start != PAGE_SIZE)
271	return -EPERM;
272	}
273	/ remap_vmalloc_range() checks size and offset constraints /
274	return remap_vmalloc_range(vma, addr: rb_map->rb,
275	pgoff: vma->vm_pgoff + RINGBUF_PGOFF);
276	}
277
278	static int ringbuf_map_mmap_user(struct bpf_map map, struct* vm_area_struct *vma)
279	{
280	struct bpf_ringbuf_map *rb_map;
281
282	rb_map = container_of(map, struct bpf_ringbuf_map, map);
283
284	if (vma->vm_flags & VM_WRITE) {
285	if (vma->vm_pgoff == `0`)
286	/ Disallow writable mappings to the consumer pointer,*
287	* and allow writable mappings to both the producer
288	* position, and the ring buffer data itself.
289	*/
290	return -EPERM;
291	}
292	/ remap_vmalloc_range() checks size and offset constraints /
293	return remap_vmalloc_range(vma, addr: rb_map->rb, pgoff: vma->vm_pgoff + RINGBUF_PGOFF);
294	}
295
296	static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
297	{
298	unsigned long cons_pos, prod_pos;
299
300	cons_pos = smp_load_acquire(&rb->consumer_pos);
301	prod_pos = smp_load_acquire(&rb->producer_pos);
302	return prod_pos - cons_pos;
303	}
304
305	static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
306	{
307	return rb->mask + `1`;
308	}
309
310	static __poll_t ringbuf_map_poll_kern(struct bpf_map map, struct* file *filp,
311	struct poll_table_struct *pts)
312	{
313	struct bpf_ringbuf_map *rb_map;
314
315	rb_map = container_of(map, struct bpf_ringbuf_map, map);
316	poll_wait(filp, wait_address: &rb_map->rb->waitq, p: pts);
317
318	if (ringbuf_avail_data_sz(rb: rb_map->rb))
319	return EPOLLIN \| EPOLLRDNORM;
320	return `0`;
321	}
322
323	static __poll_t ringbuf_map_poll_user(struct bpf_map map, struct* file *filp,
324	struct poll_table_struct *pts)
325	{
326	struct bpf_ringbuf_map *rb_map;
327
328	rb_map = container_of(map, struct bpf_ringbuf_map, map);
329	poll_wait(filp, wait_address: &rb_map->rb->waitq, p: pts);
330
331	if (ringbuf_avail_data_sz(rb: rb_map->rb) < ringbuf_total_data_sz(rb: rb_map->rb))
332	return EPOLLOUT \| EPOLLWRNORM;
333	return `0`;
334	}
335
336	static u64 ringbuf_map_mem_usage(const struct bpf_map *map)
337	{
338	struct bpf_ringbuf *rb;
339	int nr_data_pages;
340	int nr_meta_pages;
341	u64 usage = sizeof(struct bpf_ringbuf_map);
342
343	rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
344	usage += (u64)rb->nr_pages << PAGE_SHIFT;
345	nr_meta_pages = RINGBUF_NR_META_PAGES;
346	nr_data_pages = map->max_entries >> PAGE_SHIFT;
347	usage += (nr_meta_pages + `2` * nr_data_pages) * sizeof(struct page *);
348	return usage;
349	}
350
351	BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
352	const struct bpf_map_ops ringbuf_map_ops = {
353	.map_meta_equal = bpf_map_meta_equal,
354	.map_alloc = ringbuf_map_alloc,
355	.map_free = ringbuf_map_free,
356	.map_mmap = ringbuf_map_mmap_kern,
357	.map_poll = ringbuf_map_poll_kern,
358	.map_lookup_elem = ringbuf_map_lookup_elem,
359	.map_update_elem = ringbuf_map_update_elem,
360	.map_delete_elem = ringbuf_map_delete_elem,
361	.map_get_next_key = ringbuf_map_get_next_key,
362	.map_mem_usage = ringbuf_map_mem_usage,
363	.map_btf_id = &ringbuf_map_btf_ids[`0`],
364	};
365
366	BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
367	const struct bpf_map_ops user_ringbuf_map_ops = {
368	.map_meta_equal = bpf_map_meta_equal,
369	.map_alloc = ringbuf_map_alloc,
370	.map_free = ringbuf_map_free,
371	.map_mmap = ringbuf_map_mmap_user,
372	.map_poll = ringbuf_map_poll_user,
373	.map_lookup_elem = ringbuf_map_lookup_elem,
374	.map_update_elem = ringbuf_map_update_elem,
375	.map_delete_elem = ringbuf_map_delete_elem,
376	.map_get_next_key = ringbuf_map_get_next_key,
377	.map_mem_usage = ringbuf_map_mem_usage,
378	.map_btf_id = &user_ringbuf_map_btf_ids[`0`],
379	};
380
381	/ Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,*
382	* calculate offset from record metadata to ring buffer in pages, rounded
383	* down. This page offset is stored as part of record metadata and allows to
384	* restore struct bpf_ringbuf * from record pointer. This page offset is
385	* stored at offset 4 of record metadata header.
386	*/
387	static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb,
388	struct bpf_ringbuf_hdr *hdr)
389	{
390	return ((void )hdr - (void* *)rb) >> PAGE_SHIFT;
391	}
392
393	/ Given pointer to ring buffer record header, restore pointer to struct*
394	* bpf_ringbuf itself by using page offset stored at offset 4
395	*/
396	static struct bpf_ringbuf *
397	bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
398	{
399	unsigned long addr = (unsigned long)(void *)hdr;
400	unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT;
401
402	return (void*)((addr & PAGE_MASK) - off);
403	}
404
405	static void __bpf_ringbuf_reserve(struct* bpf_ringbuf *rb, u64 size)
406	{
407	unsigned long cons_pos, prod_pos, new_prod_pos, pend_pos, flags;
408	struct bpf_ringbuf_hdr *hdr;
409	u32 len, pg_off, tmp_size, hdr_len;
410
411	if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
412	return NULL;
413
414	len = round_up(size + BPF_RINGBUF_HDR_SZ, `8`);
415	if (len > ringbuf_total_data_sz(rb))
416	return NULL;
417
418	cons_pos = smp_load_acquire(&rb->consumer_pos);
419
420	if (raw_res_spin_lock_irqsave(&rb->spinlock, flags))
421	return NULL;
422
423	pend_pos = rb->pending_pos;
424	prod_pos = rb->producer_pos;
425	new_prod_pos = prod_pos + len;
426
427	while (pend_pos < prod_pos) {
428	hdr = (void *)rb->data + (pend_pos & rb->mask);
429	hdr_len = READ_ONCE(hdr->len);
430	if (hdr_len & BPF_RINGBUF_BUSY_BIT)
431	break;
432	tmp_size = hdr_len & ~BPF_RINGBUF_DISCARD_BIT;
433	tmp_size = round_up(tmp_size + BPF_RINGBUF_HDR_SZ, `8`);
434	pend_pos += tmp_size;
435	}
436	rb->pending_pos = pend_pos;
437
438	/ check for out of ringbuf space:*
439	* - by ensuring producer position doesn't advance more than
440	* (ringbuf_size - 1) ahead
441	* - by ensuring oldest not yet committed record until newest
442	* record does not span more than (ringbuf_size - 1)
443	*/
444	if (new_prod_pos - cons_pos > rb->mask \|\|
445	new_prod_pos - pend_pos > rb->mask) {
446	raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
447	return NULL;
448	}
449
450	hdr = (void *)rb->data + (prod_pos & rb->mask);
451	pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
452	hdr->len = size \| BPF_RINGBUF_BUSY_BIT;
453	hdr->pg_off = pg_off;
454
455	/ pairs with consumer's smp_load_acquire() /
456	smp_store_release(&rb->producer_pos, new_prod_pos);
457
458	raw_res_spin_unlock_irqrestore(&rb->spinlock, flags);
459
460	return (void *)hdr + BPF_RINGBUF_HDR_SZ;
461	}
462
463	BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
464	{
465	struct bpf_ringbuf_map *rb_map;
466
467	if (unlikely(flags))
468	return `0`;
469
470	rb_map = container_of(map, struct bpf_ringbuf_map, map);
471	return (unsigned long)__bpf_ringbuf_reserve(rb: rb_map->rb, size);
472	}
473
474	const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
475	.func = bpf_ringbuf_reserve,
476	.ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL,
477	.arg1_type = ARG_CONST_MAP_PTR,
478	.arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
479	.arg3_type = ARG_ANYTHING,
480	};
481
482	static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
483	{
484	unsigned long rec_pos, cons_pos;
485	struct bpf_ringbuf_hdr *hdr;
486	struct bpf_ringbuf *rb;
487	u32 new_len;
488
489	hdr = sample - BPF_RINGBUF_HDR_SZ;
490	rb = bpf_ringbuf_restore_from_rec(hdr);
491	new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT;
492	if (discard)
493	new_len \|= BPF_RINGBUF_DISCARD_BIT;
494
495	/ update record header with correct final size prefix /
496	xchg(&hdr->len, new_len);
497
498	/ if consumer caught up and is waiting for our record, notify about*
499	* new data availability
500	*/
501	rec_pos = (void )hdr - (void* *)rb->data;
502	cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
503
504	if (flags & BPF_RB_FORCE_WAKEUP)
505	irq_work_queue(work: &rb->work);
506	else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
507	irq_work_queue(work: &rb->work);
508	}
509
510	BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
511	{
512	bpf_ringbuf_commit(sample, flags, discard: false / discard /);
513	return `0`;
514	}
515
516	const struct bpf_func_proto bpf_ringbuf_submit_proto = {
517	.func = bpf_ringbuf_submit,
518	.ret_type = RET_VOID,
519	.arg1_type = ARG_PTR_TO_RINGBUF_MEM \| OBJ_RELEASE,
520	.arg2_type = ARG_ANYTHING,
521	};
522
523	BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
524	{
525	bpf_ringbuf_commit(sample, flags, discard: true / discard /);
526	return `0`;
527	}
528
529	const struct bpf_func_proto bpf_ringbuf_discard_proto = {
530	.func = bpf_ringbuf_discard,
531	.ret_type = RET_VOID,
532	.arg1_type = ARG_PTR_TO_RINGBUF_MEM \| OBJ_RELEASE,
533	.arg2_type = ARG_ANYTHING,
534	};
535
536	BPF_CALL_4(bpf_ringbuf_output, struct bpf_map , map, void* *, data, u64, size,
537	u64, flags)
538	{
539	struct bpf_ringbuf_map *rb_map;
540	void *rec;
541
542	if (unlikely(flags & ~(BPF_RB_NO_WAKEUP \| BPF_RB_FORCE_WAKEUP)))
543	return -EINVAL;
544
545	rb_map = container_of(map, struct bpf_ringbuf_map, map);
546	rec = __bpf_ringbuf_reserve(rb: rb_map->rb, size);
547	if (!rec)
548	return -EAGAIN;
549
550	memcpy(rec, data, size);
551	bpf_ringbuf_commit(sample: rec, flags, discard: false / discard /);
552	return `0`;
553	}
554
555	const struct bpf_func_proto bpf_ringbuf_output_proto = {
556	.func = bpf_ringbuf_output,
557	.ret_type = RET_INTEGER,
558	.arg1_type = ARG_CONST_MAP_PTR,
559	.arg2_type = ARG_PTR_TO_MEM \| MEM_RDONLY,
560	.arg3_type = ARG_CONST_SIZE_OR_ZERO,
561	.arg4_type = ARG_ANYTHING,
562	};
563
564	BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
565	{
566	struct bpf_ringbuf *rb;
567
568	rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
569
570	switch (flags) {
571	case BPF_RB_AVAIL_DATA:
572	return ringbuf_avail_data_sz(rb);
573	case BPF_RB_RING_SIZE:
574	return ringbuf_total_data_sz(rb);
575	case BPF_RB_CONS_POS:
576	return smp_load_acquire(&rb->consumer_pos);
577	case BPF_RB_PROD_POS:
578	return smp_load_acquire(&rb->producer_pos);
579	default:
580	return `0`;
581	}
582	}
583
584	const struct bpf_func_proto bpf_ringbuf_query_proto = {
585	.func = bpf_ringbuf_query,
586	.ret_type = RET_INTEGER,
587	.arg1_type = ARG_CONST_MAP_PTR,
588	.arg2_type = ARG_ANYTHING,
589	};
590
591	BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags,
592	struct bpf_dynptr_kern *, ptr)
593	{
594	struct bpf_ringbuf_map *rb_map;
595	void *sample;
596	int err;
597
598	if (unlikely(flags)) {
599	bpf_dynptr_set_null(ptr);
600	return -EINVAL;
601	}
602
603	err = bpf_dynptr_check_size(size);
604	if (err) {
605	bpf_dynptr_set_null(ptr);
606	return err;
607	}
608
609	rb_map = container_of(map, struct bpf_ringbuf_map, map);
610
611	sample = __bpf_ringbuf_reserve(rb: rb_map->rb, size);
612	if (!sample) {
613	bpf_dynptr_set_null(ptr);
614	return -EINVAL;
615	}
616
617	bpf_dynptr_init(ptr, data: sample, type: BPF_DYNPTR_TYPE_RINGBUF, offset: `0`, size);
618
619	return `0`;
620	}
621
622	const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
623	.func = bpf_ringbuf_reserve_dynptr,
624	.ret_type = RET_INTEGER,
625	.arg1_type = ARG_CONST_MAP_PTR,
626	.arg2_type = ARG_ANYTHING,
627	.arg3_type = ARG_ANYTHING,
628	.arg4_type = ARG_PTR_TO_DYNPTR \| DYNPTR_TYPE_RINGBUF \| MEM_UNINIT \| MEM_WRITE,
629	};
630
631	BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
632	{
633	if (!ptr->data)
634	return `0`;
635
636	bpf_ringbuf_commit(sample: ptr->data, flags, discard: false / discard /);
637
638	bpf_dynptr_set_null(ptr);
639
640	return `0`;
641	}
642
643	const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = {
644	.func = bpf_ringbuf_submit_dynptr,
645	.ret_type = RET_VOID,
646	.arg1_type = ARG_PTR_TO_DYNPTR \| DYNPTR_TYPE_RINGBUF \| OBJ_RELEASE,
647	.arg2_type = ARG_ANYTHING,
648	};
649
650	BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
651	{
652	if (!ptr->data)
653	return `0`;
654
655	bpf_ringbuf_commit(sample: ptr->data, flags, discard: true / discard /);
656
657	bpf_dynptr_set_null(ptr);
658
659	return `0`;
660	}
661
662	const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
663	.func = bpf_ringbuf_discard_dynptr,
664	.ret_type = RET_VOID,
665	.arg1_type = ARG_PTR_TO_DYNPTR \| DYNPTR_TYPE_RINGBUF \| OBJ_RELEASE,
666	.arg2_type = ARG_ANYTHING,
667	};
668
669	static int __bpf_user_ringbuf_peek(struct bpf_ringbuf rb, void* *sample, u32 size)
670	{
671	int err;
672	u32 hdr_len, sample_len, total_len, flags, *hdr;
673	u64 cons_pos, prod_pos;
674
675	/ Synchronizes with smp_store_release() in user-space producer. /
676	prod_pos = smp_load_acquire(&rb->producer_pos);
677	if (prod_pos % `8`)
678	return -EINVAL;
679
680	/ Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() /
681	cons_pos = smp_load_acquire(&rb->consumer_pos);
682	if (cons_pos >= prod_pos)
683	return -ENODATA;
684
685	hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask));
686	/ Synchronizes with smp_store_release() in user-space producer. /
687	hdr_len = smp_load_acquire(hdr);
688	flags = hdr_len & (BPF_RINGBUF_BUSY_BIT \| BPF_RINGBUF_DISCARD_BIT);
689	sample_len = hdr_len & ~flags;
690	total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, `8`);
691
692	/ The sample must fit within the region advertised by the producer position. /
693	if (total_len > prod_pos - cons_pos)
694	return -EINVAL;
695
696	/ The sample must fit within the data region of the ring buffer. /
697	if (total_len > ringbuf_total_data_sz(rb))
698	return -E2BIG;
699
700	/ The sample must fit into a struct bpf_dynptr. /
701	err = bpf_dynptr_check_size(size: sample_len);
702	if (err)
703	return -E2BIG;
704
705	if (flags & BPF_RINGBUF_DISCARD_BIT) {
706	/ If the discard bit is set, the sample should be skipped.*
707	*
708	* Update the consumer pos, and return -EAGAIN so the caller
709	* knows to skip this sample and try to read the next one.
710	*/
711	smp_store_release(&rb->consumer_pos, cons_pos + total_len);
712	return -EAGAIN;
713	}
714
715	if (flags & BPF_RINGBUF_BUSY_BIT)
716	return -ENODATA;
717
718	sample = (void* *)((uintptr_t)rb->data +
719	(uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
720	*size = sample_len;
721	return `0`;
722	}
723
724	static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags)
725	{
726	u64 consumer_pos;
727	u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, `8`);
728
729	/ Using smp_load_acquire() is unnecessary here, as the busy-bit*
730	* prevents another task from writing to consumer_pos after it was read
731	* by this task with smp_load_acquire() in __bpf_user_ringbuf_peek().
732	*/
733	consumer_pos = rb->consumer_pos;
734	/ Synchronizes with smp_load_acquire() in user-space producer. /
735	smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size);
736	}
737
738	BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map,
739	void , callback_fn, void* *, callback_ctx, u64, flags)
740	{
741	struct bpf_ringbuf *rb;
742	long samples, discarded_samples = `0`, ret = `0`;
743	bpf_callback_t callback = (bpf_callback_t)callback_fn;
744	u64 wakeup_flags = BPF_RB_NO_WAKEUP \| BPF_RB_FORCE_WAKEUP;
745	int busy = `0`;
746
747	if (unlikely(flags & ~wakeup_flags))
748	return -EINVAL;
749
750	rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
751
752	/ If another consumer is already consuming a sample, wait for them to finish. /
753	if (!atomic_try_cmpxchg(v: &rb->busy, old: &busy, new: `1`))
754	return -EBUSY;
755
756	for (samples = `0`; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == `0`; samples++) {
757	int err;
758	u32 size;
759	void *sample;
760	struct bpf_dynptr_kern dynptr;
761
762	err = __bpf_user_ringbuf_peek(rb, sample: &sample, size: &size);
763	if (err) {
764	if (err == -ENODATA) {
765	break;
766	} else if (err == -EAGAIN) {
767	discarded_samples++;
768	continue;
769	} else {
770	ret = err;
771	goto schedule_work_return;
772	}
773	}
774
775	bpf_dynptr_init(ptr: &dynptr, data: sample, type: BPF_DYNPTR_TYPE_LOCAL, offset: `0`, size);
776	ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, `0`, `0`, `0`);
777	__bpf_user_ringbuf_sample_release(rb, size, flags);
778	}
779	ret = samples - discarded_samples;
780
781	schedule_work_return:
782	/ Prevent the clearing of the busy-bit from being reordered before the*
783	* storing of any rb consumer or producer positions.
784	*/
785	atomic_set_release(v: &rb->busy, i: `0`);
786
787	if (flags & BPF_RB_FORCE_WAKEUP)
788	irq_work_queue(work: &rb->work);
789	else if (!(flags & BPF_RB_NO_WAKEUP) && samples > `0`)
790	irq_work_queue(work: &rb->work);
791	return ret;
792	}
793
794	const struct bpf_func_proto bpf_user_ringbuf_drain_proto = {
795	.func = bpf_user_ringbuf_drain,
796	.ret_type = RET_INTEGER,
797	.arg1_type = ARG_CONST_MAP_PTR,
798	.arg2_type = ARG_PTR_TO_FUNC,
799	.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
800	.arg4_type = ARG_ANYTHING,
801	};
802

Provided by KDAB

Definitions

bpf_ringbuf
bpf_ringbuf_map
bpf_ringbuf_hdr
bpf_ringbuf_area_alloc
bpf_ringbuf_notify
bpf_ringbuf_alloc
ringbuf_map_alloc
bpf_ringbuf_free
ringbuf_map_free
ringbuf_map_lookup_elem
ringbuf_map_update_elem
ringbuf_map_delete_elem
ringbuf_map_get_next_key
ringbuf_map_mmap_kern
ringbuf_map_mmap_user
ringbuf_avail_data_sz
ringbuf_total_data_sz
ringbuf_map_poll_kern
ringbuf_map_poll_user
ringbuf_map_mem_usage
ringbuf_map_btf_ids
ringbuf_map_ops
user_ringbuf_map_btf_ids
user_ringbuf_map_ops
bpf_ringbuf_rec_pg_off
bpf_ringbuf_restore_from_rec
__bpf_ringbuf_reserve
bpf_ringbuf_reserve
bpf_ringbuf_reserve_proto
bpf_ringbuf_commit
bpf_ringbuf_submit
bpf_ringbuf_submit_proto
bpf_ringbuf_discard
bpf_ringbuf_discard_proto
bpf_ringbuf_output
bpf_ringbuf_output_proto
bpf_ringbuf_query
bpf_ringbuf_query_proto
bpf_ringbuf_reserve_dynptr
bpf_ringbuf_reserve_dynptr_proto
bpf_ringbuf_submit_dynptr
bpf_ringbuf_submit_dynptr_proto
bpf_ringbuf_discard_dynptr
bpf_ringbuf_discard_dynptr_proto
__bpf_user_ringbuf_peek
__bpf_user_ringbuf_sample_release
bpf_user_ringbuf_drain

Improve your Profiling and Debugging skills

Find out more

Definitions

source code of linux/kernel/bpf/ringbuf.c