cpumap.c source code [linux/kernel/bpf/cpumap.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ bpf/cpumap.c*
3	*
4	* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
5	*/
6
7	/**
8	* DOC: cpu map
9	* The 'cpumap' is primarily used as a backend map for XDP BPF helper
10	* call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
11	*
12	* Unlike devmap which redirects XDP frames out to another NIC device,
13	* this map type redirects raw XDP frames to another CPU. The remote
14	* CPU will do SKB-allocation and call the normal network stack.
15	*/
16	/*
17	* This is a scalability and isolation mechanism, that allow
18	* separating the early driver network XDP layer, from the rest of the
19	* netstack, and assigning dedicated CPUs for this stage. This
20	* basically allows for 10G wirespeed pre-filtering via bpf.
21	*/
22	#include <linux/bitops.h>
23	#include <linux/bpf.h>
24	#include <linux/filter.h>
25	#include <linux/ptr_ring.h>
26	#include <net/xdp.h>
27	#include <net/hotdata.h>
28
29	#include <linux/sched.h>
30	#include <linux/workqueue.h>
31	#include <linux/kthread.h>
32	#include <linux/completion.h>
33	#include <trace/events/xdp.h>
34	#include <linux/btf_ids.h>
35
36	#include <linux/netdevice.h> /* netif_receive_skb_list */
37	#include <linux/etherdevice.h> /* eth_type_trans */
38
39	/ General idea: XDP packets getting XDP redirected to another CPU,*
40	* will maximum be stored/queued for one driver ->poll() call. It is
41	* guaranteed that queueing the frame and the flush operation happen on
42	* same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
43	* which queue in bpf_cpu_map_entry contains packets.
44	*/
45
46	#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
47	struct bpf_cpu_map_entry;
48	struct bpf_cpu_map;
49
50	struct xdp_bulk_queue {
51	void *q[CPU_MAP_BULK_SIZE];
52	struct list_head flush_node;
53	struct bpf_cpu_map_entry *obj;
54	unsigned int count;
55	};
56
57	/ Struct for every remote "destination" CPU in map /
58	struct bpf_cpu_map_entry {
59	u32 cpu; / kthread CPU and map index /
60	int map_id; / Back reference to map /
61
62	/ XDP can run multiple RX-ring queues, need __percpu enqueue store /
63	struct xdp_bulk_queue __percpu *bulkq;
64
65	/ Queue with potential multi-producers, and single-consumer kthread /
66	struct ptr_ring *queue;
67	struct task_struct *kthread;
68
69	struct bpf_cpumap_val value;
70	struct bpf_prog *prog;
71
72	struct completion kthread_running;
73	struct rcu_work free_work;
74	};
75
76	struct bpf_cpu_map {
77	struct bpf_map map;
78	/ Below members specific for map type /
79	struct bpf_cpu_map_entry __rcu **cpu_map;
80	};
81
82	static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
83
84	static struct bpf_map cpu_map_alloc(union* bpf_attr *attr)
85	{
86	u32 value_size = attr->value_size;
87	struct bpf_cpu_map *cmap;
88
89	/ check sanity of attributes /
90	if (attr->max_entries == `0` \|\| attr->key_size != `4` \|\|
91	(value_size != offsetofend(struct bpf_cpumap_val, qsize) &&
92	value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) \|\|
93	attr->map_flags & ~BPF_F_NUMA_NODE)
94	return ERR_PTR(error: -EINVAL);
95
96	/ Pre-limit array size based on NR_CPUS, not final CPU check /
97	if (attr->max_entries > NR_CPUS)
98	return ERR_PTR(error: -E2BIG);
99
100	cmap = bpf_map_area_alloc(size: sizeof(*cmap), NUMA_NO_NODE);
101	if (!cmap)
102	return ERR_PTR(error: -ENOMEM);
103
104	bpf_map_init_from_attr(map: &cmap->map, attr);
105
106	/ Alloc array for possible remote "destination" CPUs /
107	cmap->cpu_map = bpf_map_area_alloc(size: cmap->map.max_entries *
108	sizeof(struct bpf_cpu_map_entry *),
109	numa_node: cmap->map.numa_node);
110	if (!cmap->cpu_map) {
111	bpf_map_area_free(base: cmap);
112	return ERR_PTR(error: -ENOMEM);
113	}
114
115	return &cmap->map;
116	}
117
118	static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
119	{
120	/ The tear-down procedure should have made sure that queue is*
121	* empty. See __cpu_map_entry_replace() and work-queue
122	* invoked cpu_map_kthread_stop(). Catch any broken behaviour
123	* gracefully and warn once.
124	*/
125	void *ptr;
126
127	while ((ptr = ptr_ring_consume(r: ring))) {
128	WARN_ON_ONCE(`1`);
129	if (unlikely(__ptr_test_bit(`0`, &ptr))) {
130	__ptr_clear_bit(`0`, &ptr);
131	kfree_skb(skb: ptr);
132	continue;
133	}
134	xdp_return_frame(xdpf: ptr);
135	}
136	}
137
138	static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
139	struct list_head *listp,
140	struct xdp_cpumap_stats *stats)
141	{
142	struct sk_buff skb, tmp;
143	struct xdp_buff xdp;
144	u32 act;
145	int err;
146
147	list_for_each_entry_safe(skb, tmp, listp, list) {
148	act = bpf_prog_run_generic_xdp(skb, xdp: &xdp, xdp_prog: rcpu->prog);
149	switch (act) {
150	case XDP_PASS:
151	break;
152	case XDP_REDIRECT:
153	skb_list_del_init(skb);
154	err = xdp_do_generic_redirect(dev: skb->dev, skb, xdp: &xdp,
155	prog: rcpu->prog);
156	if (unlikely(err)) {
157	kfree_skb(skb);
158	stats->drop++;
159	} else {
160	stats->redirect++;
161	}
162	return;
163	default:
164	bpf_warn_invalid_xdp_action(NULL, prog: rcpu->prog, act);
165	fallthrough;
166	case XDP_ABORTED:
167	trace_xdp_exception(dev: skb->dev, xdp: rcpu->prog, act);
168	fallthrough;
169	case XDP_DROP:
170	skb_list_del_init(skb);
171	kfree_skb(skb);
172	stats->drop++;
173	return;
174	}
175	}
176	}
177
178	static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
179	void *frames, int* n,
180	struct xdp_cpumap_stats *stats)
181	{
182	struct xdp_rxq_info rxq = {};
183	struct xdp_buff xdp;
184	int i, nframes = `0`;
185
186	xdp_set_return_frame_no_direct();
187	xdp.rxq = &rxq;
188
189	for (i = `0`; i < n; i++) {
190	struct xdp_frame *xdpf = frames[i];
191	u32 act;
192	int err;
193
194	rxq.dev = xdpf->dev_rx;
195	rxq.mem = xdpf->mem;
196	/ TODO: report queue_index to xdp_rxq_info /
197
198	xdp_convert_frame_to_buff(frame: xdpf, xdp: &xdp);
199
200	act = bpf_prog_run_xdp(prog: rcpu->prog, xdp: &xdp);
201	switch (act) {
202	case XDP_PASS:
203	err = xdp_update_frame_from_buff(xdp: &xdp, xdp_frame: xdpf);
204	if (err < `0`) {
205	xdp_return_frame(xdpf);
206	stats->drop++;
207	} else {
208	frames[nframes++] = xdpf;
209	stats->pass++;
210	}
211	break;
212	case XDP_REDIRECT:
213	err = xdp_do_redirect(dev: xdpf->dev_rx, xdp: &xdp,
214	prog: rcpu->prog);
215	if (unlikely(err)) {
216	xdp_return_frame(xdpf);
217	stats->drop++;
218	} else {
219	stats->redirect++;
220	}
221	break;
222	default:
223	bpf_warn_invalid_xdp_action(NULL, prog: rcpu->prog, act);
224	fallthrough;
225	case XDP_DROP:
226	xdp_return_frame(xdpf);
227	stats->drop++;
228	break;
229	}
230	}
231
232	xdp_clear_return_frame_no_direct();
233
234	return nframes;
235	}
236
237	#define CPUMAP_BATCH 8
238
239	static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry rcpu, void* **frames,
240	int xdp_n, struct xdp_cpumap_stats *stats,
241	struct list_head *list)
242	{
243	int nframes;
244
245	if (!rcpu->prog)
246	return xdp_n;
247
248	rcu_read_lock_bh();
249
250	nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, n: xdp_n, stats);
251
252	if (stats->redirect)
253	xdp_do_flush();
254
255	if (unlikely(!list_empty(list)))
256	cpu_map_bpf_prog_run_skb(rcpu, listp: list, stats);
257
258	rcu_read_unlock_bh(); / resched point, may call do_softirq() /
259
260	return nframes;
261	}
262
263	static int cpu_map_kthread_run(void *data)
264	{
265	struct bpf_cpu_map_entry *rcpu = data;
266	unsigned long last_qs = jiffies;
267
268	complete(&rcpu->kthread_running);
269	set_current_state(TASK_INTERRUPTIBLE);
270
271	/ When kthread gives stop order, then rcpu have been disconnected*
272	* from map, thus no new packets can enter. Remaining in-flight
273	* per CPU stored packets are flushed to this queue. Wait honoring
274	* kthread_stop signal until queue is empty.
275	*/
276	while (!kthread_should_stop() \|\| !__ptr_ring_empty(r: rcpu->queue)) {
277	struct xdp_cpumap_stats stats = {}; / zero stats /
278	unsigned int kmem_alloc_drops = `0`, sched = `0`;
279	gfp_t gfp = __GFP_ZERO \| GFP_ATOMIC;
280	int i, n, m, nframes, xdp_n;
281	void *frames[CPUMAP_BATCH];
282	void *skbs[CPUMAP_BATCH];
283	LIST_HEAD(list);
284
285	/ Release CPU reschedule checks /
286	if (__ptr_ring_empty(r: rcpu->queue)) {
287	set_current_state(TASK_INTERRUPTIBLE);
288	/ Recheck to avoid lost wake-up /
289	if (__ptr_ring_empty(r: rcpu->queue)) {
290	schedule();
291	sched = `1`;
292	last_qs = jiffies;
293	} else {
294	__set_current_state(TASK_RUNNING);
295	}
296	} else {
297	rcu_softirq_qs_periodic(last_qs);
298	sched = cond_resched();
299	}
300
301	/*
302	* The bpf_cpu_map_entry is single consumer, with this
303	* kthread CPU pinned. Lockless access to ptr_ring
304	* consume side valid as no-resize allowed of queue.
305	*/
306	n = __ptr_ring_consume_batched(r: rcpu->queue, array: frames,
307	CPUMAP_BATCH);
308	for (i = `0`, xdp_n = `0`; i < n; i++) {
309	void *f = frames[i];
310	struct page *page;
311
312	if (unlikely(__ptr_test_bit(`0`, &f))) {
313	struct sk_buff *skb = f;
314
315	__ptr_clear_bit(`0`, &skb);
316	list_add_tail(new: &skb->list, head: &list);
317	continue;
318	}
319
320	frames[xdp_n++] = f;
321	page = virt_to_page(f);
322
323	/ Bring struct page memory area to curr CPU. Read by*
324	* build_skb_around via page_is_pfmemalloc(), and when
325	* freed written by page_frag_free call.
326	*/
327	prefetchw(x: page);
328	}
329
330	/ Support running another XDP prog on this CPU /
331	nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, stats: &stats, list: &list);
332	if (nframes) {
333	m = kmem_cache_alloc_bulk(s: net_hotdata.skbuff_cache,
334	flags: gfp, size: nframes, p: skbs);
335	if (unlikely(m == `0`)) {
336	for (i = `0`; i < nframes; i++)
337	skbs[i] = NULL; / effect: xdp_return_frame /
338	kmem_alloc_drops += nframes;
339	}
340	}
341
342	local_bh_disable();
343	for (i = `0`; i < nframes; i++) {
344	struct xdp_frame *xdpf = frames[i];
345	struct sk_buff *skb = skbs[i];
346
347	skb = __xdp_build_skb_from_frame(xdpf, skb,
348	dev: xdpf->dev_rx);
349	if (!skb) {
350	xdp_return_frame(xdpf);
351	continue;
352	}
353
354	list_add_tail(new: &skb->list, head: &list);
355	}
356	netif_receive_skb_list(head: &list);
357
358	/ Feedback loop via tracepoint /
359	trace_xdp_cpumap_kthread(map_id: rcpu->map_id, processed: n, drops: kmem_alloc_drops,
360	sched, xdp_stats: &stats);
361
362	local_bh_enable(); / resched point, may call do_softirq() /
363	}
364	__set_current_state(TASK_RUNNING);
365
366	return `0`;
367	}
368
369	static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu,
370	struct bpf_map map, int* fd)
371	{
372	struct bpf_prog *prog;
373
374	prog = bpf_prog_get_type(ufd: fd, type: BPF_PROG_TYPE_XDP);
375	if (IS_ERR(ptr: prog))
376	return PTR_ERR(ptr: prog);
377
378	if (prog->expected_attach_type != BPF_XDP_CPUMAP \|\|
379	!bpf_prog_map_compatible(map, fp: prog)) {
380	bpf_prog_put(prog);
381	return -EINVAL;
382	}
383
384	rcpu->value.bpf_prog.id = prog->aux->id;
385	rcpu->prog = prog;
386
387	return `0`;
388	}
389
390	static struct bpf_cpu_map_entry *
391	__cpu_map_entry_alloc(struct bpf_map map, struct* bpf_cpumap_val *value,
392	u32 cpu)
393	{
394	int numa, err, i, fd = value->bpf_prog.fd;
395	gfp_t gfp = GFP_KERNEL \| __GFP_NOWARN;
396	struct bpf_cpu_map_entry *rcpu;
397	struct xdp_bulk_queue *bq;
398
399	/ Have map->numa_node, but choose node of redirect target CPU /
400	numa = cpu_to_node(cpu);
401
402	rcpu = bpf_map_kmalloc_node(map, size: sizeof(*rcpu), flags: gfp \| __GFP_ZERO, node: numa);
403	if (!rcpu)
404	return NULL;
405
406	/ Alloc percpu bulkq /
407	rcpu->bulkq = bpf_map_alloc_percpu(map, size: sizeof(*rcpu->bulkq),
408	align: sizeof(void *), flags: gfp);
409	if (!rcpu->bulkq)
410	goto free_rcu;
411
412	for_each_possible_cpu(i) {
413	bq = per_cpu_ptr(rcpu->bulkq, i);
414	bq->obj = rcpu;
415	}
416
417	/ Alloc queue /
418	rcpu->queue = bpf_map_kmalloc_node(map, size: sizeof(*rcpu->queue), flags: gfp,
419	node: numa);
420	if (!rcpu->queue)
421	goto free_bulkq;
422
423	err = ptr_ring_init(r: rcpu->queue, size: value->qsize, gfp);
424	if (err)
425	goto free_queue;
426
427	rcpu->cpu = cpu;
428	rcpu->map_id = map->id;
429	rcpu->value.qsize = value->qsize;
430
431	if (fd > `0` && __cpu_map_load_bpf_program(rcpu, map, fd))
432	goto free_ptr_ring;
433
434	/ Setup kthread /
435	init_completion(x: &rcpu->kthread_running);
436	rcpu->kthread = kthread_create_on_node(threadfn: cpu_map_kthread_run, data: rcpu, node: numa,
437	namefmt: "cpumap/%d/map:%d", cpu,
438	map->id);
439	if (IS_ERR(ptr: rcpu->kthread))
440	goto free_prog;
441
442	/ Make sure kthread runs on a single CPU /
443	kthread_bind(k: rcpu->kthread, cpu);
444	wake_up_process(tsk: rcpu->kthread);
445
446	/ Make sure kthread has been running, so kthread_stop() will not*
447	* stop the kthread prematurely and all pending frames or skbs
448	* will be handled by the kthread before kthread_stop() returns.
449	*/
450	wait_for_completion(&rcpu->kthread_running);
451
452	return rcpu;
453
454	free_prog:
455	if (rcpu->prog)
456	bpf_prog_put(prog: rcpu->prog);
457	free_ptr_ring:
458	ptr_ring_cleanup(r: rcpu->queue, NULL);
459	free_queue:
460	kfree(objp: rcpu->queue);
461	free_bulkq:
462	free_percpu(pdata: rcpu->bulkq);
463	free_rcu:
464	kfree(objp: rcpu);
465	return NULL;
466	}
467
468	static void __cpu_map_entry_free(struct work_struct *work)
469	{
470	struct bpf_cpu_map_entry *rcpu;
471
472	/ This cpu_map_entry have been disconnected from map and one*
473	* RCU grace-period have elapsed. Thus, XDP cannot queue any
474	* new packets and cannot change/set flush_needed that can
475	* find this entry.
476	*/
477	rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work);
478
479	/ kthread_stop will wake_up_process and wait for it to complete.*
480	* cpu_map_kthread_run() makes sure the pointer ring is empty
481	* before exiting.
482	*/
483	kthread_stop(k: rcpu->kthread);
484
485	if (rcpu->prog)
486	bpf_prog_put(prog: rcpu->prog);
487	/ The queue should be empty at this point /
488	__cpu_map_ring_cleanup(ring: rcpu->queue);
489	ptr_ring_cleanup(r: rcpu->queue, NULL);
490	kfree(objp: rcpu->queue);
491	free_percpu(pdata: rcpu->bulkq);
492	kfree(objp: rcpu);
493	}
494
495	/ After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old*
496	* entry is no longer in use before freeing. We use queue_rcu_work() to call
497	* __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace
498	* period. This means that (a) all pending enqueue and flush operations have
499	* completed (because of the RCU callback), and (b) we are in a workqueue
500	* context where we can stop the kthread and wait for it to exit before freeing
501	* everything.
502	*/
503	static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
504	u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
505	{
506	struct bpf_cpu_map_entry *old_rcpu;
507
508	old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
509	if (old_rcpu) {
510	INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free);
511	queue_rcu_work(wq: system_wq, rwork: &old_rcpu->free_work);
512	}
513	}
514
515	static long cpu_map_delete_elem(struct bpf_map map, void* *key)
516	{
517	struct bpf_cpu_map cmap = container_of(map, struct* bpf_cpu_map, map);
518	u32 key_cpu = (u32 )key;
519
520	if (key_cpu >= map->max_entries)
521	return -EINVAL;
522
523	/ notice caller map_delete_elem() uses rcu_read_lock() /
524	__cpu_map_entry_replace(cmap, key_cpu, NULL);
525	return `0`;
526	}
527
528	static long cpu_map_update_elem(struct bpf_map map, void* key, void* *value,
529	u64 map_flags)
530	{
531	struct bpf_cpu_map cmap = container_of(map, struct* bpf_cpu_map, map);
532	struct bpf_cpumap_val cpumap_value = {};
533	struct bpf_cpu_map_entry *rcpu;
534	/ Array index key correspond to CPU number /
535	u32 key_cpu = (u32 )key;
536
537	memcpy(&cpumap_value, value, map->value_size);
538
539	if (unlikely(map_flags > BPF_EXIST))
540	return -EINVAL;
541	if (unlikely(key_cpu >= cmap->map.max_entries))
542	return -E2BIG;
543	if (unlikely(map_flags == BPF_NOEXIST))
544	return -EEXIST;
545	if (unlikely(cpumap_value.qsize > `16384`)) / sanity limit on qsize /
546	return -EOVERFLOW;
547
548	/ Make sure CPU is a valid possible cpu /
549	if (key_cpu >= nr_cpumask_bits \|\| !cpu_possible(cpu: key_cpu))
550	return -ENODEV;
551
552	if (cpumap_value.qsize == `0`) {
553	rcpu = NULL; / Same as deleting /
554	} else {
555	/ Updating qsize cause re-allocation of bpf_cpu_map_entry /
556	rcpu = __cpu_map_entry_alloc(map, value: &cpumap_value, cpu: key_cpu);
557	if (!rcpu)
558	return -ENOMEM;
559	}
560	rcu_read_lock();
561	__cpu_map_entry_replace(cmap, key_cpu, rcpu);
562	rcu_read_unlock();
563	return `0`;
564	}
565
566	static void cpu_map_free(struct bpf_map *map)
567	{
568	struct bpf_cpu_map cmap = container_of(map, struct* bpf_cpu_map, map);
569	u32 i;
570
571	/ At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,*
572	* so the bpf programs (can be more than one that used this map) were
573	* disconnected from events. Wait for outstanding critical sections in
574	* these programs to complete. synchronize_rcu() below not only
575	* guarantees no further "XDP/bpf-side" reads against
576	* bpf_cpu_map->cpu_map, but also ensure pending flush operations
577	* (if any) are completed.
578	*/
579	synchronize_rcu();
580
581	/ The only possible user of bpf_cpu_map_entry is*
582	* cpu_map_kthread_run().
583	*/
584	for (i = `0`; i < cmap->map.max_entries; i++) {
585	struct bpf_cpu_map_entry *rcpu;
586
587	rcpu = rcu_dereference_raw(cmap->cpu_map[i]);
588	if (!rcpu)
589	continue;
590
591	/ Stop kthread and cleanup entry directly /
592	__cpu_map_entry_free(work: &rcpu->free_work.work);
593	}
594	bpf_map_area_free(base: cmap->cpu_map);
595	bpf_map_area_free(base: cmap);
596	}
597
598	/ Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or*
599	* by local_bh_disable() (from XDP calls inside NAPI). The
600	* rcu_read_lock_bh_held() below makes lockdep accept both.
601	*/
602	static void __cpu_map_lookup_elem(struct* bpf_map *map, u32 key)
603	{
604	struct bpf_cpu_map cmap = container_of(map, struct* bpf_cpu_map, map);
605	struct bpf_cpu_map_entry *rcpu;
606
607	if (key >= map->max_entries)
608	return NULL;
609
610	rcpu = rcu_dereference_check(cmap->cpu_map[key],
611	rcu_read_lock_bh_held());
612	return rcpu;
613	}
614
615	static void cpu_map_lookup_elem(struct* bpf_map map, void* *key)
616	{
617	struct bpf_cpu_map_entry *rcpu =
618	__cpu_map_lookup_elem(map, key: (u32 )key);
619
620	return rcpu ? &rcpu->value : NULL;
621	}
622
623	static int cpu_map_get_next_key(struct bpf_map map, void* key, void* *next_key)
624	{
625	struct bpf_cpu_map cmap = container_of(map, struct* bpf_cpu_map, map);
626	u32 index = key ? (u32 )key : U32_MAX;
627	u32 *next = next_key;
628
629	if (index >= cmap->map.max_entries) {
630	*next = `0`;
631	return `0`;
632	}
633
634	if (index == cmap->map.max_entries - `1`)
635	return -ENOENT;
636	*next = index + `1`;
637	return `0`;
638	}
639
640	static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
641	{
642	return __bpf_xdp_redirect_map(map, index, flags, flag_mask: `0`,
643	lookup_elem: __cpu_map_lookup_elem);
644	}
645
646	static u64 cpu_map_mem_usage(const struct bpf_map *map)
647	{
648	u64 usage = sizeof(struct bpf_cpu_map);
649
650	/ Currently the dynamically allocated elements are not counted /
651	usage += (u64)map->max_entries * sizeof(struct bpf_cpu_map_entry *);
652	return usage;
653	}
654
655	BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map)
656	const struct bpf_map_ops cpu_map_ops = {
657	.map_meta_equal = bpf_map_meta_equal,
658	.map_alloc = cpu_map_alloc,
659	.map_free = cpu_map_free,
660	.map_delete_elem = cpu_map_delete_elem,
661	.map_update_elem = cpu_map_update_elem,
662	.map_lookup_elem = cpu_map_lookup_elem,
663	.map_get_next_key = cpu_map_get_next_key,
664	.map_check_btf = map_check_no_btf,
665	.map_mem_usage = cpu_map_mem_usage,
666	.map_btf_id = &cpu_map_btf_ids[`0`],
667	.map_redirect = cpu_map_redirect,
668	};
669
670	static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
671	{
672	struct bpf_cpu_map_entry *rcpu = bq->obj;
673	unsigned int processed = `0`, drops = `0`;
674	const int to_cpu = rcpu->cpu;
675	struct ptr_ring *q;
676	int i;
677
678	if (unlikely(!bq->count))
679	return;
680
681	q = rcpu->queue;
682	spin_lock(lock: &q->producer_lock);
683
684	for (i = `0`; i < bq->count; i++) {
685	struct xdp_frame *xdpf = bq->q[i];
686	int err;
687
688	err = __ptr_ring_produce(r: q, ptr: xdpf);
689	if (err) {
690	drops++;
691	xdp_return_frame_rx_napi(xdpf);
692	}
693	processed++;
694	}
695	bq->count = `0`;
696	spin_unlock(lock: &q->producer_lock);
697
698	__list_del_clearprev(entry: &bq->flush_node);
699
700	/ Feedback loop via tracepoints /
701	trace_xdp_cpumap_enqueue(map_id: rcpu->map_id, processed, drops, to_cpu);
702	}
703
704	/ Runs under RCU-read-side, plus in softirq under NAPI protection.*
705	* Thus, safe percpu variable access.
706	*/
707	static void bq_enqueue(struct bpf_cpu_map_entry rcpu, struct* xdp_frame *xdpf)
708	{
709	struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
710	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
711
712	if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
713	bq_flush_to_queue(bq);
714
715	/ Notice, xdp_buff/page MUST be queued here, long enough for*
716	* driver to code invoking us to finished, due to driver
717	* (e.g. ixgbe) recycle tricks based on page-refcnt.
718	*
719	* Thus, incoming xdp_frame is always queued here (else we race
720	* with another CPU on page-refcnt and remaining driver code).
721	* Queue time is very short, as driver will invoke flush
722	* operation, when completing napi->poll call.
723	*/
724	bq->q[bq->count++] = xdpf;
725
726	if (!bq->flush_node.prev)
727	list_add(new: &bq->flush_node, head: flush_list);
728	}
729
730	int cpu_map_enqueue(struct bpf_cpu_map_entry rcpu, struct* xdp_frame *xdpf,
731	struct net_device *dev_rx)
732	{
733	/ Info needed when constructing SKB on remote CPU /
734	xdpf->dev_rx = dev_rx;
735
736	bq_enqueue(rcpu, xdpf);
737	return `0`;
738	}
739
740	int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
741	struct sk_buff *skb)
742	{
743	int ret;
744
745	__skb_pull(skb, len: skb->mac_len);
746	skb_set_redirected(skb, from_ingress: false);
747	__ptr_set_bit(`0`, &skb);
748
749	ret = ptr_ring_produce(r: rcpu->queue, ptr: skb);
750	if (ret < `0`)
751	goto trace;
752
753	wake_up_process(tsk: rcpu->kthread);
754	trace:
755	trace_xdp_cpumap_enqueue(map_id: rcpu->map_id, processed: !ret, drops: !!ret, to_cpu: rcpu->cpu);
756	return ret;
757	}
758
759	void __cpu_map_flush(void)
760	{
761	struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
762	struct xdp_bulk_queue bq, tmp;
763
764	list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
765	bq_flush_to_queue(bq);
766
767	/ If already running, costs spin_lock_irqsave + smb_mb /
768	wake_up_process(tsk: bq->obj->kthread);
769	}
770	}
771
772	#ifdef CONFIG_DEBUG_NET
773	bool cpu_map_check_flush(void)
774	{
775	if (list_empty(this_cpu_ptr(&cpu_map_flush_list)))
776	return false;
777	__cpu_map_flush();
778	return true;
779	}
780	#endif
781
782	static int __init cpu_map_init(void)
783	{
784	int cpu;
785
786	for_each_possible_cpu(cpu)
787	INIT_LIST_HEAD(list: &per_cpu(cpu_map_flush_list, cpu));
788	return `0`;
789	}
790
791	subsys_initcall(cpu_map_init);
792

source code of linux/kernel/bpf/cpumap.c