syscall.c source code [linux/kernel/bpf/syscall.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com*
3	*/
4	#include <linux/bpf.h>
5	#include <linux/bpf-cgroup.h>
6	#include <linux/bpf_trace.h>
7	#include <linux/bpf_lirc.h>
8	#include <linux/bpf_verifier.h>
9	#include <linux/bsearch.h>
10	#include <linux/btf.h>
11	#include <linux/syscalls.h>
12	#include <linux/slab.h>
13	#include <linux/sched/signal.h>
14	#include <linux/vmalloc.h>
15	#include <linux/mmzone.h>
16	#include <linux/anon_inodes.h>
17	#include <linux/fdtable.h>
18	#include <linux/file.h>
19	#include <linux/fs.h>
20	#include <linux/license.h>
21	#include <linux/filter.h>
22	#include <linux/kernel.h>
23	#include <linux/idr.h>
24	#include <linux/cred.h>
25	#include <linux/timekeeping.h>
26	#include <linux/ctype.h>
27	#include <linux/nospec.h>
28	#include <linux/audit.h>
29	#include <uapi/linux/btf.h>
30	#include <linux/pgtable.h>
31	#include <linux/bpf_lsm.h>
32	#include <linux/poll.h>
33	#include <linux/sort.h>
34	#include <linux/bpf-netns.h>
35	#include <linux/rcupdate_trace.h>
36	#include <linux/memcontrol.h>
37	#include <linux/trace_events.h>
38
39	#include <net/netfilter/nf_bpf_link.h>
40	#include <net/netkit.h>
41	#include <net/tcx.h>
42
43	#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY \|\| \
44	(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY \|\| \
45	(map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
46	#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
47	#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
48	#define IS_FD_MAP(map) (IS_FD_ARRAY(map) \|\| IS_FD_PROG_ARRAY(map) \|\| \
49	IS_FD_HASH(map))
50
51	#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY \| BPF_F_WRONLY)
52
53	DEFINE_PER_CPU(int, bpf_prog_active);
54	static DEFINE_IDR(prog_idr);
55	static DEFINE_SPINLOCK(prog_idr_lock);
56	static DEFINE_IDR(map_idr);
57	static DEFINE_SPINLOCK(map_idr_lock);
58	static DEFINE_IDR(link_idr);
59	static DEFINE_SPINLOCK(link_idr_lock);
60
61	int sysctl_unprivileged_bpf_disabled __read_mostly =
62	IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? `2` : `0`;
63
64	static const struct bpf_map_ops * const bpf_map_types[] = {
65	#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
66	#define BPF_MAP_TYPE(_id, _ops) \
67	[_id] = &_ops,
68	#define BPF_LINK_TYPE(_id, _name)
69	#include <linux/bpf_types.h>
70	#undef BPF_PROG_TYPE
71	#undef BPF_MAP_TYPE
72	#undef BPF_LINK_TYPE
73	};
74
75	/*
76	* If we're handed a bigger struct than we know of, ensure all the unknown bits
77	* are 0 - i.e. new user-space does not rely on any kernel feature extensions
78	* we don't know about yet.
79	*
80	* There is a ToCToU between this function call and the following
81	* copy_from_user() call. However, this is not a concern since this function is
82	* meant to be a future-proofing of bits.
83	*/
84	int bpf_check_uarg_tail_zero(bpfptr_t uaddr,
85	size_t expected_size,
86	size_t actual_size)
87	{
88	int res;
89
90	if (unlikely(actual_size > PAGE_SIZE)) / silly large /
91	return -E2BIG;
92
93	if (actual_size <= expected_size)
94	return `0`;
95
96	if (uaddr.is_kernel)
97	res = memchr_inv(p: uaddr.kernel + expected_size, c: `0`,
98	size: actual_size - expected_size) == NULL;
99	else
100	res = check_zeroed_user(from: uaddr.user + expected_size,
101	size: actual_size - expected_size);
102	if (res < `0`)
103	return res;
104	return res ? `0` : -E2BIG;
105	}
106
107	const struct bpf_map_ops bpf_map_offload_ops = {
108	.map_meta_equal = bpf_map_meta_equal,
109	.map_alloc = bpf_map_offload_map_alloc,
110	.map_free = bpf_map_offload_map_free,
111	.map_check_btf = map_check_no_btf,
112	.map_mem_usage = bpf_map_offload_map_mem_usage,
113	};
114
115	static void bpf_map_write_active_inc(struct bpf_map *map)
116	{
117	atomic64_inc(v: &map->writecnt);
118	}
119
120	static void bpf_map_write_active_dec(struct bpf_map *map)
121	{
122	atomic64_dec(v: &map->writecnt);
123	}
124
125	bool bpf_map_write_active(const struct bpf_map *map)
126	{
127	return atomic64_read(v: &map->writecnt) != `0`;
128	}
129
130	static u32 bpf_map_value_size(const struct bpf_map *map)
131	{
132	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
133	map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH \|\|
134	map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY \|\|
135	map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
136	return round_up(map->value_size, `8`) * num_possible_cpus();
137	else if (IS_FD_MAP(map))
138	return sizeof(u32);
139	else
140	return map->value_size;
141	}
142
143	static void maybe_wait_bpf_programs(struct bpf_map *map)
144	{
145	/ Wait for any running non-sleepable BPF programs to complete so that*
146	* userspace, when we return to it, knows that all non-sleepable
147	* programs that could be running use the new map value. For sleepable
148	* BPF programs, synchronize_rcu_tasks_trace() should be used to wait
149	* for the completions of these programs, but considering the waiting
150	* time can be very long and userspace may think it will hang forever,
151	* so don't handle sleepable BPF programs now.
152	*/
153	if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS \|\|
154	map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
155	synchronize_rcu();
156	}
157
158	static int bpf_map_update_value(struct bpf_map map, struct* file *map_file,
159	void key, void* *value, __u64 flags)
160	{
161	int err;
162
163	/ Need to create a kthread, thus must support schedule /
164	if (bpf_map_is_offloaded(map)) {
165	return bpf_map_offload_update_elem(map, key, value, flags);
166	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP \|\|
167	map->map_type == BPF_MAP_TYPE_ARENA \|\|
168	map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
169	return map->ops->map_update_elem(map, key, value, flags);
170	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH \|\|
171	map->map_type == BPF_MAP_TYPE_SOCKMAP) {
172	return sock_map_update_elem_sys(map, key, value, flags);
173	} else if (IS_FD_PROG_ARRAY(map)) {
174	return bpf_fd_array_map_update_elem(map, map_file, key, value,
175	map_flags: flags);
176	}
177
178	bpf_disable_instrumentation();
179	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
180	map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
181	err = bpf_percpu_hash_update(map, key, value, flags);
182	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
183	err = bpf_percpu_array_update(map, key, value, flags);
184	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
185	err = bpf_percpu_cgroup_storage_update(map, key, value,
186	flags);
187	} else if (IS_FD_ARRAY(map)) {
188	err = bpf_fd_array_map_update_elem(map, map_file, key, value,
189	map_flags: flags);
190	} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
191	err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
192	map_flags: flags);
193	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
194	/ rcu_read_lock() is not needed /
195	err = bpf_fd_reuseport_array_update_elem(map, key, value,
196	map_flags: flags);
197	} else if (map->map_type == BPF_MAP_TYPE_QUEUE \|\|
198	map->map_type == BPF_MAP_TYPE_STACK \|\|
199	map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
200	err = map->ops->map_push_elem(map, value, flags);
201	} else {
202	rcu_read_lock();
203	err = map->ops->map_update_elem(map, key, value, flags);
204	rcu_read_unlock();
205	}
206	bpf_enable_instrumentation();
207
208	return err;
209	}
210
211	static int bpf_map_copy_value(struct bpf_map map, void* key, void* *value,
212	__u64 flags)
213	{
214	void *ptr;
215	int err;
216
217	if (bpf_map_is_offloaded(map))
218	return bpf_map_offload_lookup_elem(map, key, value);
219
220	bpf_disable_instrumentation();
221	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
222	map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
223	err = bpf_percpu_hash_copy(map, key, value);
224	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
225	err = bpf_percpu_array_copy(map, key, value);
226	} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
227	err = bpf_percpu_cgroup_storage_copy(map, key, value);
228	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
229	err = bpf_stackmap_copy(map, key, value);
230	} else if (IS_FD_ARRAY(map) \|\| IS_FD_PROG_ARRAY(map)) {
231	err = bpf_fd_array_map_lookup_elem(map, key, value);
232	} else if (IS_FD_HASH(map)) {
233	err = bpf_fd_htab_map_lookup_elem(map, key, value);
234	} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
235	err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
236	} else if (map->map_type == BPF_MAP_TYPE_QUEUE \|\|
237	map->map_type == BPF_MAP_TYPE_STACK \|\|
238	map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
239	err = map->ops->map_peek_elem(map, value);
240	} else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
241	/ struct_ops map requires directly updating "value" /
242	err = bpf_struct_ops_map_sys_lookup_elem(map, key, value);
243	} else {
244	rcu_read_lock();
245	if (map->ops->map_lookup_elem_sys_only)
246	ptr = map->ops->map_lookup_elem_sys_only(map, key);
247	else
248	ptr = map->ops->map_lookup_elem(map, key);
249	if (IS_ERR(ptr)) {
250	err = PTR_ERR(ptr);
251	} else if (!ptr) {
252	err = -ENOENT;
253	} else {
254	err = `0`;
255	if (flags & BPF_F_LOCK)
256	/ lock 'ptr' and copy everything but lock /
257	copy_map_value_locked(map, dst: value, src: ptr, lock_src: true);
258	else
259	copy_map_value(map, dst: value, src: ptr);
260	/ mask lock and timer, since value wasn't zero inited /
261	check_and_init_map_value(map, dst: value);
262	}
263	rcu_read_unlock();
264	}
265
266	bpf_enable_instrumentation();
267
268	return err;
269	}
270
271	/ Please, do not use this function outside from the map creation path*
272	* (e.g. in map update path) without taking care of setting the active
273	* memory cgroup (see at bpf_map_kmalloc_node() for example).
274	*/
275	static void __bpf_map_area_alloc(u64 size, int* numa_node, bool mmapable)
276	{
277	/ We really just want to fail instead of triggering OOM killer*
278	* under memory pressure, therefore we set __GFP_NORETRY to kmalloc,
279	* which is used for lower order allocation requests.
280	*
281	* It has been observed that higher order allocation requests done by
282	* vmalloc with __GFP_NORETRY being set might fail due to not trying
283	* to reclaim memory from the page cache, thus we set
284	* __GFP_RETRY_MAYFAIL to avoid such situations.
285	*/
286
287	gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN \| __GFP_ZERO);
288	unsigned int flags = `0`;
289	unsigned long align = `1`;
290	void *area;
291
292	if (size >= SIZE_MAX)
293	return NULL;
294
295	/ kmalloc()'ed memory can't be mmap()'ed /
296	if (mmapable) {
297	BUG_ON(!PAGE_ALIGNED(size));
298	align = SHMLBA;
299	flags = VM_USERMAP;
300	} else if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
301	area = kmalloc_node(size, flags: gfp \| GFP_USER \| __GFP_NORETRY,
302	node: numa_node);
303	if (area != NULL)
304	return area;
305	}
306
307	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
308	gfp_mask: gfp \| GFP_KERNEL \| __GFP_RETRY_MAYFAIL, PAGE_KERNEL,
309	vm_flags: flags, node: numa_node, caller: __builtin_return_address(`0`));
310	}
311
312	void bpf_map_area_alloc(u64 size, int* numa_node)
313	{
314	return __bpf_map_area_alloc(size, numa_node, mmapable: false);
315	}
316
317	void bpf_map_area_mmapable_alloc(u64 size, int* numa_node)
318	{
319	return __bpf_map_area_alloc(size, numa_node, mmapable: true);
320	}
321
322	void bpf_map_area_free(void *area)
323	{
324	kvfree(addr: area);
325	}
326
327	static u32 bpf_map_flags_retain_permanent(u32 flags)
328	{
329	/ Some map creation flags are not tied to the map object but*
330	* rather to the map fd instead, so they have no meaning upon
331	* map object inspection since multiple file descriptors with
332	* different (access) properties can exist here. Thus, given
333	* this has zero meaning for the map itself, lets clear these
334	* from here.
335	*/
336	return flags & ~(BPF_F_RDONLY \| BPF_F_WRONLY);
337	}
338
339	void bpf_map_init_from_attr(struct bpf_map map, union* bpf_attr *attr)
340	{
341	map->map_type = attr->map_type;
342	map->key_size = attr->key_size;
343	map->value_size = attr->value_size;
344	map->max_entries = attr->max_entries;
345	map->map_flags = bpf_map_flags_retain_permanent(flags: attr->map_flags);
346	map->numa_node = bpf_map_attr_numa_node(attr);
347	map->map_extra = attr->map_extra;
348	}
349
350	static int bpf_map_alloc_id(struct bpf_map *map)
351	{
352	int id;
353
354	idr_preload(GFP_KERNEL);
355	spin_lock_bh(lock: &map_idr_lock);
356	id = idr_alloc_cyclic(&map_idr, ptr: map, start: `1`, INT_MAX, GFP_ATOMIC);
357	if (id > `0`)
358	map->id = id;
359	spin_unlock_bh(lock: &map_idr_lock);
360	idr_preload_end();
361
362	if (WARN_ON_ONCE(!id))
363	return -ENOSPC;
364
365	return id > `0` ? `0` : id;
366	}
367
368	void bpf_map_free_id(struct bpf_map *map)
369	{
370	unsigned long flags;
371
372	/ Offloaded maps are removed from the IDR store when their device*
373	* disappears - even if someone holds an fd to them they are unusable,
374	* the memory is gone, all ops will fail; they are simply waiting for
375	* refcnt to drop to be freed.
376	*/
377	if (!map->id)
378	return;
379
380	spin_lock_irqsave(&map_idr_lock, flags);
381
382	idr_remove(&map_idr, id: map->id);
383	map->id = `0`;
384
385	spin_unlock_irqrestore(lock: &map_idr_lock, flags);
386	}
387
388	#ifdef CONFIG_MEMCG_KMEM
389	static void bpf_map_save_memcg(struct bpf_map *map)
390	{
391	/ Currently if a map is created by a process belonging to the root*
392	* memory cgroup, get_obj_cgroup_from_current() will return NULL.
393	* So we have to check map->objcg for being NULL each time it's
394	* being used.
395	*/
396	if (memcg_bpf_enabled())
397	map->objcg = get_obj_cgroup_from_current();
398	}
399
400	static void bpf_map_release_memcg(struct bpf_map *map)
401	{
402	if (map->objcg)
403	obj_cgroup_put(objcg: map->objcg);
404	}
405
406	static struct mem_cgroup bpf_map_get_memcg(const* struct bpf_map *map)
407	{
408	if (map->objcg)
409	return get_mem_cgroup_from_objcg(objcg: map->objcg);
410
411	return root_mem_cgroup;
412	}
413
414	void bpf_map_kmalloc_node(const* struct bpf_map *map, size_t size, gfp_t flags,
415	int node)
416	{
417	struct mem_cgroup memcg, old_memcg;
418	void *ptr;
419
420	memcg = bpf_map_get_memcg(map);
421	old_memcg = set_active_memcg(memcg);
422	ptr = kmalloc_node(size, flags: flags \| __GFP_ACCOUNT, node);
423	set_active_memcg(old_memcg);
424	mem_cgroup_put(memcg);
425
426	return ptr;
427	}
428
429	void bpf_map_kzalloc(const* struct bpf_map *map, size_t size, gfp_t flags)
430	{
431	struct mem_cgroup memcg, old_memcg;
432	void *ptr;
433
434	memcg = bpf_map_get_memcg(map);
435	old_memcg = set_active_memcg(memcg);
436	ptr = kzalloc(size, flags: flags \| __GFP_ACCOUNT);
437	set_active_memcg(old_memcg);
438	mem_cgroup_put(memcg);
439
440	return ptr;
441	}
442
443	void bpf_map_kvcalloc(struct* bpf_map *map, size_t n, size_t size,
444	gfp_t flags)
445	{
446	struct mem_cgroup memcg, old_memcg;
447	void *ptr;
448
449	memcg = bpf_map_get_memcg(map);
450	old_memcg = set_active_memcg(memcg);
451	ptr = kvcalloc(n, size, flags: flags \| __GFP_ACCOUNT);
452	set_active_memcg(old_memcg);
453	mem_cgroup_put(memcg);
454
455	return ptr;
456	}
457
458	void __percpu bpf_map_alloc_percpu(const* struct bpf_map *map, size_t size,
459	size_t align, gfp_t flags)
460	{
461	struct mem_cgroup memcg, old_memcg;
462	void __percpu *ptr;
463
464	memcg = bpf_map_get_memcg(map);
465	old_memcg = set_active_memcg(memcg);
466	ptr = __alloc_percpu_gfp(size, align, gfp: flags \| __GFP_ACCOUNT);
467	set_active_memcg(old_memcg);
468	mem_cgroup_put(memcg);
469
470	return ptr;
471	}
472
473	#else
474	static void bpf_map_save_memcg(struct bpf_map *map)
475	{
476	}
477
478	static void bpf_map_release_memcg(struct bpf_map *map)
479	{
480	}
481	#endif
482
483	int bpf_map_alloc_pages(const struct bpf_map map, gfp_t gfp, int* nid,
484	unsigned long nr_pages, struct page **pages)
485	{
486	unsigned long i, j;
487	struct page *pg;
488	int ret = `0`;
489	#ifdef CONFIG_MEMCG_KMEM
490	struct mem_cgroup memcg, old_memcg;
491
492	memcg = bpf_map_get_memcg(map);
493	old_memcg = set_active_memcg(memcg);
494	#endif
495	for (i = `0`; i < nr_pages; i++) {
496	pg = alloc_pages_node(nid, gfp_mask: gfp \| __GFP_ACCOUNT, order: `0`);
497
498	if (pg) {
499	pages[i] = pg;
500	continue;
501	}
502	for (j = `0`; j < i; j++)
503	__free_page(pages[j]);
504	ret = -ENOMEM;
505	break;
506	}
507
508	#ifdef CONFIG_MEMCG_KMEM
509	set_active_memcg(old_memcg);
510	mem_cgroup_put(memcg);
511	#endif
512	return ret;
513	}
514
515
516	static int btf_field_cmp(const void a, const* void *b)
517	{
518	const struct btf_field f1 = a, f2 = b;
519
520	if (f1->offset < f2->offset)
521	return -`1`;
522	else if (f1->offset > f2->offset)
523	return `1`;
524	return `0`;
525	}
526
527	struct btf_field btf_record_find(const* struct btf_record *rec, u32 offset,
528	u32 field_mask)
529	{
530	struct btf_field *field;
531
532	if (IS_ERR_OR_NULL(ptr: rec) \|\| !(rec->field_mask & field_mask))
533	return NULL;
534	field = bsearch(key: &offset, base: rec->fields, num: rec->cnt, size: sizeof(rec->fields[`0`]), cmp: btf_field_cmp);
535	if (!field \|\| !(field->type & field_mask))
536	return NULL;
537	return field;
538	}
539
540	void btf_record_free(struct btf_record *rec)
541	{
542	int i;
543
544	if (IS_ERR_OR_NULL(ptr: rec))
545	return;
546	for (i = `0`; i < rec->cnt; i++) {
547	switch (rec->fields[i].type) {
548	case BPF_KPTR_UNREF:
549	case BPF_KPTR_REF:
550	case BPF_KPTR_PERCPU:
551	if (rec->fields[i].kptr.module)
552	module_put(module: rec->fields[i].kptr.module);
553	btf_put(btf: rec->fields[i].kptr.btf);
554	break;
555	case BPF_LIST_HEAD:
556	case BPF_LIST_NODE:
557	case BPF_RB_ROOT:
558	case BPF_RB_NODE:
559	case BPF_SPIN_LOCK:
560	case BPF_TIMER:
561	case BPF_REFCOUNT:
562	/ Nothing to release /
563	break;
564	default:
565	WARN_ON_ONCE(`1`);
566	continue;
567	}
568	}
569	kfree(objp: rec);
570	}
571
572	void bpf_map_free_record(struct bpf_map *map)
573	{
574	btf_record_free(rec: map->record);
575	map->record = NULL;
576	}
577
578	struct btf_record btf_record_dup(const* struct btf_record *rec)
579	{
580	const struct btf_field *fields;
581	struct btf_record *new_rec;
582	int ret, size, i;
583
584	if (IS_ERR_OR_NULL(ptr: rec))
585	return NULL;
586	size = offsetof(struct btf_record, fields[rec->cnt]);
587	new_rec = kmemdup(p: rec, size, GFP_KERNEL \| __GFP_NOWARN);
588	if (!new_rec)
589	return ERR_PTR(error: -ENOMEM);
590	/ Do a deep copy of the btf_record /
591	fields = rec->fields;
592	new_rec->cnt = `0`;
593	for (i = `0`; i < rec->cnt; i++) {
594	switch (fields[i].type) {
595	case BPF_KPTR_UNREF:
596	case BPF_KPTR_REF:
597	case BPF_KPTR_PERCPU:
598	btf_get(btf: fields[i].kptr.btf);
599	if (fields[i].kptr.module && !try_module_get(module: fields[i].kptr.module)) {
600	ret = -ENXIO;
601	goto free;
602	}
603	break;
604	case BPF_LIST_HEAD:
605	case BPF_LIST_NODE:
606	case BPF_RB_ROOT:
607	case BPF_RB_NODE:
608	case BPF_SPIN_LOCK:
609	case BPF_TIMER:
610	case BPF_REFCOUNT:
611	/ Nothing to acquire /
612	break;
613	default:
614	ret = -EFAULT;
615	WARN_ON_ONCE(`1`);
616	goto free;
617	}
618	new_rec->cnt++;
619	}
620	return new_rec;
621	free:
622	btf_record_free(rec: new_rec);
623	return ERR_PTR(error: ret);
624	}
625
626	bool btf_record_equal(const struct btf_record rec_a, const* struct btf_record *rec_b)
627	{
628	bool a_has_fields = !IS_ERR_OR_NULL(ptr: rec_a), b_has_fields = !IS_ERR_OR_NULL(ptr: rec_b);
629	int size;
630
631	if (!a_has_fields && !b_has_fields)
632	return true;
633	if (a_has_fields != b_has_fields)
634	return false;
635	if (rec_a->cnt != rec_b->cnt)
636	return false;
637	size = offsetof(struct btf_record, fields[rec_a->cnt]);
638	/ btf_parse_fields uses kzalloc to allocate a btf_record, so unused*
639	* members are zeroed out. So memcmp is safe to do without worrying
640	* about padding/unused fields.
641	*
642	* While spin_lock, timer, and kptr have no relation to map BTF,
643	* list_head metadata is specific to map BTF, the btf and value_rec
644	* members in particular. btf is the map BTF, while value_rec points to
645	* btf_record in that map BTF.
646	*
647	* So while by default, we don't rely on the map BTF (which the records
648	* were parsed from) matching for both records, which is not backwards
649	* compatible, in case list_head is part of it, we implicitly rely on
650	* that by way of depending on memcmp succeeding for it.
651	*/
652	return !memcmp(p: rec_a, q: rec_b, size);
653	}
654
655	void bpf_obj_free_timer(const struct btf_record rec, void* *obj)
656	{
657	if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
658	return;
659	bpf_timer_cancel_and_free(timer: obj + rec->timer_off);
660	}
661
662	void bpf_obj_free_fields(const struct btf_record rec, void* *obj)
663	{
664	const struct btf_field *fields;
665	int i;
666
667	if (IS_ERR_OR_NULL(ptr: rec))
668	return;
669	fields = rec->fields;
670	for (i = `0`; i < rec->cnt; i++) {
671	struct btf_struct_meta *pointee_struct_meta;
672	const struct btf_field *field = &fields[i];
673	void *field_ptr = obj + field->offset;
674	void *xchgd_field;
675
676	switch (fields[i].type) {
677	case BPF_SPIN_LOCK:
678	break;
679	case BPF_TIMER:
680	bpf_timer_cancel_and_free(timer: field_ptr);
681	break;
682	case BPF_KPTR_UNREF:
683	WRITE_ONCE((u64 )field_ptr, `0`);
684	break;
685	case BPF_KPTR_REF:
686	case BPF_KPTR_PERCPU:
687	xchgd_field = (void )xchg((unsigned* long *)field_ptr, `0`);
688	if (!xchgd_field)
689	break;
690
691	if (!btf_is_kernel(btf: field->kptr.btf)) {
692	pointee_struct_meta = btf_find_struct_meta(btf: field->kptr.btf,
693	btf_id: field->kptr.btf_id);
694	migrate_disable();
695	__bpf_obj_drop_impl(p: xchgd_field, rec: pointee_struct_meta ?
696	pointee_struct_meta->record : NULL,
697	percpu: fields[i].type == BPF_KPTR_PERCPU);
698	migrate_enable();
699	} else {
700	field->kptr.dtor(xchgd_field);
701	}
702	break;
703	case BPF_LIST_HEAD:
704	if (WARN_ON_ONCE(rec->spin_lock_off < `0`))
705	continue;
706	bpf_list_head_free(field, list_head: field_ptr, spin_lock: obj + rec->spin_lock_off);
707	break;
708	case BPF_RB_ROOT:
709	if (WARN_ON_ONCE(rec->spin_lock_off < `0`))
710	continue;
711	bpf_rb_root_free(field, rb_root: field_ptr, spin_lock: obj + rec->spin_lock_off);
712	break;
713	case BPF_LIST_NODE:
714	case BPF_RB_NODE:
715	case BPF_REFCOUNT:
716	break;
717	default:
718	WARN_ON_ONCE(`1`);
719	continue;
720	}
721	}
722	}
723
724	/ called from workqueue /
725	static void bpf_map_free_deferred(struct work_struct *work)
726	{
727	struct bpf_map map = container_of(work, struct* bpf_map, work);
728	struct btf_record *rec = map->record;
729	struct btf *btf = map->btf;
730
731	security_bpf_map_free(map);
732	bpf_map_release_memcg(map);
733	/ implementation dependent freeing /
734	map->ops->map_free(map);
735	/ Delay freeing of btf_record for maps, as map_free*
736	* callback usually needs access to them. It is better to do it here
737	* than require each callback to do the free itself manually.
738	*
739	* Note that the btf_record stashed in map->inner_map_meta->record was
740	* already freed using the map_free callback for map in map case which
741	* eventually calls bpf_map_free_meta, since inner_map_meta is only a
742	* template bpf_map struct used during verification.
743	*/
744	btf_record_free(rec);
745	/ Delay freeing of btf for maps, as map_free callback may need*
746	* struct_meta info which will be freed with btf_put().
747	*/
748	btf_put(btf);
749	}
750
751	static void bpf_map_put_uref(struct bpf_map *map)
752	{
753	if (atomic64_dec_and_test(v: &map->usercnt)) {
754	if (map->ops->map_release_uref)
755	map->ops->map_release_uref(map);
756	}
757	}
758
759	static void bpf_map_free_in_work(struct bpf_map *map)
760	{
761	INIT_WORK(&map->work, bpf_map_free_deferred);
762	/ Avoid spawning kworkers, since they all might contend*
763	* for the same mutex like slab_mutex.
764	*/
765	queue_work(wq: system_unbound_wq, work: &map->work);
766	}
767
768	static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
769	{
770	bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
771	}
772
773	static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
774	{
775	if (rcu_trace_implies_rcu_gp())
776	bpf_map_free_rcu_gp(rcu);
777	else
778	call_rcu(head: rcu, func: bpf_map_free_rcu_gp);
779	}
780
781	/ decrement map refcnt and schedule it for freeing via workqueue*
782	* (underlying map implementation ops->map_free() might sleep)
783	*/
784	void bpf_map_put(struct bpf_map *map)
785	{
786	if (atomic64_dec_and_test(v: &map->refcnt)) {
787	/ bpf_map_free_id() must be called first /
788	bpf_map_free_id(map);
789
790	WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
791	if (READ_ONCE(map->free_after_mult_rcu_gp))
792	call_rcu_tasks_trace(rhp: &map->rcu, func: bpf_map_free_mult_rcu_gp);
793	else if (READ_ONCE(map->free_after_rcu_gp))
794	call_rcu(head: &map->rcu, func: bpf_map_free_rcu_gp);
795	else
796	bpf_map_free_in_work(map);
797	}
798	}
799	EXPORT_SYMBOL_GPL(bpf_map_put);
800
801	void bpf_map_put_with_uref(struct bpf_map *map)
802	{
803	bpf_map_put_uref(map);
804	bpf_map_put(map);
805	}
806
807	static int bpf_map_release(struct inode inode, struct* file *filp)
808	{
809	struct bpf_map *map = filp->private_data;
810
811	if (map->ops->map_release)
812	map->ops->map_release(map, filp);
813
814	bpf_map_put_with_uref(map);
815	return `0`;
816	}
817
818	static fmode_t map_get_sys_perms(struct bpf_map map, struct* fd f)
819	{
820	fmode_t mode = f.file->f_mode;
821
822	/ Our file permissions may have been overridden by global*
823	* map permissions facing syscall side.
824	*/
825	if (READ_ONCE(map->frozen))
826	mode &= ~FMODE_CAN_WRITE;
827	return mode;
828	}
829
830	#ifdef CONFIG_PROC_FS
831	/ Show the memory usage of a bpf map /
832	static u64 bpf_map_memory_usage(const struct bpf_map *map)
833	{
834	return map->ops->map_mem_usage(map);
835	}
836
837	static void bpf_map_show_fdinfo(struct seq_file m, struct* file *filp)
838	{
839	struct bpf_map *map = filp->private_data;
840	u32 type = `0`, jited = `0`;
841
842	if (map_type_contains_progs(map)) {
843	spin_lock(lock: &map->owner.lock);
844	type = map->owner.type;
845	jited = map->owner.jited;
846	spin_unlock(lock: &map->owner.lock);
847	}
848
849	seq_printf(m,
850	fmt: "map_type:\t%u\n"
851	"key_size:\t%u\n"
852	"value_size:\t%u\n"
853	"max_entries:\t%u\n"
854	"map_flags:\t%#x\n"
855	"map_extra:\t%#llx\n"
856	"memlock:\t%llu\n"
857	"map_id:\t%u\n"
858	"frozen:\t%u\n",
859	map->map_type,
860	map->key_size,
861	map->value_size,
862	map->max_entries,
863	map->map_flags,
864	(unsigned long long)map->map_extra,
865	bpf_map_memory_usage(map),
866	map->id,
867	READ_ONCE(map->frozen));
868	if (type) {
869	seq_printf(m, fmt: "owner_prog_type:\t%u\n", type);
870	seq_printf(m, fmt: "owner_jited:\t%u\n", jited);
871	}
872	}
873	#endif
874
875	static ssize_t bpf_dummy_read(struct file filp, char* __user *buf, size_t siz,
876	loff_t *ppos)
877	{
878	/ We need this handler such that alloc_file() enables*
879	* f_mode with FMODE_CAN_READ.
880	*/
881	return -EINVAL;
882	}
883
884	static ssize_t bpf_dummy_write(struct file filp, const* char __user *buf,
885	size_t siz, loff_t *ppos)
886	{
887	/ We need this handler such that alloc_file() enables*
888	* f_mode with FMODE_CAN_WRITE.
889	*/
890	return -EINVAL;
891	}
892
893	/ called for any extra memory-mapped regions (except initial) /
894	static void bpf_map_mmap_open(struct vm_area_struct *vma)
895	{
896	struct bpf_map *map = vma->vm_file->private_data;
897
898	if (vma->vm_flags & VM_MAYWRITE)
899	bpf_map_write_active_inc(map);
900	}
901
902	/ called for all unmapped memory region (including initial) /
903	static void bpf_map_mmap_close(struct vm_area_struct *vma)
904	{
905	struct bpf_map *map = vma->vm_file->private_data;
906
907	if (vma->vm_flags & VM_MAYWRITE)
908	bpf_map_write_active_dec(map);
909	}
910
911	static const struct vm_operations_struct bpf_map_default_vmops = {
912	.open = bpf_map_mmap_open,
913	.close = bpf_map_mmap_close,
914	};
915
916	static int bpf_map_mmap(struct file filp, struct* vm_area_struct *vma)
917	{
918	struct bpf_map *map = filp->private_data;
919	int err;
920
921	if (!map->ops->map_mmap \|\| !IS_ERR_OR_NULL(ptr: map->record))
922	return -ENOTSUPP;
923
924	if (!(vma->vm_flags & VM_SHARED))
925	return -EINVAL;
926
927	mutex_lock(&map->freeze_mutex);
928
929	if (vma->vm_flags & VM_WRITE) {
930	if (map->frozen) {
931	err = -EPERM;
932	goto out;
933	}
934	/ map is meant to be read-only, so do not allow mapping as*
935	* writable, because it's possible to leak a writable page
936	* reference and allows user-space to still modify it after
937	* freezing, while verifier will assume contents do not change
938	*/
939	if (map->map_flags & BPF_F_RDONLY_PROG) {
940	err = -EACCES;
941	goto out;
942	}
943	}
944
945	/ set default open/close callbacks /
946	vma->vm_ops = &bpf_map_default_vmops;
947	vma->vm_private_data = map;
948	vm_flags_clear(vma, VM_MAYEXEC);
949	if (!(vma->vm_flags & VM_WRITE))
950	/ disallow re-mapping with PROT_WRITE /
951	vm_flags_clear(vma, VM_MAYWRITE);
952
953	err = map->ops->map_mmap(map, vma);
954	if (err)
955	goto out;
956
957	if (vma->vm_flags & VM_MAYWRITE)
958	bpf_map_write_active_inc(map);
959	out:
960	mutex_unlock(lock: &map->freeze_mutex);
961	return err;
962	}
963
964	static __poll_t bpf_map_poll(struct file filp, struct* poll_table_struct *pts)
965	{
966	struct bpf_map *map = filp->private_data;
967
968	if (map->ops->map_poll)
969	return map->ops->map_poll(map, filp, pts);
970
971	return EPOLLERR;
972	}
973
974	static unsigned long bpf_get_unmapped_area(struct file filp, unsigned* long addr,
975	unsigned long len, unsigned long pgoff,
976	unsigned long flags)
977	{
978	struct bpf_map *map = filp->private_data;
979
980	if (map->ops->map_get_unmapped_area)
981	return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
982	#ifdef CONFIG_MMU
983	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
984	#else
985	return addr;
986	#endif
987	}
988
989	const struct file_operations bpf_map_fops = {
990	#ifdef CONFIG_PROC_FS
991	.show_fdinfo = bpf_map_show_fdinfo,
992	#endif
993	.release = bpf_map_release,
994	.read = bpf_dummy_read,
995	.write = bpf_dummy_write,
996	.mmap = bpf_map_mmap,
997	.poll = bpf_map_poll,
998	.get_unmapped_area = bpf_get_unmapped_area,
999	};
1000
1001	int bpf_map_new_fd(struct bpf_map map, int* flags)
1002	{
1003	int ret;
1004
1005	ret = security_bpf_map(map, OPEN_FMODE(flags));
1006	if (ret < `0`)
1007	return ret;
1008
1009	return anon_inode_getfd(name: "bpf-map", fops: &bpf_map_fops, priv: map,
1010	flags: flags \| O_CLOEXEC);
1011	}
1012
1013	int bpf_get_file_flag(int flags)
1014	{
1015	if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
1016	return -EINVAL;
1017	if (flags & BPF_F_RDONLY)
1018	return O_RDONLY;
1019	if (flags & BPF_F_WRONLY)
1020	return O_WRONLY;
1021	return O_RDWR;
1022	}
1023
1024	/ helper macro to check that unused fields 'union bpf_attr' are zero /
1025	#define CHECK_ATTR(CMD) \
1026	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
1027	sizeof(attr->CMD##_LAST_FIELD), 0, \
1028	sizeof(*attr) - \
1029	offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
1030	sizeof(attr->CMD##_LAST_FIELD)) != NULL
1031
1032	/ dst and src must have at least "size" number of bytes.*
1033	* Return strlen on success and < 0 on error.
1034	*/
1035	int bpf_obj_name_cpy(char dst, const* char src, unsigned* int size)
1036	{
1037	const char *end = src + size;
1038	const char *orig_src = src;
1039
1040	memset(dst, `0`, size);
1041	/ Copy all isalnum(), '_' and '.' chars. /
1042	while (src < end && *src) {
1043	if (!isalnum(*src) &&
1044	src != `'_'` && src != `'.'`)
1045	return -EINVAL;
1046	dst++ = src++;
1047	}
1048
1049	/ No '\0' found in "size" number of bytes /
1050	if (src == end)
1051	return -EINVAL;
1052
1053	return src - orig_src;
1054	}
1055
1056	int map_check_no_btf(const struct bpf_map *map,
1057	const struct btf *btf,
1058	const struct btf_type *key_type,
1059	const struct btf_type *value_type)
1060	{
1061	return -ENOTSUPP;
1062	}
1063
1064	static int map_check_btf(struct bpf_map map, struct* bpf_token *token,
1065	const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
1066	{
1067	const struct btf_type key_type, value_type;
1068	u32 key_size, value_size;
1069	int ret = `0`;
1070
1071	/ Some maps allow key to be unspecified. /
1072	if (btf_key_id) {
1073	key_type = btf_type_id_size(btf, type_id: &btf_key_id, ret_size: &key_size);
1074	if (!key_type \|\| key_size != map->key_size)
1075	return -EINVAL;
1076	} else {
1077	key_type = btf_type_by_id(btf, type_id: `0`);
1078	if (!map->ops->map_check_btf)
1079	return -EINVAL;
1080	}
1081
1082	value_type = btf_type_id_size(btf, type_id: &btf_value_id, ret_size: &value_size);
1083	if (!value_type \|\| value_size != map->value_size)
1084	return -EINVAL;
1085
1086	map->record = btf_parse_fields(btf, t: value_type,
1087	field_mask: BPF_SPIN_LOCK \| BPF_TIMER \| BPF_KPTR \| BPF_LIST_HEAD \|
1088	BPF_RB_ROOT \| BPF_REFCOUNT,
1089	value_size: map->value_size);
1090	if (!IS_ERR_OR_NULL(ptr: map->record)) {
1091	int i;
1092
1093	if (!bpf_token_capable(token, CAP_BPF)) {
1094	ret = -EPERM;
1095	goto free_map_tab;
1096	}
1097	if (map->map_flags & (BPF_F_RDONLY_PROG \| BPF_F_WRONLY_PROG)) {
1098	ret = -EACCES;
1099	goto free_map_tab;
1100	}
1101	for (i = `0`; i < sizeof(map->record->field_mask) * `8`; i++) {
1102	switch (map->record->field_mask & (`1` << i)) {
1103	case `0`:
1104	continue;
1105	case BPF_SPIN_LOCK:
1106	if (map->map_type != BPF_MAP_TYPE_HASH &&
1107	map->map_type != BPF_MAP_TYPE_ARRAY &&
1108	map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
1109	map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
1110	map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
1111	map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
1112	map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
1113	ret = -EOPNOTSUPP;
1114	goto free_map_tab;
1115	}
1116	break;
1117	case BPF_TIMER:
1118	if (map->map_type != BPF_MAP_TYPE_HASH &&
1119	map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1120	map->map_type != BPF_MAP_TYPE_ARRAY) {
1121	ret = -EOPNOTSUPP;
1122	goto free_map_tab;
1123	}
1124	break;
1125	case BPF_KPTR_UNREF:
1126	case BPF_KPTR_REF:
1127	case BPF_KPTR_PERCPU:
1128	case BPF_REFCOUNT:
1129	if (map->map_type != BPF_MAP_TYPE_HASH &&
1130	map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
1131	map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1132	map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
1133	map->map_type != BPF_MAP_TYPE_ARRAY &&
1134	map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
1135	map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
1136	map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
1137	map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
1138	map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
1139	ret = -EOPNOTSUPP;
1140	goto free_map_tab;
1141	}
1142	break;
1143	case BPF_LIST_HEAD:
1144	case BPF_RB_ROOT:
1145	if (map->map_type != BPF_MAP_TYPE_HASH &&
1146	map->map_type != BPF_MAP_TYPE_LRU_HASH &&
1147	map->map_type != BPF_MAP_TYPE_ARRAY) {
1148	ret = -EOPNOTSUPP;
1149	goto free_map_tab;
1150	}
1151	break;
1152	default:
1153	/ Fail if map_type checks are missing for a field type /
1154	ret = -EOPNOTSUPP;
1155	goto free_map_tab;
1156	}
1157	}
1158	}
1159
1160	ret = btf_check_and_fixup_fields(btf, rec: map->record);
1161	if (ret < `0`)
1162	goto free_map_tab;
1163
1164	if (map->ops->map_check_btf) {
1165	ret = map->ops->map_check_btf(map, btf, key_type, value_type);
1166	if (ret < `0`)
1167	goto free_map_tab;
1168	}
1169
1170	return ret;
1171	free_map_tab:
1172	bpf_map_free_record(map);
1173	return ret;
1174	}
1175
1176	static bool bpf_net_capable(void)
1177	{
1178	return capable(CAP_NET_ADMIN) \|\| capable(CAP_SYS_ADMIN);
1179	}
1180
1181	#define BPF_MAP_CREATE_LAST_FIELD map_token_fd
1182	/ called via syscall /
1183	static int map_create(union bpf_attr *attr)
1184	{
1185	const struct bpf_map_ops *ops;
1186	struct bpf_token *token = NULL;
1187	int numa_node = bpf_map_attr_numa_node(attr);
1188	u32 map_type = attr->map_type;
1189	struct bpf_map *map;
1190	bool token_flag;
1191	int f_flags;
1192	int err;
1193
1194	err = CHECK_ATTR(BPF_MAP_CREATE);
1195	if (err)
1196	return -EINVAL;
1197
1198	/ check BPF_F_TOKEN_FD flag, remember if it's set, and then clear it*
1199	* to avoid per-map type checks tripping on unknown flag
1200	*/
1201	token_flag = attr->map_flags & BPF_F_TOKEN_FD;
1202	attr->map_flags &= ~BPF_F_TOKEN_FD;
1203
1204	if (attr->btf_vmlinux_value_type_id) {
1205	if (attr->map_type != BPF_MAP_TYPE_STRUCT_OPS \|\|
1206	attr->btf_key_type_id \|\| attr->btf_value_type_id)
1207	return -EINVAL;
1208	} else if (attr->btf_key_type_id && !attr->btf_value_type_id) {
1209	return -EINVAL;
1210	}
1211
1212	if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
1213	attr->map_type != BPF_MAP_TYPE_ARENA &&
1214	attr->map_extra != `0`)
1215	return -EINVAL;
1216
1217	f_flags = bpf_get_file_flag(flags: attr->map_flags);
1218	if (f_flags < `0`)
1219	return f_flags;
1220
1221	if (numa_node != NUMA_NO_NODE &&
1222	((unsigned int)numa_node >= nr_node_ids \|\|
1223	!node_online(numa_node)))
1224	return -EINVAL;
1225
1226	/ find map type and init map: hashtable vs rbtree vs bloom vs ... /
1227	map_type = attr->map_type;
1228	if (map_type >= ARRAY_SIZE(bpf_map_types))
1229	return -EINVAL;
1230	map_type = array_index_nospec(map_type, ARRAY_SIZE(bpf_map_types));
1231	ops = bpf_map_types[map_type];
1232	if (!ops)
1233	return -EINVAL;
1234
1235	if (ops->map_alloc_check) {
1236	err = ops->map_alloc_check(attr);
1237	if (err)
1238	return err;
1239	}
1240	if (attr->map_ifindex)
1241	ops = &bpf_map_offload_ops;
1242	if (!ops->map_mem_usage)
1243	return -EINVAL;
1244
1245	if (token_flag) {
1246	token = bpf_token_get_from_fd(ufd: attr->map_token_fd);
1247	if (IS_ERR(ptr: token))
1248	return PTR_ERR(ptr: token);
1249
1250	/ if current token doesn't grant map creation permissions,*
1251	* then we can't use this token, so ignore it and rely on
1252	* system-wide capabilities checks
1253	*/
1254	if (!bpf_token_allow_cmd(token, cmd: BPF_MAP_CREATE) \|\|
1255	!bpf_token_allow_map_type(token, type: attr->map_type)) {
1256	bpf_token_put(token);
1257	token = NULL;
1258	}
1259	}
1260
1261	err = -EPERM;
1262
1263	/ Intent here is for unprivileged_bpf_disabled to block BPF map*
1264	* creation for unprivileged users; other actions depend
1265	* on fd availability and access to bpffs, so are dependent on
1266	* object creation success. Even with unprivileged BPF disabled,
1267	* capability checks are still carried out.
1268	*/
1269	if (sysctl_unprivileged_bpf_disabled && !bpf_token_capable(token, CAP_BPF))
1270	goto put_token;
1271
1272	/ check privileged map type permissions /
1273	switch (map_type) {
1274	case BPF_MAP_TYPE_ARRAY:
1275	case BPF_MAP_TYPE_PERCPU_ARRAY:
1276	case BPF_MAP_TYPE_PROG_ARRAY:
1277	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
1278	case BPF_MAP_TYPE_CGROUP_ARRAY:
1279	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
1280	case BPF_MAP_TYPE_HASH:
1281	case BPF_MAP_TYPE_PERCPU_HASH:
1282	case BPF_MAP_TYPE_HASH_OF_MAPS:
1283	case BPF_MAP_TYPE_RINGBUF:
1284	case BPF_MAP_TYPE_USER_RINGBUF:
1285	case BPF_MAP_TYPE_CGROUP_STORAGE:
1286	case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
1287	/ unprivileged /
1288	break;
1289	case BPF_MAP_TYPE_SK_STORAGE:
1290	case BPF_MAP_TYPE_INODE_STORAGE:
1291	case BPF_MAP_TYPE_TASK_STORAGE:
1292	case BPF_MAP_TYPE_CGRP_STORAGE:
1293	case BPF_MAP_TYPE_BLOOM_FILTER:
1294	case BPF_MAP_TYPE_LPM_TRIE:
1295	case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
1296	case BPF_MAP_TYPE_STACK_TRACE:
1297	case BPF_MAP_TYPE_QUEUE:
1298	case BPF_MAP_TYPE_STACK:
1299	case BPF_MAP_TYPE_LRU_HASH:
1300	case BPF_MAP_TYPE_LRU_PERCPU_HASH:
1301	case BPF_MAP_TYPE_STRUCT_OPS:
1302	case BPF_MAP_TYPE_CPUMAP:
1303	case BPF_MAP_TYPE_ARENA:
1304	if (!bpf_token_capable(token, CAP_BPF))
1305	goto put_token;
1306	break;
1307	case BPF_MAP_TYPE_SOCKMAP:
1308	case BPF_MAP_TYPE_SOCKHASH:
1309	case BPF_MAP_TYPE_DEVMAP:
1310	case BPF_MAP_TYPE_DEVMAP_HASH:
1311	case BPF_MAP_TYPE_XSKMAP:
1312	if (!bpf_token_capable(token, CAP_NET_ADMIN))
1313	goto put_token;
1314	break;
1315	default:
1316	WARN(`1`, "unsupported map type %d", map_type);
1317	goto put_token;
1318	}
1319
1320	map = ops->map_alloc(attr);
1321	if (IS_ERR(ptr: map)) {
1322	err = PTR_ERR(ptr: map);
1323	goto put_token;
1324	}
1325	map->ops = ops;
1326	map->map_type = map_type;
1327
1328	err = bpf_obj_name_cpy(dst: map->name, src: attr->map_name,
1329	size: sizeof(attr->map_name));
1330	if (err < `0`)
1331	goto free_map;
1332
1333	atomic64_set(v: &map->refcnt, i: `1`);
1334	atomic64_set(v: &map->usercnt, i: `1`);
1335	mutex_init(&map->freeze_mutex);
1336	spin_lock_init(&map->owner.lock);
1337
1338	if (attr->btf_key_type_id \|\| attr->btf_value_type_id \|\|
1339	/ Even the map's value is a kernel's struct,*
1340	* the bpf_prog.o must have BTF to begin with
1341	* to figure out the corresponding kernel's
1342	* counter part. Thus, attr->btf_fd has
1343	* to be valid also.
1344	*/
1345	attr->btf_vmlinux_value_type_id) {
1346	struct btf *btf;
1347
1348	btf = btf_get_by_fd(fd: attr->btf_fd);
1349	if (IS_ERR(ptr: btf)) {
1350	err = PTR_ERR(ptr: btf);
1351	goto free_map;
1352	}
1353	if (btf_is_kernel(btf)) {
1354	btf_put(btf);
1355	err = -EACCES;
1356	goto free_map;
1357	}
1358	map->btf = btf;
1359
1360	if (attr->btf_value_type_id) {
1361	err = map_check_btf(map, token, btf, btf_key_id: attr->btf_key_type_id,
1362	btf_value_id: attr->btf_value_type_id);
1363	if (err)
1364	goto free_map;
1365	}
1366
1367	map->btf_key_type_id = attr->btf_key_type_id;
1368	map->btf_value_type_id = attr->btf_value_type_id;
1369	map->btf_vmlinux_value_type_id =
1370	attr->btf_vmlinux_value_type_id;
1371	}
1372
1373	err = security_bpf_map_create(map, attr, token);
1374	if (err)
1375	goto free_map_sec;
1376
1377	err = bpf_map_alloc_id(map);
1378	if (err)
1379	goto free_map_sec;
1380
1381	bpf_map_save_memcg(map);
1382	bpf_token_put(token);
1383
1384	err = bpf_map_new_fd(map, flags: f_flags);
1385	if (err < `0`) {
1386	/ failed to allocate fd.*
1387	* bpf_map_put_with_uref() is needed because the above
1388	* bpf_map_alloc_id() has published the map
1389	* to the userspace and the userspace may
1390	* have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
1391	*/
1392	bpf_map_put_with_uref(map);
1393	return err;
1394	}
1395
1396	return err;
1397
1398	free_map_sec:
1399	security_bpf_map_free(map);
1400	free_map:
1401	btf_put(btf: map->btf);
1402	map->ops->map_free(map);
1403	put_token:
1404	bpf_token_put(token);
1405	return err;
1406	}
1407
1408	/ if error is returned, fd is released.*
1409	* On success caller should complete fd access with matching fdput()
1410	*/
1411	struct bpf_map __bpf_map_get(struct* fd f)
1412	{
1413	if (!f.file)
1414	return ERR_PTR(error: -EBADF);
1415	if (f.file->f_op != &bpf_map_fops) {
1416	fdput(fd: f);
1417	return ERR_PTR(error: -EINVAL);
1418	}
1419
1420	return f.file->private_data;
1421	}
1422
1423	void bpf_map_inc(struct bpf_map *map)
1424	{
1425	atomic64_inc(v: &map->refcnt);
1426	}
1427	EXPORT_SYMBOL_GPL(bpf_map_inc);
1428
1429	void bpf_map_inc_with_uref(struct bpf_map *map)
1430	{
1431	atomic64_inc(v: &map->refcnt);
1432	atomic64_inc(v: &map->usercnt);
1433	}
1434	EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);
1435
1436	struct bpf_map *bpf_map_get(u32 ufd)
1437	{
1438	struct fd f = fdget(fd: ufd);
1439	struct bpf_map *map;
1440
1441	map = __bpf_map_get(f);
1442	if (IS_ERR(ptr: map))
1443	return map;
1444
1445	bpf_map_inc(map);
1446	fdput(fd: f);
1447
1448	return map;
1449	}
1450	EXPORT_SYMBOL(bpf_map_get);
1451
1452	struct bpf_map *bpf_map_get_with_uref(u32 ufd)
1453	{
1454	struct fd f = fdget(fd: ufd);
1455	struct bpf_map *map;
1456
1457	map = __bpf_map_get(f);
1458	if (IS_ERR(ptr: map))
1459	return map;
1460
1461	bpf_map_inc_with_uref(map);
1462	fdput(fd: f);
1463
1464	return map;
1465	}
1466
1467	/ map_idr_lock should have been held or the map should have been*
1468	* protected by rcu read lock.
1469	*/
1470	struct bpf_map __bpf_map_inc_not_zero(struct* bpf_map *map, bool uref)
1471	{
1472	int refold;
1473
1474	refold = atomic64_fetch_add_unless(v: &map->refcnt, a: `1`, u: `0`);
1475	if (!refold)
1476	return ERR_PTR(error: -ENOENT);
1477	if (uref)
1478	atomic64_inc(v: &map->usercnt);
1479
1480	return map;
1481	}
1482
1483	struct bpf_map bpf_map_inc_not_zero(struct* bpf_map *map)
1484	{
1485	spin_lock_bh(lock: &map_idr_lock);
1486	map = __bpf_map_inc_not_zero(map, uref: false);
1487	spin_unlock_bh(lock: &map_idr_lock);
1488
1489	return map;
1490	}
1491	EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero);
1492
1493	int __weak bpf_stackmap_copy(struct bpf_map map, void* key, void* *value)
1494	{
1495	return -ENOTSUPP;
1496	}
1497
1498	static void __bpf_copy_key(void* __user *ukey, u64 key_size)
1499	{
1500	if (key_size)
1501	return vmemdup_user(ukey, key_size);
1502
1503	if (ukey)
1504	return ERR_PTR(error: -EINVAL);
1505
1506	return NULL;
1507	}
1508
1509	static void *___bpf_copy_key(bpfptr_t ukey, u64 key_size)
1510	{
1511	if (key_size)
1512	return kvmemdup_bpfptr(src: ukey, len: key_size);
1513
1514	if (!bpfptr_is_null(bpfptr: ukey))
1515	return ERR_PTR(error: -EINVAL);
1516
1517	return NULL;
1518	}
1519
1520	/ last field in 'union bpf_attr' used by this command /
1521	#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags
1522
1523	static int map_lookup_elem(union bpf_attr *attr)
1524	{
1525	void __user *ukey = u64_to_user_ptr(attr->key);
1526	void __user *uvalue = u64_to_user_ptr(attr->value);
1527	int ufd = attr->map_fd;
1528	struct bpf_map *map;
1529	void key, value;
1530	u32 value_size;
1531	struct fd f;
1532	int err;
1533
1534	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
1535	return -EINVAL;
1536
1537	if (attr->flags & ~BPF_F_LOCK)
1538	return -EINVAL;
1539
1540	f = fdget(fd: ufd);
1541	map = __bpf_map_get(f);
1542	if (IS_ERR(ptr: map))
1543	return PTR_ERR(ptr: map);
1544	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1545	err = -EPERM;
1546	goto err_put;
1547	}
1548
1549	if ((attr->flags & BPF_F_LOCK) &&
1550	!btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK)) {
1551	err = -EINVAL;
1552	goto err_put;
1553	}
1554
1555	key = __bpf_copy_key(ukey, key_size: map->key_size);
1556	if (IS_ERR(ptr: key)) {
1557	err = PTR_ERR(ptr: key);
1558	goto err_put;
1559	}
1560
1561	value_size = bpf_map_value_size(map);
1562
1563	err = -ENOMEM;
1564	value = kvmalloc(size: value_size, GFP_USER \| __GFP_NOWARN);
1565	if (!value)
1566	goto free_key;
1567
1568	if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
1569	if (copy_from_user(to: value, from: uvalue, n: value_size))
1570	err = -EFAULT;
1571	else
1572	err = bpf_map_copy_value(map, key, value, flags: attr->flags);
1573	goto free_value;
1574	}
1575
1576	err = bpf_map_copy_value(map, key, value, flags: attr->flags);
1577	if (err)
1578	goto free_value;
1579
1580	err = -EFAULT;
1581	if (copy_to_user(to: uvalue, from: value, n: value_size) != `0`)
1582	goto free_value;
1583
1584	err = `0`;
1585
1586	free_value:
1587	kvfree(addr: value);
1588	free_key:
1589	kvfree(addr: key);
1590	err_put:
1591	fdput(fd: f);
1592	return err;
1593	}
1594
1595
1596	#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
1597
1598	static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
1599	{
1600	bpfptr_t ukey = make_bpfptr(addr: attr->key, is_kernel: uattr.is_kernel);
1601	bpfptr_t uvalue = make_bpfptr(addr: attr->value, is_kernel: uattr.is_kernel);
1602	int ufd = attr->map_fd;
1603	struct bpf_map *map;
1604	void key, value;
1605	u32 value_size;
1606	struct fd f;
1607	int err;
1608
1609	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
1610	return -EINVAL;
1611
1612	f = fdget(fd: ufd);
1613	map = __bpf_map_get(f);
1614	if (IS_ERR(ptr: map))
1615	return PTR_ERR(ptr: map);
1616	bpf_map_write_active_inc(map);
1617	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1618	err = -EPERM;
1619	goto err_put;
1620	}
1621
1622	if ((attr->flags & BPF_F_LOCK) &&
1623	!btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK)) {
1624	err = -EINVAL;
1625	goto err_put;
1626	}
1627
1628	key = ___bpf_copy_key(ukey, key_size: map->key_size);
1629	if (IS_ERR(ptr: key)) {
1630	err = PTR_ERR(ptr: key);
1631	goto err_put;
1632	}
1633
1634	value_size = bpf_map_value_size(map);
1635	value = kvmemdup_bpfptr(src: uvalue, len: value_size);
1636	if (IS_ERR(ptr: value)) {
1637	err = PTR_ERR(ptr: value);
1638	goto free_key;
1639	}
1640
1641	err = bpf_map_update_value(map, map_file: f.file, key, value, flags: attr->flags);
1642	if (!err)
1643	maybe_wait_bpf_programs(map);
1644
1645	kvfree(addr: value);
1646	free_key:
1647	kvfree(addr: key);
1648	err_put:
1649	bpf_map_write_active_dec(map);
1650	fdput(fd: f);
1651	return err;
1652	}
1653
1654	#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
1655
1656	static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
1657	{
1658	bpfptr_t ukey = make_bpfptr(addr: attr->key, is_kernel: uattr.is_kernel);
1659	int ufd = attr->map_fd;
1660	struct bpf_map *map;
1661	struct fd f;
1662	void *key;
1663	int err;
1664
1665	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
1666	return -EINVAL;
1667
1668	f = fdget(fd: ufd);
1669	map = __bpf_map_get(f);
1670	if (IS_ERR(ptr: map))
1671	return PTR_ERR(ptr: map);
1672	bpf_map_write_active_inc(map);
1673	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
1674	err = -EPERM;
1675	goto err_put;
1676	}
1677
1678	key = ___bpf_copy_key(ukey, key_size: map->key_size);
1679	if (IS_ERR(ptr: key)) {
1680	err = PTR_ERR(ptr: key);
1681	goto err_put;
1682	}
1683
1684	if (bpf_map_is_offloaded(map)) {
1685	err = bpf_map_offload_delete_elem(map, key);
1686	goto out;
1687	} else if (IS_FD_PROG_ARRAY(map) \|\|
1688	map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
1689	/ These maps require sleepable context /
1690	err = map->ops->map_delete_elem(map, key);
1691	goto out;
1692	}
1693
1694	bpf_disable_instrumentation();
1695	rcu_read_lock();
1696	err = map->ops->map_delete_elem(map, key);
1697	rcu_read_unlock();
1698	bpf_enable_instrumentation();
1699	if (!err)
1700	maybe_wait_bpf_programs(map);
1701	out:
1702	kvfree(addr: key);
1703	err_put:
1704	bpf_map_write_active_dec(map);
1705	fdput(fd: f);
1706	return err;
1707	}
1708
1709	/ last field in 'union bpf_attr' used by this command /
1710	#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
1711
1712	static int map_get_next_key(union bpf_attr *attr)
1713	{
1714	void __user *ukey = u64_to_user_ptr(attr->key);
1715	void __user *unext_key = u64_to_user_ptr(attr->next_key);
1716	int ufd = attr->map_fd;
1717	struct bpf_map *map;
1718	void key, next_key;
1719	struct fd f;
1720	int err;
1721
1722	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
1723	return -EINVAL;
1724
1725	f = fdget(fd: ufd);
1726	map = __bpf_map_get(f);
1727	if (IS_ERR(ptr: map))
1728	return PTR_ERR(ptr: map);
1729	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
1730	err = -EPERM;
1731	goto err_put;
1732	}
1733
1734	if (ukey) {
1735	key = __bpf_copy_key(ukey, key_size: map->key_size);
1736	if (IS_ERR(ptr: key)) {
1737	err = PTR_ERR(ptr: key);
1738	goto err_put;
1739	}
1740	} else {
1741	key = NULL;
1742	}
1743
1744	err = -ENOMEM;
1745	next_key = kvmalloc(size: map->key_size, GFP_USER);
1746	if (!next_key)
1747	goto free_key;
1748
1749	if (bpf_map_is_offloaded(map)) {
1750	err = bpf_map_offload_get_next_key(map, key, next_key);
1751	goto out;
1752	}
1753
1754	rcu_read_lock();
1755	err = map->ops->map_get_next_key(map, key, next_key);
1756	rcu_read_unlock();
1757	out:
1758	if (err)
1759	goto free_next_key;
1760
1761	err = -EFAULT;
1762	if (copy_to_user(to: unext_key, from: next_key, n: map->key_size) != `0`)
1763	goto free_next_key;
1764
1765	err = `0`;
1766
1767	free_next_key:
1768	kvfree(addr: next_key);
1769	free_key:
1770	kvfree(addr: key);
1771	err_put:
1772	fdput(fd: f);
1773	return err;
1774	}
1775
1776	int generic_map_delete_batch(struct bpf_map *map,
1777	const union bpf_attr *attr,
1778	union bpf_attr __user *uattr)
1779	{
1780	void __user *keys = u64_to_user_ptr(attr->batch.keys);
1781	u32 cp, max_count;
1782	int err = `0`;
1783	void *key;
1784
1785	if (attr->batch.elem_flags & ~BPF_F_LOCK)
1786	return -EINVAL;
1787
1788	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1789	!btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK)) {
1790	return -EINVAL;
1791	}
1792
1793	max_count = attr->batch.count;
1794	if (!max_count)
1795	return `0`;
1796
1797	if (put_user(`0`, &uattr->batch.count))
1798	return -EFAULT;
1799
1800	key = kvmalloc(size: map->key_size, GFP_USER \| __GFP_NOWARN);
1801	if (!key)
1802	return -ENOMEM;
1803
1804	for (cp = `0`; cp < max_count; cp++) {
1805	err = -EFAULT;
1806	if (copy_from_user(to: key, from: keys + cp * map->key_size,
1807	n: map->key_size))
1808	break;
1809
1810	if (bpf_map_is_offloaded(map)) {
1811	err = bpf_map_offload_delete_elem(map, key);
1812	break;
1813	}
1814
1815	bpf_disable_instrumentation();
1816	rcu_read_lock();
1817	err = map->ops->map_delete_elem(map, key);
1818	rcu_read_unlock();
1819	bpf_enable_instrumentation();
1820	if (err)
1821	break;
1822	cond_resched();
1823	}
1824	if (copy_to_user(to: &uattr->batch.count, from: &cp, n: sizeof(cp)))
1825	err = -EFAULT;
1826
1827	kvfree(addr: key);
1828
1829	return err;
1830	}
1831
1832	int generic_map_update_batch(struct bpf_map map, struct* file *map_file,
1833	const union bpf_attr *attr,
1834	union bpf_attr __user *uattr)
1835	{
1836	void __user *values = u64_to_user_ptr(attr->batch.values);
1837	void __user *keys = u64_to_user_ptr(attr->batch.keys);
1838	u32 value_size, cp, max_count;
1839	void key, value;
1840	int err = `0`;
1841
1842	if (attr->batch.elem_flags & ~BPF_F_LOCK)
1843	return -EINVAL;
1844
1845	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1846	!btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK)) {
1847	return -EINVAL;
1848	}
1849
1850	value_size = bpf_map_value_size(map);
1851
1852	max_count = attr->batch.count;
1853	if (!max_count)
1854	return `0`;
1855
1856	if (put_user(`0`, &uattr->batch.count))
1857	return -EFAULT;
1858
1859	key = kvmalloc(size: map->key_size, GFP_USER \| __GFP_NOWARN);
1860	if (!key)
1861	return -ENOMEM;
1862
1863	value = kvmalloc(size: value_size, GFP_USER \| __GFP_NOWARN);
1864	if (!value) {
1865	kvfree(addr: key);
1866	return -ENOMEM;
1867	}
1868
1869	for (cp = `0`; cp < max_count; cp++) {
1870	err = -EFAULT;
1871	if (copy_from_user(to: key, from: keys + cp * map->key_size,
1872	n: map->key_size) \|\|
1873	copy_from_user(to: value, from: values + cp * value_size, n: value_size))
1874	break;
1875
1876	err = bpf_map_update_value(map, map_file, key, value,
1877	flags: attr->batch.elem_flags);
1878
1879	if (err)
1880	break;
1881	cond_resched();
1882	}
1883
1884	if (copy_to_user(to: &uattr->batch.count, from: &cp, n: sizeof(cp)))
1885	err = -EFAULT;
1886
1887	kvfree(addr: value);
1888	kvfree(addr: key);
1889
1890	return err;
1891	}
1892
1893	#define MAP_LOOKUP_RETRIES 3
1894
1895	int generic_map_lookup_batch(struct bpf_map *map,
1896	const union bpf_attr *attr,
1897	union bpf_attr __user *uattr)
1898	{
1899	void __user *uobatch = u64_to_user_ptr(attr->batch.out_batch);
1900	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1901	void __user *values = u64_to_user_ptr(attr->batch.values);
1902	void __user *keys = u64_to_user_ptr(attr->batch.keys);
1903	void buf, buf_prevkey, prev_key, key, *value;
1904	int err, retry = MAP_LOOKUP_RETRIES;
1905	u32 value_size, cp, max_count;
1906
1907	if (attr->batch.elem_flags & ~BPF_F_LOCK)
1908	return -EINVAL;
1909
1910	if ((attr->batch.elem_flags & BPF_F_LOCK) &&
1911	!btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK))
1912	return -EINVAL;
1913
1914	value_size = bpf_map_value_size(map);
1915
1916	max_count = attr->batch.count;
1917	if (!max_count)
1918	return `0`;
1919
1920	if (put_user(`0`, &uattr->batch.count))
1921	return -EFAULT;
1922
1923	buf_prevkey = kvmalloc(size: map->key_size, GFP_USER \| __GFP_NOWARN);
1924	if (!buf_prevkey)
1925	return -ENOMEM;
1926
1927	buf = kvmalloc(size: map->key_size + value_size, GFP_USER \| __GFP_NOWARN);
1928	if (!buf) {
1929	kvfree(addr: buf_prevkey);
1930	return -ENOMEM;
1931	}
1932
1933	err = -EFAULT;
1934	prev_key = NULL;
1935	if (ubatch && copy_from_user(to: buf_prevkey, from: ubatch, n: map->key_size))
1936	goto free_buf;
1937	key = buf;
1938	value = key + map->key_size;
1939	if (ubatch)
1940	prev_key = buf_prevkey;
1941
1942	for (cp = `0`; cp < max_count;) {
1943	rcu_read_lock();
1944	err = map->ops->map_get_next_key(map, prev_key, key);
1945	rcu_read_unlock();
1946	if (err)
1947	break;
1948	err = bpf_map_copy_value(map, key, value,
1949	flags: attr->batch.elem_flags);
1950
1951	if (err == -ENOENT) {
1952	if (retry) {
1953	retry--;
1954	continue;
1955	}
1956	err = -EINTR;
1957	break;
1958	}
1959
1960	if (err)
1961	goto free_buf;
1962
1963	if (copy_to_user(to: keys + cp * map->key_size, from: key,
1964	n: map->key_size)) {
1965	err = -EFAULT;
1966	goto free_buf;
1967	}
1968	if (copy_to_user(to: values + cp * value_size, from: value, n: value_size)) {
1969	err = -EFAULT;
1970	goto free_buf;
1971	}
1972
1973	if (!prev_key)
1974	prev_key = buf_prevkey;
1975
1976	swap(prev_key, key);
1977	retry = MAP_LOOKUP_RETRIES;
1978	cp++;
1979	cond_resched();
1980	}
1981
1982	if (err == -EFAULT)
1983	goto free_buf;
1984
1985	if ((copy_to_user(to: &uattr->batch.count, from: &cp, n: sizeof(cp)) \|\|
1986	(cp && copy_to_user(to: uobatch, from: prev_key, n: map->key_size))))
1987	err = -EFAULT;
1988
1989	free_buf:
1990	kvfree(addr: buf_prevkey);
1991	kvfree(addr: buf);
1992	return err;
1993	}
1994
1995	#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD flags
1996
1997	static int map_lookup_and_delete_elem(union bpf_attr *attr)
1998	{
1999	void __user *ukey = u64_to_user_ptr(attr->key);
2000	void __user *uvalue = u64_to_user_ptr(attr->value);
2001	int ufd = attr->map_fd;
2002	struct bpf_map *map;
2003	void key, value;
2004	u32 value_size;
2005	struct fd f;
2006	int err;
2007
2008	if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM))
2009	return -EINVAL;
2010
2011	if (attr->flags & ~BPF_F_LOCK)
2012	return -EINVAL;
2013
2014	f = fdget(fd: ufd);
2015	map = __bpf_map_get(f);
2016	if (IS_ERR(ptr: map))
2017	return PTR_ERR(ptr: map);
2018	bpf_map_write_active_inc(map);
2019	if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) \|\|
2020	!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
2021	err = -EPERM;
2022	goto err_put;
2023	}
2024
2025	if (attr->flags &&
2026	(map->map_type == BPF_MAP_TYPE_QUEUE \|\|
2027	map->map_type == BPF_MAP_TYPE_STACK)) {
2028	err = -EINVAL;
2029	goto err_put;
2030	}
2031
2032	if ((attr->flags & BPF_F_LOCK) &&
2033	!btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK)) {
2034	err = -EINVAL;
2035	goto err_put;
2036	}
2037
2038	key = __bpf_copy_key(ukey, key_size: map->key_size);
2039	if (IS_ERR(ptr: key)) {
2040	err = PTR_ERR(ptr: key);
2041	goto err_put;
2042	}
2043
2044	value_size = bpf_map_value_size(map);
2045
2046	err = -ENOMEM;
2047	value = kvmalloc(size: value_size, GFP_USER \| __GFP_NOWARN);
2048	if (!value)
2049	goto free_key;
2050
2051	err = -ENOTSUPP;
2052	if (map->map_type == BPF_MAP_TYPE_QUEUE \|\|
2053	map->map_type == BPF_MAP_TYPE_STACK) {
2054	err = map->ops->map_pop_elem(map, value);
2055	} else if (map->map_type == BPF_MAP_TYPE_HASH \|\|
2056	map->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
2057	map->map_type == BPF_MAP_TYPE_LRU_HASH \|\|
2058	map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2059	if (!bpf_map_is_offloaded(map)) {
2060	bpf_disable_instrumentation();
2061	rcu_read_lock();
2062	err = map->ops->map_lookup_and_delete_elem(map, key, value, attr->flags);
2063	rcu_read_unlock();
2064	bpf_enable_instrumentation();
2065	}
2066	}
2067
2068	if (err)
2069	goto free_value;
2070
2071	if (copy_to_user(to: uvalue, from: value, n: value_size) != `0`) {
2072	err = -EFAULT;
2073	goto free_value;
2074	}
2075
2076	err = `0`;
2077
2078	free_value:
2079	kvfree(addr: value);
2080	free_key:
2081	kvfree(addr: key);
2082	err_put:
2083	bpf_map_write_active_dec(map);
2084	fdput(fd: f);
2085	return err;
2086	}
2087
2088	#define BPF_MAP_FREEZE_LAST_FIELD map_fd
2089
2090	static int map_freeze(const union bpf_attr *attr)
2091	{
2092	int err = `0`, ufd = attr->map_fd;
2093	struct bpf_map *map;
2094	struct fd f;
2095
2096	if (CHECK_ATTR(BPF_MAP_FREEZE))
2097	return -EINVAL;
2098
2099	f = fdget(fd: ufd);
2100	map = __bpf_map_get(f);
2101	if (IS_ERR(ptr: map))
2102	return PTR_ERR(ptr: map);
2103
2104	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS \|\| !IS_ERR_OR_NULL(ptr: map->record)) {
2105	fdput(fd: f);
2106	return -ENOTSUPP;
2107	}
2108
2109	if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
2110	fdput(fd: f);
2111	return -EPERM;
2112	}
2113
2114	mutex_lock(&map->freeze_mutex);
2115	if (bpf_map_write_active(map)) {
2116	err = -EBUSY;
2117	goto err_put;
2118	}
2119	if (READ_ONCE(map->frozen)) {
2120	err = -EBUSY;
2121	goto err_put;
2122	}
2123
2124	WRITE_ONCE(map->frozen, true);
2125	err_put:
2126	mutex_unlock(lock: &map->freeze_mutex);
2127	fdput(fd: f);
2128	return err;
2129	}
2130
2131	static const struct bpf_prog_ops * const bpf_prog_types[] = {
2132	#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
2133	[_id] = & _name ## _prog_ops,
2134	#define BPF_MAP_TYPE(_id, _ops)
2135	#define BPF_LINK_TYPE(_id, _name)
2136	#include <linux/bpf_types.h>
2137	#undef BPF_PROG_TYPE
2138	#undef BPF_MAP_TYPE
2139	#undef BPF_LINK_TYPE
2140	};
2141
2142	static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
2143	{
2144	const struct bpf_prog_ops *ops;
2145
2146	if (type >= ARRAY_SIZE(bpf_prog_types))
2147	return -EINVAL;
2148	type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
2149	ops = bpf_prog_types[type];
2150	if (!ops)
2151	return -EINVAL;
2152
2153	if (!bpf_prog_is_offloaded(aux: prog->aux))
2154	prog->aux->ops = ops;
2155	else
2156	prog->aux->ops = &bpf_offload_prog_ops;
2157	prog->type = type;
2158	return `0`;
2159	}
2160
2161	enum bpf_audit {
2162	BPF_AUDIT_LOAD,
2163	BPF_AUDIT_UNLOAD,
2164	BPF_AUDIT_MAX,
2165	};
2166
2167	static const char * const bpf_audit_str[BPF_AUDIT_MAX] = {
2168	[BPF_AUDIT_LOAD] = "LOAD",
2169	[BPF_AUDIT_UNLOAD] = "UNLOAD",
2170	};
2171
2172	static void bpf_audit_prog(const struct bpf_prog prog, unsigned* int op)
2173	{
2174	struct audit_context *ctx = NULL;
2175	struct audit_buffer *ab;
2176
2177	if (WARN_ON_ONCE(op >= BPF_AUDIT_MAX))
2178	return;
2179	if (audit_enabled == AUDIT_OFF)
2180	return;
2181	if (!in_irq() && !irqs_disabled())
2182	ctx = audit_context();
2183	ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
2184	if (unlikely(!ab))
2185	return;
2186	audit_log_format(ab, fmt: "prog-id=%u op=%s",
2187	prog->aux->id, bpf_audit_str[op]);
2188	audit_log_end(ab);
2189	}
2190
2191	static int bpf_prog_alloc_id(struct bpf_prog *prog)
2192	{
2193	int id;
2194
2195	idr_preload(GFP_KERNEL);
2196	spin_lock_bh(lock: &prog_idr_lock);
2197	id = idr_alloc_cyclic(&prog_idr, ptr: prog, start: `1`, INT_MAX, GFP_ATOMIC);
2198	if (id > `0`)
2199	prog->aux->id = id;
2200	spin_unlock_bh(lock: &prog_idr_lock);
2201	idr_preload_end();
2202
2203	/ id is in [1, INT_MAX) /
2204	if (WARN_ON_ONCE(!id))
2205	return -ENOSPC;
2206
2207	return id > `0` ? `0` : id;
2208	}
2209
2210	void bpf_prog_free_id(struct bpf_prog *prog)
2211	{
2212	unsigned long flags;
2213
2214	/ cBPF to eBPF migrations are currently not in the idr store.*
2215	* Offloaded programs are removed from the store when their device
2216	* disappears - even if someone grabs an fd to them they are unusable,
2217	* simply waiting for refcnt to drop to be freed.
2218	*/
2219	if (!prog->aux->id)
2220	return;
2221
2222	spin_lock_irqsave(&prog_idr_lock, flags);
2223	idr_remove(&prog_idr, id: prog->aux->id);
2224	prog->aux->id = `0`;
2225	spin_unlock_irqrestore(lock: &prog_idr_lock, flags);
2226	}
2227
2228	static void __bpf_prog_put_rcu(struct rcu_head *rcu)
2229	{
2230	struct bpf_prog_aux aux = container_of(rcu, struct* bpf_prog_aux, rcu);
2231
2232	kvfree(addr: aux->func_info);
2233	kfree(objp: aux->func_info_aux);
2234	free_uid(aux->user);
2235	security_bpf_prog_free(prog: aux->prog);
2236	bpf_prog_free(fp: aux->prog);
2237	}
2238
2239	static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
2240	{
2241	bpf_prog_kallsyms_del_all(fp: prog);
2242	btf_put(btf: prog->aux->btf);
2243	module_put(module: prog->aux->mod);
2244	kvfree(addr: prog->aux->jited_linfo);
2245	kvfree(addr: prog->aux->linfo);
2246	kfree(objp: prog->aux->kfunc_tab);
2247	if (prog->aux->attach_btf)
2248	btf_put(btf: prog->aux->attach_btf);
2249
2250	if (deferred) {
2251	if (prog->sleepable)
2252	call_rcu_tasks_trace(rhp: &prog->aux->rcu, func: __bpf_prog_put_rcu);
2253	else
2254	call_rcu(head: &prog->aux->rcu, func: __bpf_prog_put_rcu);
2255	} else {
2256	__bpf_prog_put_rcu(rcu: &prog->aux->rcu);
2257	}
2258	}
2259
2260	static void bpf_prog_put_deferred(struct work_struct *work)
2261	{
2262	struct bpf_prog_aux *aux;
2263	struct bpf_prog *prog;
2264
2265	aux = container_of(work, struct bpf_prog_aux, work);
2266	prog = aux->prog;
2267	perf_event_bpf_event(prog, type: PERF_BPF_EVENT_PROG_UNLOAD, flags: `0`);
2268	bpf_audit_prog(prog, op: BPF_AUDIT_UNLOAD);
2269	bpf_prog_free_id(prog);
2270	__bpf_prog_put_noref(prog, deferred: true);
2271	}
2272
2273	static void __bpf_prog_put(struct bpf_prog *prog)
2274	{
2275	struct bpf_prog_aux *aux = prog->aux;
2276
2277	if (atomic64_dec_and_test(v: &aux->refcnt)) {
2278	if (in_irq() \|\| irqs_disabled()) {
2279	INIT_WORK(&aux->work, bpf_prog_put_deferred);
2280	schedule_work(work: &aux->work);
2281	} else {
2282	bpf_prog_put_deferred(work: &aux->work);
2283	}
2284	}
2285	}
2286
2287	void bpf_prog_put(struct bpf_prog *prog)
2288	{
2289	__bpf_prog_put(prog);
2290	}
2291	EXPORT_SYMBOL_GPL(bpf_prog_put);
2292
2293	static int bpf_prog_release(struct inode inode, struct* file *filp)
2294	{
2295	struct bpf_prog *prog = filp->private_data;
2296
2297	bpf_prog_put(prog);
2298	return `0`;
2299	}
2300
2301	struct bpf_prog_kstats {
2302	u64 nsecs;
2303	u64 cnt;
2304	u64 misses;
2305	};
2306
2307	void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog)
2308	{
2309	struct bpf_prog_stats *stats;
2310	unsigned int flags;
2311
2312	stats = this_cpu_ptr(prog->stats);
2313	flags = u64_stats_update_begin_irqsave(syncp: &stats->syncp);
2314	u64_stats_inc(p: &stats->misses);
2315	u64_stats_update_end_irqrestore(syncp: &stats->syncp, flags);
2316	}
2317
2318	static void bpf_prog_get_stats(const struct bpf_prog *prog,
2319	struct bpf_prog_kstats *stats)
2320	{
2321	u64 nsecs = `0`, cnt = `0`, misses = `0`;
2322	int cpu;
2323
2324	for_each_possible_cpu(cpu) {
2325	const struct bpf_prog_stats *st;
2326	unsigned int start;
2327	u64 tnsecs, tcnt, tmisses;
2328
2329	st = per_cpu_ptr(prog->stats, cpu);
2330	do {
2331	start = u64_stats_fetch_begin(syncp: &st->syncp);
2332	tnsecs = u64_stats_read(p: &st->nsecs);
2333	tcnt = u64_stats_read(p: &st->cnt);
2334	tmisses = u64_stats_read(p: &st->misses);
2335	} while (u64_stats_fetch_retry(syncp: &st->syncp, start));
2336	nsecs += tnsecs;
2337	cnt += tcnt;
2338	misses += tmisses;
2339	}
2340	stats->nsecs = nsecs;
2341	stats->cnt = cnt;
2342	stats->misses = misses;
2343	}
2344
2345	#ifdef CONFIG_PROC_FS
2346	static void bpf_prog_show_fdinfo(struct seq_file m, struct* file *filp)
2347	{
2348	const struct bpf_prog *prog = filp->private_data;
2349	char prog_tag[sizeof(prog->tag) * `2` + `1`] = { };
2350	struct bpf_prog_kstats stats;
2351
2352	bpf_prog_get_stats(prog, stats: &stats);
2353	bin2hex(dst: prog_tag, src: prog->tag, count: sizeof(prog->tag));
2354	seq_printf(m,
2355	fmt: "prog_type:\t%u\n"
2356	"prog_jited:\t%u\n"
2357	"prog_tag:\t%s\n"
2358	"memlock:\t%llu\n"
2359	"prog_id:\t%u\n"
2360	"run_time_ns:\t%llu\n"
2361	"run_cnt:\t%llu\n"
2362	"recursion_misses:\t%llu\n"
2363	"verified_insns:\t%u\n",
2364	prog->type,
2365	prog->jited,
2366	prog_tag,
2367	prog->pages * `1ULL` << PAGE_SHIFT,
2368	prog->aux->id,
2369	stats.nsecs,
2370	stats.cnt,
2371	stats.misses,
2372	prog->aux->verified_insns);
2373	}
2374	#endif
2375
2376	const struct file_operations bpf_prog_fops = {
2377	#ifdef CONFIG_PROC_FS
2378	.show_fdinfo = bpf_prog_show_fdinfo,
2379	#endif
2380	.release = bpf_prog_release,
2381	.read = bpf_dummy_read,
2382	.write = bpf_dummy_write,
2383	};
2384
2385	int bpf_prog_new_fd(struct bpf_prog *prog)
2386	{
2387	int ret;
2388
2389	ret = security_bpf_prog(prog);
2390	if (ret < `0`)
2391	return ret;
2392
2393	return anon_inode_getfd(name: "bpf-prog", fops: &bpf_prog_fops, priv: prog,
2394	O_RDWR \| O_CLOEXEC);
2395	}
2396
2397	static struct bpf_prog ____bpf_prog_get(struct* fd f)
2398	{
2399	if (!f.file)
2400	return ERR_PTR(error: -EBADF);
2401	if (f.file->f_op != &bpf_prog_fops) {
2402	fdput(fd: f);
2403	return ERR_PTR(error: -EINVAL);
2404	}
2405
2406	return f.file->private_data;
2407	}
2408
2409	void bpf_prog_add(struct bpf_prog prog, int* i)
2410	{
2411	atomic64_add(i, v: &prog->aux->refcnt);
2412	}
2413	EXPORT_SYMBOL_GPL(bpf_prog_add);
2414
2415	void bpf_prog_sub(struct bpf_prog prog, int* i)
2416	{
2417	/ Only to be used for undoing previous bpf_prog_add() in some*
2418	* error path. We still know that another entity in our call
2419	* path holds a reference to the program, thus atomic_sub() can
2420	* be safely used in such cases!
2421	*/
2422	WARN_ON(atomic64_sub_return(i, &prog->aux->refcnt) == `0`);
2423	}
2424	EXPORT_SYMBOL_GPL(bpf_prog_sub);
2425
2426	void bpf_prog_inc(struct bpf_prog *prog)
2427	{
2428	atomic64_inc(v: &prog->aux->refcnt);
2429	}
2430	EXPORT_SYMBOL_GPL(bpf_prog_inc);
2431
2432	/ prog_idr_lock should have been held /
2433	struct bpf_prog bpf_prog_inc_not_zero(struct* bpf_prog *prog)
2434	{
2435	int refold;
2436
2437	refold = atomic64_fetch_add_unless(v: &prog->aux->refcnt, a: `1`, u: `0`);
2438
2439	if (!refold)
2440	return ERR_PTR(error: -ENOENT);
2441
2442	return prog;
2443	}
2444	EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
2445
2446	bool bpf_prog_get_ok(struct bpf_prog *prog,
2447	enum bpf_prog_type *attach_type, bool attach_drv)
2448	{
2449	/ not an attachment, just a refcount inc, always allow /
2450	if (!attach_type)
2451	return true;
2452
2453	if (prog->type != *attach_type)
2454	return false;
2455	if (bpf_prog_is_offloaded(aux: prog->aux) && !attach_drv)
2456	return false;
2457
2458	return true;
2459	}
2460
2461	static struct bpf_prog __bpf_prog_get(u32 ufd, enum* bpf_prog_type *attach_type,
2462	bool attach_drv)
2463	{
2464	struct fd f = fdget(fd: ufd);
2465	struct bpf_prog *prog;
2466
2467	prog = ____bpf_prog_get(f);
2468	if (IS_ERR(ptr: prog))
2469	return prog;
2470	if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
2471	prog = ERR_PTR(error: -EINVAL);
2472	goto out;
2473	}
2474
2475	bpf_prog_inc(prog);
2476	out:
2477	fdput(fd: f);
2478	return prog;
2479	}
2480
2481	struct bpf_prog *bpf_prog_get(u32 ufd)
2482	{
2483	return __bpf_prog_get(ufd, NULL, attach_drv: false);
2484	}
2485
2486	struct bpf_prog bpf_prog_get_type_dev(u32 ufd, enum* bpf_prog_type type,
2487	bool attach_drv)
2488	{
2489	return __bpf_prog_get(ufd, attach_type: &type, attach_drv);
2490	}
2491	EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
2492
2493	/ Initially all BPF programs could be loaded w/o specifying*
2494	* expected_attach_type. Later for some of them specifying expected_attach_type
2495	* at load time became required so that program could be validated properly.
2496	* Programs of types that are allowed to be loaded both w/ and w/o (for
2497	* backward compatibility) expected_attach_type, should have the default attach
2498	* type assigned to expected_attach_type for the latter case, so that it can be
2499	* validated later at attach time.
2500	*
2501	* bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if
2502	* prog type requires it but has some attach types that have to be backward
2503	* compatible.
2504	*/
2505	static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr)
2506	{
2507	switch (attr->prog_type) {
2508	case BPF_PROG_TYPE_CGROUP_SOCK:
2509	/ Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't*
2510	* exist so checking for non-zero is the way to go here.
2511	*/
2512	if (!attr->expected_attach_type)
2513	attr->expected_attach_type =
2514	BPF_CGROUP_INET_SOCK_CREATE;
2515	break;
2516	case BPF_PROG_TYPE_SK_REUSEPORT:
2517	if (!attr->expected_attach_type)
2518	attr->expected_attach_type =
2519	BPF_SK_REUSEPORT_SELECT;
2520	break;
2521	}
2522	}
2523
2524	static int
2525	bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
2526	enum bpf_attach_type expected_attach_type,
2527	struct btf *attach_btf, u32 btf_id,
2528	struct bpf_prog *dst_prog)
2529	{
2530	if (btf_id) {
2531	if (btf_id > BTF_MAX_TYPE)
2532	return -EINVAL;
2533
2534	if (!attach_btf && !dst_prog)
2535	return -EINVAL;
2536
2537	switch (prog_type) {
2538	case BPF_PROG_TYPE_TRACING:
2539	case BPF_PROG_TYPE_LSM:
2540	case BPF_PROG_TYPE_STRUCT_OPS:
2541	case BPF_PROG_TYPE_EXT:
2542	break;
2543	default:
2544	return -EINVAL;
2545	}
2546	}
2547
2548	if (attach_btf && (!btf_id \|\| dst_prog))
2549	return -EINVAL;
2550
2551	if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING &&
2552	prog_type != BPF_PROG_TYPE_EXT)
2553	return -EINVAL;
2554
2555	switch (prog_type) {
2556	case BPF_PROG_TYPE_CGROUP_SOCK:
2557	switch (expected_attach_type) {
2558	case BPF_CGROUP_INET_SOCK_CREATE:
2559	case BPF_CGROUP_INET_SOCK_RELEASE:
2560	case BPF_CGROUP_INET4_POST_BIND:
2561	case BPF_CGROUP_INET6_POST_BIND:
2562	return `0`;
2563	default:
2564	return -EINVAL;
2565	}
2566	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2567	switch (expected_attach_type) {
2568	case BPF_CGROUP_INET4_BIND:
2569	case BPF_CGROUP_INET6_BIND:
2570	case BPF_CGROUP_INET4_CONNECT:
2571	case BPF_CGROUP_INET6_CONNECT:
2572	case BPF_CGROUP_UNIX_CONNECT:
2573	case BPF_CGROUP_INET4_GETPEERNAME:
2574	case BPF_CGROUP_INET6_GETPEERNAME:
2575	case BPF_CGROUP_UNIX_GETPEERNAME:
2576	case BPF_CGROUP_INET4_GETSOCKNAME:
2577	case BPF_CGROUP_INET6_GETSOCKNAME:
2578	case BPF_CGROUP_UNIX_GETSOCKNAME:
2579	case BPF_CGROUP_UDP4_SENDMSG:
2580	case BPF_CGROUP_UDP6_SENDMSG:
2581	case BPF_CGROUP_UNIX_SENDMSG:
2582	case BPF_CGROUP_UDP4_RECVMSG:
2583	case BPF_CGROUP_UDP6_RECVMSG:
2584	case BPF_CGROUP_UNIX_RECVMSG:
2585	return `0`;
2586	default:
2587	return -EINVAL;
2588	}
2589	case BPF_PROG_TYPE_CGROUP_SKB:
2590	switch (expected_attach_type) {
2591	case BPF_CGROUP_INET_INGRESS:
2592	case BPF_CGROUP_INET_EGRESS:
2593	return `0`;
2594	default:
2595	return -EINVAL;
2596	}
2597	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2598	switch (expected_attach_type) {
2599	case BPF_CGROUP_SETSOCKOPT:
2600	case BPF_CGROUP_GETSOCKOPT:
2601	return `0`;
2602	default:
2603	return -EINVAL;
2604	}
2605	case BPF_PROG_TYPE_SK_LOOKUP:
2606	if (expected_attach_type == BPF_SK_LOOKUP)
2607	return `0`;
2608	return -EINVAL;
2609	case BPF_PROG_TYPE_SK_REUSEPORT:
2610	switch (expected_attach_type) {
2611	case BPF_SK_REUSEPORT_SELECT:
2612	case BPF_SK_REUSEPORT_SELECT_OR_MIGRATE:
2613	return `0`;
2614	default:
2615	return -EINVAL;
2616	}
2617	case BPF_PROG_TYPE_NETFILTER:
2618	if (expected_attach_type == BPF_NETFILTER)
2619	return `0`;
2620	return -EINVAL;
2621	case BPF_PROG_TYPE_SYSCALL:
2622	case BPF_PROG_TYPE_EXT:
2623	if (expected_attach_type)
2624	return -EINVAL;
2625	fallthrough;
2626	default:
2627	return `0`;
2628	}
2629	}
2630
2631	static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
2632	{
2633	switch (prog_type) {
2634	case BPF_PROG_TYPE_SCHED_CLS:
2635	case BPF_PROG_TYPE_SCHED_ACT:
2636	case BPF_PROG_TYPE_XDP:
2637	case BPF_PROG_TYPE_LWT_IN:
2638	case BPF_PROG_TYPE_LWT_OUT:
2639	case BPF_PROG_TYPE_LWT_XMIT:
2640	case BPF_PROG_TYPE_LWT_SEG6LOCAL:
2641	case BPF_PROG_TYPE_SK_SKB:
2642	case BPF_PROG_TYPE_SK_MSG:
2643	case BPF_PROG_TYPE_FLOW_DISSECTOR:
2644	case BPF_PROG_TYPE_CGROUP_DEVICE:
2645	case BPF_PROG_TYPE_CGROUP_SOCK:
2646	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
2647	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
2648	case BPF_PROG_TYPE_CGROUP_SYSCTL:
2649	case BPF_PROG_TYPE_SOCK_OPS:
2650	case BPF_PROG_TYPE_EXT: / extends any prog /
2651	case BPF_PROG_TYPE_NETFILTER:
2652	return true;
2653	case BPF_PROG_TYPE_CGROUP_SKB:
2654	/ always unpriv /
2655	case BPF_PROG_TYPE_SK_REUSEPORT:
2656	/ equivalent to SOCKET_FILTER. need CAP_BPF only /
2657	default:
2658	return false;
2659	}
2660	}
2661
2662	static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
2663	{
2664	switch (prog_type) {
2665	case BPF_PROG_TYPE_KPROBE:
2666	case BPF_PROG_TYPE_TRACEPOINT:
2667	case BPF_PROG_TYPE_PERF_EVENT:
2668	case BPF_PROG_TYPE_RAW_TRACEPOINT:
2669	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
2670	case BPF_PROG_TYPE_TRACING:
2671	case BPF_PROG_TYPE_LSM:
2672	case BPF_PROG_TYPE_STRUCT_OPS: / has access to struct sock /
2673	case BPF_PROG_TYPE_EXT: / extends any prog /
2674	return true;
2675	default:
2676	return false;
2677	}
2678	}
2679
2680	/ last field in 'union bpf_attr' used by this command /
2681	#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd
2682
2683	static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
2684	{
2685	enum bpf_prog_type type = attr->prog_type;
2686	struct bpf_prog prog, dst_prog = NULL;
2687	struct btf *attach_btf = NULL;
2688	struct bpf_token *token = NULL;
2689	bool bpf_cap;
2690	int err;
2691	char license[`128`];
2692
2693	if (CHECK_ATTR(BPF_PROG_LOAD))
2694	return -EINVAL;
2695
2696	if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT \|
2697	BPF_F_ANY_ALIGNMENT \|
2698	BPF_F_TEST_STATE_FREQ \|
2699	BPF_F_SLEEPABLE \|
2700	BPF_F_TEST_RND_HI32 \|
2701	BPF_F_XDP_HAS_FRAGS \|
2702	BPF_F_XDP_DEV_BOUND_ONLY \|
2703	BPF_F_TEST_REG_INVARIANTS \|
2704	BPF_F_TOKEN_FD))
2705	return -EINVAL;
2706
2707	bpf_prog_load_fixup_attach_type(attr);
2708
2709	if (attr->prog_flags & BPF_F_TOKEN_FD) {
2710	token = bpf_token_get_from_fd(ufd: attr->prog_token_fd);
2711	if (IS_ERR(ptr: token))
2712	return PTR_ERR(ptr: token);
2713	/ if current token doesn't grant prog loading permissions,*
2714	* then we can't use this token, so ignore it and rely on
2715	* system-wide capabilities checks
2716	*/
2717	if (!bpf_token_allow_cmd(token, cmd: BPF_PROG_LOAD) \|\|
2718	!bpf_token_allow_prog_type(token, prog_type: attr->prog_type,
2719	attach_type: attr->expected_attach_type)) {
2720	bpf_token_put(token);
2721	token = NULL;
2722	}
2723	}
2724
2725	bpf_cap = bpf_token_capable(token, CAP_BPF);
2726	err = -EPERM;
2727
2728	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
2729	(attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
2730	!bpf_cap)
2731	goto put_token;
2732
2733	/ Intent here is for unprivileged_bpf_disabled to block BPF program*
2734	* creation for unprivileged users; other actions depend
2735	* on fd availability and access to bpffs, so are dependent on
2736	* object creation success. Even with unprivileged BPF disabled,
2737	* capability checks are still carried out for these
2738	* and other operations.
2739	*/
2740	if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
2741	goto put_token;
2742
2743	if (attr->insn_cnt == `0` \|\|
2744	attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
2745	err = -E2BIG;
2746	goto put_token;
2747	}
2748	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
2749	type != BPF_PROG_TYPE_CGROUP_SKB &&
2750	!bpf_cap)
2751	goto put_token;
2752
2753	if (is_net_admin_prog_type(prog_type: type) && !bpf_token_capable(token, CAP_NET_ADMIN))
2754	goto put_token;
2755	if (is_perfmon_prog_type(prog_type: type) && !bpf_token_capable(token, CAP_PERFMON))
2756	goto put_token;
2757
2758	/ attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog*
2759	* or btf, we need to check which one it is
2760	*/
2761	if (attr->attach_prog_fd) {
2762	dst_prog = bpf_prog_get(ufd: attr->attach_prog_fd);
2763	if (IS_ERR(ptr: dst_prog)) {
2764	dst_prog = NULL;
2765	attach_btf = btf_get_by_fd(fd: attr->attach_btf_obj_fd);
2766	if (IS_ERR(ptr: attach_btf)) {
2767	err = -EINVAL;
2768	goto put_token;
2769	}
2770	if (!btf_is_kernel(btf: attach_btf)) {
2771	/ attaching through specifying bpf_prog's BTF*
2772	* objects directly might be supported eventually
2773	*/
2774	btf_put(btf: attach_btf);
2775	err = -ENOTSUPP;
2776	goto put_token;
2777	}
2778	}
2779	} else if (attr->attach_btf_id) {
2780	/ fall back to vmlinux BTF, if BTF type ID is specified /
2781	attach_btf = bpf_get_btf_vmlinux();
2782	if (IS_ERR(ptr: attach_btf)) {
2783	err = PTR_ERR(ptr: attach_btf);
2784	goto put_token;
2785	}
2786	if (!attach_btf) {
2787	err = -EINVAL;
2788	goto put_token;
2789	}
2790	btf_get(btf: attach_btf);
2791	}
2792
2793	if (bpf_prog_load_check_attach(prog_type: type, expected_attach_type: attr->expected_attach_type,
2794	attach_btf, btf_id: attr->attach_btf_id,
2795	dst_prog)) {
2796	if (dst_prog)
2797	bpf_prog_put(dst_prog);
2798	if (attach_btf)
2799	btf_put(btf: attach_btf);
2800	err = -EINVAL;
2801	goto put_token;
2802	}
2803
2804	/ plain bpf_prog allocation /
2805	prog = bpf_prog_alloc(size: bpf_prog_size(proglen: attr->insn_cnt), GFP_USER);
2806	if (!prog) {
2807	if (dst_prog)
2808	bpf_prog_put(dst_prog);
2809	if (attach_btf)
2810	btf_put(btf: attach_btf);
2811	err = -EINVAL;
2812	goto put_token;
2813	}
2814
2815	prog->expected_attach_type = attr->expected_attach_type;
2816	prog->sleepable = !!(attr->prog_flags & BPF_F_SLEEPABLE);
2817	prog->aux->attach_btf = attach_btf;
2818	prog->aux->attach_btf_id = attr->attach_btf_id;
2819	prog->aux->dst_prog = dst_prog;
2820	prog->aux->dev_bound = !!attr->prog_ifindex;
2821	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
2822
2823	/ move token into prog->aux, reuse taken refcnt /
2824	prog->aux->token = token;
2825	token = NULL;
2826
2827	prog->aux->user = get_current_user();
2828	prog->len = attr->insn_cnt;
2829
2830	err = -EFAULT;
2831	if (copy_from_bpfptr(dst: prog->insns,
2832	src: make_bpfptr(addr: attr->insns, is_kernel: uattr.is_kernel),
2833	size: bpf_prog_insn_size(prog)) != `0`)
2834	goto free_prog;
2835	/ copy eBPF program license from user space /
2836	if (strncpy_from_bpfptr(dst: license,
2837	src: make_bpfptr(addr: attr->license, is_kernel: uattr.is_kernel),
2838	count: sizeof(license) - `1`) < `0`)
2839	goto free_prog;
2840	license[sizeof(license) - `1`] = `0`;
2841
2842	/ eBPF programs must be GPL compatible to use GPL-ed functions /
2843	prog->gpl_compatible = license_is_gpl_compatible(license) ? `1` : `0`;
2844
2845	prog->orig_prog = NULL;
2846	prog->jited = `0`;
2847
2848	atomic64_set(v: &prog->aux->refcnt, i: `1`);
2849
2850	if (bpf_prog_is_dev_bound(aux: prog->aux)) {
2851	err = bpf_prog_dev_bound_init(prog, attr);
2852	if (err)
2853	goto free_prog;
2854	}
2855
2856	if (type == BPF_PROG_TYPE_EXT && dst_prog &&
2857	bpf_prog_is_dev_bound(aux: dst_prog->aux)) {
2858	err = bpf_prog_dev_bound_inherit(new_prog: prog, old_prog: dst_prog);
2859	if (err)
2860	goto free_prog;
2861	}
2862
2863	/*
2864	* Bookkeeping for managing the program attachment chain.
2865	*
2866	* It might be tempting to set attach_tracing_prog flag at the attachment
2867	* time, but this will not prevent from loading bunch of tracing prog
2868	* first, then attach them one to another.
2869	*
2870	* The flag attach_tracing_prog is set for the whole program lifecycle, and
2871	* doesn't have to be cleared in bpf_tracing_link_release, since tracing
2872	* programs cannot change attachment target.
2873	*/
2874	if (type == BPF_PROG_TYPE_TRACING && dst_prog &&
2875	dst_prog->type == BPF_PROG_TYPE_TRACING) {
2876	prog->aux->attach_tracing_prog = true;
2877	}
2878
2879	/ find program type: socket_filter vs tracing_filter /
2880	err = find_prog_type(type, prog);
2881	if (err < `0`)
2882	goto free_prog;
2883
2884	prog->aux->load_time = ktime_get_boottime_ns();
2885	err = bpf_obj_name_cpy(dst: prog->aux->name, src: attr->prog_name,
2886	size: sizeof(attr->prog_name));
2887	if (err < `0`)
2888	goto free_prog;
2889
2890	err = security_bpf_prog_load(prog, attr, token);
2891	if (err)
2892	goto free_prog_sec;
2893
2894	/ run eBPF verifier /
2895	err = bpf_check(fp: &prog, attr, uattr, uattr_size);
2896	if (err < `0`)
2897	goto free_used_maps;
2898
2899	prog = bpf_prog_select_runtime(fp: prog, err: &err);
2900	if (err < `0`)
2901	goto free_used_maps;
2902
2903	err = bpf_prog_alloc_id(prog);
2904	if (err)
2905	goto free_used_maps;
2906
2907	/ Upon success of bpf_prog_alloc_id(), the BPF prog is*
2908	* effectively publicly exposed. However, retrieving via
2909	* bpf_prog_get_fd_by_id() will take another reference,
2910	* therefore it cannot be gone underneath us.
2911	*
2912	* Only for the time /after/ successful bpf_prog_new_fd()
2913	* and before returning to userspace, we might just hold
2914	* one reference and any parallel close on that fd could
2915	* rip everything out. Hence, below notifications must
2916	* happen before bpf_prog_new_fd().
2917	*
2918	* Also, any failure handling from this point onwards must
2919	* be using bpf_prog_put() given the program is exposed.
2920	*/
2921	bpf_prog_kallsyms_add(fp: prog);
2922	perf_event_bpf_event(prog, type: PERF_BPF_EVENT_PROG_LOAD, flags: `0`);
2923	bpf_audit_prog(prog, op: BPF_AUDIT_LOAD);
2924
2925	err = bpf_prog_new_fd(prog);
2926	if (err < `0`)
2927	bpf_prog_put(prog);
2928	return err;
2929
2930	free_used_maps:
2931	/ In case we have subprogs, we need to wait for a grace*
2932	* period before we can tear down JIT memory since symbols
2933	* are already exposed under kallsyms.
2934	*/
2935	__bpf_prog_put_noref(prog, deferred: prog->aux->real_func_cnt);
2936	return err;
2937
2938	free_prog_sec:
2939	security_bpf_prog_free(prog);
2940	free_prog:
2941	free_uid(prog->aux->user);
2942	if (prog->aux->attach_btf)
2943	btf_put(btf: prog->aux->attach_btf);
2944	bpf_prog_free(fp: prog);
2945	put_token:
2946	bpf_token_put(token);
2947	return err;
2948	}
2949
2950	#define BPF_OBJ_LAST_FIELD path_fd
2951
2952	static int bpf_obj_pin(const union bpf_attr *attr)
2953	{
2954	int path_fd;
2955
2956	if (CHECK_ATTR(BPF_OBJ) \|\| attr->file_flags & ~BPF_F_PATH_FD)
2957	return -EINVAL;
2958
2959	/ path_fd has to be accompanied by BPF_F_PATH_FD flag /
2960	if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
2961	return -EINVAL;
2962
2963	path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
2964	return bpf_obj_pin_user(ufd: attr->bpf_fd, path_fd,
2965	u64_to_user_ptr(attr->pathname));
2966	}
2967
2968	static int bpf_obj_get(const union bpf_attr *attr)
2969	{
2970	int path_fd;
2971
2972	if (CHECK_ATTR(BPF_OBJ) \|\| attr->bpf_fd != `0` \|\|
2973	attr->file_flags & ~(BPF_OBJ_FLAG_MASK \| BPF_F_PATH_FD))
2974	return -EINVAL;
2975
2976	/ path_fd has to be accompanied by BPF_F_PATH_FD flag /
2977	if (!(attr->file_flags & BPF_F_PATH_FD) && attr->path_fd)
2978	return -EINVAL;
2979
2980	path_fd = attr->file_flags & BPF_F_PATH_FD ? attr->path_fd : AT_FDCWD;
2981	return bpf_obj_get_user(path_fd, u64_to_user_ptr(attr->pathname),
2982	flags: attr->file_flags);
2983	}
2984
2985	void bpf_link_init(struct bpf_link link, enum* bpf_link_type type,
2986	const struct bpf_link_ops ops, struct* bpf_prog *prog)
2987	{
2988	atomic64_set(v: &link->refcnt, i: `1`);
2989	link->type = type;
2990	link->id = `0`;
2991	link->ops = ops;
2992	link->prog = prog;
2993	}
2994
2995	static void bpf_link_free_id(int id)
2996	{
2997	if (!id)
2998	return;
2999
3000	spin_lock_bh(lock: &link_idr_lock);
3001	idr_remove(&link_idr, id);
3002	spin_unlock_bh(lock: &link_idr_lock);
3003	}
3004
3005	/ Clean up bpf_link and corresponding anon_inode file and FD. After*
3006	* anon_inode is created, bpf_link can't be just kfree()'d due to deferred
3007	* anon_inode's release() call. This helper marks bpf_link as
3008	* defunct, releases anon_inode file and puts reserved FD. bpf_prog's refcnt
3009	* is not decremented, it's the responsibility of a calling code that failed
3010	* to complete bpf_link initialization.
3011	* This helper eventually calls link's dealloc callback, but does not call
3012	* link's release callback.
3013	*/
3014	void bpf_link_cleanup(struct bpf_link_primer *primer)
3015	{
3016	primer->link->prog = NULL;
3017	bpf_link_free_id(id: primer->id);
3018	fput(primer->file);
3019	put_unused_fd(fd: primer->fd);
3020	}
3021
3022	void bpf_link_inc(struct bpf_link *link)
3023	{
3024	atomic64_inc(v: &link->refcnt);
3025	}
3026
3027	static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
3028	{
3029	struct bpf_link link = container_of(rcu, struct* bpf_link, rcu);
3030
3031	/ free bpf_link and its containing memory /
3032	link->ops->dealloc_deferred(link);
3033	}
3034
3035	static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
3036	{
3037	if (rcu_trace_implies_rcu_gp())
3038	bpf_link_defer_dealloc_rcu_gp(rcu);
3039	else
3040	call_rcu(head: rcu, func: bpf_link_defer_dealloc_rcu_gp);
3041	}
3042
3043	/ bpf_link_free is guaranteed to be called from process context /
3044	static void bpf_link_free(struct bpf_link *link)
3045	{
3046	bool sleepable = false;
3047
3048	bpf_link_free_id(id: link->id);
3049	if (link->prog) {
3050	sleepable = link->prog->sleepable;
3051	/ detach BPF program, clean up used resources /
3052	link->ops->release(link);
3053	bpf_prog_put(link->prog);
3054	}
3055	if (link->ops->dealloc_deferred) {
3056	/ schedule BPF link deallocation; if underlying BPF program*
3057	* is sleepable, we need to first wait for RCU tasks trace
3058	* sync, then go through "classic" RCU grace period
3059	*/
3060	if (sleepable)
3061	call_rcu_tasks_trace(rhp: &link->rcu, func: bpf_link_defer_dealloc_mult_rcu_gp);
3062	else
3063	call_rcu(head: &link->rcu, func: bpf_link_defer_dealloc_rcu_gp);
3064	}
3065	if (link->ops->dealloc)
3066	link->ops->dealloc(link);
3067	}
3068
3069	static void bpf_link_put_deferred(struct work_struct *work)
3070	{
3071	struct bpf_link link = container_of(work, struct* bpf_link, work);
3072
3073	bpf_link_free(link);
3074	}
3075
3076	/ bpf_link_put might be called from atomic context. It needs to be called*
3077	* from sleepable context in order to acquire sleeping locks during the process.
3078	*/
3079	void bpf_link_put(struct bpf_link *link)
3080	{
3081	if (!atomic64_dec_and_test(v: &link->refcnt))
3082	return;
3083
3084	INIT_WORK(&link->work, bpf_link_put_deferred);
3085	schedule_work(work: &link->work);
3086	}
3087	EXPORT_SYMBOL(bpf_link_put);
3088
3089	static void bpf_link_put_direct(struct bpf_link *link)
3090	{
3091	if (!atomic64_dec_and_test(v: &link->refcnt))
3092	return;
3093	bpf_link_free(link);
3094	}
3095
3096	static int bpf_link_release(struct inode inode, struct* file *filp)
3097	{
3098	struct bpf_link *link = filp->private_data;
3099
3100	bpf_link_put_direct(link);
3101	return `0`;
3102	}
3103
3104	#ifdef CONFIG_PROC_FS
3105	#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
3106	#define BPF_MAP_TYPE(_id, _ops)
3107	#define BPF_LINK_TYPE(_id, _name) [_id] = #_name,
3108	static const char *bpf_link_type_strs[] = {
3109	[BPF_LINK_TYPE_UNSPEC] = "<invalid>",
3110	#include <linux/bpf_types.h>
3111	};
3112	#undef BPF_PROG_TYPE
3113	#undef BPF_MAP_TYPE
3114	#undef BPF_LINK_TYPE
3115
3116	static void bpf_link_show_fdinfo(struct seq_file m, struct* file *filp)
3117	{
3118	const struct bpf_link *link = filp->private_data;
3119	const struct bpf_prog *prog = link->prog;
3120	char prog_tag[sizeof(prog->tag) * `2` + `1`] = { };
3121
3122	seq_printf(m,
3123	fmt: "link_type:\t%s\n"
3124	"link_id:\t%u\n",
3125	bpf_link_type_strs[link->type],
3126	link->id);
3127	if (prog) {
3128	bin2hex(dst: prog_tag, src: prog->tag, count: sizeof(prog->tag));
3129	seq_printf(m,
3130	fmt: "prog_tag:\t%s\n"
3131	"prog_id:\t%u\n",
3132	prog_tag,
3133	prog->aux->id);
3134	}
3135	if (link->ops->show_fdinfo)
3136	link->ops->show_fdinfo(link, m);
3137	}
3138	#endif
3139
3140	static const struct file_operations bpf_link_fops = {
3141	#ifdef CONFIG_PROC_FS
3142	.show_fdinfo = bpf_link_show_fdinfo,
3143	#endif
3144	.release = bpf_link_release,
3145	.read = bpf_dummy_read,
3146	.write = bpf_dummy_write,
3147	};
3148
3149	static int bpf_link_alloc_id(struct bpf_link *link)
3150	{
3151	int id;
3152
3153	idr_preload(GFP_KERNEL);
3154	spin_lock_bh(lock: &link_idr_lock);
3155	id = idr_alloc_cyclic(&link_idr, ptr: link, start: `1`, INT_MAX, GFP_ATOMIC);
3156	spin_unlock_bh(lock: &link_idr_lock);
3157	idr_preload_end();
3158
3159	return id;
3160	}
3161
3162	/ Prepare bpf_link to be exposed to user-space by allocating anon_inode file,*
3163	* reserving unused FD and allocating ID from link_idr. This is to be paired
3164	* with bpf_link_settle() to install FD and ID and expose bpf_link to
3165	* user-space, if bpf_link is successfully attached. If not, bpf_link and
3166	* pre-allocated resources are to be freed with bpf_cleanup() call. All the
3167	* transient state is passed around in struct bpf_link_primer.
3168	* This is preferred way to create and initialize bpf_link, especially when
3169	* there are complicated and expensive operations in between creating bpf_link
3170	* itself and attaching it to BPF hook. By using bpf_link_prime() and
3171	* bpf_link_settle() kernel code using bpf_link doesn't have to perform
3172	* expensive (and potentially failing) roll back operations in a rare case
3173	* that file, FD, or ID can't be allocated.
3174	*/
3175	int bpf_link_prime(struct bpf_link link, struct* bpf_link_primer *primer)
3176	{
3177	struct file *file;
3178	int fd, id;
3179
3180	fd = get_unused_fd_flags(O_CLOEXEC);
3181	if (fd < `0`)
3182	return fd;
3183
3184
3185	id = bpf_link_alloc_id(link);
3186	if (id < `0`) {
3187	put_unused_fd(fd);
3188	return id;
3189	}
3190
3191	file = anon_inode_getfile(name: "bpf_link", fops: &bpf_link_fops, priv: link, O_CLOEXEC);
3192	if (IS_ERR(ptr: file)) {
3193	bpf_link_free_id(id);
3194	put_unused_fd(fd);
3195	return PTR_ERR(ptr: file);
3196	}
3197
3198	primer->link = link;
3199	primer->file = file;
3200	primer->fd = fd;
3201	primer->id = id;
3202	return `0`;
3203	}
3204
3205	int bpf_link_settle(struct bpf_link_primer *primer)
3206	{
3207	/ make bpf_link fetchable by ID /
3208	spin_lock_bh(lock: &link_idr_lock);
3209	primer->link->id = primer->id;
3210	spin_unlock_bh(lock: &link_idr_lock);
3211	/ make bpf_link fetchable by FD /
3212	fd_install(fd: primer->fd, file: primer->file);
3213	/ pass through installed FD /
3214	return primer->fd;
3215	}
3216
3217	int bpf_link_new_fd(struct bpf_link *link)
3218	{
3219	return anon_inode_getfd(name: "bpf-link", fops: &bpf_link_fops, priv: link, O_CLOEXEC);
3220	}
3221
3222	struct bpf_link *bpf_link_get_from_fd(u32 ufd)
3223	{
3224	struct fd f = fdget(fd: ufd);
3225	struct bpf_link *link;
3226
3227	if (!f.file)
3228	return ERR_PTR(error: -EBADF);
3229	if (f.file->f_op != &bpf_link_fops) {
3230	fdput(fd: f);
3231	return ERR_PTR(error: -EINVAL);
3232	}
3233
3234	link = f.file->private_data;
3235	bpf_link_inc(link);
3236	fdput(fd: f);
3237
3238	return link;
3239	}
3240	EXPORT_SYMBOL(bpf_link_get_from_fd);
3241
3242	static void bpf_tracing_link_release(struct bpf_link *link)
3243	{
3244	struct bpf_tracing_link *tr_link =
3245	container_of(link, struct bpf_tracing_link, link.link);
3246
3247	WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
3248	tr_link->trampoline));
3249
3250	bpf_trampoline_put(tr: tr_link->trampoline);
3251
3252	/ tgt_prog is NULL if target is a kernel function /
3253	if (tr_link->tgt_prog)
3254	bpf_prog_put(tr_link->tgt_prog);
3255	}
3256
3257	static void bpf_tracing_link_dealloc(struct bpf_link *link)
3258	{
3259	struct bpf_tracing_link *tr_link =
3260	container_of(link, struct bpf_tracing_link, link.link);
3261
3262	kfree(objp: tr_link);
3263	}
3264
3265	static void bpf_tracing_link_show_fdinfo(const struct bpf_link *link,
3266	struct seq_file *seq)
3267	{
3268	struct bpf_tracing_link *tr_link =
3269	container_of(link, struct bpf_tracing_link, link.link);
3270	u32 target_btf_id, target_obj_id;
3271
3272	bpf_trampoline_unpack_key(key: tr_link->trampoline->key,
3273	obj_id: &target_obj_id, btf_id: &target_btf_id);
3274	seq_printf(m: seq,
3275	fmt: "attach_type:\t%d\n"
3276	"target_obj_id:\t%u\n"
3277	"target_btf_id:\t%u\n",
3278	tr_link->attach_type,
3279	target_obj_id,
3280	target_btf_id);
3281	}
3282
3283	static int bpf_tracing_link_fill_link_info(const struct bpf_link *link,
3284	struct bpf_link_info *info)
3285	{
3286	struct bpf_tracing_link *tr_link =
3287	container_of(link, struct bpf_tracing_link, link.link);
3288
3289	info->tracing.attach_type = tr_link->attach_type;
3290	bpf_trampoline_unpack_key(key: tr_link->trampoline->key,
3291	obj_id: &info->tracing.target_obj_id,
3292	btf_id: &info->tracing.target_btf_id);
3293
3294	return `0`;
3295	}
3296
3297	static const struct bpf_link_ops bpf_tracing_link_lops = {
3298	.release = bpf_tracing_link_release,
3299	.dealloc = bpf_tracing_link_dealloc,
3300	.show_fdinfo = bpf_tracing_link_show_fdinfo,
3301	.fill_link_info = bpf_tracing_link_fill_link_info,
3302	};
3303
3304	static int bpf_tracing_prog_attach(struct bpf_prog *prog,
3305	int tgt_prog_fd,
3306	u32 btf_id,
3307	u64 bpf_cookie)
3308	{
3309	struct bpf_link_primer link_primer;
3310	struct bpf_prog *tgt_prog = NULL;
3311	struct bpf_trampoline *tr = NULL;
3312	struct bpf_tracing_link *link;
3313	u64 key = `0`;
3314	int err;
3315
3316	switch (prog->type) {
3317	case BPF_PROG_TYPE_TRACING:
3318	if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
3319	prog->expected_attach_type != BPF_TRACE_FEXIT &&
3320	prog->expected_attach_type != BPF_MODIFY_RETURN) {
3321	err = -EINVAL;
3322	goto out_put_prog;
3323	}
3324	break;
3325	case BPF_PROG_TYPE_EXT:
3326	if (prog->expected_attach_type != `0`) {
3327	err = -EINVAL;
3328	goto out_put_prog;
3329	}
3330	break;
3331	case BPF_PROG_TYPE_LSM:
3332	if (prog->expected_attach_type != BPF_LSM_MAC) {
3333	err = -EINVAL;
3334	goto out_put_prog;
3335	}
3336	break;
3337	default:
3338	err = -EINVAL;
3339	goto out_put_prog;
3340	}
3341
3342	if (!!tgt_prog_fd != !!btf_id) {
3343	err = -EINVAL;
3344	goto out_put_prog;
3345	}
3346
3347	if (tgt_prog_fd) {
3348	/*
3349	* For now we only allow new targets for BPF_PROG_TYPE_EXT. If this
3350	* part would be changed to implement the same for
3351	* BPF_PROG_TYPE_TRACING, do not forget to update the way how
3352	* attach_tracing_prog flag is set.
3353	*/
3354	if (prog->type != BPF_PROG_TYPE_EXT) {
3355	err = -EINVAL;
3356	goto out_put_prog;
3357	}
3358
3359	tgt_prog = bpf_prog_get(ufd: tgt_prog_fd);
3360	if (IS_ERR(ptr: tgt_prog)) {
3361	err = PTR_ERR(ptr: tgt_prog);
3362	tgt_prog = NULL;
3363	goto out_put_prog;
3364	}
3365
3366	key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
3367	}
3368
3369	link = kzalloc(size: sizeof(*link), GFP_USER);
3370	if (!link) {
3371	err = -ENOMEM;
3372	goto out_put_prog;
3373	}
3374	bpf_link_init(link: &link->link.link, type: BPF_LINK_TYPE_TRACING,
3375	ops: &bpf_tracing_link_lops, prog);
3376	link->attach_type = prog->expected_attach_type;
3377	link->link.cookie = bpf_cookie;
3378
3379	mutex_lock(&prog->aux->dst_mutex);
3380
3381	/ There are a few possible cases here:*
3382	*
3383	* - if prog->aux->dst_trampoline is set, the program was just loaded
3384	* and not yet attached to anything, so we can use the values stored
3385	* in prog->aux
3386	*
3387	* - if prog->aux->dst_trampoline is NULL, the program has already been
3388	* attached to a target and its initial target was cleared (below)
3389	*
3390	* - if tgt_prog != NULL, the caller specified tgt_prog_fd +
3391	* target_btf_id using the link_create API.
3392	*
3393	* - if tgt_prog == NULL when this function was called using the old
3394	* raw_tracepoint_open API, and we need a target from prog->aux
3395	*
3396	* - if prog->aux->dst_trampoline and tgt_prog is NULL, the program
3397	* was detached and is going for re-attachment.
3398	*
3399	* - if prog->aux->dst_trampoline is NULL and tgt_prog and prog->aux->attach_btf
3400	* are NULL, then program was already attached and user did not provide
3401	* tgt_prog_fd so we have no way to find out or create trampoline
3402	*/
3403	if (!prog->aux->dst_trampoline && !tgt_prog) {
3404	/*
3405	* Allow re-attach for TRACING and LSM programs. If it's
3406	* currently linked, bpf_trampoline_link_prog will fail.
3407	* EXT programs need to specify tgt_prog_fd, so they
3408	* re-attach in separate code path.
3409	*/
3410	if (prog->type != BPF_PROG_TYPE_TRACING &&
3411	prog->type != BPF_PROG_TYPE_LSM) {
3412	err = -EINVAL;
3413	goto out_unlock;
3414	}
3415	/ We can allow re-attach only if we have valid attach_btf. /
3416	if (!prog->aux->attach_btf) {
3417	err = -EINVAL;
3418	goto out_unlock;
3419	}
3420	btf_id = prog->aux->attach_btf_id;
3421	key = bpf_trampoline_compute_key(NULL, btf: prog->aux->attach_btf, btf_id);
3422	}
3423
3424	if (!prog->aux->dst_trampoline \|\|
3425	(key && key != prog->aux->dst_trampoline->key)) {
3426	/ If there is no saved target, or the specified target is*
3427	* different from the destination specified at load time, we
3428	* need a new trampoline and a check for compatibility
3429	*/
3430	struct bpf_attach_target_info tgt_info = {};
3431
3432	err = bpf_check_attach_target(NULL, prog, tgt_prog, btf_id,
3433	tgt_info: &tgt_info);
3434	if (err)
3435	goto out_unlock;
3436
3437	if (tgt_info.tgt_mod) {
3438	module_put(module: prog->aux->mod);
3439	prog->aux->mod = tgt_info.tgt_mod;
3440	}
3441
3442	tr = bpf_trampoline_get(key, tgt_info: &tgt_info);
3443	if (!tr) {
3444	err = -ENOMEM;
3445	goto out_unlock;
3446	}
3447	} else {
3448	/ The caller didn't specify a target, or the target was the*
3449	* same as the destination supplied during program load. This
3450	* means we can reuse the trampoline and reference from program
3451	* load time, and there is no need to allocate a new one. This
3452	* can only happen once for any program, as the saved values in
3453	* prog->aux are cleared below.
3454	*/
3455	tr = prog->aux->dst_trampoline;
3456	tgt_prog = prog->aux->dst_prog;
3457	}
3458
3459	err = bpf_link_prime(link: &link->link.link, primer: &link_primer);
3460	if (err)
3461	goto out_unlock;
3462
3463	err = bpf_trampoline_link_prog(link: &link->link, tr);
3464	if (err) {
3465	bpf_link_cleanup(primer: &link_primer);
3466	link = NULL;
3467	goto out_unlock;
3468	}
3469
3470	link->tgt_prog = tgt_prog;
3471	link->trampoline = tr;
3472
3473	/ Always clear the trampoline and target prog from prog->aux to make*
3474	* sure the original attach destination is not kept alive after a
3475	* program is (re-)attached to another target.
3476	*/
3477	if (prog->aux->dst_prog &&
3478	(tgt_prog_fd \|\| tr != prog->aux->dst_trampoline))
3479	/ got extra prog ref from syscall, or attaching to different prog /
3480	bpf_prog_put(prog->aux->dst_prog);
3481	if (prog->aux->dst_trampoline && tr != prog->aux->dst_trampoline)
3482	/ we allocated a new trampoline, so free the old one /
3483	bpf_trampoline_put(tr: prog->aux->dst_trampoline);
3484
3485	prog->aux->dst_prog = NULL;
3486	prog->aux->dst_trampoline = NULL;
3487	mutex_unlock(lock: &prog->aux->dst_mutex);
3488
3489	return bpf_link_settle(primer: &link_primer);
3490	out_unlock:
3491	if (tr && tr != prog->aux->dst_trampoline)
3492	bpf_trampoline_put(tr);
3493	mutex_unlock(lock: &prog->aux->dst_mutex);
3494	kfree(objp: link);
3495	out_put_prog:
3496	if (tgt_prog_fd && tgt_prog)
3497	bpf_prog_put(tgt_prog);
3498	return err;
3499	}
3500
3501	struct bpf_raw_tp_link {
3502	struct bpf_link link;
3503	struct bpf_raw_event_map *btp;
3504	};
3505
3506	static void bpf_raw_tp_link_release(struct bpf_link *link)
3507	{
3508	struct bpf_raw_tp_link *raw_tp =
3509	container_of(link, struct bpf_raw_tp_link, link);
3510
3511	bpf_probe_unregister(btp: raw_tp->btp, prog: raw_tp->link.prog);
3512	bpf_put_raw_tracepoint(btp: raw_tp->btp);
3513	}
3514
3515	static void bpf_raw_tp_link_dealloc(struct bpf_link *link)
3516	{
3517	struct bpf_raw_tp_link *raw_tp =
3518	container_of(link, struct bpf_raw_tp_link, link);
3519
3520	kfree(objp: raw_tp);
3521	}
3522
3523	static void bpf_raw_tp_link_show_fdinfo(const struct bpf_link *link,
3524	struct seq_file *seq)
3525	{
3526	struct bpf_raw_tp_link *raw_tp_link =
3527	container_of(link, struct bpf_raw_tp_link, link);
3528
3529	seq_printf(m: seq,
3530	fmt: "tp_name:\t%s\n",
3531	raw_tp_link->btp->tp->name);
3532	}
3533
3534	static int bpf_copy_to_user(char __user ubuf, const* char *buf, u32 ulen,
3535	u32 len)
3536	{
3537	if (ulen >= len + `1`) {
3538	if (copy_to_user(to: ubuf, from: buf, n: len + `1`))
3539	return -EFAULT;
3540	} else {
3541	char zero = `'\0'`;
3542
3543	if (copy_to_user(to: ubuf, from: buf, n: ulen - `1`))
3544	return -EFAULT;
3545	if (put_user(zero, ubuf + ulen - `1`))
3546	return -EFAULT;
3547	return -ENOSPC;
3548	}
3549
3550	return `0`;
3551	}
3552
3553	static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
3554	struct bpf_link_info *info)
3555	{
3556	struct bpf_raw_tp_link *raw_tp_link =
3557	container_of(link, struct bpf_raw_tp_link, link);
3558	char __user *ubuf = u64_to_user_ptr(info->raw_tracepoint.tp_name);
3559	const char *tp_name = raw_tp_link->btp->tp->name;
3560	u32 ulen = info->raw_tracepoint.tp_name_len;
3561	size_t tp_len = strlen(tp_name);
3562
3563	if (!ulen ^ !ubuf)
3564	return -EINVAL;
3565
3566	info->raw_tracepoint.tp_name_len = tp_len + `1`;
3567
3568	if (!ubuf)
3569	return `0`;
3570
3571	return bpf_copy_to_user(ubuf, buf: tp_name, ulen, len: tp_len);
3572	}
3573
3574	static const struct bpf_link_ops bpf_raw_tp_link_lops = {
3575	.release = bpf_raw_tp_link_release,
3576	.dealloc_deferred = bpf_raw_tp_link_dealloc,
3577	.show_fdinfo = bpf_raw_tp_link_show_fdinfo,
3578	.fill_link_info = bpf_raw_tp_link_fill_link_info,
3579	};
3580
3581	#ifdef CONFIG_PERF_EVENTS
3582	struct bpf_perf_link {
3583	struct bpf_link link;
3584	struct file *perf_file;
3585	};
3586
3587	static void bpf_perf_link_release(struct bpf_link *link)
3588	{
3589	struct bpf_perf_link perf_link = container_of(link, struct* bpf_perf_link, link);
3590	struct perf_event *event = perf_link->perf_file->private_data;
3591
3592	perf_event_free_bpf_prog(event);
3593	fput(perf_link->perf_file);
3594	}
3595
3596	static void bpf_perf_link_dealloc(struct bpf_link *link)
3597	{
3598	struct bpf_perf_link perf_link = container_of(link, struct* bpf_perf_link, link);
3599
3600	kfree(objp: perf_link);
3601	}
3602
3603	static int bpf_perf_link_fill_common(const struct perf_event *event,
3604	char __user *uname, u32 ulen,
3605	u64 probe_offset, u64 probe_addr,
3606	u32 fd_type, unsigned* long *missed)
3607	{
3608	const char *buf;
3609	u32 prog_id;
3610	size_t len;
3611	int err;
3612
3613	if (!ulen ^ !uname)
3614	return -EINVAL;
3615
3616	err = bpf_get_perf_event_info(event, prog_id: &prog_id, fd_type, buf: &buf,
3617	probe_offset, probe_addr, missed);
3618	if (err)
3619	return err;
3620	if (!uname)
3621	return `0`;
3622	if (buf) {
3623	len = strlen(buf);
3624	err = bpf_copy_to_user(ubuf: uname, buf, ulen, len);
3625	if (err)
3626	return err;
3627	} else {
3628	char zero = `'\0'`;
3629
3630	if (put_user(zero, uname))
3631	return -EFAULT;
3632	}
3633	return `0`;
3634	}
3635
3636	#ifdef CONFIG_KPROBE_EVENTS
3637	static int bpf_perf_link_fill_kprobe(const struct perf_event *event,
3638	struct bpf_link_info *info)
3639	{
3640	unsigned long missed;
3641	char __user *uname;
3642	u64 addr, offset;
3643	u32 ulen, type;
3644	int err;
3645
3646	uname = u64_to_user_ptr(info->perf_event.kprobe.func_name);
3647	ulen = info->perf_event.kprobe.name_len;
3648	err = bpf_perf_link_fill_common(event, uname, ulen, probe_offset: &offset, probe_addr: &addr,
3649	fd_type: &type, missed: &missed);
3650	if (err)
3651	return err;
3652	if (type == BPF_FD_TYPE_KRETPROBE)
3653	info->perf_event.type = BPF_PERF_EVENT_KRETPROBE;
3654	else
3655	info->perf_event.type = BPF_PERF_EVENT_KPROBE;
3656
3657	info->perf_event.kprobe.offset = offset;
3658	info->perf_event.kprobe.missed = missed;
3659	if (!kallsyms_show_value(current_cred()))
3660	addr = `0`;
3661	info->perf_event.kprobe.addr = addr;
3662	info->perf_event.kprobe.cookie = event->bpf_cookie;
3663	return `0`;
3664	}
3665	#endif
3666
3667	#ifdef CONFIG_UPROBE_EVENTS
3668	static int bpf_perf_link_fill_uprobe(const struct perf_event *event,
3669	struct bpf_link_info *info)
3670	{
3671	char __user *uname;
3672	u64 addr, offset;
3673	u32 ulen, type;
3674	int err;
3675
3676	uname = u64_to_user_ptr(info->perf_event.uprobe.file_name);
3677	ulen = info->perf_event.uprobe.name_len;
3678	err = bpf_perf_link_fill_common(event, uname, ulen, probe_offset: &offset, probe_addr: &addr,
3679	fd_type: &type, NULL);
3680	if (err)
3681	return err;
3682
3683	if (type == BPF_FD_TYPE_URETPROBE)
3684	info->perf_event.type = BPF_PERF_EVENT_URETPROBE;
3685	else
3686	info->perf_event.type = BPF_PERF_EVENT_UPROBE;
3687	info->perf_event.uprobe.offset = offset;
3688	info->perf_event.uprobe.cookie = event->bpf_cookie;
3689	return `0`;
3690	}
3691	#endif
3692
3693	static int bpf_perf_link_fill_probe(const struct perf_event *event,
3694	struct bpf_link_info *info)
3695	{
3696	#ifdef CONFIG_KPROBE_EVENTS
3697	if (event->tp_event->flags & TRACE_EVENT_FL_KPROBE)
3698	return bpf_perf_link_fill_kprobe(event, info);
3699	#endif
3700	#ifdef CONFIG_UPROBE_EVENTS
3701	if (event->tp_event->flags & TRACE_EVENT_FL_UPROBE)
3702	return bpf_perf_link_fill_uprobe(event, info);
3703	#endif
3704	return -EOPNOTSUPP;
3705	}
3706
3707	static int bpf_perf_link_fill_tracepoint(const struct perf_event *event,
3708	struct bpf_link_info *info)
3709	{
3710	char __user *uname;
3711	u32 ulen;
3712
3713	uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name);
3714	ulen = info->perf_event.tracepoint.name_len;
3715	info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT;
3716	info->perf_event.tracepoint.cookie = event->bpf_cookie;
3717	return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL);
3718	}
3719
3720	static int bpf_perf_link_fill_perf_event(const struct perf_event *event,
3721	struct bpf_link_info *info)
3722	{
3723	info->perf_event.event.type = event->attr.type;
3724	info->perf_event.event.config = event->attr.config;
3725	info->perf_event.event.cookie = event->bpf_cookie;
3726	info->perf_event.type = BPF_PERF_EVENT_EVENT;
3727	return `0`;
3728	}
3729
3730	static int bpf_perf_link_fill_link_info(const struct bpf_link *link,
3731	struct bpf_link_info *info)
3732	{
3733	struct bpf_perf_link *perf_link;
3734	const struct perf_event *event;
3735
3736	perf_link = container_of(link, struct bpf_perf_link, link);
3737	event = perf_get_event(file: perf_link->perf_file);
3738	if (IS_ERR(ptr: event))
3739	return PTR_ERR(ptr: event);
3740
3741	switch (event->prog->type) {
3742	case BPF_PROG_TYPE_PERF_EVENT:
3743	return bpf_perf_link_fill_perf_event(event, info);
3744	case BPF_PROG_TYPE_TRACEPOINT:
3745	return bpf_perf_link_fill_tracepoint(event, info);
3746	case BPF_PROG_TYPE_KPROBE:
3747	return bpf_perf_link_fill_probe(event, info);
3748	default:
3749	return -EOPNOTSUPP;
3750	}
3751	}
3752
3753	static const struct bpf_link_ops bpf_perf_link_lops = {
3754	.release = bpf_perf_link_release,
3755	.dealloc = bpf_perf_link_dealloc,
3756	.fill_link_info = bpf_perf_link_fill_link_info,
3757	};
3758
3759	static int bpf_perf_link_attach(const union bpf_attr attr, struct* bpf_prog *prog)
3760	{
3761	struct bpf_link_primer link_primer;
3762	struct bpf_perf_link *link;
3763	struct perf_event *event;
3764	struct file *perf_file;
3765	int err;
3766
3767	if (attr->link_create.flags)
3768	return -EINVAL;
3769
3770	perf_file = perf_event_get(fd: attr->link_create.target_fd);
3771	if (IS_ERR(ptr: perf_file))
3772	return PTR_ERR(ptr: perf_file);
3773
3774	link = kzalloc(size: sizeof(*link), GFP_USER);
3775	if (!link) {
3776	err = -ENOMEM;
3777	goto out_put_file;
3778	}
3779	bpf_link_init(link: &link->link, type: BPF_LINK_TYPE_PERF_EVENT, ops: &bpf_perf_link_lops, prog);
3780	link->perf_file = perf_file;
3781
3782	err = bpf_link_prime(link: &link->link, primer: &link_primer);
3783	if (err) {
3784	kfree(objp: link);
3785	goto out_put_file;
3786	}
3787
3788	event = perf_file->private_data;
3789	err = perf_event_set_bpf_prog(event, prog, bpf_cookie: attr->link_create.perf_event.bpf_cookie);
3790	if (err) {
3791	bpf_link_cleanup(primer: &link_primer);
3792	goto out_put_file;
3793	}
3794	/ perf_event_set_bpf_prog() doesn't take its own refcnt on prog /
3795	bpf_prog_inc(prog);
3796
3797	return bpf_link_settle(primer: &link_primer);
3798
3799	out_put_file:
3800	fput(perf_file);
3801	return err;
3802	}
3803	#else
3804	static int bpf_perf_link_attach(const union bpf_attr attr, struct* bpf_prog *prog)
3805	{
3806	return -EOPNOTSUPP;
3807	}
3808	#endif /* CONFIG_PERF_EVENTS */
3809
3810	static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
3811	const char __user *user_tp_name)
3812	{
3813	struct bpf_link_primer link_primer;
3814	struct bpf_raw_tp_link *link;
3815	struct bpf_raw_event_map *btp;
3816	const char *tp_name;
3817	char buf[`128`];
3818	int err;
3819
3820	switch (prog->type) {
3821	case BPF_PROG_TYPE_TRACING:
3822	case BPF_PROG_TYPE_EXT:
3823	case BPF_PROG_TYPE_LSM:
3824	if (user_tp_name)
3825	/ The attach point for this category of programs*
3826	* should be specified via btf_id during program load.
3827	*/
3828	return -EINVAL;
3829	if (prog->type == BPF_PROG_TYPE_TRACING &&
3830	prog->expected_attach_type == BPF_TRACE_RAW_TP) {
3831	tp_name = prog->aux->attach_func_name;
3832	break;
3833	}
3834	return bpf_tracing_prog_attach(prog, tgt_prog_fd: `0`, btf_id: `0`, bpf_cookie: `0`);
3835	case BPF_PROG_TYPE_RAW_TRACEPOINT:
3836	case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
3837	if (strncpy_from_user(dst: buf, src: user_tp_name, count: sizeof(buf) - `1`) < `0`)
3838	return -EFAULT;
3839	buf[sizeof(buf) - `1`] = `0`;
3840	tp_name = buf;
3841	break;
3842	default:
3843	return -EINVAL;
3844	}
3845
3846	btp = bpf_get_raw_tracepoint(name: tp_name);
3847	if (!btp)
3848	return -ENOENT;
3849
3850	link = kzalloc(size: sizeof(*link), GFP_USER);
3851	if (!link) {
3852	err = -ENOMEM;
3853	goto out_put_btp;
3854	}
3855	bpf_link_init(link: &link->link, type: BPF_LINK_TYPE_RAW_TRACEPOINT,
3856	ops: &bpf_raw_tp_link_lops, prog);
3857	link->btp = btp;
3858
3859	err = bpf_link_prime(link: &link->link, primer: &link_primer);
3860	if (err) {
3861	kfree(objp: link);
3862	goto out_put_btp;
3863	}
3864
3865	err = bpf_probe_register(btp: link->btp, prog);
3866	if (err) {
3867	bpf_link_cleanup(primer: &link_primer);
3868	goto out_put_btp;
3869	}
3870
3871	return bpf_link_settle(primer: &link_primer);
3872
3873	out_put_btp:
3874	bpf_put_raw_tracepoint(btp);
3875	return err;
3876	}
3877
3878	#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
3879
3880	static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
3881	{
3882	struct bpf_prog *prog;
3883	int fd;
3884
3885	if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
3886	return -EINVAL;
3887
3888	prog = bpf_prog_get(ufd: attr->raw_tracepoint.prog_fd);
3889	if (IS_ERR(ptr: prog))
3890	return PTR_ERR(ptr: prog);
3891
3892	fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
3893	if (fd < `0`)
3894	bpf_prog_put(prog);
3895	return fd;
3896	}
3897
3898	static enum bpf_prog_type
3899	attach_type_to_prog_type(enum bpf_attach_type attach_type)
3900	{
3901	switch (attach_type) {
3902	case BPF_CGROUP_INET_INGRESS:
3903	case BPF_CGROUP_INET_EGRESS:
3904	return BPF_PROG_TYPE_CGROUP_SKB;
3905	case BPF_CGROUP_INET_SOCK_CREATE:
3906	case BPF_CGROUP_INET_SOCK_RELEASE:
3907	case BPF_CGROUP_INET4_POST_BIND:
3908	case BPF_CGROUP_INET6_POST_BIND:
3909	return BPF_PROG_TYPE_CGROUP_SOCK;
3910	case BPF_CGROUP_INET4_BIND:
3911	case BPF_CGROUP_INET6_BIND:
3912	case BPF_CGROUP_INET4_CONNECT:
3913	case BPF_CGROUP_INET6_CONNECT:
3914	case BPF_CGROUP_UNIX_CONNECT:
3915	case BPF_CGROUP_INET4_GETPEERNAME:
3916	case BPF_CGROUP_INET6_GETPEERNAME:
3917	case BPF_CGROUP_UNIX_GETPEERNAME:
3918	case BPF_CGROUP_INET4_GETSOCKNAME:
3919	case BPF_CGROUP_INET6_GETSOCKNAME:
3920	case BPF_CGROUP_UNIX_GETSOCKNAME:
3921	case BPF_CGROUP_UDP4_SENDMSG:
3922	case BPF_CGROUP_UDP6_SENDMSG:
3923	case BPF_CGROUP_UNIX_SENDMSG:
3924	case BPF_CGROUP_UDP4_RECVMSG:
3925	case BPF_CGROUP_UDP6_RECVMSG:
3926	case BPF_CGROUP_UNIX_RECVMSG:
3927	return BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
3928	case BPF_CGROUP_SOCK_OPS:
3929	return BPF_PROG_TYPE_SOCK_OPS;
3930	case BPF_CGROUP_DEVICE:
3931	return BPF_PROG_TYPE_CGROUP_DEVICE;
3932	case BPF_SK_MSG_VERDICT:
3933	return BPF_PROG_TYPE_SK_MSG;
3934	case BPF_SK_SKB_STREAM_PARSER:
3935	case BPF_SK_SKB_STREAM_VERDICT:
3936	case BPF_SK_SKB_VERDICT:
3937	return BPF_PROG_TYPE_SK_SKB;
3938	case BPF_LIRC_MODE2:
3939	return BPF_PROG_TYPE_LIRC_MODE2;
3940	case BPF_FLOW_DISSECTOR:
3941	return BPF_PROG_TYPE_FLOW_DISSECTOR;
3942	case BPF_CGROUP_SYSCTL:
3943	return BPF_PROG_TYPE_CGROUP_SYSCTL;
3944	case BPF_CGROUP_GETSOCKOPT:
3945	case BPF_CGROUP_SETSOCKOPT:
3946	return BPF_PROG_TYPE_CGROUP_SOCKOPT;
3947	case BPF_TRACE_ITER:
3948	case BPF_TRACE_RAW_TP:
3949	case BPF_TRACE_FENTRY:
3950	case BPF_TRACE_FEXIT:
3951	case BPF_MODIFY_RETURN:
3952	return BPF_PROG_TYPE_TRACING;
3953	case BPF_LSM_MAC:
3954	return BPF_PROG_TYPE_LSM;
3955	case BPF_SK_LOOKUP:
3956	return BPF_PROG_TYPE_SK_LOOKUP;
3957	case BPF_XDP:
3958	return BPF_PROG_TYPE_XDP;
3959	case BPF_LSM_CGROUP:
3960	return BPF_PROG_TYPE_LSM;
3961	case BPF_TCX_INGRESS:
3962	case BPF_TCX_EGRESS:
3963	case BPF_NETKIT_PRIMARY:
3964	case BPF_NETKIT_PEER:
3965	return BPF_PROG_TYPE_SCHED_CLS;
3966	default:
3967	return BPF_PROG_TYPE_UNSPEC;
3968	}
3969	}
3970
3971	static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
3972	enum bpf_attach_type attach_type)
3973	{
3974	enum bpf_prog_type ptype;
3975
3976	switch (prog->type) {
3977	case BPF_PROG_TYPE_CGROUP_SOCK:
3978	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
3979	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
3980	case BPF_PROG_TYPE_SK_LOOKUP:
3981	return attach_type == prog->expected_attach_type ? `0` : -EINVAL;
3982	case BPF_PROG_TYPE_CGROUP_SKB:
3983	if (!bpf_token_capable(token: prog->aux->token, CAP_NET_ADMIN))
3984	/ cg-skb progs can be loaded by unpriv user.*
3985	* check permissions at attach time.
3986	*/
3987	return -EPERM;
3988	return prog->enforce_expected_attach_type &&
3989	prog->expected_attach_type != attach_type ?
3990	-EINVAL : `0`;
3991	case BPF_PROG_TYPE_EXT:
3992	return `0`;
3993	case BPF_PROG_TYPE_NETFILTER:
3994	if (attach_type != BPF_NETFILTER)
3995	return -EINVAL;
3996	return `0`;
3997	case BPF_PROG_TYPE_PERF_EVENT:
3998	case BPF_PROG_TYPE_TRACEPOINT:
3999	if (attach_type != BPF_PERF_EVENT)
4000	return -EINVAL;
4001	return `0`;
4002	case BPF_PROG_TYPE_KPROBE:
4003	if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
4004	attach_type != BPF_TRACE_KPROBE_MULTI)
4005	return -EINVAL;
4006	if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
4007	attach_type != BPF_TRACE_UPROBE_MULTI)
4008	return -EINVAL;
4009	if (attach_type != BPF_PERF_EVENT &&
4010	attach_type != BPF_TRACE_KPROBE_MULTI &&
4011	attach_type != BPF_TRACE_UPROBE_MULTI)
4012	return -EINVAL;
4013	return `0`;
4014	case BPF_PROG_TYPE_SCHED_CLS:
4015	if (attach_type != BPF_TCX_INGRESS &&
4016	attach_type != BPF_TCX_EGRESS &&
4017	attach_type != BPF_NETKIT_PRIMARY &&
4018	attach_type != BPF_NETKIT_PEER)
4019	return -EINVAL;
4020	return `0`;
4021	default:
4022	ptype = attach_type_to_prog_type(attach_type);
4023	if (ptype == BPF_PROG_TYPE_UNSPEC \|\| ptype != prog->type)
4024	return -EINVAL;
4025	return `0`;
4026	}
4027	}
4028
4029	#define BPF_PROG_ATTACH_LAST_FIELD expected_revision
4030
4031	#define BPF_F_ATTACH_MASK_BASE \
4032	(BPF_F_ALLOW_OVERRIDE \| \
4033	BPF_F_ALLOW_MULTI \| \
4034	BPF_F_REPLACE)
4035
4036	#define BPF_F_ATTACH_MASK_MPROG \
4037	(BPF_F_REPLACE \| \
4038	BPF_F_BEFORE \| \
4039	BPF_F_AFTER \| \
4040	BPF_F_ID \| \
4041	BPF_F_LINK)
4042
4043	static int bpf_prog_attach(const union bpf_attr *attr)
4044	{
4045	enum bpf_prog_type ptype;
4046	struct bpf_prog *prog;
4047	int ret;
4048
4049	if (CHECK_ATTR(BPF_PROG_ATTACH))
4050	return -EINVAL;
4051
4052	ptype = attach_type_to_prog_type(attach_type: attr->attach_type);
4053	if (ptype == BPF_PROG_TYPE_UNSPEC)
4054	return -EINVAL;
4055	if (bpf_mprog_supported(type: ptype)) {
4056	if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
4057	return -EINVAL;
4058	} else {
4059	if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
4060	return -EINVAL;
4061	if (attr->relative_fd \|\|
4062	attr->expected_revision)
4063	return -EINVAL;
4064	}
4065
4066	prog = bpf_prog_get_type(ufd: attr->attach_bpf_fd, type: ptype);
4067	if (IS_ERR(ptr: prog))
4068	return PTR_ERR(ptr: prog);
4069
4070	if (bpf_prog_attach_check_attach_type(prog, attach_type: attr->attach_type)) {
4071	bpf_prog_put(prog);
4072	return -EINVAL;
4073	}
4074
4075	switch (ptype) {
4076	case BPF_PROG_TYPE_SK_SKB:
4077	case BPF_PROG_TYPE_SK_MSG:
4078	ret = sock_map_get_from_fd(attr, prog);
4079	break;
4080	case BPF_PROG_TYPE_LIRC_MODE2:
4081	ret = lirc_prog_attach(attr, prog);
4082	break;
4083	case BPF_PROG_TYPE_FLOW_DISSECTOR:
4084	ret = netns_bpf_prog_attach(attr, prog);
4085	break;
4086	case BPF_PROG_TYPE_CGROUP_DEVICE:
4087	case BPF_PROG_TYPE_CGROUP_SKB:
4088	case BPF_PROG_TYPE_CGROUP_SOCK:
4089	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
4090	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
4091	case BPF_PROG_TYPE_CGROUP_SYSCTL:
4092	case BPF_PROG_TYPE_SOCK_OPS:
4093	case BPF_PROG_TYPE_LSM:
4094	if (ptype == BPF_PROG_TYPE_LSM &&
4095	prog->expected_attach_type != BPF_LSM_CGROUP)
4096	ret = -EINVAL;
4097	else
4098	ret = cgroup_bpf_prog_attach(attr, ptype, prog);
4099	break;
4100	case BPF_PROG_TYPE_SCHED_CLS:
4101	if (attr->attach_type == BPF_TCX_INGRESS \|\|
4102	attr->attach_type == BPF_TCX_EGRESS)
4103	ret = tcx_prog_attach(attr, prog);
4104	else
4105	ret = netkit_prog_attach(attr, prog);
4106	break;
4107	default:
4108	ret = -EINVAL;
4109	}
4110
4111	if (ret)
4112	bpf_prog_put(prog);
4113	return ret;
4114	}
4115
4116	#define BPF_PROG_DETACH_LAST_FIELD expected_revision
4117
4118	static int bpf_prog_detach(const union bpf_attr *attr)
4119	{
4120	struct bpf_prog *prog = NULL;
4121	enum bpf_prog_type ptype;
4122	int ret;
4123
4124	if (CHECK_ATTR(BPF_PROG_DETACH))
4125	return -EINVAL;
4126
4127	ptype = attach_type_to_prog_type(attach_type: attr->attach_type);
4128	if (bpf_mprog_supported(type: ptype)) {
4129	if (ptype == BPF_PROG_TYPE_UNSPEC)
4130	return -EINVAL;
4131	if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
4132	return -EINVAL;
4133	if (attr->attach_bpf_fd) {
4134	prog = bpf_prog_get_type(ufd: attr->attach_bpf_fd, type: ptype);
4135	if (IS_ERR(ptr: prog))
4136	return PTR_ERR(ptr: prog);
4137	}
4138	} else if (attr->attach_flags \|\|
4139	attr->relative_fd \|\|
4140	attr->expected_revision) {
4141	return -EINVAL;
4142	}
4143
4144	switch (ptype) {
4145	case BPF_PROG_TYPE_SK_MSG:
4146	case BPF_PROG_TYPE_SK_SKB:
4147	ret = sock_map_prog_detach(attr, ptype);
4148	break;
4149	case BPF_PROG_TYPE_LIRC_MODE2:
4150	ret = lirc_prog_detach(attr);
4151	break;
4152	case BPF_PROG_TYPE_FLOW_DISSECTOR:
4153	ret = netns_bpf_prog_detach(attr, ptype);
4154	break;
4155	case BPF_PROG_TYPE_CGROUP_DEVICE:
4156	case BPF_PROG_TYPE_CGROUP_SKB:
4157	case BPF_PROG_TYPE_CGROUP_SOCK:
4158	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
4159	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
4160	case BPF_PROG_TYPE_CGROUP_SYSCTL:
4161	case BPF_PROG_TYPE_SOCK_OPS:
4162	case BPF_PROG_TYPE_LSM:
4163	ret = cgroup_bpf_prog_detach(attr, ptype);
4164	break;
4165	case BPF_PROG_TYPE_SCHED_CLS:
4166	if (attr->attach_type == BPF_TCX_INGRESS \|\|
4167	attr->attach_type == BPF_TCX_EGRESS)
4168	ret = tcx_prog_detach(attr, prog);
4169	else
4170	ret = netkit_prog_detach(attr, prog);
4171	break;
4172	default:
4173	ret = -EINVAL;
4174	}
4175
4176	if (prog)
4177	bpf_prog_put(prog);
4178	return ret;
4179	}
4180
4181	#define BPF_PROG_QUERY_LAST_FIELD query.revision
4182
4183	static int bpf_prog_query(const union bpf_attr *attr,
4184	union bpf_attr __user *uattr)
4185	{
4186	if (!bpf_net_capable())
4187	return -EPERM;
4188	if (CHECK_ATTR(BPF_PROG_QUERY))
4189	return -EINVAL;
4190	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
4191	return -EINVAL;
4192
4193	switch (attr->query.attach_type) {
4194	case BPF_CGROUP_INET_INGRESS:
4195	case BPF_CGROUP_INET_EGRESS:
4196	case BPF_CGROUP_INET_SOCK_CREATE:
4197	case BPF_CGROUP_INET_SOCK_RELEASE:
4198	case BPF_CGROUP_INET4_BIND:
4199	case BPF_CGROUP_INET6_BIND:
4200	case BPF_CGROUP_INET4_POST_BIND:
4201	case BPF_CGROUP_INET6_POST_BIND:
4202	case BPF_CGROUP_INET4_CONNECT:
4203	case BPF_CGROUP_INET6_CONNECT:
4204	case BPF_CGROUP_UNIX_CONNECT:
4205	case BPF_CGROUP_INET4_GETPEERNAME:
4206	case BPF_CGROUP_INET6_GETPEERNAME:
4207	case BPF_CGROUP_UNIX_GETPEERNAME:
4208	case BPF_CGROUP_INET4_GETSOCKNAME:
4209	case BPF_CGROUP_INET6_GETSOCKNAME:
4210	case BPF_CGROUP_UNIX_GETSOCKNAME:
4211	case BPF_CGROUP_UDP4_SENDMSG:
4212	case BPF_CGROUP_UDP6_SENDMSG:
4213	case BPF_CGROUP_UNIX_SENDMSG:
4214	case BPF_CGROUP_UDP4_RECVMSG:
4215	case BPF_CGROUP_UDP6_RECVMSG:
4216	case BPF_CGROUP_UNIX_RECVMSG:
4217	case BPF_CGROUP_SOCK_OPS:
4218	case BPF_CGROUP_DEVICE:
4219	case BPF_CGROUP_SYSCTL:
4220	case BPF_CGROUP_GETSOCKOPT:
4221	case BPF_CGROUP_SETSOCKOPT:
4222	case BPF_LSM_CGROUP:
4223	return cgroup_bpf_prog_query(attr, uattr);
4224	case BPF_LIRC_MODE2:
4225	return lirc_prog_query(attr, uattr);
4226	case BPF_FLOW_DISSECTOR:
4227	case BPF_SK_LOOKUP:
4228	return netns_bpf_prog_query(attr, uattr);
4229	case BPF_SK_SKB_STREAM_PARSER:
4230	case BPF_SK_SKB_STREAM_VERDICT:
4231	case BPF_SK_MSG_VERDICT:
4232	case BPF_SK_SKB_VERDICT:
4233	return sock_map_bpf_prog_query(attr, uattr);
4234	case BPF_TCX_INGRESS:
4235	case BPF_TCX_EGRESS:
4236	return tcx_prog_query(attr, uattr);
4237	case BPF_NETKIT_PRIMARY:
4238	case BPF_NETKIT_PEER:
4239	return netkit_prog_query(attr, uattr);
4240	default:
4241	return -EINVAL;
4242	}
4243	}
4244
4245	#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
4246
4247	static int bpf_prog_test_run(const union bpf_attr *attr,
4248	union bpf_attr __user *uattr)
4249	{
4250	struct bpf_prog *prog;
4251	int ret = -ENOTSUPP;
4252
4253	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
4254	return -EINVAL;
4255
4256	if ((attr->test.ctx_size_in && !attr->test.ctx_in) \|\|
4257	(!attr->test.ctx_size_in && attr->test.ctx_in))
4258	return -EINVAL;
4259
4260	if ((attr->test.ctx_size_out && !attr->test.ctx_out) \|\|
4261	(!attr->test.ctx_size_out && attr->test.ctx_out))
4262	return -EINVAL;
4263
4264	prog = bpf_prog_get(ufd: attr->test.prog_fd);
4265	if (IS_ERR(ptr: prog))
4266	return PTR_ERR(ptr: prog);
4267
4268	if (prog->aux->ops->test_run)
4269	ret = prog->aux->ops->test_run(prog, attr, uattr);
4270
4271	bpf_prog_put(prog);
4272	return ret;
4273	}
4274
4275	#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
4276
4277	static int bpf_obj_get_next_id(const union bpf_attr *attr,
4278	union bpf_attr __user *uattr,
4279	struct idr *idr,
4280	spinlock_t *lock)
4281	{
4282	u32 next_id = attr->start_id;
4283	int err = `0`;
4284
4285	if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) \|\| next_id >= INT_MAX)
4286	return -EINVAL;
4287
4288	if (!capable(CAP_SYS_ADMIN))
4289	return -EPERM;
4290
4291	next_id++;
4292	spin_lock_bh(lock);
4293	if (!idr_get_next(idr, nextid: &next_id))
4294	err = -ENOENT;
4295	spin_unlock_bh(lock);
4296
4297	if (!err)
4298	err = put_user(next_id, &uattr->next_id);
4299
4300	return err;
4301	}
4302
4303	struct bpf_map bpf_map_get_curr_or_next(u32 id)
4304	{
4305	struct bpf_map *map;
4306
4307	spin_lock_bh(lock: &map_idr_lock);
4308	again:
4309	map = idr_get_next(&map_idr, nextid: id);
4310	if (map) {
4311	map = __bpf_map_inc_not_zero(map, uref: false);
4312	if (IS_ERR(ptr: map)) {
4313	(*id)++;
4314	goto again;
4315	}
4316	}
4317	spin_unlock_bh(lock: &map_idr_lock);
4318
4319	return map;
4320	}
4321
4322	struct bpf_prog bpf_prog_get_curr_or_next(u32 id)
4323	{
4324	struct bpf_prog *prog;
4325
4326	spin_lock_bh(lock: &prog_idr_lock);
4327	again:
4328	prog = idr_get_next(&prog_idr, nextid: id);
4329	if (prog) {
4330	prog = bpf_prog_inc_not_zero(prog);
4331	if (IS_ERR(ptr: prog)) {
4332	(*id)++;
4333	goto again;
4334	}
4335	}
4336	spin_unlock_bh(lock: &prog_idr_lock);
4337
4338	return prog;
4339	}
4340
4341	#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
4342
4343	struct bpf_prog *bpf_prog_by_id(u32 id)
4344	{
4345	struct bpf_prog *prog;
4346
4347	if (!id)
4348	return ERR_PTR(error: -ENOENT);
4349
4350	spin_lock_bh(lock: &prog_idr_lock);
4351	prog = idr_find(&prog_idr, id);
4352	if (prog)
4353	prog = bpf_prog_inc_not_zero(prog);
4354	else
4355	prog = ERR_PTR(error: -ENOENT);
4356	spin_unlock_bh(lock: &prog_idr_lock);
4357	return prog;
4358	}
4359
4360	static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
4361	{
4362	struct bpf_prog *prog;
4363	u32 id = attr->prog_id;
4364	int fd;
4365
4366	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
4367	return -EINVAL;
4368
4369	if (!capable(CAP_SYS_ADMIN))
4370	return -EPERM;
4371
4372	prog = bpf_prog_by_id(id);
4373	if (IS_ERR(ptr: prog))
4374	return PTR_ERR(ptr: prog);
4375
4376	fd = bpf_prog_new_fd(prog);
4377	if (fd < `0`)
4378	bpf_prog_put(prog);
4379
4380	return fd;
4381	}
4382
4383	#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
4384
4385	static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
4386	{
4387	struct bpf_map *map;
4388	u32 id = attr->map_id;
4389	int f_flags;
4390	int fd;
4391
4392	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) \|\|
4393	attr->open_flags & ~BPF_OBJ_FLAG_MASK)
4394	return -EINVAL;
4395
4396	if (!capable(CAP_SYS_ADMIN))
4397	return -EPERM;
4398
4399	f_flags = bpf_get_file_flag(flags: attr->open_flags);
4400	if (f_flags < `0`)
4401	return f_flags;
4402
4403	spin_lock_bh(lock: &map_idr_lock);
4404	map = idr_find(&map_idr, id);
4405	if (map)
4406	map = __bpf_map_inc_not_zero(map, uref: true);
4407	else
4408	map = ERR_PTR(error: -ENOENT);
4409	spin_unlock_bh(lock: &map_idr_lock);
4410
4411	if (IS_ERR(ptr: map))
4412	return PTR_ERR(ptr: map);
4413
4414	fd = bpf_map_new_fd(map, flags: f_flags);
4415	if (fd < `0`)
4416	bpf_map_put_with_uref(map);
4417
4418	return fd;
4419	}
4420
4421	static const struct bpf_map bpf_map_from_imm(const* struct bpf_prog *prog,
4422	unsigned long addr, u32 *off,
4423	u32 *type)
4424	{
4425	const struct bpf_map *map;
4426	int i;
4427
4428	mutex_lock(&prog->aux->used_maps_mutex);
4429	for (i = `0`, *off = `0`; i < prog->aux->used_map_cnt; i++) {
4430	map = prog->aux->used_maps[i];
4431	if (map == (void *)addr) {
4432	*type = BPF_PSEUDO_MAP_FD;
4433	goto out;
4434	}
4435	if (!map->ops->map_direct_value_meta)
4436	continue;
4437	if (!map->ops->map_direct_value_meta(map, addr, off)) {
4438	*type = BPF_PSEUDO_MAP_VALUE;
4439	goto out;
4440	}
4441	}
4442	map = NULL;
4443
4444	out:
4445	mutex_unlock(lock: &prog->aux->used_maps_mutex);
4446	return map;
4447	}
4448
4449	static struct bpf_insn bpf_insn_prepare_dump(const* struct bpf_prog *prog,
4450	const struct cred *f_cred)
4451	{
4452	const struct bpf_map *map;
4453	struct bpf_insn *insns;
4454	u32 off, type;
4455	u64 imm;
4456	u8 code;
4457	int i;
4458
4459	insns = kmemdup(p: prog->insnsi, size: bpf_prog_insn_size(prog),
4460	GFP_USER);
4461	if (!insns)
4462	return insns;
4463
4464	for (i = `0`; i < prog->len; i++) {
4465	code = insns[i].code;
4466
4467	if (code == (BPF_JMP \| BPF_TAIL_CALL)) {
4468	insns[i].code = BPF_JMP \| BPF_CALL;
4469	insns[i].imm = BPF_FUNC_tail_call;
4470	/ fall-through /
4471	}
4472	if (code == (BPF_JMP \| BPF_CALL) \|\|
4473	code == (BPF_JMP \| BPF_CALL_ARGS)) {
4474	if (code == (BPF_JMP \| BPF_CALL_ARGS))
4475	insns[i].code = BPF_JMP \| BPF_CALL;
4476	if (!bpf_dump_raw_ok(cred: f_cred))
4477	insns[i].imm = `0`;
4478	continue;
4479	}
4480	if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
4481	insns[i].code = BPF_LDX \| BPF_SIZE(code) \| BPF_MEM;
4482	continue;
4483	}
4484
4485	if ((BPF_CLASS(code) == BPF_LDX \|\| BPF_CLASS(code) == BPF_STX \|\|
4486	BPF_CLASS(code) == BPF_ST) && BPF_MODE(code) == BPF_PROBE_MEM32) {
4487	insns[i].code = BPF_CLASS(code) \| BPF_SIZE(code) \| BPF_MEM;
4488	continue;
4489	}
4490
4491	if (code != (BPF_LD \| BPF_IMM \| BPF_DW))
4492	continue;
4493
4494	imm = ((u64)insns[i + `1`].imm << `32`) \| (u32)insns[i].imm;
4495	map = bpf_map_from_imm(prog, addr: imm, off: &off, type: &type);
4496	if (map) {
4497	insns[i].src_reg = type;
4498	insns[i].imm = map->id;
4499	insns[i + `1`].imm = off;
4500	continue;
4501	}
4502	}
4503
4504	return insns;
4505	}
4506
4507	static int set_info_rec_size(struct bpf_prog_info *info)
4508	{
4509	/*
4510	* Ensure info.*_rec_size is the same as kernel expected size
4511	*
4512	* or
4513	*
4514	* Only allow zero *_rec_size if both _rec_size and _cnt are
4515	* zero. In this case, the kernel will set the expected
4516	* _rec_size back to the info.
4517	*/
4518
4519	if ((info->nr_func_info \|\| info->func_info_rec_size) &&
4520	info->func_info_rec_size != sizeof(struct bpf_func_info))
4521	return -EINVAL;
4522
4523	if ((info->nr_line_info \|\| info->line_info_rec_size) &&
4524	info->line_info_rec_size != sizeof(struct bpf_line_info))
4525	return -EINVAL;
4526
4527	if ((info->nr_jited_line_info \|\| info->jited_line_info_rec_size) &&
4528	info->jited_line_info_rec_size != sizeof(__u64))
4529	return -EINVAL;
4530
4531	info->func_info_rec_size = sizeof(struct bpf_func_info);
4532	info->line_info_rec_size = sizeof(struct bpf_line_info);
4533	info->jited_line_info_rec_size = sizeof(__u64);
4534
4535	return `0`;
4536	}
4537
4538	static int bpf_prog_get_info_by_fd(struct file *file,
4539	struct bpf_prog *prog,
4540	const union bpf_attr *attr,
4541	union bpf_attr __user *uattr)
4542	{
4543	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4544	struct btf *attach_btf = bpf_prog_get_target_btf(prog);
4545	struct bpf_prog_info info;
4546	u32 info_len = attr->info.info_len;
4547	struct bpf_prog_kstats stats;
4548	char __user *uinsns;
4549	u32 ulen;
4550	int err;
4551
4552	err = bpf_check_uarg_tail_zero(uaddr: USER_BPFPTR(p: uinfo), expected_size: sizeof(info), actual_size: info_len);
4553	if (err)
4554	return err;
4555	info_len = min_t(u32, sizeof(info), info_len);
4556
4557	memset(&info, `0`, sizeof(info));
4558	if (copy_from_user(to: &info, from: uinfo, n: info_len))
4559	return -EFAULT;
4560
4561	info.type = prog->type;
4562	info.id = prog->aux->id;
4563	info.load_time = prog->aux->load_time;
4564	info.created_by_uid = from_kuid_munged(current_user_ns(),
4565	uid: prog->aux->user->uid);
4566	info.gpl_compatible = prog->gpl_compatible;
4567
4568	memcpy(info.tag, prog->tag, sizeof(prog->tag));
4569	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
4570
4571	mutex_lock(&prog->aux->used_maps_mutex);
4572	ulen = info.nr_map_ids;
4573	info.nr_map_ids = prog->aux->used_map_cnt;
4574	ulen = min_t(u32, info.nr_map_ids, ulen);
4575	if (ulen) {
4576	u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
4577	u32 i;
4578
4579	for (i = `0`; i < ulen; i++)
4580	if (put_user(prog->aux->used_maps[i]->id,
4581	&user_map_ids[i])) {
4582	mutex_unlock(lock: &prog->aux->used_maps_mutex);
4583	return -EFAULT;
4584	}
4585	}
4586	mutex_unlock(lock: &prog->aux->used_maps_mutex);
4587
4588	err = set_info_rec_size(&info);
4589	if (err)
4590	return err;
4591
4592	bpf_prog_get_stats(prog, stats: &stats);
4593	info.run_time_ns = stats.nsecs;
4594	info.run_cnt = stats.cnt;
4595	info.recursion_misses = stats.misses;
4596
4597	info.verified_insns = prog->aux->verified_insns;
4598
4599	if (!bpf_capable()) {
4600	info.jited_prog_len = `0`;
4601	info.xlated_prog_len = `0`;
4602	info.nr_jited_ksyms = `0`;
4603	info.nr_jited_func_lens = `0`;
4604	info.nr_func_info = `0`;
4605	info.nr_line_info = `0`;
4606	info.nr_jited_line_info = `0`;
4607	goto done;
4608	}
4609
4610	ulen = info.xlated_prog_len;
4611	info.xlated_prog_len = bpf_prog_insn_size(prog);
4612	if (info.xlated_prog_len && ulen) {
4613	struct bpf_insn *insns_sanitized;
4614	bool fault;
4615
4616	if (prog->blinded && !bpf_dump_raw_ok(cred: file->f_cred)) {
4617	info.xlated_prog_insns = `0`;
4618	goto done;
4619	}
4620	insns_sanitized = bpf_insn_prepare_dump(prog, f_cred: file->f_cred);
4621	if (!insns_sanitized)
4622	return -ENOMEM;
4623	uinsns = u64_to_user_ptr(info.xlated_prog_insns);
4624	ulen = min_t(u32, info.xlated_prog_len, ulen);
4625	fault = copy_to_user(to: uinsns, from: insns_sanitized, n: ulen);
4626	kfree(objp: insns_sanitized);
4627	if (fault)
4628	return -EFAULT;
4629	}
4630
4631	if (bpf_prog_is_offloaded(aux: prog->aux)) {
4632	err = bpf_prog_offload_info_fill(info: &info, prog);
4633	if (err)
4634	return err;
4635	goto done;
4636	}
4637
4638	/ NOTE: the following code is supposed to be skipped for offload.*
4639	* bpf_prog_offload_info_fill() is the place to fill similar fields
4640	* for offload.
4641	*/
4642	ulen = info.jited_prog_len;
4643	if (prog->aux->func_cnt) {
4644	u32 i;
4645
4646	info.jited_prog_len = `0`;
4647	for (i = `0`; i < prog->aux->func_cnt; i++)
4648	info.jited_prog_len += prog->aux->func[i]->jited_len;
4649	} else {
4650	info.jited_prog_len = prog->jited_len;
4651	}
4652
4653	if (info.jited_prog_len && ulen) {
4654	if (bpf_dump_raw_ok(cred: file->f_cred)) {
4655	uinsns = u64_to_user_ptr(info.jited_prog_insns);
4656	ulen = min_t(u32, info.jited_prog_len, ulen);
4657
4658	/ for multi-function programs, copy the JITed*
4659	* instructions for all the functions
4660	*/
4661	if (prog->aux->func_cnt) {
4662	u32 len, free, i;
4663	u8 *img;
4664
4665	free = ulen;
4666	for (i = `0`; i < prog->aux->func_cnt; i++) {
4667	len = prog->aux->func[i]->jited_len;
4668	len = min_t(u32, len, free);
4669	img = (u8 *) prog->aux->func[i]->bpf_func;
4670	if (copy_to_user(to: uinsns, from: img, n: len))
4671	return -EFAULT;
4672	uinsns += len;
4673	free -= len;
4674	if (!free)
4675	break;
4676	}
4677	} else {
4678	if (copy_to_user(to: uinsns, from: prog->bpf_func, n: ulen))
4679	return -EFAULT;
4680	}
4681	} else {
4682	info.jited_prog_insns = `0`;
4683	}
4684	}
4685
4686	ulen = info.nr_jited_ksyms;
4687	info.nr_jited_ksyms = prog->aux->func_cnt ? : `1`;
4688	if (ulen) {
4689	if (bpf_dump_raw_ok(cred: file->f_cred)) {
4690	unsigned long ksym_addr;
4691	u64 __user *user_ksyms;
4692	u32 i;
4693
4694	/ copy the address of the kernel symbol*
4695	* corresponding to each function
4696	*/
4697	ulen = min_t(u32, info.nr_jited_ksyms, ulen);
4698	user_ksyms = u64_to_user_ptr(info.jited_ksyms);
4699	if (prog->aux->func_cnt) {
4700	for (i = `0`; i < ulen; i++) {
4701	ksym_addr = (unsigned long)
4702	prog->aux->func[i]->bpf_func;
4703	if (put_user((u64) ksym_addr,
4704	&user_ksyms[i]))
4705	return -EFAULT;
4706	}
4707	} else {
4708	ksym_addr = (unsigned long) prog->bpf_func;
4709	if (put_user((u64) ksym_addr, &user_ksyms[`0`]))
4710	return -EFAULT;
4711	}
4712	} else {
4713	info.jited_ksyms = `0`;
4714	}
4715	}
4716
4717	ulen = info.nr_jited_func_lens;
4718	info.nr_jited_func_lens = prog->aux->func_cnt ? : `1`;
4719	if (ulen) {
4720	if (bpf_dump_raw_ok(cred: file->f_cred)) {
4721	u32 __user *user_lens;
4722	u32 func_len, i;
4723
4724	/ copy the JITed image lengths for each function /
4725	ulen = min_t(u32, info.nr_jited_func_lens, ulen);
4726	user_lens = u64_to_user_ptr(info.jited_func_lens);
4727	if (prog->aux->func_cnt) {
4728	for (i = `0`; i < ulen; i++) {
4729	func_len =
4730	prog->aux->func[i]->jited_len;
4731	if (put_user(func_len, &user_lens[i]))
4732	return -EFAULT;
4733	}
4734	} else {
4735	func_len = prog->jited_len;
4736	if (put_user(func_len, &user_lens[`0`]))
4737	return -EFAULT;
4738	}
4739	} else {
4740	info.jited_func_lens = `0`;
4741	}
4742	}
4743
4744	if (prog->aux->btf)
4745	info.btf_id = btf_obj_id(btf: prog->aux->btf);
4746	info.attach_btf_id = prog->aux->attach_btf_id;
4747	if (attach_btf)
4748	info.attach_btf_obj_id = btf_obj_id(btf: attach_btf);
4749
4750	ulen = info.nr_func_info;
4751	info.nr_func_info = prog->aux->func_info_cnt;
4752	if (info.nr_func_info && ulen) {
4753	char __user *user_finfo;
4754
4755	user_finfo = u64_to_user_ptr(info.func_info);
4756	ulen = min_t(u32, info.nr_func_info, ulen);
4757	if (copy_to_user(to: user_finfo, from: prog->aux->func_info,
4758	n: info.func_info_rec_size * ulen))
4759	return -EFAULT;
4760	}
4761
4762	ulen = info.nr_line_info;
4763	info.nr_line_info = prog->aux->nr_linfo;
4764	if (info.nr_line_info && ulen) {
4765	__u8 __user *user_linfo;
4766
4767	user_linfo = u64_to_user_ptr(info.line_info);
4768	ulen = min_t(u32, info.nr_line_info, ulen);
4769	if (copy_to_user(to: user_linfo, from: prog->aux->linfo,
4770	n: info.line_info_rec_size * ulen))
4771	return -EFAULT;
4772	}
4773
4774	ulen = info.nr_jited_line_info;
4775	if (prog->aux->jited_linfo)
4776	info.nr_jited_line_info = prog->aux->nr_linfo;
4777	else
4778	info.nr_jited_line_info = `0`;
4779	if (info.nr_jited_line_info && ulen) {
4780	if (bpf_dump_raw_ok(cred: file->f_cred)) {
4781	unsigned long line_addr;
4782	__u64 __user *user_linfo;
4783	u32 i;
4784
4785	user_linfo = u64_to_user_ptr(info.jited_line_info);
4786	ulen = min_t(u32, info.nr_jited_line_info, ulen);
4787	for (i = `0`; i < ulen; i++) {
4788	line_addr = (unsigned long)prog->aux->jited_linfo[i];
4789	if (put_user((__u64)line_addr, &user_linfo[i]))
4790	return -EFAULT;
4791	}
4792	} else {
4793	info.jited_line_info = `0`;
4794	}
4795	}
4796
4797	ulen = info.nr_prog_tags;
4798	info.nr_prog_tags = prog->aux->func_cnt ? : `1`;
4799	if (ulen) {
4800	__u8 __user (*user_prog_tags)[BPF_TAG_SIZE];
4801	u32 i;
4802
4803	user_prog_tags = u64_to_user_ptr(info.prog_tags);
4804	ulen = min_t(u32, info.nr_prog_tags, ulen);
4805	if (prog->aux->func_cnt) {
4806	for (i = `0`; i < ulen; i++) {
4807	if (copy_to_user(to: user_prog_tags[i],
4808	from: prog->aux->func[i]->tag,
4809	BPF_TAG_SIZE))
4810	return -EFAULT;
4811	}
4812	} else {
4813	if (copy_to_user(to: user_prog_tags[`0`],
4814	from: prog->tag, BPF_TAG_SIZE))
4815	return -EFAULT;
4816	}
4817	}
4818
4819	done:
4820	if (copy_to_user(to: uinfo, from: &info, n: info_len) \|\|
4821	put_user(info_len, &uattr->info.info_len))
4822	return -EFAULT;
4823
4824	return `0`;
4825	}
4826
4827	static int bpf_map_get_info_by_fd(struct file *file,
4828	struct bpf_map *map,
4829	const union bpf_attr *attr,
4830	union bpf_attr __user *uattr)
4831	{
4832	struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4833	struct bpf_map_info info;
4834	u32 info_len = attr->info.info_len;
4835	int err;
4836
4837	err = bpf_check_uarg_tail_zero(uaddr: USER_BPFPTR(p: uinfo), expected_size: sizeof(info), actual_size: info_len);
4838	if (err)
4839	return err;
4840	info_len = min_t(u32, sizeof(info), info_len);
4841
4842	memset(&info, `0`, sizeof(info));
4843	info.type = map->map_type;
4844	info.id = map->id;
4845	info.key_size = map->key_size;
4846	info.value_size = map->value_size;
4847	info.max_entries = map->max_entries;
4848	info.map_flags = map->map_flags;
4849	info.map_extra = map->map_extra;
4850	memcpy(info.name, map->name, sizeof(map->name));
4851
4852	if (map->btf) {
4853	info.btf_id = btf_obj_id(btf: map->btf);
4854	info.btf_key_type_id = map->btf_key_type_id;
4855	info.btf_value_type_id = map->btf_value_type_id;
4856	}
4857	info.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id;
4858	if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS)
4859	bpf_map_struct_ops_info_fill(info: &info, map);
4860
4861	if (bpf_map_is_offloaded(map)) {
4862	err = bpf_map_offload_info_fill(info: &info, map);
4863	if (err)
4864	return err;
4865	}
4866
4867	if (copy_to_user(to: uinfo, from: &info, n: info_len) \|\|
4868	put_user(info_len, &uattr->info.info_len))
4869	return -EFAULT;
4870
4871	return `0`;
4872	}
4873
4874	static int bpf_btf_get_info_by_fd(struct file *file,
4875	struct btf *btf,
4876	const union bpf_attr *attr,
4877	union bpf_attr __user *uattr)
4878	{
4879	struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4880	u32 info_len = attr->info.info_len;
4881	int err;
4882
4883	err = bpf_check_uarg_tail_zero(uaddr: USER_BPFPTR(p: uinfo), expected_size: sizeof(*uinfo), actual_size: info_len);
4884	if (err)
4885	return err;
4886
4887	return btf_get_info_by_fd(btf, attr, uattr);
4888	}
4889
4890	static int bpf_link_get_info_by_fd(struct file *file,
4891	struct bpf_link *link,
4892	const union bpf_attr *attr,
4893	union bpf_attr __user *uattr)
4894	{
4895	struct bpf_link_info __user *uinfo = u64_to_user_ptr(attr->info.info);
4896	struct bpf_link_info info;
4897	u32 info_len = attr->info.info_len;
4898	int err;
4899
4900	err = bpf_check_uarg_tail_zero(uaddr: USER_BPFPTR(p: uinfo), expected_size: sizeof(info), actual_size: info_len);
4901	if (err)
4902	return err;
4903	info_len = min_t(u32, sizeof(info), info_len);
4904
4905	memset(&info, `0`, sizeof(info));
4906	if (copy_from_user(to: &info, from: uinfo, n: info_len))
4907	return -EFAULT;
4908
4909	info.type = link->type;
4910	info.id = link->id;
4911	if (link->prog)
4912	info.prog_id = link->prog->aux->id;
4913
4914	if (link->ops->fill_link_info) {
4915	err = link->ops->fill_link_info(link, &info);
4916	if (err)
4917	return err;
4918	}
4919
4920	if (copy_to_user(to: uinfo, from: &info, n: info_len) \|\|
4921	put_user(info_len, &uattr->info.info_len))
4922	return -EFAULT;
4923
4924	return `0`;
4925	}
4926
4927
4928	#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
4929
4930	static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
4931	union bpf_attr __user *uattr)
4932	{
4933	int ufd = attr->info.bpf_fd;
4934	struct fd f;
4935	int err;
4936
4937	if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
4938	return -EINVAL;
4939
4940	f = fdget(fd: ufd);
4941	if (!f.file)
4942	return -EBADFD;
4943
4944	if (f.file->f_op == &bpf_prog_fops)
4945	err = bpf_prog_get_info_by_fd(file: f.file, prog: f.file->private_data, attr,
4946	uattr);
4947	else if (f.file->f_op == &bpf_map_fops)
4948	err = bpf_map_get_info_by_fd(file: f.file, map: f.file->private_data, attr,
4949	uattr);
4950	else if (f.file->f_op == &btf_fops)
4951	err = bpf_btf_get_info_by_fd(file: f.file, btf: f.file->private_data, attr, uattr);
4952	else if (f.file->f_op == &bpf_link_fops)
4953	err = bpf_link_get_info_by_fd(file: f.file, link: f.file->private_data,
4954	attr, uattr);
4955	else
4956	err = -EINVAL;
4957
4958	fdput(fd: f);
4959	return err;
4960	}
4961
4962	#define BPF_BTF_LOAD_LAST_FIELD btf_token_fd
4963
4964	static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
4965	{
4966	struct bpf_token *token = NULL;
4967
4968	if (CHECK_ATTR(BPF_BTF_LOAD))
4969	return -EINVAL;
4970
4971	if (attr->btf_flags & ~BPF_F_TOKEN_FD)
4972	return -EINVAL;
4973
4974	if (attr->btf_flags & BPF_F_TOKEN_FD) {
4975	token = bpf_token_get_from_fd(ufd: attr->btf_token_fd);
4976	if (IS_ERR(ptr: token))
4977	return PTR_ERR(ptr: token);
4978	if (!bpf_token_allow_cmd(token, cmd: BPF_BTF_LOAD)) {
4979	bpf_token_put(token);
4980	token = NULL;
4981	}
4982	}
4983
4984	if (!bpf_token_capable(token, CAP_BPF)) {
4985	bpf_token_put(token);
4986	return -EPERM;
4987	}
4988
4989	bpf_token_put(token);
4990
4991	return btf_new_fd(attr, uattr, uattr_sz: uattr_size);
4992	}
4993
4994	#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
4995
4996	static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
4997	{
4998	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
4999	return -EINVAL;
5000
5001	if (!capable(CAP_SYS_ADMIN))
5002	return -EPERM;
5003
5004	return btf_get_fd_by_id(id: attr->btf_id);
5005	}
5006
5007	static int bpf_task_fd_query_copy(const union bpf_attr *attr,
5008	union bpf_attr __user *uattr,
5009	u32 prog_id, u32 fd_type,
5010	const char *buf, u64 probe_offset,
5011	u64 probe_addr)
5012	{
5013	char __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
5014	u32 len = buf ? strlen(buf) : `0`, input_len;
5015	int err = `0`;
5016
5017	if (put_user(len, &uattr->task_fd_query.buf_len))
5018	return -EFAULT;
5019	input_len = attr->task_fd_query.buf_len;
5020	if (input_len && ubuf) {
5021	if (!len) {
5022	/ nothing to copy, just make ubuf NULL terminated /
5023	char zero = `'\0'`;
5024
5025	if (put_user(zero, ubuf))
5026	return -EFAULT;
5027	} else if (input_len >= len + `1`) {
5028	/ ubuf can hold the string with NULL terminator /
5029	if (copy_to_user(to: ubuf, from: buf, n: len + `1`))
5030	return -EFAULT;
5031	} else {
5032	/ ubuf cannot hold the string with NULL terminator,*
5033	* do a partial copy with NULL terminator.
5034	*/
5035	char zero = `'\0'`;
5036
5037	err = -ENOSPC;
5038	if (copy_to_user(to: ubuf, from: buf, n: input_len - `1`))
5039	return -EFAULT;
5040	if (put_user(zero, ubuf + input_len - `1`))
5041	return -EFAULT;
5042	}
5043	}
5044
5045	if (put_user(prog_id, &uattr->task_fd_query.prog_id) \|\|
5046	put_user(fd_type, &uattr->task_fd_query.fd_type) \|\|
5047	put_user(probe_offset, &uattr->task_fd_query.probe_offset) \|\|
5048	put_user(probe_addr, &uattr->task_fd_query.probe_addr))
5049	return -EFAULT;
5050
5051	return err;
5052	}
5053
5054	#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
5055
5056	static int bpf_task_fd_query(const union bpf_attr *attr,
5057	union bpf_attr __user *uattr)
5058	{
5059	pid_t pid = attr->task_fd_query.pid;
5060	u32 fd = attr->task_fd_query.fd;
5061	const struct perf_event *event;
5062	struct task_struct *task;
5063	struct file *file;
5064	int err;
5065
5066	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
5067	return -EINVAL;
5068
5069	if (!capable(CAP_SYS_ADMIN))
5070	return -EPERM;
5071
5072	if (attr->task_fd_query.flags != `0`)
5073	return -EINVAL;
5074
5075	rcu_read_lock();
5076	task = get_pid_task(pid: find_vpid(nr: pid), PIDTYPE_PID);
5077	rcu_read_unlock();
5078	if (!task)
5079	return -ENOENT;
5080
5081	err = `0`;
5082	file = fget_task(task, fd);
5083	put_task_struct(t: task);
5084	if (!file)
5085	return -EBADF;
5086
5087	if (file->f_op == &bpf_link_fops) {
5088	struct bpf_link *link = file->private_data;
5089
5090	if (link->ops == &bpf_raw_tp_link_lops) {
5091	struct bpf_raw_tp_link *raw_tp =
5092	container_of(link, struct bpf_raw_tp_link, link);
5093	struct bpf_raw_event_map *btp = raw_tp->btp;
5094
5095	err = bpf_task_fd_query_copy(attr, uattr,
5096	prog_id: raw_tp->link.prog->aux->id,
5097	fd_type: BPF_FD_TYPE_RAW_TRACEPOINT,
5098	buf: btp->tp->name, probe_offset: `0`, probe_addr: `0`);
5099	goto put_file;
5100	}
5101	goto out_not_supp;
5102	}
5103
5104	event = perf_get_event(file);
5105	if (!IS_ERR(ptr: event)) {
5106	u64 probe_offset, probe_addr;
5107	u32 prog_id, fd_type;
5108	const char *buf;
5109
5110	err = bpf_get_perf_event_info(event, prog_id: &prog_id, fd_type: &fd_type,
5111	buf: &buf, probe_offset: &probe_offset,
5112	probe_addr: &probe_addr, NULL);
5113	if (!err)
5114	err = bpf_task_fd_query_copy(attr, uattr, prog_id,
5115	fd_type, buf,
5116	probe_offset,
5117	probe_addr);
5118	goto put_file;
5119	}
5120
5121	out_not_supp:
5122	err = -ENOTSUPP;
5123	put_file:
5124	fput(file);
5125	return err;
5126	}
5127
5128	#define BPF_MAP_BATCH_LAST_FIELD batch.flags
5129
5130	#define BPF_DO_BATCH(fn, ...) \
5131	do { \
5132	if (!fn) { \
5133	err = -ENOTSUPP; \
5134	goto err_put; \
5135	} \
5136	err = fn(__VA_ARGS__); \
5137	} while (0)
5138
5139	static int bpf_map_do_batch(const union bpf_attr *attr,
5140	union bpf_attr __user *uattr,
5141	int cmd)
5142	{
5143	bool has_read = cmd == BPF_MAP_LOOKUP_BATCH \|\|
5144	cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
5145	bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
5146	struct bpf_map *map;
5147	int err, ufd;
5148	struct fd f;
5149
5150	if (CHECK_ATTR(BPF_MAP_BATCH))
5151	return -EINVAL;
5152
5153	ufd = attr->batch.map_fd;
5154	f = fdget(fd: ufd);
5155	map = __bpf_map_get(f);
5156	if (IS_ERR(ptr: map))
5157	return PTR_ERR(ptr: map);
5158	if (has_write)
5159	bpf_map_write_active_inc(map);
5160	if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
5161	err = -EPERM;
5162	goto err_put;
5163	}
5164	if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
5165	err = -EPERM;
5166	goto err_put;
5167	}
5168
5169	if (cmd == BPF_MAP_LOOKUP_BATCH)
5170	BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
5171	else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
5172	BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
5173	else if (cmd == BPF_MAP_UPDATE_BATCH)
5174	BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
5175	else
5176	BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
5177	err_put:
5178	if (has_write) {
5179	maybe_wait_bpf_programs(map);
5180	bpf_map_write_active_dec(map);
5181	}
5182	fdput(fd: f);
5183	return err;
5184	}
5185
5186	#define BPF_LINK_CREATE_LAST_FIELD link_create.uprobe_multi.pid
5187	static int link_create(union bpf_attr *attr, bpfptr_t uattr)
5188	{
5189	struct bpf_prog *prog;
5190	int ret;
5191
5192	if (CHECK_ATTR(BPF_LINK_CREATE))
5193	return -EINVAL;
5194
5195	if (attr->link_create.attach_type == BPF_STRUCT_OPS)
5196	return bpf_struct_ops_link_create(attr);
5197
5198	prog = bpf_prog_get(ufd: attr->link_create.prog_fd);
5199	if (IS_ERR(ptr: prog))
5200	return PTR_ERR(ptr: prog);
5201
5202	ret = bpf_prog_attach_check_attach_type(prog,
5203	attach_type: attr->link_create.attach_type);
5204	if (ret)
5205	goto out;
5206
5207	switch (prog->type) {
5208	case BPF_PROG_TYPE_CGROUP_SKB:
5209	case BPF_PROG_TYPE_CGROUP_SOCK:
5210	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
5211	case BPF_PROG_TYPE_SOCK_OPS:
5212	case BPF_PROG_TYPE_CGROUP_DEVICE:
5213	case BPF_PROG_TYPE_CGROUP_SYSCTL:
5214	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
5215	ret = cgroup_bpf_link_attach(attr, prog);
5216	break;
5217	case BPF_PROG_TYPE_EXT:
5218	ret = bpf_tracing_prog_attach(prog,
5219	tgt_prog_fd: attr->link_create.target_fd,
5220	btf_id: attr->link_create.target_btf_id,
5221	bpf_cookie: attr->link_create.tracing.cookie);
5222	break;
5223	case BPF_PROG_TYPE_LSM:
5224	case BPF_PROG_TYPE_TRACING:
5225	if (attr->link_create.attach_type != prog->expected_attach_type) {
5226	ret = -EINVAL;
5227	goto out;
5228	}
5229	if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
5230	ret = bpf_raw_tp_link_attach(prog, NULL);
5231	else if (prog->expected_attach_type == BPF_TRACE_ITER)
5232	ret = bpf_iter_link_attach(attr, uattr, prog);
5233	else if (prog->expected_attach_type == BPF_LSM_CGROUP)
5234	ret = cgroup_bpf_link_attach(attr, prog);
5235	else
5236	ret = bpf_tracing_prog_attach(prog,
5237	tgt_prog_fd: attr->link_create.target_fd,
5238	btf_id: attr->link_create.target_btf_id,
5239	bpf_cookie: attr->link_create.tracing.cookie);
5240	break;
5241	case BPF_PROG_TYPE_FLOW_DISSECTOR:
5242	case BPF_PROG_TYPE_SK_LOOKUP:
5243	ret = netns_bpf_link_create(attr, prog);
5244	break;
5245	#ifdef CONFIG_NET
5246	case BPF_PROG_TYPE_XDP:
5247	ret = bpf_xdp_link_attach(attr, prog);
5248	break;
5249	case BPF_PROG_TYPE_SCHED_CLS:
5250	if (attr->link_create.attach_type == BPF_TCX_INGRESS \|\|
5251	attr->link_create.attach_type == BPF_TCX_EGRESS)
5252	ret = tcx_link_attach(attr, prog);
5253	else
5254	ret = netkit_link_attach(attr, prog);
5255	break;
5256	case BPF_PROG_TYPE_NETFILTER:
5257	ret = bpf_nf_link_attach(attr, prog);
5258	break;
5259	#endif
5260	case BPF_PROG_TYPE_PERF_EVENT:
5261	case BPF_PROG_TYPE_TRACEPOINT:
5262	ret = bpf_perf_link_attach(attr, prog);
5263	break;
5264	case BPF_PROG_TYPE_KPROBE:
5265	if (attr->link_create.attach_type == BPF_PERF_EVENT)
5266	ret = bpf_perf_link_attach(attr, prog);
5267	else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI)
5268	ret = bpf_kprobe_multi_link_attach(attr, prog);
5269	else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
5270	ret = bpf_uprobe_multi_link_attach(attr, prog);
5271	break;
5272	default:
5273	ret = -EINVAL;
5274	}
5275
5276	out:
5277	if (ret < `0`)
5278	bpf_prog_put(prog);
5279	return ret;
5280	}
5281
5282	static int link_update_map(struct bpf_link link, union* bpf_attr *attr)
5283	{
5284	struct bpf_map new_map, old_map = NULL;
5285	int ret;
5286
5287	new_map = bpf_map_get(attr->link_update.new_map_fd);
5288	if (IS_ERR(ptr: new_map))
5289	return PTR_ERR(ptr: new_map);
5290
5291	if (attr->link_update.flags & BPF_F_REPLACE) {
5292	old_map = bpf_map_get(attr->link_update.old_map_fd);
5293	if (IS_ERR(ptr: old_map)) {
5294	ret = PTR_ERR(ptr: old_map);
5295	goto out_put;
5296	}
5297	} else if (attr->link_update.old_map_fd) {
5298	ret = -EINVAL;
5299	goto out_put;
5300	}
5301
5302	ret = link->ops->update_map(link, new_map, old_map);
5303
5304	if (old_map)
5305	bpf_map_put(old_map);
5306	out_put:
5307	bpf_map_put(new_map);
5308	return ret;
5309	}
5310
5311	#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
5312
5313	static int link_update(union bpf_attr *attr)
5314	{
5315	struct bpf_prog old_prog = NULL, new_prog;
5316	struct bpf_link *link;
5317	u32 flags;
5318	int ret;
5319
5320	if (CHECK_ATTR(BPF_LINK_UPDATE))
5321	return -EINVAL;
5322
5323	flags = attr->link_update.flags;
5324	if (flags & ~BPF_F_REPLACE)
5325	return -EINVAL;
5326
5327	link = bpf_link_get_from_fd(attr->link_update.link_fd);
5328	if (IS_ERR(ptr: link))
5329	return PTR_ERR(ptr: link);
5330
5331	if (link->ops->update_map) {
5332	ret = link_update_map(link, attr);
5333	goto out_put_link;
5334	}
5335
5336	new_prog = bpf_prog_get(ufd: attr->link_update.new_prog_fd);
5337	if (IS_ERR(ptr: new_prog)) {
5338	ret = PTR_ERR(ptr: new_prog);
5339	goto out_put_link;
5340	}
5341
5342	if (flags & BPF_F_REPLACE) {
5343	old_prog = bpf_prog_get(ufd: attr->link_update.old_prog_fd);
5344	if (IS_ERR(ptr: old_prog)) {
5345	ret = PTR_ERR(ptr: old_prog);
5346	old_prog = NULL;
5347	goto out_put_progs;
5348	}
5349	} else if (attr->link_update.old_prog_fd) {
5350	ret = -EINVAL;
5351	goto out_put_progs;
5352	}
5353
5354	if (link->ops->update_prog)
5355	ret = link->ops->update_prog(link, new_prog, old_prog);
5356	else
5357	ret = -EINVAL;
5358
5359	out_put_progs:
5360	if (old_prog)
5361	bpf_prog_put(old_prog);
5362	if (ret)
5363	bpf_prog_put(new_prog);
5364	out_put_link:
5365	bpf_link_put_direct(link);
5366	return ret;
5367	}
5368
5369	#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
5370
5371	static int link_detach(union bpf_attr *attr)
5372	{
5373	struct bpf_link *link;
5374	int ret;
5375
5376	if (CHECK_ATTR(BPF_LINK_DETACH))
5377	return -EINVAL;
5378
5379	link = bpf_link_get_from_fd(attr->link_detach.link_fd);
5380	if (IS_ERR(ptr: link))
5381	return PTR_ERR(ptr: link);
5382
5383	if (link->ops->detach)
5384	ret = link->ops->detach(link);
5385	else
5386	ret = -EOPNOTSUPP;
5387
5388	bpf_link_put_direct(link);
5389	return ret;
5390	}
5391
5392	static struct bpf_link bpf_link_inc_not_zero(struct* bpf_link *link)
5393	{
5394	return atomic64_fetch_add_unless(v: &link->refcnt, a: `1`, u: `0`) ? link : ERR_PTR(error: -ENOENT);
5395	}
5396
5397	struct bpf_link *bpf_link_by_id(u32 id)
5398	{
5399	struct bpf_link *link;
5400
5401	if (!id)
5402	return ERR_PTR(error: -ENOENT);
5403
5404	spin_lock_bh(lock: &link_idr_lock);
5405	/ before link is "settled", ID is 0, pretend it doesn't exist yet /
5406	link = idr_find(&link_idr, id);
5407	if (link) {
5408	if (link->id)
5409	link = bpf_link_inc_not_zero(link);
5410	else
5411	link = ERR_PTR(error: -EAGAIN);
5412	} else {
5413	link = ERR_PTR(error: -ENOENT);
5414	}
5415	spin_unlock_bh(lock: &link_idr_lock);
5416	return link;
5417	}
5418
5419	struct bpf_link bpf_link_get_curr_or_next(u32 id)
5420	{
5421	struct bpf_link *link;
5422
5423	spin_lock_bh(lock: &link_idr_lock);
5424	again:
5425	link = idr_get_next(&link_idr, nextid: id);
5426	if (link) {
5427	link = bpf_link_inc_not_zero(link);
5428	if (IS_ERR(ptr: link)) {
5429	(*id)++;
5430	goto again;
5431	}
5432	}
5433	spin_unlock_bh(lock: &link_idr_lock);
5434
5435	return link;
5436	}
5437
5438	#define BPF_LINK_GET_FD_BY_ID_LAST_FIELD link_id
5439
5440	static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
5441	{
5442	struct bpf_link *link;
5443	u32 id = attr->link_id;
5444	int fd;
5445
5446	if (CHECK_ATTR(BPF_LINK_GET_FD_BY_ID))
5447	return -EINVAL;
5448
5449	if (!capable(CAP_SYS_ADMIN))
5450	return -EPERM;
5451
5452	link = bpf_link_by_id(id);
5453	if (IS_ERR(ptr: link))
5454	return PTR_ERR(ptr: link);
5455
5456	fd = bpf_link_new_fd(link);
5457	if (fd < `0`)
5458	bpf_link_put_direct(link);
5459
5460	return fd;
5461	}
5462
5463	DEFINE_MUTEX(bpf_stats_enabled_mutex);
5464
5465	static int bpf_stats_release(struct inode inode, struct* file *file)
5466	{
5467	mutex_lock(&bpf_stats_enabled_mutex);
5468	static_key_slow_dec(key: &bpf_stats_enabled_key.key);
5469	mutex_unlock(lock: &bpf_stats_enabled_mutex);
5470	return `0`;
5471	}
5472
5473	static const struct file_operations bpf_stats_fops = {
5474	.release = bpf_stats_release,
5475	};
5476
5477	static int bpf_enable_runtime_stats(void)
5478	{
5479	int fd;
5480
5481	mutex_lock(&bpf_stats_enabled_mutex);
5482
5483	/ Set a very high limit to avoid overflow /
5484	if (static_key_count(key: &bpf_stats_enabled_key.key) > INT_MAX / `2`) {
5485	mutex_unlock(lock: &bpf_stats_enabled_mutex);
5486	return -EBUSY;
5487	}
5488
5489	fd = anon_inode_getfd(name: "bpf-stats", fops: &bpf_stats_fops, NULL, O_CLOEXEC);
5490	if (fd >= `0`)
5491	static_key_slow_inc(key: &bpf_stats_enabled_key.key);
5492
5493	mutex_unlock(lock: &bpf_stats_enabled_mutex);
5494	return fd;
5495	}
5496
5497	#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
5498
5499	static int bpf_enable_stats(union bpf_attr *attr)
5500	{
5501
5502	if (CHECK_ATTR(BPF_ENABLE_STATS))
5503	return -EINVAL;
5504
5505	if (!capable(CAP_SYS_ADMIN))
5506	return -EPERM;
5507
5508	switch (attr->enable_stats.type) {
5509	case BPF_STATS_RUN_TIME:
5510	return bpf_enable_runtime_stats();
5511	default:
5512	break;
5513	}
5514	return -EINVAL;
5515	}
5516
5517	#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags
5518
5519	static int bpf_iter_create(union bpf_attr *attr)
5520	{
5521	struct bpf_link *link;
5522	int err;
5523
5524	if (CHECK_ATTR(BPF_ITER_CREATE))
5525	return -EINVAL;
5526
5527	if (attr->iter_create.flags)
5528	return -EINVAL;
5529
5530	link = bpf_link_get_from_fd(attr->iter_create.link_fd);
5531	if (IS_ERR(ptr: link))
5532	return PTR_ERR(ptr: link);
5533
5534	err = bpf_iter_new_fd(link);
5535	bpf_link_put_direct(link);
5536
5537	return err;
5538	}
5539
5540	#define BPF_PROG_BIND_MAP_LAST_FIELD prog_bind_map.flags
5541
5542	static int bpf_prog_bind_map(union bpf_attr *attr)
5543	{
5544	struct bpf_prog *prog;
5545	struct bpf_map *map;
5546	struct bpf_map used_maps_old, used_maps_new;
5547	int i, ret = `0`;
5548
5549	if (CHECK_ATTR(BPF_PROG_BIND_MAP))
5550	return -EINVAL;
5551
5552	if (attr->prog_bind_map.flags)
5553	return -EINVAL;
5554
5555	prog = bpf_prog_get(ufd: attr->prog_bind_map.prog_fd);
5556	if (IS_ERR(ptr: prog))
5557	return PTR_ERR(ptr: prog);
5558
5559	map = bpf_map_get(attr->prog_bind_map.map_fd);
5560	if (IS_ERR(ptr: map)) {
5561	ret = PTR_ERR(ptr: map);
5562	goto out_prog_put;
5563	}
5564
5565	mutex_lock(&prog->aux->used_maps_mutex);
5566
5567	used_maps_old = prog->aux->used_maps;
5568
5569	for (i = `0`; i < prog->aux->used_map_cnt; i++)
5570	if (used_maps_old[i] == map) {
5571	bpf_map_put(map);
5572	goto out_unlock;
5573	}
5574
5575	used_maps_new = kmalloc_array(n: prog->aux->used_map_cnt + `1`,
5576	size: sizeof(used_maps_new[`0`]),
5577	GFP_KERNEL);
5578	if (!used_maps_new) {
5579	ret = -ENOMEM;
5580	goto out_unlock;
5581	}
5582
5583	/ The bpf program will not access the bpf map, but for the sake of*
5584	* simplicity, increase sleepable_refcnt for sleepable program as well.
5585	*/
5586	if (prog->sleepable)
5587	atomic64_inc(v: &map->sleepable_refcnt);
5588	memcpy(used_maps_new, used_maps_old,
5589	sizeof(used_maps_old[`0`]) * prog->aux->used_map_cnt);
5590	used_maps_new[prog->aux->used_map_cnt] = map;
5591
5592	prog->aux->used_map_cnt++;
5593	prog->aux->used_maps = used_maps_new;
5594
5595	kfree(objp: used_maps_old);
5596
5597	out_unlock:
5598	mutex_unlock(lock: &prog->aux->used_maps_mutex);
5599
5600	if (ret)
5601	bpf_map_put(map);
5602	out_prog_put:
5603	bpf_prog_put(prog);
5604	return ret;
5605	}
5606
5607	#define BPF_TOKEN_CREATE_LAST_FIELD token_create.bpffs_fd
5608
5609	static int token_create(union bpf_attr *attr)
5610	{
5611	if (CHECK_ATTR(BPF_TOKEN_CREATE))
5612	return -EINVAL;
5613
5614	/ no flags are supported yet /
5615	if (attr->token_create.flags)
5616	return -EINVAL;
5617
5618	return bpf_token_create(attr);
5619	}
5620
5621	static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
5622	{
5623	union bpf_attr attr;
5624	int err;
5625
5626	err = bpf_check_uarg_tail_zero(uaddr: uattr, expected_size: sizeof(attr), actual_size: size);
5627	if (err)
5628	return err;
5629	size = min_t(u32, size, sizeof(attr));
5630
5631	/ copy attributes from user space, may be less than sizeof(bpf_attr) /
5632	memset(&attr, `0`, sizeof(attr));
5633	if (copy_from_bpfptr(dst: &attr, src: uattr, size) != `0`)
5634	return -EFAULT;
5635
5636	err = security_bpf(cmd, attr: &attr, size);
5637	if (err < `0`)
5638	return err;
5639
5640	switch (cmd) {
5641	case BPF_MAP_CREATE:
5642	err = map_create(attr: &attr);
5643	break;
5644	case BPF_MAP_LOOKUP_ELEM:
5645	err = map_lookup_elem(attr: &attr);
5646	break;
5647	case BPF_MAP_UPDATE_ELEM:
5648	err = map_update_elem(attr: &attr, uattr);
5649	break;
5650	case BPF_MAP_DELETE_ELEM:
5651	err = map_delete_elem(attr: &attr, uattr);
5652	break;
5653	case BPF_MAP_GET_NEXT_KEY:
5654	err = map_get_next_key(attr: &attr);
5655	break;
5656	case BPF_MAP_FREEZE:
5657	err = map_freeze(attr: &attr);
5658	break;
5659	case BPF_PROG_LOAD:
5660	err = bpf_prog_load(attr: &attr, uattr, uattr_size: size);
5661	break;
5662	case BPF_OBJ_PIN:
5663	err = bpf_obj_pin(attr: &attr);
5664	break;
5665	case BPF_OBJ_GET:
5666	err = bpf_obj_get(attr: &attr);
5667	break;
5668	case BPF_PROG_ATTACH:
5669	err = bpf_prog_attach(attr: &attr);
5670	break;
5671	case BPF_PROG_DETACH:
5672	err = bpf_prog_detach(attr: &attr);
5673	break;
5674	case BPF_PROG_QUERY:
5675	err = bpf_prog_query(attr: &attr, uattr: uattr.user);
5676	break;
5677	case BPF_PROG_TEST_RUN:
5678	err = bpf_prog_test_run(attr: &attr, uattr: uattr.user);
5679	break;
5680	case BPF_PROG_GET_NEXT_ID:
5681	err = bpf_obj_get_next_id(attr: &attr, uattr: uattr.user,
5682	idr: &prog_idr, lock: &prog_idr_lock);
5683	break;
5684	case BPF_MAP_GET_NEXT_ID:
5685	err = bpf_obj_get_next_id(attr: &attr, uattr: uattr.user,
5686	idr: &map_idr, lock: &map_idr_lock);
5687	break;
5688	case BPF_BTF_GET_NEXT_ID:
5689	err = bpf_obj_get_next_id(attr: &attr, uattr: uattr.user,
5690	idr: &btf_idr, lock: &btf_idr_lock);
5691	break;
5692	case BPF_PROG_GET_FD_BY_ID:
5693	err = bpf_prog_get_fd_by_id(attr: &attr);
5694	break;
5695	case BPF_MAP_GET_FD_BY_ID:
5696	err = bpf_map_get_fd_by_id(attr: &attr);
5697	break;
5698	case BPF_OBJ_GET_INFO_BY_FD:
5699	err = bpf_obj_get_info_by_fd(attr: &attr, uattr: uattr.user);
5700	break;
5701	case BPF_RAW_TRACEPOINT_OPEN:
5702	err = bpf_raw_tracepoint_open(attr: &attr);
5703	break;
5704	case BPF_BTF_LOAD:
5705	err = bpf_btf_load(attr: &attr, uattr, uattr_size: size);
5706	break;
5707	case BPF_BTF_GET_FD_BY_ID:
5708	err = bpf_btf_get_fd_by_id(attr: &attr);
5709	break;
5710	case BPF_TASK_FD_QUERY:
5711	err = bpf_task_fd_query(attr: &attr, uattr: uattr.user);
5712	break;
5713	case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
5714	err = map_lookup_and_delete_elem(attr: &attr);
5715	break;
5716	case BPF_MAP_LOOKUP_BATCH:
5717	err = bpf_map_do_batch(attr: &attr, uattr: uattr.user, cmd: BPF_MAP_LOOKUP_BATCH);
5718	break;
5719	case BPF_MAP_LOOKUP_AND_DELETE_BATCH:
5720	err = bpf_map_do_batch(attr: &attr, uattr: uattr.user,
5721	cmd: BPF_MAP_LOOKUP_AND_DELETE_BATCH);
5722	break;
5723	case BPF_MAP_UPDATE_BATCH:
5724	err = bpf_map_do_batch(attr: &attr, uattr: uattr.user, cmd: BPF_MAP_UPDATE_BATCH);
5725	break;
5726	case BPF_MAP_DELETE_BATCH:
5727	err = bpf_map_do_batch(attr: &attr, uattr: uattr.user, cmd: BPF_MAP_DELETE_BATCH);
5728	break;
5729	case BPF_LINK_CREATE:
5730	err = link_create(attr: &attr, uattr);
5731	break;
5732	case BPF_LINK_UPDATE:
5733	err = link_update(attr: &attr);
5734	break;
5735	case BPF_LINK_GET_FD_BY_ID:
5736	err = bpf_link_get_fd_by_id(attr: &attr);
5737	break;
5738	case BPF_LINK_GET_NEXT_ID:
5739	err = bpf_obj_get_next_id(attr: &attr, uattr: uattr.user,
5740	idr: &link_idr, lock: &link_idr_lock);
5741	break;
5742	case BPF_ENABLE_STATS:
5743	err = bpf_enable_stats(attr: &attr);
5744	break;
5745	case BPF_ITER_CREATE:
5746	err = bpf_iter_create(attr: &attr);
5747	break;
5748	case BPF_LINK_DETACH:
5749	err = link_detach(attr: &attr);
5750	break;
5751	case BPF_PROG_BIND_MAP:
5752	err = bpf_prog_bind_map(attr: &attr);
5753	break;
5754	case BPF_TOKEN_CREATE:
5755	err = token_create(attr: &attr);
5756	break;
5757	default:
5758	err = -EINVAL;
5759	break;
5760	}
5761
5762	return err;
5763	}
5764
5765	SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user , uattr, unsigned* int, size)
5766	{
5767	return __sys_bpf(cmd, uattr: USER_BPFPTR(p: uattr), size);
5768	}
5769
5770	static bool syscall_prog_is_valid_access(int off, int size,
5771	enum bpf_access_type type,
5772	const struct bpf_prog *prog,
5773	struct bpf_insn_access_aux *info)
5774	{
5775	if (off < `0` \|\| off >= U16_MAX)
5776	return false;
5777	if (off % size != `0`)
5778	return false;
5779	return true;
5780	}
5781
5782	BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
5783	{
5784	switch (cmd) {
5785	case BPF_MAP_CREATE:
5786	case BPF_MAP_DELETE_ELEM:
5787	case BPF_MAP_UPDATE_ELEM:
5788	case BPF_MAP_FREEZE:
5789	case BPF_MAP_GET_FD_BY_ID:
5790	case BPF_PROG_LOAD:
5791	case BPF_BTF_LOAD:
5792	case BPF_LINK_CREATE:
5793	case BPF_RAW_TRACEPOINT_OPEN:
5794	break;
5795	default:
5796	return -EINVAL;
5797	}
5798	return __sys_bpf(cmd, uattr: KERNEL_BPFPTR(p: attr), size: attr_size);
5799	}
5800
5801
5802	/ To shut up -Wmissing-prototypes.*
5803	* This function is used by the kernel light skeleton
5804	* to load bpf programs when modules are loaded or during kernel boot.
5805	* See tools/lib/bpf/skel_internal.h
5806	*/
5807	int kern_sys_bpf(int cmd, union bpf_attr attr, unsigned* int size);
5808
5809	int kern_sys_bpf(int cmd, union bpf_attr attr, unsigned* int size)
5810	{
5811	struct bpf_prog * __maybe_unused prog;
5812	struct bpf_tramp_run_ctx __maybe_unused run_ctx;
5813
5814	switch (cmd) {
5815	#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
5816	case BPF_PROG_TEST_RUN:
5817	if (attr->test.data_in \|\| attr->test.data_out \|\|
5818	attr->test.ctx_out \|\| attr->test.duration \|\|
5819	attr->test.repeat \|\| attr->test.flags)
5820	return -EINVAL;
5821
5822	prog = bpf_prog_get_type(ufd: attr->test.prog_fd, type: BPF_PROG_TYPE_SYSCALL);
5823	if (IS_ERR(ptr: prog))
5824	return PTR_ERR(ptr: prog);
5825
5826	if (attr->test.ctx_size_in < prog->aux->max_ctx_offset \|\|
5827	attr->test.ctx_size_in > U16_MAX) {
5828	bpf_prog_put(prog);
5829	return -EINVAL;
5830	}
5831
5832	run_ctx.bpf_cookie = `0`;
5833	if (!__bpf_prog_enter_sleepable_recur(prog, run_ctx: &run_ctx)) {
5834	/ recursion detected /
5835	__bpf_prog_exit_sleepable_recur(prog, start: `0`, run_ctx: &run_ctx);
5836	bpf_prog_put(prog);
5837	return -EBUSY;
5838	}
5839	attr->test.retval = bpf_prog_run(prog, ctx: (void ) (long*) attr->test.ctx_in);
5840	__bpf_prog_exit_sleepable_recur(prog, start: `0` / bpf_prog_run does runtime stats /,
5841	run_ctx: &run_ctx);
5842	bpf_prog_put(prog);
5843	return `0`;
5844	#endif
5845	default:
5846	return ____bpf_sys_bpf(cmd, attr, attr_size: size);
5847	}
5848	}
5849	EXPORT_SYMBOL(kern_sys_bpf);
5850
5851	static const struct bpf_func_proto bpf_sys_bpf_proto = {
5852	.func = bpf_sys_bpf,
5853	.gpl_only = false,
5854	.ret_type = RET_INTEGER,
5855	.arg1_type = ARG_ANYTHING,
5856	.arg2_type = ARG_PTR_TO_MEM \| MEM_RDONLY,
5857	.arg3_type = ARG_CONST_SIZE,
5858	};
5859
5860	const struct bpf_func_proto * __weak
5861	tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5862	{
5863	return bpf_base_func_proto(func_id, prog);
5864	}
5865
5866	BPF_CALL_1(bpf_sys_close, u32, fd)
5867	{
5868	/ When bpf program calls this helper there should not be*
5869	* an fdget() without matching completed fdput().
5870	* This helper is allowed in the following callchain only:
5871	* sys_bpf->prog_test_run->bpf_prog->bpf_sys_close
5872	*/
5873	return close_fd(fd);
5874	}
5875
5876	static const struct bpf_func_proto bpf_sys_close_proto = {
5877	.func = bpf_sys_close,
5878	.gpl_only = false,
5879	.ret_type = RET_INTEGER,
5880	.arg1_type = ARG_ANYTHING,
5881	};
5882
5883	BPF_CALL_4(bpf_kallsyms_lookup_name, const char , name, int, name_sz, int, flags, u64 , res)
5884	{
5885	if (flags)
5886	return -EINVAL;
5887
5888	if (name_sz <= `1` \|\| name[name_sz - `1`])
5889	return -EINVAL;
5890
5891	if (!bpf_dump_raw_ok(current_cred()))
5892	return -EPERM;
5893
5894	*res = kallsyms_lookup_name(name);
5895	return *res ? `0` : -ENOENT;
5896	}
5897
5898	static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
5899	.func = bpf_kallsyms_lookup_name,
5900	.gpl_only = false,
5901	.ret_type = RET_INTEGER,
5902	.arg1_type = ARG_PTR_TO_MEM,
5903	.arg2_type = ARG_CONST_SIZE_OR_ZERO,
5904	.arg3_type = ARG_ANYTHING,
5905	.arg4_type = ARG_PTR_TO_LONG,
5906	};
5907
5908	static const struct bpf_func_proto *
5909	syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
5910	{
5911	switch (func_id) {
5912	case BPF_FUNC_sys_bpf:
5913	return !bpf_token_capable(token: prog->aux->token, CAP_PERFMON)
5914	? NULL : &bpf_sys_bpf_proto;
5915	case BPF_FUNC_btf_find_by_name_kind:
5916	return &bpf_btf_find_by_name_kind_proto;
5917	case BPF_FUNC_sys_close:
5918	return &bpf_sys_close_proto;
5919	case BPF_FUNC_kallsyms_lookup_name:
5920	return &bpf_kallsyms_lookup_name_proto;
5921	default:
5922	return tracing_prog_func_proto(func_id, prog);
5923	}
5924	}
5925
5926	const struct bpf_verifier_ops bpf_syscall_verifier_ops = {
5927	.get_func_proto = syscall_prog_func_proto,
5928	.is_valid_access = syscall_prog_is_valid_access,
5929	};
5930
5931	const struct bpf_prog_ops bpf_syscall_prog_ops = {
5932	.test_run = bpf_prog_test_run_syscall,
5933	};
5934
5935	#ifdef CONFIG_SYSCTL
5936	static int bpf_stats_handler(struct ctl_table table, int* write,
5937	void buffer, size_t lenp, loff_t *ppos)
5938	{
5939	struct static_key key = (struct* static_key *)table->data;
5940	static int saved_val;
5941	int val, ret;
5942	struct ctl_table tmp = {
5943	.data = &val,
5944	.maxlen = sizeof(val),
5945	.mode = table->mode,
5946	.extra1 = SYSCTL_ZERO,
5947	.extra2 = SYSCTL_ONE,
5948	};
5949
5950	if (write && !capable(CAP_SYS_ADMIN))
5951	return -EPERM;
5952
5953	mutex_lock(&bpf_stats_enabled_mutex);
5954	val = saved_val;
5955	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
5956	if (write && !ret && val != saved_val) {
5957	if (val)
5958	static_key_slow_inc(key);
5959	else
5960	static_key_slow_dec(key);
5961	saved_val = val;
5962	}
5963	mutex_unlock(lock: &bpf_stats_enabled_mutex);
5964	return ret;
5965	}
5966
5967	void __weak unpriv_ebpf_notify(int new_state)
5968	{
5969	}
5970
5971	static int bpf_unpriv_handler(struct ctl_table table, int* write,
5972	void buffer, size_t lenp, loff_t *ppos)
5973	{
5974	int ret, unpriv_enable = (int* *)table->data;
5975	bool locked_state = unpriv_enable == `1`;
5976	struct ctl_table tmp = *table;
5977
5978	if (write && !capable(CAP_SYS_ADMIN))
5979	return -EPERM;
5980
5981	tmp.data = &unpriv_enable;
5982	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
5983	if (write && !ret) {
5984	if (locked_state && unpriv_enable != `1`)
5985	return -EPERM;
5986	(int* *)table->data = unpriv_enable;
5987	}
5988
5989	if (write)
5990	unpriv_ebpf_notify(new_state: unpriv_enable);
5991
5992	return ret;
5993	}
5994
5995	static struct ctl_table bpf_syscall_table[] = {
5996	{
5997	.procname = "unprivileged_bpf_disabled",
5998	.data = &sysctl_unprivileged_bpf_disabled,
5999	.maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
6000	.mode = `0644`,
6001	.proc_handler = bpf_unpriv_handler,
6002	.extra1 = SYSCTL_ZERO,
6003	.extra2 = SYSCTL_TWO,
6004	},
6005	{
6006	.procname = "bpf_stats_enabled",
6007	.data = &bpf_stats_enabled_key.key,
6008	.mode = `0644`,
6009	.proc_handler = bpf_stats_handler,
6010	},
6011	{ }
6012	};
6013
6014	static int __init bpf_syscall_sysctl_init(void)
6015	{
6016	register_sysctl_init("kernel", bpf_syscall_table);
6017	return `0`;
6018	}
6019	late_initcall(bpf_syscall_sysctl_init);
6020	#endif /* CONFIG_SYSCTL */
6021

source code of linux/kernel/bpf/syscall.c