hashtab.c source code [linux/kernel/bpf/hashtab.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com*
3	* Copyright (c) 2016 Facebook
4	*/
5	#include <linux/bpf.h>
6	#include <linux/btf.h>
7	#include <linux/jhash.h>
8	#include <linux/filter.h>
9	#include <linux/rculist_nulls.h>
10	#include <linux/rcupdate_wait.h>
11	#include <linux/random.h>
12	#include <uapi/linux/btf.h>
13	#include <linux/rcupdate_trace.h>
14	#include <linux/btf_ids.h>
15	#include "percpu_freelist.h"
16	#include "bpf_lru_list.h"
17	#include "map_in_map.h"
18	#include <linux/bpf_mem_alloc.h>
19
20	#define HTAB_CREATE_FLAG_MASK \
21	(BPF_F_NO_PREALLOC \| BPF_F_NO_COMMON_LRU \| BPF_F_NUMA_NODE \| \
22	BPF_F_ACCESS_MASK \| BPF_F_ZERO_SEED)
23
24	#define BATCH_OPS(_name) \
25	.map_lookup_batch = \
26	_name##_map_lookup_batch, \
27	.map_lookup_and_delete_batch = \
28	_name##_map_lookup_and_delete_batch, \
29	.map_update_batch = \
30	generic_map_update_batch, \
31	.map_delete_batch = \
32	generic_map_delete_batch
33
34	/*
35	* The bucket lock has two protection scopes:
36	*
37	* 1) Serializing concurrent operations from BPF programs on different
38	* CPUs
39	*
40	* 2) Serializing concurrent operations from BPF programs and sys_bpf()
41	*
42	* BPF programs can execute in any context including perf, kprobes and
43	* tracing. As there are almost no limits where perf, kprobes and tracing
44	* can be invoked from the lock operations need to be protected against
45	* deadlocks. Deadlocks can be caused by recursion and by an invocation in
46	* the lock held section when functions which acquire this lock are invoked
47	* from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
48	* variable bpf_prog_active, which prevents BPF programs attached to perf
49	* events, kprobes and tracing to be invoked before the prior invocation
50	* from one of these contexts completed. sys_bpf() uses the same mechanism
51	* by pinning the task to the current CPU and incrementing the recursion
52	* protection across the map operation.
53	*
54	* This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
55	* operations like memory allocations (even with GFP_ATOMIC) from atomic
56	* contexts. This is required because even with GFP_ATOMIC the memory
57	* allocator calls into code paths which acquire locks with long held lock
58	* sections. To ensure the deterministic behaviour these locks are regular
59	* spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
60	* true atomic contexts on an RT kernel are the low level hardware
61	* handling, scheduling, low level interrupt handling, NMIs etc. None of
62	* these contexts should ever do memory allocations.
63	*
64	* As regular device interrupt handlers and soft interrupts are forced into
65	* thread context, the existing code which does
66	* spin_lock(); alloc(GFP_ATOMIC); spin_unlock();
67	* just works.
68	*
69	* In theory the BPF locks could be converted to regular spinlocks as well,
70	* but the bucket locks and percpu_freelist locks can be taken from
71	* arbitrary contexts (perf, kprobes, tracepoints) which are required to be
72	* atomic contexts even on RT. Before the introduction of bpf_mem_alloc,
73	* it is only safe to use raw spinlock for preallocated hash map on a RT kernel,
74	* because there is no memory allocation within the lock held sections. However
75	* after hash map was fully converted to use bpf_mem_alloc, there will be
76	* non-synchronous memory allocation for non-preallocated hash map, so it is
77	* safe to always use raw spinlock for bucket lock.
78	*/
79	struct bucket {
80	struct hlist_nulls_head head;
81	raw_spinlock_t raw_lock;
82	};
83
84	#define HASHTAB_MAP_LOCK_COUNT 8
85	#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
86
87	struct bpf_htab {
88	struct bpf_map map;
89	struct bpf_mem_alloc ma;
90	struct bpf_mem_alloc pcpu_ma;
91	struct bucket *buckets;
92	void *elems;
93	union {
94	struct pcpu_freelist freelist;
95	struct bpf_lru lru;
96	};
97	struct htab_elem __percpu extra_elems;
98	/ number of elements in non-preallocated hashtable are kept*
99	* in either pcount or count
100	*/
101	struct percpu_counter pcount;
102	atomic_t count;
103	bool use_percpu_counter;
104	u32 n_buckets; / number of hash buckets /
105	u32 elem_size; / size of each element in bytes /
106	u32 hashrnd;
107	struct lock_class_key lockdep_key;
108	int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
109	};
110
111	/ each htab element is struct htab_elem + key + value /
112	struct htab_elem {
113	union {
114	struct hlist_nulls_node hash_node;
115	struct {
116	void *padding;
117	union {
118	struct pcpu_freelist_node fnode;
119	struct htab_elem *batch_flink;
120	};
121	};
122	};
123	union {
124	/ pointer to per-cpu pointer /
125	void *ptr_to_pptr;
126	struct bpf_lru_node lru_node;
127	};
128	u32 hash;
129	char key[] __aligned(`8`);
130	};
131
132	static inline bool htab_is_prealloc(const struct bpf_htab *htab)
133	{
134	return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
135	}
136
137	static void htab_init_buckets(struct bpf_htab *htab)
138	{
139	unsigned int i;
140
141	for (i = `0`; i < htab->n_buckets; i++) {
142	INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
143	raw_spin_lock_init(&htab->buckets[i].raw_lock);
144	lockdep_set_class(&htab->buckets[i].raw_lock,
145	&htab->lockdep_key);
146	cond_resched();
147	}
148	}
149
150	static inline int htab_lock_bucket(const struct bpf_htab *htab,
151	struct bucket *b, u32 hash,
152	unsigned long *pflags)
153	{
154	unsigned long flags;
155
156	hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - `1`);
157
158	preempt_disable();
159	local_irq_save(flags);
160	if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != `1`)) {
161	__this_cpu_dec(*(htab->map_locked[hash]));
162	local_irq_restore(flags);
163	preempt_enable();
164	return -EBUSY;
165	}
166
167	raw_spin_lock(&b->raw_lock);
168	*pflags = flags;
169
170	return `0`;
171	}
172
173	static inline void htab_unlock_bucket(const struct bpf_htab *htab,
174	struct bucket *b, u32 hash,
175	unsigned long flags)
176	{
177	hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - `1`);
178	raw_spin_unlock(&b->raw_lock);
179	__this_cpu_dec(*(htab->map_locked[hash]));
180	local_irq_restore(flags);
181	preempt_enable();
182	}
183
184	static bool htab_lru_map_delete_node(void arg, struct* bpf_lru_node *node);
185
186	static bool htab_is_lru(const struct bpf_htab *htab)
187	{
188	return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH \|\|
189	htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
190	}
191
192	static bool htab_is_percpu(const struct bpf_htab *htab)
193	{
194	return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
195	htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
196	}
197
198	static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
199	void __percpu *pptr)
200	{
201	(void* __percpu **)(l->key + key_size) = pptr;
202	}
203
204	static inline void __percpu htab_elem_get_ptr(struct* htab_elem *l, u32 key_size)
205	{
206	return (void* __percpu **)(l->key + key_size);
207	}
208
209	static void fd_htab_map_get_ptr(const* struct bpf_map map, struct* htab_elem *l)
210	{
211	return (void* **)(l->key + roundup(map->key_size, `8`));
212	}
213
214	static struct htab_elem get_htab_elem(struct* bpf_htab htab, int* i)
215	{
216	return (struct htab_elem ) (htab->elems + i (u64)htab->elem_size);
217	}
218
219	static bool htab_has_extra_elems(struct bpf_htab *htab)
220	{
221	return !htab_is_percpu(htab) && !htab_is_lru(htab);
222	}
223
224	static void htab_free_prealloced_timers(struct bpf_htab *htab)
225	{
226	u32 num_entries = htab->map.max_entries;
227	int i;
228
229	if (!btf_record_has_field(rec: htab->map.record, type: BPF_TIMER))
230	return;
231	if (htab_has_extra_elems(htab))
232	num_entries += num_possible_cpus();
233
234	for (i = `0`; i < num_entries; i++) {
235	struct htab_elem *elem;
236
237	elem = get_htab_elem(htab, i);
238	bpf_obj_free_timer(rec: htab->map.record, obj: elem->key + round_up(htab->map.key_size, `8`));
239	cond_resched();
240	}
241	}
242
243	static void htab_free_prealloced_fields(struct bpf_htab *htab)
244	{
245	u32 num_entries = htab->map.max_entries;
246	int i;
247
248	if (IS_ERR_OR_NULL(ptr: htab->map.record))
249	return;
250	if (htab_has_extra_elems(htab))
251	num_entries += num_possible_cpus();
252	for (i = `0`; i < num_entries; i++) {
253	struct htab_elem *elem;
254
255	elem = get_htab_elem(htab, i);
256	if (htab_is_percpu(htab)) {
257	void __percpu *pptr = htab_elem_get_ptr(l: elem, key_size: htab->map.key_size);
258	int cpu;
259
260	for_each_possible_cpu(cpu) {
261	bpf_obj_free_fields(rec: htab->map.record, per_cpu_ptr(pptr, cpu));
262	cond_resched();
263	}
264	} else {
265	bpf_obj_free_fields(rec: htab->map.record, obj: elem->key + round_up(htab->map.key_size, `8`));
266	cond_resched();
267	}
268	cond_resched();
269	}
270	}
271
272	static void htab_free_elems(struct bpf_htab *htab)
273	{
274	int i;
275
276	if (!htab_is_percpu(htab))
277	goto free_elems;
278
279	for (i = `0`; i < htab->map.max_entries; i++) {
280	void __percpu *pptr;
281
282	pptr = htab_elem_get_ptr(l: get_htab_elem(htab, i),
283	key_size: htab->map.key_size);
284	free_percpu(pdata: pptr);
285	cond_resched();
286	}
287	free_elems:
288	bpf_map_area_free(base: htab->elems);
289	}
290
291	/ The LRU list has a lock (lru_lock). Each htab bucket has a lock*
292	* (bucket_lock). If both locks need to be acquired together, the lock
293	* order is always lru_lock -> bucket_lock and this only happens in
294	* bpf_lru_list.c logic. For example, certain code path of
295	* bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
296	* will acquire lru_lock first followed by acquiring bucket_lock.
297	*
298	* In hashtab.c, to avoid deadlock, lock acquisition of
299	* bucket_lock followed by lru_lock is not allowed. In such cases,
300	* bucket_lock needs to be released first before acquiring lru_lock.
301	*/
302	static struct htab_elem prealloc_lru_pop(struct* bpf_htab htab, void* *key,
303	u32 hash)
304	{
305	struct bpf_lru_node *node = bpf_lru_pop_free(lru: &htab->lru, hash);
306	struct htab_elem *l;
307
308	if (node) {
309	bpf_map_inc_elem_count(map: &htab->map);
310	l = container_of(node, struct htab_elem, lru_node);
311	memcpy(l->key, key, htab->map.key_size);
312	return l;
313	}
314
315	return NULL;
316	}
317
318	static int prealloc_init(struct bpf_htab *htab)
319	{
320	u32 num_entries = htab->map.max_entries;
321	int err = -ENOMEM, i;
322
323	if (htab_has_extra_elems(htab))
324	num_entries += num_possible_cpus();
325
326	htab->elems = bpf_map_area_alloc(size: (u64)htab->elem_size * num_entries,
327	numa_node: htab->map.numa_node);
328	if (!htab->elems)
329	return -ENOMEM;
330
331	if (!htab_is_percpu(htab))
332	goto skip_percpu_elems;
333
334	for (i = `0`; i < num_entries; i++) {
335	u32 size = round_up(htab->map.value_size, `8`);
336	void __percpu *pptr;
337
338	pptr = bpf_map_alloc_percpu(map: &htab->map, size, align: `8`,
339	GFP_USER \| __GFP_NOWARN);
340	if (!pptr)
341	goto free_elems;
342	htab_elem_set_ptr(l: get_htab_elem(htab, i), key_size: htab->map.key_size,
343	pptr);
344	cond_resched();
345	}
346
347	skip_percpu_elems:
348	if (htab_is_lru(htab))
349	err = bpf_lru_init(lru: &htab->lru,
350	percpu: htab->map.map_flags & BPF_F_NO_COMMON_LRU,
351	offsetof(struct htab_elem, hash) -
352	offsetof(struct htab_elem, lru_node),
353	del_from_htab: htab_lru_map_delete_node,
354	delete_arg: htab);
355	else
356	err = pcpu_freelist_init(&htab->freelist);
357
358	if (err)
359	goto free_elems;
360
361	if (htab_is_lru(htab))
362	bpf_lru_populate(lru: &htab->lru, buf: htab->elems,
363	offsetof(struct htab_elem, lru_node),
364	elem_size: htab->elem_size, nr_elems: num_entries);
365	else
366	pcpu_freelist_populate(s: &htab->freelist,
367	buf: htab->elems + offsetof(struct htab_elem, fnode),
368	elem_size: htab->elem_size, nr_elems: num_entries);
369
370	return `0`;
371
372	free_elems:
373	htab_free_elems(htab);
374	return err;
375	}
376
377	static void prealloc_destroy(struct bpf_htab *htab)
378	{
379	htab_free_elems(htab);
380
381	if (htab_is_lru(htab))
382	bpf_lru_destroy(lru: &htab->lru);
383	else
384	pcpu_freelist_destroy(s: &htab->freelist);
385	}
386
387	static int alloc_extra_elems(struct bpf_htab *htab)
388	{
389	struct htab_elem __percpu pptr, *l_new;
390	struct pcpu_freelist_node *l;
391	int cpu;
392
393	pptr = bpf_map_alloc_percpu(map: &htab->map, size: sizeof(struct htab_elem *), align: `8`,
394	GFP_USER \| __GFP_NOWARN);
395	if (!pptr)
396	return -ENOMEM;
397
398	for_each_possible_cpu(cpu) {
399	l = pcpu_freelist_pop(&htab->freelist);
400	/ pop will succeed, since prealloc_init()*
401	* preallocated extra num_possible_cpus elements
402	*/
403	l_new = container_of(l, struct htab_elem, fnode);
404	*per_cpu_ptr(pptr, cpu) = l_new;
405	}
406	htab->extra_elems = pptr;
407	return `0`;
408	}
409
410	/ Called from syscall /
411	static int htab_map_alloc_check(union bpf_attr *attr)
412	{
413	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
414	attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
415	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH \|\|
416	attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
417	/ percpu_lru means each cpu has its own LRU list.*
418	* it is different from BPF_MAP_TYPE_PERCPU_HASH where
419	* the map's value itself is percpu. percpu_lru has
420	* nothing to do with the map's value.
421	*/
422	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
423	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
424	bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED);
425	int numa_node = bpf_map_attr_numa_node(attr);
426
427	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
428	offsetof(struct htab_elem, hash_node.pprev));
429
430	if (zero_seed && !capable(CAP_SYS_ADMIN))
431	/ Guard against local DoS, and discourage production use. /
432	return -EPERM;
433
434	if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK \|\|
435	!bpf_map_flags_access_ok(access_flags: attr->map_flags))
436	return -EINVAL;
437
438	if (!lru && percpu_lru)
439	return -EINVAL;
440
441	if (lru && !prealloc)
442	return -ENOTSUPP;
443
444	if (numa_node != NUMA_NO_NODE && (percpu \|\| percpu_lru))
445	return -EINVAL;
446
447	/ check sanity of attributes.*
448	* value_size == 0 may be allowed in the future to use map as a set
449	*/
450	if (attr->max_entries == `0` \|\| attr->key_size == `0` \|\|
451	attr->value_size == `0`)
452	return -EINVAL;
453
454	if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
455	sizeof(struct htab_elem))
456	/ if key_size + value_size is bigger, the user space won't be*
457	* able to access the elements via bpf syscall. This check
458	* also makes sure that the elem_size doesn't overflow and it's
459	* kmalloc-able later in htab_map_update_elem()
460	*/
461	return -E2BIG;
462
463	return `0`;
464	}
465
466	static struct bpf_map htab_map_alloc(union* bpf_attr *attr)
467	{
468	bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
469	attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
470	bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH \|\|
471	attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
472	/ percpu_lru means each cpu has its own LRU list.*
473	* it is different from BPF_MAP_TYPE_PERCPU_HASH where
474	* the map's value itself is percpu. percpu_lru has
475	* nothing to do with the map's value.
476	*/
477	bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
478	bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
479	struct bpf_htab *htab;
480	int err, i;
481
482	htab = bpf_map_area_alloc(size: sizeof(*htab), NUMA_NO_NODE);
483	if (!htab)
484	return ERR_PTR(error: -ENOMEM);
485
486	lockdep_register_key(key: &htab->lockdep_key);
487
488	bpf_map_init_from_attr(map: &htab->map, attr);
489
490	if (percpu_lru) {
491	/ ensure each CPU's lru list has >=1 elements.*
492	* since we are at it, make each lru list has the same
493	* number of elements.
494	*/
495	htab->map.max_entries = roundup(attr->max_entries,
496	num_possible_cpus());
497	if (htab->map.max_entries < attr->max_entries)
498	htab->map.max_entries = rounddown(attr->max_entries,
499	num_possible_cpus());
500	}
501
502	/ hash table size must be power of 2; roundup_pow_of_two() can overflow*
503	* into UB on 32-bit arches, so check that first
504	*/
505	err = -E2BIG;
506	if (htab->map.max_entries > `1UL` << `31`)
507	goto free_htab;
508
509	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
510
511	htab->elem_size = sizeof(struct htab_elem) +
512	round_up(htab->map.key_size, `8`);
513	if (percpu)
514	htab->elem_size += sizeof(void *);
515	else
516	htab->elem_size += round_up(htab->map.value_size, `8`);
517
518	/ check for u32 overflow /
519	if (htab->n_buckets > U32_MAX / sizeof(struct bucket))
520	goto free_htab;
521
522	err = bpf_map_init_elem_count(map: &htab->map);
523	if (err)
524	goto free_htab;
525
526	err = -ENOMEM;
527	htab->buckets = bpf_map_area_alloc(size: htab->n_buckets *
528	sizeof(struct bucket),
529	numa_node: htab->map.numa_node);
530	if (!htab->buckets)
531	goto free_elem_count;
532
533	for (i = `0`; i < HASHTAB_MAP_LOCK_COUNT; i++) {
534	htab->map_locked[i] = bpf_map_alloc_percpu(map: &htab->map,
535	size: sizeof(int),
536	align: sizeof(int),
537	GFP_USER);
538	if (!htab->map_locked[i])
539	goto free_map_locked;
540	}
541
542	if (htab->map.map_flags & BPF_F_ZERO_SEED)
543	htab->hashrnd = `0`;
544	else
545	htab->hashrnd = get_random_u32();
546
547	htab_init_buckets(htab);
548
549	/ compute_batch_value() computes batch value as num_online_cpus() * 2*
550	* and __percpu_counter_compare() needs
551	* htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus()
552	* for percpu_counter to be faster than atomic_t. In practice the average bpf
553	* hash map size is 10k, which means that a system with 64 cpus will fill
554	* hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore
555	* define our own batch count as 32 then 10k hash map can be filled up to 80%:
556	* 10k - 8k > 32 _batch_ * 64 _cpus_
557	* and __percpu_counter_compare() will still be fast. At that point hash map
558	* collisions will dominate its performance anyway. Assume that hash map filled
559	* to 50+% isn't going to be O(1) and use the following formula to choose
560	* between percpu_counter and atomic_t.
561	*/
562	#define PERCPU_COUNTER_BATCH 32
563	if (attr->max_entries / `2` > num_online_cpus() * PERCPU_COUNTER_BATCH)
564	htab->use_percpu_counter = true;
565
566	if (htab->use_percpu_counter) {
567	err = percpu_counter_init(&htab->pcount, `0`, GFP_KERNEL);
568	if (err)
569	goto free_map_locked;
570	}
571
572	if (prealloc) {
573	err = prealloc_init(htab);
574	if (err)
575	goto free_map_locked;
576
577	if (!percpu && !lru) {
578	/ lru itself can remove the least used element, so*
579	* there is no need for an extra elem during map_update.
580	*/
581	err = alloc_extra_elems(htab);
582	if (err)
583	goto free_prealloc;
584	}
585	} else {
586	err = bpf_mem_alloc_init(ma: &htab->ma, size: htab->elem_size, percpu: false);
587	if (err)
588	goto free_map_locked;
589	if (percpu) {
590	err = bpf_mem_alloc_init(ma: &htab->pcpu_ma,
591	round_up(htab->map.value_size, `8`), percpu: true);
592	if (err)
593	goto free_map_locked;
594	}
595	}
596
597	return &htab->map;
598
599	free_prealloc:
600	prealloc_destroy(htab);
601	free_map_locked:
602	if (htab->use_percpu_counter)
603	percpu_counter_destroy(fbc: &htab->pcount);
604	for (i = `0`; i < HASHTAB_MAP_LOCK_COUNT; i++)
605	free_percpu(pdata: htab->map_locked[i]);
606	bpf_map_area_free(base: htab->buckets);
607	bpf_mem_alloc_destroy(ma: &htab->pcpu_ma);
608	bpf_mem_alloc_destroy(ma: &htab->ma);
609	free_elem_count:
610	bpf_map_free_elem_count(map: &htab->map);
611	free_htab:
612	lockdep_unregister_key(key: &htab->lockdep_key);
613	bpf_map_area_free(base: htab);
614	return ERR_PTR(error: err);
615	}
616
617	static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
618	{
619	if (likely(key_len % `4` == `0`))
620	return jhash2(k: key, length: key_len / `4`, initval: hashrnd);
621	return jhash(key, length: key_len, initval: hashrnd);
622	}
623
624	static inline struct bucket __select_bucket(struct* bpf_htab *htab, u32 hash)
625	{
626	return &htab->buckets[hash & (htab->n_buckets - `1`)];
627	}
628
629	static inline struct hlist_nulls_head select_bucket(struct* bpf_htab *htab, u32 hash)
630	{
631	return &__select_bucket(htab, hash)->head;
632	}
633
634	/ this lookup function can only be called with bucket lock taken /
635	static struct htab_elem lookup_elem_raw(struct* hlist_nulls_head *head, u32 hash,
636	void *key, u32 key_size)
637	{
638	struct hlist_nulls_node *n;
639	struct htab_elem *l;
640
641	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
642	if (l->hash == hash && !memcmp(p: &l->key, q: key, size: key_size))
643	return l;
644
645	return NULL;
646	}
647
648	/ can be called without bucket lock. it will repeat the loop in*
649	* the unlikely event when elements moved from one bucket into another
650	* while link list is being walked
651	*/
652	static struct htab_elem lookup_nulls_elem_raw(struct* hlist_nulls_head *head,
653	u32 hash, void *key,
654	u32 key_size, u32 n_buckets)
655	{
656	struct hlist_nulls_node *n;
657	struct htab_elem *l;
658
659	again:
660	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
661	if (l->hash == hash && !memcmp(p: &l->key, q: key, size: key_size))
662	return l;
663
664	if (unlikely(get_nulls_value(n) != (hash & (n_buckets - `1`))))
665	goto again;
666
667	return NULL;
668	}
669
670	/ Called from syscall or from eBPF program directly, so*
671	* arguments have to match bpf_map_lookup_elem() exactly.
672	* The return value is adjusted by BPF instructions
673	* in htab_map_gen_lookup().
674	*/
675	static void __htab_map_lookup_elem(struct* bpf_map map, void* *key)
676	{
677	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
678	struct hlist_nulls_head *head;
679	struct htab_elem *l;
680	u32 hash, key_size;
681
682	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
683	!rcu_read_lock_bh_held());
684
685	key_size = map->key_size;
686
687	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
688
689	head = select_bucket(htab, hash);
690
691	l = lookup_nulls_elem_raw(head, hash, key, key_size, n_buckets: htab->n_buckets);
692
693	return l;
694	}
695
696	static void htab_map_lookup_elem(struct* bpf_map map, void* *key)
697	{
698	struct htab_elem *l = __htab_map_lookup_elem(map, key);
699
700	if (l)
701	return l->key + round_up(map->key_size, `8`);
702
703	return NULL;
704	}
705
706	/ inline bpf_map_lookup_elem() call.*
707	* Instead of:
708	* bpf_prog
709	* bpf_map_lookup_elem
710	* map->ops->map_lookup_elem
711	* htab_map_lookup_elem
712	* __htab_map_lookup_elem
713	* do:
714	* bpf_prog
715	* __htab_map_lookup_elem
716	*/
717	static int htab_map_gen_lookup(struct bpf_map map, struct* bpf_insn *insn_buf)
718	{
719	struct bpf_insn *insn = insn_buf;
720	const int ret = BPF_REG_0;
721
722	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
723	(void ()(struct bpf_map map, void* *key))NULL));
724	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
725	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, `0`, `1`);
726	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
727	offsetof(struct htab_elem, key) +
728	round_up(map->key_size, `8`));
729	return insn - insn_buf;
730	}
731
732	static __always_inline void __htab_lru_map_lookup_elem(struct* bpf_map *map,
733	void key, const* bool mark)
734	{
735	struct htab_elem *l = __htab_map_lookup_elem(map, key);
736
737	if (l) {
738	if (mark)
739	bpf_lru_node_set_ref(node: &l->lru_node);
740	return l->key + round_up(map->key_size, `8`);
741	}
742
743	return NULL;
744	}
745
746	static void htab_lru_map_lookup_elem(struct* bpf_map map, void* *key)
747	{
748	return __htab_lru_map_lookup_elem(map, key, mark: true);
749	}
750
751	static void htab_lru_map_lookup_elem_sys(struct* bpf_map map, void* *key)
752	{
753	return __htab_lru_map_lookup_elem(map, key, mark: false);
754	}
755
756	static int htab_lru_map_gen_lookup(struct bpf_map *map,
757	struct bpf_insn *insn_buf)
758	{
759	struct bpf_insn *insn = insn_buf;
760	const int ret = BPF_REG_0;
761	const int ref_reg = BPF_REG_1;
762
763	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
764	(void ()(struct bpf_map map, void* *key))NULL));
765	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
766	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, `0`, `4`);
767	*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
768	offsetof(struct htab_elem, lru_node) +
769	offsetof(struct bpf_lru_node, ref));
770	*insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, `0`, `1`);
771	*insn++ = BPF_ST_MEM(BPF_B, ret,
772	offsetof(struct htab_elem, lru_node) +
773	offsetof(struct bpf_lru_node, ref),
774	`1`);
775	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
776	offsetof(struct htab_elem, key) +
777	round_up(map->key_size, `8`));
778	return insn - insn_buf;
779	}
780
781	static void check_and_free_fields(struct bpf_htab *htab,
782	struct htab_elem *elem)
783	{
784	if (htab_is_percpu(htab)) {
785	void __percpu *pptr = htab_elem_get_ptr(l: elem, key_size: htab->map.key_size);
786	int cpu;
787
788	for_each_possible_cpu(cpu)
789	bpf_obj_free_fields(rec: htab->map.record, per_cpu_ptr(pptr, cpu));
790	} else {
791	void *map_value = elem->key + round_up(htab->map.key_size, `8`);
792
793	bpf_obj_free_fields(rec: htab->map.record, obj: map_value);
794	}
795	}
796
797	/ It is called from the bpf_lru_list when the LRU needs to delete*
798	* older elements from the htab.
799	*/
800	static bool htab_lru_map_delete_node(void arg, struct* bpf_lru_node *node)
801	{
802	struct bpf_htab *htab = arg;
803	struct htab_elem l = NULL, tgt_l;
804	struct hlist_nulls_head *head;
805	struct hlist_nulls_node *n;
806	unsigned long flags;
807	struct bucket *b;
808	int ret;
809
810	tgt_l = container_of(node, struct htab_elem, lru_node);
811	b = __select_bucket(htab, hash: tgt_l->hash);
812	head = &b->head;
813
814	ret = htab_lock_bucket(htab, b, hash: tgt_l->hash, pflags: &flags);
815	if (ret)
816	return false;
817
818	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
819	if (l == tgt_l) {
820	hlist_nulls_del_rcu(n: &l->hash_node);
821	check_and_free_fields(htab, elem: l);
822	bpf_map_dec_elem_count(map: &htab->map);
823	break;
824	}
825
826	htab_unlock_bucket(htab, b, hash: tgt_l->hash, flags);
827
828	return l == tgt_l;
829	}
830
831	/ Called from syscall /
832	static int htab_map_get_next_key(struct bpf_map map, void* key, void* *next_key)
833	{
834	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
835	struct hlist_nulls_head *head;
836	struct htab_elem l, next_l;
837	u32 hash, key_size;
838	int i = `0`;
839
840	WARN_ON_ONCE(!rcu_read_lock_held());
841
842	key_size = map->key_size;
843
844	if (!key)
845	goto find_first_elem;
846
847	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
848
849	head = select_bucket(htab, hash);
850
851	/ lookup the key /
852	l = lookup_nulls_elem_raw(head, hash, key, key_size, n_buckets: htab->n_buckets);
853
854	if (!l)
855	goto find_first_elem;
856
857	/ key was found, get next key in the same bucket /
858	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
859	struct htab_elem, hash_node);
860
861	if (next_l) {
862	/ if next elem in this hash list is non-zero, just return it /
863	memcpy(next_key, next_l->key, key_size);
864	return `0`;
865	}
866
867	/ no more elements in this hash list, go to the next bucket /
868	i = hash & (htab->n_buckets - `1`);
869	i++;
870
871	find_first_elem:
872	/ iterate over buckets /
873	for (; i < htab->n_buckets; i++) {
874	head = select_bucket(htab, hash: i);
875
876	/ pick first element in the bucket /
877	next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
878	struct htab_elem, hash_node);
879	if (next_l) {
880	/ if it's not empty, just return it /
881	memcpy(next_key, next_l->key, key_size);
882	return `0`;
883	}
884	}
885
886	/ iterated over all buckets and all elements /
887	return -ENOENT;
888	}
889
890	static void htab_elem_free(struct bpf_htab htab, struct* htab_elem *l)
891	{
892	check_and_free_fields(htab, elem: l);
893	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
894	bpf_mem_cache_free(ma: &htab->pcpu_ma, ptr: l->ptr_to_pptr);
895	bpf_mem_cache_free(ma: &htab->ma, ptr: l);
896	}
897
898	static void htab_put_fd_value(struct bpf_htab htab, struct* htab_elem *l)
899	{
900	struct bpf_map *map = &htab->map;
901	void *ptr;
902
903	if (map->ops->map_fd_put_ptr) {
904	ptr = fd_htab_map_get_ptr(map, l);
905	map->ops->map_fd_put_ptr(map, ptr, true);
906	}
907	}
908
909	static bool is_map_full(struct bpf_htab *htab)
910	{
911	if (htab->use_percpu_counter)
912	return __percpu_counter_compare(fbc: &htab->pcount, rhs: htab->map.max_entries,
913	PERCPU_COUNTER_BATCH) >= `0`;
914	return atomic_read(v: &htab->count) >= htab->map.max_entries;
915	}
916
917	static void inc_elem_count(struct bpf_htab *htab)
918	{
919	bpf_map_inc_elem_count(map: &htab->map);
920
921	if (htab->use_percpu_counter)
922	percpu_counter_add_batch(fbc: &htab->pcount, amount: `1`, PERCPU_COUNTER_BATCH);
923	else
924	atomic_inc(v: &htab->count);
925	}
926
927	static void dec_elem_count(struct bpf_htab *htab)
928	{
929	bpf_map_dec_elem_count(map: &htab->map);
930
931	if (htab->use_percpu_counter)
932	percpu_counter_add_batch(fbc: &htab->pcount, amount: -`1`, PERCPU_COUNTER_BATCH);
933	else
934	atomic_dec(v: &htab->count);
935	}
936
937
938	static void free_htab_elem(struct bpf_htab htab, struct* htab_elem *l)
939	{
940	htab_put_fd_value(htab, l);
941
942	if (htab_is_prealloc(htab)) {
943	bpf_map_dec_elem_count(map: &htab->map);
944	check_and_free_fields(htab, elem: l);
945	__pcpu_freelist_push(&htab->freelist, &l->fnode);
946	} else {
947	dec_elem_count(htab);
948	htab_elem_free(htab, l);
949	}
950	}
951
952	static void pcpu_copy_value(struct bpf_htab htab, void* __percpu *pptr,
953	void *value, bool onallcpus)
954	{
955	if (!onallcpus) {
956	/ copy true value_size bytes /
957	copy_map_value(map: &htab->map, this_cpu_ptr(pptr), src: value);
958	} else {
959	u32 size = round_up(htab->map.value_size, `8`);
960	int off = `0`, cpu;
961
962	for_each_possible_cpu(cpu) {
963	copy_map_value_long(map: &htab->map, per_cpu_ptr(pptr, cpu), src: value + off);
964	off += size;
965	}
966	}
967	}
968
969	static void pcpu_init_value(struct bpf_htab htab, void* __percpu *pptr,
970	void *value, bool onallcpus)
971	{
972	/ When not setting the initial value on all cpus, zero-fill element*
973	* values for other cpus. Otherwise, bpf program has no way to ensure
974	* known initial values for cpus other than current one
975	* (onallcpus=false always when coming from bpf prog).
976	*/
977	if (!onallcpus) {
978	int current_cpu = raw_smp_processor_id();
979	int cpu;
980
981	for_each_possible_cpu(cpu) {
982	if (cpu == current_cpu)
983	copy_map_value_long(map: &htab->map, per_cpu_ptr(pptr, cpu), src: value);
984	else / Since elem is preallocated, we cannot touch special fields /
985	zero_map_value(map: &htab->map, per_cpu_ptr(pptr, cpu));
986	}
987	} else {
988	pcpu_copy_value(htab, pptr, value, onallcpus);
989	}
990	}
991
992	static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
993	{
994	return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
995	BITS_PER_LONG == `64`;
996	}
997
998	static struct htab_elem alloc_htab_elem(struct* bpf_htab htab, void* *key,
999	void *value, u32 key_size, u32 hash,
1000	bool percpu, bool onallcpus,
1001	struct htab_elem *old_elem)
1002	{
1003	u32 size = htab->map.value_size;
1004	bool prealloc = htab_is_prealloc(htab);
1005	struct htab_elem l_new, *pl_new;
1006	void __percpu *pptr;
1007
1008	if (prealloc) {
1009	if (old_elem) {
1010	/ if we're updating the existing element,*
1011	* use per-cpu extra elems to avoid freelist_pop/push
1012	*/
1013	pl_new = this_cpu_ptr(htab->extra_elems);
1014	l_new = *pl_new;
1015	htab_put_fd_value(htab, l: old_elem);
1016	*pl_new = old_elem;
1017	} else {
1018	struct pcpu_freelist_node *l;
1019
1020	l = __pcpu_freelist_pop(&htab->freelist);
1021	if (!l)
1022	return ERR_PTR(error: -E2BIG);
1023	l_new = container_of(l, struct htab_elem, fnode);
1024	bpf_map_inc_elem_count(map: &htab->map);
1025	}
1026	} else {
1027	if (is_map_full(htab))
1028	if (!old_elem)
1029	/ when map is full and update() is replacing*
1030	* old element, it's ok to allocate, since
1031	* old element will be freed immediately.
1032	* Otherwise return an error
1033	*/
1034	return ERR_PTR(error: -E2BIG);
1035	inc_elem_count(htab);
1036	l_new = bpf_mem_cache_alloc(ma: &htab->ma);
1037	if (!l_new) {
1038	l_new = ERR_PTR(error: -ENOMEM);
1039	goto dec_count;
1040	}
1041	}
1042
1043	memcpy(l_new->key, key, key_size);
1044	if (percpu) {
1045	if (prealloc) {
1046	pptr = htab_elem_get_ptr(l: l_new, key_size);
1047	} else {
1048	/ alloc_percpu zero-fills /
1049	pptr = bpf_mem_cache_alloc(ma: &htab->pcpu_ma);
1050	if (!pptr) {
1051	bpf_mem_cache_free(ma: &htab->ma, ptr: l_new);
1052	l_new = ERR_PTR(error: -ENOMEM);
1053	goto dec_count;
1054	}
1055	l_new->ptr_to_pptr = pptr;
1056	pptr = (void* **)pptr;
1057	}
1058
1059	pcpu_init_value(htab, pptr, value, onallcpus);
1060
1061	if (!prealloc)
1062	htab_elem_set_ptr(l: l_new, key_size, pptr);
1063	} else if (fd_htab_map_needs_adjust(htab)) {
1064	size = round_up(size, `8`);
1065	memcpy(l_new->key + round_up(key_size, `8`), value, size);
1066	} else {
1067	copy_map_value(map: &htab->map,
1068	dst: l_new->key + round_up(key_size, `8`),
1069	src: value);
1070	}
1071
1072	l_new->hash = hash;
1073	return l_new;
1074	dec_count:
1075	dec_elem_count(htab);
1076	return l_new;
1077	}
1078
1079	static int check_flags(struct bpf_htab htab, struct* htab_elem *l_old,
1080	u64 map_flags)
1081	{
1082	if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
1083	/ elem already exists /
1084	return -EEXIST;
1085
1086	if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
1087	/ elem doesn't exist, cannot update it /
1088	return -ENOENT;
1089
1090	return `0`;
1091	}
1092
1093	/ Called from syscall or from eBPF program /
1094	static long htab_map_update_elem(struct bpf_map map, void* key, void* *value,
1095	u64 map_flags)
1096	{
1097	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1098	struct htab_elem l_new = NULL, l_old;
1099	struct hlist_nulls_head *head;
1100	unsigned long flags;
1101	struct bucket *b;
1102	u32 key_size, hash;
1103	int ret;
1104
1105	if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST))
1106	/ unknown flags /
1107	return -EINVAL;
1108
1109	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1110	!rcu_read_lock_bh_held());
1111
1112	key_size = map->key_size;
1113
1114	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1115
1116	b = __select_bucket(htab, hash);
1117	head = &b->head;
1118
1119	if (unlikely(map_flags & BPF_F_LOCK)) {
1120	if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
1121	return -EINVAL;
1122	/ find an element without taking the bucket lock /
1123	l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
1124	n_buckets: htab->n_buckets);
1125	ret = check_flags(htab, l_old, map_flags);
1126	if (ret)
1127	return ret;
1128	if (l_old) {
1129	/ grab the element lock and update value in place /
1130	copy_map_value_locked(map,
1131	dst: l_old->key + round_up(key_size, `8`),
1132	src: value, lock_src: false);
1133	return `0`;
1134	}
1135	/ fall through, grab the bucket lock and lookup again.*
1136	* 99.9% chance that the element won't be found,
1137	* but second lookup under lock has to be done.
1138	*/
1139	}
1140
1141	ret = htab_lock_bucket(htab, b, hash, pflags: &flags);
1142	if (ret)
1143	return ret;
1144
1145	l_old = lookup_elem_raw(head, hash, key, key_size);
1146
1147	ret = check_flags(htab, l_old, map_flags);
1148	if (ret)
1149	goto err;
1150
1151	if (unlikely(l_old && (map_flags & BPF_F_LOCK))) {
1152	/ first lookup without the bucket lock didn't find the element,*
1153	* but second lookup with the bucket lock found it.
1154	* This case is highly unlikely, but has to be dealt with:
1155	* grab the element lock in addition to the bucket lock
1156	* and update element in place
1157	*/
1158	copy_map_value_locked(map,
1159	dst: l_old->key + round_up(key_size, `8`),
1160	src: value, lock_src: false);
1161	ret = `0`;
1162	goto err;
1163	}
1164
1165	l_new = alloc_htab_elem(htab, key, value, key_size, hash, percpu: false, onallcpus: false,
1166	old_elem: l_old);
1167	if (IS_ERR(ptr: l_new)) {
1168	/ all pre-allocated elements are in use or memory exhausted /
1169	ret = PTR_ERR(ptr: l_new);
1170	goto err;
1171	}
1172
1173	/ add new element to the head of the list, so that*
1174	* concurrent search will find it before old elem
1175	*/
1176	hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head);
1177	if (l_old) {
1178	hlist_nulls_del_rcu(n: &l_old->hash_node);
1179	if (!htab_is_prealloc(htab))
1180	free_htab_elem(htab, l: l_old);
1181	else
1182	check_and_free_fields(htab, elem: l_old);
1183	}
1184	ret = `0`;
1185	err:
1186	htab_unlock_bucket(htab, b, hash, flags);
1187	return ret;
1188	}
1189
1190	static void htab_lru_push_free(struct bpf_htab htab, struct* htab_elem *elem)
1191	{
1192	check_and_free_fields(htab, elem);
1193	bpf_map_dec_elem_count(map: &htab->map);
1194	bpf_lru_push_free(lru: &htab->lru, node: &elem->lru_node);
1195	}
1196
1197	static long htab_lru_map_update_elem(struct bpf_map map, void* key, void* *value,
1198	u64 map_flags)
1199	{
1200	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1201	struct htab_elem l_new, l_old = NULL;
1202	struct hlist_nulls_head *head;
1203	unsigned long flags;
1204	struct bucket *b;
1205	u32 key_size, hash;
1206	int ret;
1207
1208	if (unlikely(map_flags > BPF_EXIST))
1209	/ unknown flags /
1210	return -EINVAL;
1211
1212	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1213	!rcu_read_lock_bh_held());
1214
1215	key_size = map->key_size;
1216
1217	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1218
1219	b = __select_bucket(htab, hash);
1220	head = &b->head;
1221
1222	/ For LRU, we need to alloc before taking bucket's*
1223	* spinlock because getting free nodes from LRU may need
1224	* to remove older elements from htab and this removal
1225	* operation will need a bucket lock.
1226	*/
1227	l_new = prealloc_lru_pop(htab, key, hash);
1228	if (!l_new)
1229	return -ENOMEM;
1230	copy_map_value(map: &htab->map,
1231	dst: l_new->key + round_up(map->key_size, `8`), src: value);
1232
1233	ret = htab_lock_bucket(htab, b, hash, pflags: &flags);
1234	if (ret)
1235	goto err_lock_bucket;
1236
1237	l_old = lookup_elem_raw(head, hash, key, key_size);
1238
1239	ret = check_flags(htab, l_old, map_flags);
1240	if (ret)
1241	goto err;
1242
1243	/ add new element to the head of the list, so that*
1244	* concurrent search will find it before old elem
1245	*/
1246	hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head);
1247	if (l_old) {
1248	bpf_lru_node_set_ref(node: &l_new->lru_node);
1249	hlist_nulls_del_rcu(n: &l_old->hash_node);
1250	}
1251	ret = `0`;
1252
1253	err:
1254	htab_unlock_bucket(htab, b, hash, flags);
1255
1256	err_lock_bucket:
1257	if (ret)
1258	htab_lru_push_free(htab, elem: l_new);
1259	else if (l_old)
1260	htab_lru_push_free(htab, elem: l_old);
1261
1262	return ret;
1263	}
1264
1265	static long __htab_percpu_map_update_elem(struct bpf_map map, void* *key,
1266	void *value, u64 map_flags,
1267	bool onallcpus)
1268	{
1269	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1270	struct htab_elem l_new = NULL, l_old;
1271	struct hlist_nulls_head *head;
1272	unsigned long flags;
1273	struct bucket *b;
1274	u32 key_size, hash;
1275	int ret;
1276
1277	if (unlikely(map_flags > BPF_EXIST))
1278	/ unknown flags /
1279	return -EINVAL;
1280
1281	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1282	!rcu_read_lock_bh_held());
1283
1284	key_size = map->key_size;
1285
1286	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1287
1288	b = __select_bucket(htab, hash);
1289	head = &b->head;
1290
1291	ret = htab_lock_bucket(htab, b, hash, pflags: &flags);
1292	if (ret)
1293	return ret;
1294
1295	l_old = lookup_elem_raw(head, hash, key, key_size);
1296
1297	ret = check_flags(htab, l_old, map_flags);
1298	if (ret)
1299	goto err;
1300
1301	if (l_old) {
1302	/ per-cpu hash map can update value in-place /
1303	pcpu_copy_value(htab, pptr: htab_elem_get_ptr(l: l_old, key_size),
1304	value, onallcpus);
1305	} else {
1306	l_new = alloc_htab_elem(htab, key, value, key_size,
1307	hash, percpu: true, onallcpus, NULL);
1308	if (IS_ERR(ptr: l_new)) {
1309	ret = PTR_ERR(ptr: l_new);
1310	goto err;
1311	}
1312	hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head);
1313	}
1314	ret = `0`;
1315	err:
1316	htab_unlock_bucket(htab, b, hash, flags);
1317	return ret;
1318	}
1319
1320	static long __htab_lru_percpu_map_update_elem(struct bpf_map map, void* *key,
1321	void *value, u64 map_flags,
1322	bool onallcpus)
1323	{
1324	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1325	struct htab_elem l_new = NULL, l_old;
1326	struct hlist_nulls_head *head;
1327	unsigned long flags;
1328	struct bucket *b;
1329	u32 key_size, hash;
1330	int ret;
1331
1332	if (unlikely(map_flags > BPF_EXIST))
1333	/ unknown flags /
1334	return -EINVAL;
1335
1336	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1337	!rcu_read_lock_bh_held());
1338
1339	key_size = map->key_size;
1340
1341	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1342
1343	b = __select_bucket(htab, hash);
1344	head = &b->head;
1345
1346	/ For LRU, we need to alloc before taking bucket's*
1347	* spinlock because LRU's elem alloc may need
1348	* to remove older elem from htab and this removal
1349	* operation will need a bucket lock.
1350	*/
1351	if (map_flags != BPF_EXIST) {
1352	l_new = prealloc_lru_pop(htab, key, hash);
1353	if (!l_new)
1354	return -ENOMEM;
1355	}
1356
1357	ret = htab_lock_bucket(htab, b, hash, pflags: &flags);
1358	if (ret)
1359	goto err_lock_bucket;
1360
1361	l_old = lookup_elem_raw(head, hash, key, key_size);
1362
1363	ret = check_flags(htab, l_old, map_flags);
1364	if (ret)
1365	goto err;
1366
1367	if (l_old) {
1368	bpf_lru_node_set_ref(node: &l_old->lru_node);
1369
1370	/ per-cpu hash map can update value in-place /
1371	pcpu_copy_value(htab, pptr: htab_elem_get_ptr(l: l_old, key_size),
1372	value, onallcpus);
1373	} else {
1374	pcpu_init_value(htab, pptr: htab_elem_get_ptr(l: l_new, key_size),
1375	value, onallcpus);
1376	hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head);
1377	l_new = NULL;
1378	}
1379	ret = `0`;
1380	err:
1381	htab_unlock_bucket(htab, b, hash, flags);
1382	err_lock_bucket:
1383	if (l_new) {
1384	bpf_map_dec_elem_count(map: &htab->map);
1385	bpf_lru_push_free(lru: &htab->lru, node: &l_new->lru_node);
1386	}
1387	return ret;
1388	}
1389
1390	static long htab_percpu_map_update_elem(struct bpf_map map, void* *key,
1391	void *value, u64 map_flags)
1392	{
1393	return __htab_percpu_map_update_elem(map, key, value, map_flags, onallcpus: false);
1394	}
1395
1396	static long htab_lru_percpu_map_update_elem(struct bpf_map map, void* *key,
1397	void *value, u64 map_flags)
1398	{
1399	return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
1400	onallcpus: false);
1401	}
1402
1403	/ Called from syscall or from eBPF program /
1404	static long htab_map_delete_elem(struct bpf_map map, void* *key)
1405	{
1406	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1407	struct hlist_nulls_head *head;
1408	struct bucket *b;
1409	struct htab_elem *l;
1410	unsigned long flags;
1411	u32 hash, key_size;
1412	int ret;
1413
1414	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1415	!rcu_read_lock_bh_held());
1416
1417	key_size = map->key_size;
1418
1419	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1420	b = __select_bucket(htab, hash);
1421	head = &b->head;
1422
1423	ret = htab_lock_bucket(htab, b, hash, pflags: &flags);
1424	if (ret)
1425	return ret;
1426
1427	l = lookup_elem_raw(head, hash, key, key_size);
1428
1429	if (l) {
1430	hlist_nulls_del_rcu(n: &l->hash_node);
1431	free_htab_elem(htab, l);
1432	} else {
1433	ret = -ENOENT;
1434	}
1435
1436	htab_unlock_bucket(htab, b, hash, flags);
1437	return ret;
1438	}
1439
1440	static long htab_lru_map_delete_elem(struct bpf_map map, void* *key)
1441	{
1442	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1443	struct hlist_nulls_head *head;
1444	struct bucket *b;
1445	struct htab_elem *l;
1446	unsigned long flags;
1447	u32 hash, key_size;
1448	int ret;
1449
1450	WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
1451	!rcu_read_lock_bh_held());
1452
1453	key_size = map->key_size;
1454
1455	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1456	b = __select_bucket(htab, hash);
1457	head = &b->head;
1458
1459	ret = htab_lock_bucket(htab, b, hash, pflags: &flags);
1460	if (ret)
1461	return ret;
1462
1463	l = lookup_elem_raw(head, hash, key, key_size);
1464
1465	if (l)
1466	hlist_nulls_del_rcu(n: &l->hash_node);
1467	else
1468	ret = -ENOENT;
1469
1470	htab_unlock_bucket(htab, b, hash, flags);
1471	if (l)
1472	htab_lru_push_free(htab, elem: l);
1473	return ret;
1474	}
1475
1476	static void delete_all_elements(struct bpf_htab *htab)
1477	{
1478	int i;
1479
1480	/ It's called from a worker thread, so disable migration here,*
1481	* since bpf_mem_cache_free() relies on that.
1482	*/
1483	migrate_disable();
1484	for (i = `0`; i < htab->n_buckets; i++) {
1485	struct hlist_nulls_head *head = select_bucket(htab, hash: i);
1486	struct hlist_nulls_node *n;
1487	struct htab_elem *l;
1488
1489	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1490	hlist_nulls_del_rcu(n: &l->hash_node);
1491	htab_elem_free(htab, l);
1492	}
1493	}
1494	migrate_enable();
1495	}
1496
1497	static void htab_free_malloced_timers(struct bpf_htab *htab)
1498	{
1499	int i;
1500
1501	rcu_read_lock();
1502	for (i = `0`; i < htab->n_buckets; i++) {
1503	struct hlist_nulls_head *head = select_bucket(htab, hash: i);
1504	struct hlist_nulls_node *n;
1505	struct htab_elem *l;
1506
1507	hlist_nulls_for_each_entry(l, n, head, hash_node) {
1508	/ We only free timer on uref dropping to zero /
1509	bpf_obj_free_timer(rec: htab->map.record, obj: l->key + round_up(htab->map.key_size, `8`));
1510	}
1511	cond_resched_rcu();
1512	}
1513	rcu_read_unlock();
1514	}
1515
1516	static void htab_map_free_timers(struct bpf_map *map)
1517	{
1518	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1519
1520	/ We only free timer on uref dropping to zero /
1521	if (!btf_record_has_field(rec: htab->map.record, type: BPF_TIMER))
1522	return;
1523	if (!htab_is_prealloc(htab))
1524	htab_free_malloced_timers(htab);
1525	else
1526	htab_free_prealloced_timers(htab);
1527	}
1528
1529	/ Called when map->refcnt goes to zero, either from workqueue or from syscall /
1530	static void htab_map_free(struct bpf_map *map)
1531	{
1532	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1533	int i;
1534
1535	/ bpf_free_used_maps() or close(map_fd) will trigger this map_free callback.*
1536	* bpf_free_used_maps() is called after bpf prog is no longer executing.
1537	* There is no need to synchronize_rcu() here to protect map elements.
1538	*/
1539
1540	/ htab no longer uses call_rcu() directly. bpf_mem_alloc does it*
1541	* underneath and is reponsible for waiting for callbacks to finish
1542	* during bpf_mem_alloc_destroy().
1543	*/
1544	if (!htab_is_prealloc(htab)) {
1545	delete_all_elements(htab);
1546	} else {
1547	htab_free_prealloced_fields(htab);
1548	prealloc_destroy(htab);
1549	}
1550
1551	bpf_map_free_elem_count(map);
1552	free_percpu(pdata: htab->extra_elems);
1553	bpf_map_area_free(base: htab->buckets);
1554	bpf_mem_alloc_destroy(ma: &htab->pcpu_ma);
1555	bpf_mem_alloc_destroy(ma: &htab->ma);
1556	if (htab->use_percpu_counter)
1557	percpu_counter_destroy(fbc: &htab->pcount);
1558	for (i = `0`; i < HASHTAB_MAP_LOCK_COUNT; i++)
1559	free_percpu(pdata: htab->map_locked[i]);
1560	lockdep_unregister_key(key: &htab->lockdep_key);
1561	bpf_map_area_free(base: htab);
1562	}
1563
1564	static void htab_map_seq_show_elem(struct bpf_map map, void* *key,
1565	struct seq_file *m)
1566	{
1567	void *value;
1568
1569	rcu_read_lock();
1570
1571	value = htab_map_lookup_elem(map, key);
1572	if (!value) {
1573	rcu_read_unlock();
1574	return;
1575	}
1576
1577	btf_type_seq_show(btf: map->btf, type_id: map->btf_key_type_id, obj: key, m);
1578	seq_puts(m, s: ": ");
1579	btf_type_seq_show(btf: map->btf, type_id: map->btf_value_type_id, obj: value, m);
1580	seq_puts(m, s: "\n");
1581
1582	rcu_read_unlock();
1583	}
1584
1585	static int __htab_map_lookup_and_delete_elem(struct bpf_map map, void* *key,
1586	void *value, bool is_lru_map,
1587	bool is_percpu, u64 flags)
1588	{
1589	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1590	struct hlist_nulls_head *head;
1591	unsigned long bflags;
1592	struct htab_elem *l;
1593	u32 hash, key_size;
1594	struct bucket *b;
1595	int ret;
1596
1597	key_size = map->key_size;
1598
1599	hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd);
1600	b = __select_bucket(htab, hash);
1601	head = &b->head;
1602
1603	ret = htab_lock_bucket(htab, b, hash, pflags: &bflags);
1604	if (ret)
1605	return ret;
1606
1607	l = lookup_elem_raw(head, hash, key, key_size);
1608	if (!l) {
1609	ret = -ENOENT;
1610	} else {
1611	if (is_percpu) {
1612	u32 roundup_value_size = round_up(map->value_size, `8`);
1613	void __percpu *pptr;
1614	int off = `0`, cpu;
1615
1616	pptr = htab_elem_get_ptr(l, key_size);
1617	for_each_possible_cpu(cpu) {
1618	copy_map_value_long(map: &htab->map, dst: value + off, per_cpu_ptr(pptr, cpu));
1619	check_and_init_map_value(map: &htab->map, dst: value + off);
1620	off += roundup_value_size;
1621	}
1622	} else {
1623	u32 roundup_key_size = round_up(map->key_size, `8`);
1624
1625	if (flags & BPF_F_LOCK)
1626	copy_map_value_locked(map, dst: value, src: l->key +
1627	roundup_key_size,
1628	lock_src: true);
1629	else
1630	copy_map_value(map, dst: value, src: l->key +
1631	roundup_key_size);
1632	/ Zeroing special fields in the temp buffer /
1633	check_and_init_map_value(map, dst: value);
1634	}
1635
1636	hlist_nulls_del_rcu(n: &l->hash_node);
1637	if (!is_lru_map)
1638	free_htab_elem(htab, l);
1639	}
1640
1641	htab_unlock_bucket(htab, b, hash, flags: bflags);
1642
1643	if (is_lru_map && l)
1644	htab_lru_push_free(htab, elem: l);
1645
1646	return ret;
1647	}
1648
1649	static int htab_map_lookup_and_delete_elem(struct bpf_map map, void* *key,
1650	void *value, u64 flags)
1651	{
1652	return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: false, is_percpu: false,
1653	flags);
1654	}
1655
1656	static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1657	void key, void* *value,
1658	u64 flags)
1659	{
1660	return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: false, is_percpu: true,
1661	flags);
1662	}
1663
1664	static int htab_lru_map_lookup_and_delete_elem(struct bpf_map map, void* *key,
1665	void *value, u64 flags)
1666	{
1667	return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: true, is_percpu: false,
1668	flags);
1669	}
1670
1671	static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map,
1672	void key, void* *value,
1673	u64 flags)
1674	{
1675	return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: true, is_percpu: true,
1676	flags);
1677	}
1678
1679	static int
1680	__htab_map_lookup_and_delete_batch(struct bpf_map *map,
1681	const union bpf_attr *attr,
1682	union bpf_attr __user *uattr,
1683	bool do_delete, bool is_lru_map,
1684	bool is_percpu)
1685	{
1686	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
1687	u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
1688	void keys = NULL, values = NULL, value, dst_key, *dst_val;
1689	void __user *uvalues = u64_to_user_ptr(attr->batch.values);
1690	void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
1691	void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
1692	u32 batch, max_count, size, bucket_size, map_id;
1693	struct htab_elem *node_to_free = NULL;
1694	u64 elem_map_flags, map_flags;
1695	struct hlist_nulls_head *head;
1696	struct hlist_nulls_node *n;
1697	unsigned long flags = `0`;
1698	bool locked = false;
1699	struct htab_elem *l;
1700	struct bucket *b;
1701	int ret = `0`;
1702
1703	elem_map_flags = attr->batch.elem_flags;
1704	if ((elem_map_flags & ~BPF_F_LOCK) \|\|
1705	((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK)))
1706	return -EINVAL;
1707
1708	map_flags = attr->batch.flags;
1709	if (map_flags)
1710	return -EINVAL;
1711
1712	max_count = attr->batch.count;
1713	if (!max_count)
1714	return `0`;
1715
1716	if (put_user(`0`, &uattr->batch.count))
1717	return -EFAULT;
1718
1719	batch = `0`;
1720	if (ubatch && copy_from_user(to: &batch, from: ubatch, n: sizeof(batch)))
1721	return -EFAULT;
1722
1723	if (batch >= htab->n_buckets)
1724	return -ENOENT;
1725
1726	key_size = htab->map.key_size;
1727	roundup_key_size = round_up(htab->map.key_size, `8`);
1728	value_size = htab->map.value_size;
1729	size = round_up(value_size, `8`);
1730	if (is_percpu)
1731	value_size = size * num_possible_cpus();
1732	total = `0`;
1733	/ while experimenting with hash tables with sizes ranging from 10 to*
1734	* 1000, it was observed that a bucket can have up to 5 entries.
1735	*/
1736	bucket_size = `5`;
1737
1738	alloc:
1739	/ We cannot do copy_from_user or copy_to_user inside*
1740	* the rcu_read_lock. Allocate enough space here.
1741	*/
1742	keys = kvmalloc_array(n: key_size, size: bucket_size, GFP_USER \| __GFP_NOWARN);
1743	values = kvmalloc_array(n: value_size, size: bucket_size, GFP_USER \| __GFP_NOWARN);
1744	if (!keys \|\| !values) {
1745	ret = -ENOMEM;
1746	goto after_loop;
1747	}
1748
1749	again:
1750	bpf_disable_instrumentation();
1751	rcu_read_lock();
1752	again_nocopy:
1753	dst_key = keys;
1754	dst_val = values;
1755	b = &htab->buckets[batch];
1756	head = &b->head;
1757	/ do not grab the lock unless need it (bucket_cnt > 0). /
1758	if (locked) {
1759	ret = htab_lock_bucket(htab, b, hash: batch, pflags: &flags);
1760	if (ret) {
1761	rcu_read_unlock();
1762	bpf_enable_instrumentation();
1763	goto after_loop;
1764	}
1765	}
1766
1767	bucket_cnt = `0`;
1768	hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
1769	bucket_cnt++;
1770
1771	if (bucket_cnt && !locked) {
1772	locked = true;
1773	goto again_nocopy;
1774	}
1775
1776	if (bucket_cnt > (max_count - total)) {
1777	if (total == `0`)
1778	ret = -ENOSPC;
1779	/ Note that since bucket_cnt > 0 here, it is implicit*
1780	* that the locked was grabbed, so release it.
1781	*/
1782	htab_unlock_bucket(htab, b, hash: batch, flags);
1783	rcu_read_unlock();
1784	bpf_enable_instrumentation();
1785	goto after_loop;
1786	}
1787
1788	if (bucket_cnt > bucket_size) {
1789	bucket_size = bucket_cnt;
1790	/ Note that since bucket_cnt > 0 here, it is implicit*
1791	* that the locked was grabbed, so release it.
1792	*/
1793	htab_unlock_bucket(htab, b, hash: batch, flags);
1794	rcu_read_unlock();
1795	bpf_enable_instrumentation();
1796	kvfree(addr: keys);
1797	kvfree(addr: values);
1798	goto alloc;
1799	}
1800
1801	/ Next block is only safe to run if you have grabbed the lock /
1802	if (!locked)
1803	goto next_batch;
1804
1805	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
1806	memcpy(dst_key, l->key, key_size);
1807
1808	if (is_percpu) {
1809	int off = `0`, cpu;
1810	void __percpu *pptr;
1811
1812	pptr = htab_elem_get_ptr(l, key_size: map->key_size);
1813	for_each_possible_cpu(cpu) {
1814	copy_map_value_long(map: &htab->map, dst: dst_val + off, per_cpu_ptr(pptr, cpu));
1815	check_and_init_map_value(map: &htab->map, dst: dst_val + off);
1816	off += size;
1817	}
1818	} else {
1819	value = l->key + roundup_key_size;
1820	if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
1821	struct bpf_map **inner_map = value;
1822
1823	/ Actual value is the id of the inner map /
1824	map_id = map->ops->map_fd_sys_lookup_elem(*inner_map);
1825	value = &map_id;
1826	}
1827
1828	if (elem_map_flags & BPF_F_LOCK)
1829	copy_map_value_locked(map, dst: dst_val, src: value,
1830	lock_src: true);
1831	else
1832	copy_map_value(map, dst: dst_val, src: value);
1833	/ Zeroing special fields in the temp buffer /
1834	check_and_init_map_value(map, dst: dst_val);
1835	}
1836	if (do_delete) {
1837	hlist_nulls_del_rcu(n: &l->hash_node);
1838
1839	/ bpf_lru_push_free() will acquire lru_lock, which*
1840	* may cause deadlock. See comments in function
1841	* prealloc_lru_pop(). Let us do bpf_lru_push_free()
1842	* after releasing the bucket lock.
1843	*/
1844	if (is_lru_map) {
1845	l->batch_flink = node_to_free;
1846	node_to_free = l;
1847	} else {
1848	free_htab_elem(htab, l);
1849	}
1850	}
1851	dst_key += key_size;
1852	dst_val += value_size;
1853	}
1854
1855	htab_unlock_bucket(htab, b, hash: batch, flags);
1856	locked = false;
1857
1858	while (node_to_free) {
1859	l = node_to_free;
1860	node_to_free = node_to_free->batch_flink;
1861	htab_lru_push_free(htab, elem: l);
1862	}
1863
1864	next_batch:
1865	/ If we are not copying data, we can go to next bucket and avoid*
1866	* unlocking the rcu.
1867	*/
1868	if (!bucket_cnt && (batch + `1` < htab->n_buckets)) {
1869	batch++;
1870	goto again_nocopy;
1871	}
1872
1873	rcu_read_unlock();
1874	bpf_enable_instrumentation();
1875	if (bucket_cnt && (copy_to_user(to: ukeys + total * key_size, from: keys,
1876	n: key_size * bucket_cnt) \|\|
1877	copy_to_user(to: uvalues + total * value_size, from: values,
1878	n: value_size * bucket_cnt))) {
1879	ret = -EFAULT;
1880	goto after_loop;
1881	}
1882
1883	total += bucket_cnt;
1884	batch++;
1885	if (batch >= htab->n_buckets) {
1886	ret = -ENOENT;
1887	goto after_loop;
1888	}
1889	goto again;
1890
1891	after_loop:
1892	if (ret == -EFAULT)
1893	goto out;
1894
1895	/ copy # of entries and next batch /
1896	ubatch = u64_to_user_ptr(attr->batch.out_batch);
1897	if (copy_to_user(to: ubatch, from: &batch, n: sizeof(batch)) \|\|
1898	put_user(total, &uattr->batch.count))
1899	ret = -EFAULT;
1900
1901	out:
1902	kvfree(addr: keys);
1903	kvfree(addr: values);
1904	return ret;
1905	}
1906
1907	static int
1908	htab_percpu_map_lookup_batch(struct bpf_map map, const* union bpf_attr *attr,
1909	union bpf_attr __user *uattr)
1910	{
1911	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false,
1912	is_lru_map: false, is_percpu: true);
1913	}
1914
1915	static int
1916	htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1917	const union bpf_attr *attr,
1918	union bpf_attr __user *uattr)
1919	{
1920	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true,
1921	is_lru_map: false, is_percpu: true);
1922	}
1923
1924	static int
1925	htab_map_lookup_batch(struct bpf_map map, const* union bpf_attr *attr,
1926	union bpf_attr __user *uattr)
1927	{
1928	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false,
1929	is_lru_map: false, is_percpu: false);
1930	}
1931
1932	static int
1933	htab_map_lookup_and_delete_batch(struct bpf_map *map,
1934	const union bpf_attr *attr,
1935	union bpf_attr __user *uattr)
1936	{
1937	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true,
1938	is_lru_map: false, is_percpu: false);
1939	}
1940
1941	static int
1942	htab_lru_percpu_map_lookup_batch(struct bpf_map *map,
1943	const union bpf_attr *attr,
1944	union bpf_attr __user *uattr)
1945	{
1946	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false,
1947	is_lru_map: true, is_percpu: true);
1948	}
1949
1950	static int
1951	htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map,
1952	const union bpf_attr *attr,
1953	union bpf_attr __user *uattr)
1954	{
1955	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true,
1956	is_lru_map: true, is_percpu: true);
1957	}
1958
1959	static int
1960	htab_lru_map_lookup_batch(struct bpf_map map, const* union bpf_attr *attr,
1961	union bpf_attr __user *uattr)
1962	{
1963	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false,
1964	is_lru_map: true, is_percpu: false);
1965	}
1966
1967	static int
1968	htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
1969	const union bpf_attr *attr,
1970	union bpf_attr __user *uattr)
1971	{
1972	return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true,
1973	is_lru_map: true, is_percpu: false);
1974	}
1975
1976	struct bpf_iter_seq_hash_map_info {
1977	struct bpf_map *map;
1978	struct bpf_htab *htab;
1979	void percpu_value_buf; // non-zero means percpu hash*
1980	u32 bucket_id;
1981	u32 skip_elems;
1982	};
1983
1984	static struct htab_elem *
1985	bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
1986	struct htab_elem *prev_elem)
1987	{
1988	const struct bpf_htab *htab = info->htab;
1989	u32 skip_elems = info->skip_elems;
1990	u32 bucket_id = info->bucket_id;
1991	struct hlist_nulls_head *head;
1992	struct hlist_nulls_node *n;
1993	struct htab_elem *elem;
1994	struct bucket *b;
1995	u32 i, count;
1996
1997	if (bucket_id >= htab->n_buckets)
1998	return NULL;
1999
2000	/ try to find next elem in the same bucket /
2001	if (prev_elem) {
2002	/ no update/deletion on this bucket, prev_elem should be still valid*
2003	* and we won't skip elements.
2004	*/
2005	n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
2006	elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
2007	if (elem)
2008	return elem;
2009
2010	/ not found, unlock and go to the next bucket /
2011	b = &htab->buckets[bucket_id++];
2012	rcu_read_unlock();
2013	skip_elems = `0`;
2014	}
2015
2016	for (i = bucket_id; i < htab->n_buckets; i++) {
2017	b = &htab->buckets[i];
2018	rcu_read_lock();
2019
2020	count = `0`;
2021	head = &b->head;
2022	hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2023	if (count >= skip_elems) {
2024	info->bucket_id = i;
2025	info->skip_elems = count;
2026	return elem;
2027	}
2028	count++;
2029	}
2030
2031	rcu_read_unlock();
2032	skip_elems = `0`;
2033	}
2034
2035	info->bucket_id = i;
2036	info->skip_elems = `0`;
2037	return NULL;
2038	}
2039
2040	static void bpf_hash_map_seq_start(struct* seq_file seq, loff_t pos)
2041	{
2042	struct bpf_iter_seq_hash_map_info *info = seq->private;
2043	struct htab_elem *elem;
2044
2045	elem = bpf_hash_map_seq_find_next(info, NULL);
2046	if (!elem)
2047	return NULL;
2048
2049	if (*pos == `0`)
2050	++*pos;
2051	return elem;
2052	}
2053
2054	static void bpf_hash_map_seq_next(struct* seq_file seq, void* v, loff_t pos)
2055	{
2056	struct bpf_iter_seq_hash_map_info *info = seq->private;
2057
2058	++*pos;
2059	++info->skip_elems;
2060	return bpf_hash_map_seq_find_next(info, prev_elem: v);
2061	}
2062
2063	static int __bpf_hash_map_seq_show(struct seq_file seq, struct* htab_elem *elem)
2064	{
2065	struct bpf_iter_seq_hash_map_info *info = seq->private;
2066	u32 roundup_key_size, roundup_value_size;
2067	struct bpf_iter__bpf_map_elem ctx = {};
2068	struct bpf_map *map = info->map;
2069	struct bpf_iter_meta meta;
2070	int ret = `0`, off = `0`, cpu;
2071	struct bpf_prog *prog;
2072	void __percpu *pptr;
2073
2074	meta.seq = seq;
2075	prog = bpf_iter_get_info(meta: &meta, in_stop: elem == NULL);
2076	if (prog) {
2077	ctx.meta = &meta;
2078	ctx.map = info->map;
2079	if (elem) {
2080	roundup_key_size = round_up(map->key_size, `8`);
2081	ctx.key = elem->key;
2082	if (!info->percpu_value_buf) {
2083	ctx.value = elem->key + roundup_key_size;
2084	} else {
2085	roundup_value_size = round_up(map->value_size, `8`);
2086	pptr = htab_elem_get_ptr(l: elem, key_size: map->key_size);
2087	for_each_possible_cpu(cpu) {
2088	copy_map_value_long(map, dst: info->percpu_value_buf + off,
2089	per_cpu_ptr(pptr, cpu));
2090	check_and_init_map_value(map, dst: info->percpu_value_buf + off);
2091	off += roundup_value_size;
2092	}
2093	ctx.value = info->percpu_value_buf;
2094	}
2095	}
2096	ret = bpf_iter_run_prog(prog, ctx: &ctx);
2097	}
2098
2099	return ret;
2100	}
2101
2102	static int bpf_hash_map_seq_show(struct seq_file seq, void* *v)
2103	{
2104	return __bpf_hash_map_seq_show(seq, elem: v);
2105	}
2106
2107	static void bpf_hash_map_seq_stop(struct seq_file seq, void* *v)
2108	{
2109	if (!v)
2110	(void)__bpf_hash_map_seq_show(seq, NULL);
2111	else
2112	rcu_read_unlock();
2113	}
2114
2115	static int bpf_iter_init_hash_map(void *priv_data,
2116	struct bpf_iter_aux_info *aux)
2117	{
2118	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2119	struct bpf_map *map = aux->map;
2120	void *value_buf;
2121	u32 buf_size;
2122
2123	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH \|\|
2124	map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
2125	buf_size = round_up(map->value_size, `8`) * num_possible_cpus();
2126	value_buf = kmalloc(size: buf_size, GFP_USER \| __GFP_NOWARN);
2127	if (!value_buf)
2128	return -ENOMEM;
2129
2130	seq_info->percpu_value_buf = value_buf;
2131	}
2132
2133	bpf_map_inc_with_uref(map);
2134	seq_info->map = map;
2135	seq_info->htab = container_of(map, struct bpf_htab, map);
2136	return `0`;
2137	}
2138
2139	static void bpf_iter_fini_hash_map(void *priv_data)
2140	{
2141	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
2142
2143	bpf_map_put_with_uref(map: seq_info->map);
2144	kfree(objp: seq_info->percpu_value_buf);
2145	}
2146
2147	static const struct seq_operations bpf_hash_map_seq_ops = {
2148	.start = bpf_hash_map_seq_start,
2149	.next = bpf_hash_map_seq_next,
2150	.stop = bpf_hash_map_seq_stop,
2151	.show = bpf_hash_map_seq_show,
2152	};
2153
2154	static const struct bpf_iter_seq_info iter_seq_info = {
2155	.seq_ops = &bpf_hash_map_seq_ops,
2156	.init_seq_private = bpf_iter_init_hash_map,
2157	.fini_seq_private = bpf_iter_fini_hash_map,
2158	.seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
2159	};
2160
2161	static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
2162	void *callback_ctx, u64 flags)
2163	{
2164	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
2165	struct hlist_nulls_head *head;
2166	struct hlist_nulls_node *n;
2167	struct htab_elem *elem;
2168	u32 roundup_key_size;
2169	int i, num_elems = `0`;
2170	void __percpu *pptr;
2171	struct bucket *b;
2172	void key, val;
2173	bool is_percpu;
2174	u64 ret = `0`;
2175
2176	if (flags != `0`)
2177	return -EINVAL;
2178
2179	is_percpu = htab_is_percpu(htab);
2180
2181	roundup_key_size = round_up(map->key_size, `8`);
2182	/ disable migration so percpu value prepared here will be the*
2183	* same as the one seen by the bpf program with bpf_map_lookup_elem().
2184	*/
2185	if (is_percpu)
2186	migrate_disable();
2187	for (i = `0`; i < htab->n_buckets; i++) {
2188	b = &htab->buckets[i];
2189	rcu_read_lock();
2190	head = &b->head;
2191	hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
2192	key = elem->key;
2193	if (is_percpu) {
2194	/ current cpu value for percpu map /
2195	pptr = htab_elem_get_ptr(l: elem, key_size: map->key_size);
2196	val = this_cpu_ptr(pptr);
2197	} else {
2198	val = elem->key + roundup_key_size;
2199	}
2200	num_elems++;
2201	ret = callback_fn((u64)(long)map, (u64)(long)key,
2202	(u64)(long)val, (u64)(long)callback_ctx, `0`);
2203	/ return value: 0 - continue, 1 - stop and return /
2204	if (ret) {
2205	rcu_read_unlock();
2206	goto out;
2207	}
2208	}
2209	rcu_read_unlock();
2210	}
2211	out:
2212	if (is_percpu)
2213	migrate_enable();
2214	return num_elems;
2215	}
2216
2217	static u64 htab_map_mem_usage(const struct bpf_map *map)
2218	{
2219	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
2220	u32 value_size = round_up(htab->map.value_size, `8`);
2221	bool prealloc = htab_is_prealloc(htab);
2222	bool percpu = htab_is_percpu(htab);
2223	bool lru = htab_is_lru(htab);
2224	u64 num_entries;
2225	u64 usage = sizeof(struct bpf_htab);
2226
2227	usage += sizeof(struct bucket) * htab->n_buckets;
2228	usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
2229	if (prealloc) {
2230	num_entries = map->max_entries;
2231	if (htab_has_extra_elems(htab))
2232	num_entries += num_possible_cpus();
2233
2234	usage += htab->elem_size * num_entries;
2235
2236	if (percpu)
2237	usage += value_size * num_possible_cpus() * num_entries;
2238	else if (!lru)
2239	usage += sizeof(struct htab_elem ) num_possible_cpus();
2240	} else {
2241	#define LLIST_NODE_SZ sizeof(struct llist_node)
2242
2243	num_entries = htab->use_percpu_counter ?
2244	percpu_counter_sum(fbc: &htab->pcount) :
2245	atomic_read(v: &htab->count);
2246	usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
2247	if (percpu) {
2248	usage += (LLIST_NODE_SZ + sizeof(void )) num_entries;
2249	usage += value_size * num_possible_cpus() * num_entries;
2250	}
2251	}
2252	return usage;
2253	}
2254
2255	BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
2256	const struct bpf_map_ops htab_map_ops = {
2257	.map_meta_equal = bpf_map_meta_equal,
2258	.map_alloc_check = htab_map_alloc_check,
2259	.map_alloc = htab_map_alloc,
2260	.map_free = htab_map_free,
2261	.map_get_next_key = htab_map_get_next_key,
2262	.map_release_uref = htab_map_free_timers,
2263	.map_lookup_elem = htab_map_lookup_elem,
2264	.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
2265	.map_update_elem = htab_map_update_elem,
2266	.map_delete_elem = htab_map_delete_elem,
2267	.map_gen_lookup = htab_map_gen_lookup,
2268	.map_seq_show_elem = htab_map_seq_show_elem,
2269	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2270	.map_for_each_callback = bpf_for_each_hash_elem,
2271	.map_mem_usage = htab_map_mem_usage,
2272	BATCH_OPS(htab),
2273	.map_btf_id = &htab_map_btf_ids[`0`],
2274	.iter_seq_info = &iter_seq_info,
2275	};
2276
2277	const struct bpf_map_ops htab_lru_map_ops = {
2278	.map_meta_equal = bpf_map_meta_equal,
2279	.map_alloc_check = htab_map_alloc_check,
2280	.map_alloc = htab_map_alloc,
2281	.map_free = htab_map_free,
2282	.map_get_next_key = htab_map_get_next_key,
2283	.map_release_uref = htab_map_free_timers,
2284	.map_lookup_elem = htab_lru_map_lookup_elem,
2285	.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
2286	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
2287	.map_update_elem = htab_lru_map_update_elem,
2288	.map_delete_elem = htab_lru_map_delete_elem,
2289	.map_gen_lookup = htab_lru_map_gen_lookup,
2290	.map_seq_show_elem = htab_map_seq_show_elem,
2291	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2292	.map_for_each_callback = bpf_for_each_hash_elem,
2293	.map_mem_usage = htab_map_mem_usage,
2294	BATCH_OPS(htab_lru),
2295	.map_btf_id = &htab_map_btf_ids[`0`],
2296	.iter_seq_info = &iter_seq_info,
2297	};
2298
2299	/ Called from eBPF program /
2300	static void htab_percpu_map_lookup_elem(struct* bpf_map map, void* *key)
2301	{
2302	struct htab_elem *l = __htab_map_lookup_elem(map, key);
2303
2304	if (l)
2305	return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2306	else
2307	return NULL;
2308	}
2309
2310	static void htab_percpu_map_lookup_percpu_elem(struct* bpf_map map, void* *key, u32 cpu)
2311	{
2312	struct htab_elem *l;
2313
2314	if (cpu >= nr_cpu_ids)
2315	return NULL;
2316
2317	l = __htab_map_lookup_elem(map, key);
2318	if (l)
2319	return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2320	else
2321	return NULL;
2322	}
2323
2324	static void htab_lru_percpu_map_lookup_elem(struct* bpf_map map, void* *key)
2325	{
2326	struct htab_elem *l = __htab_map_lookup_elem(map, key);
2327
2328	if (l) {
2329	bpf_lru_node_set_ref(node: &l->lru_node);
2330	return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
2331	}
2332
2333	return NULL;
2334	}
2335
2336	static void htab_lru_percpu_map_lookup_percpu_elem(struct* bpf_map map, void* *key, u32 cpu)
2337	{
2338	struct htab_elem *l;
2339
2340	if (cpu >= nr_cpu_ids)
2341	return NULL;
2342
2343	l = __htab_map_lookup_elem(map, key);
2344	if (l) {
2345	bpf_lru_node_set_ref(node: &l->lru_node);
2346	return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu);
2347	}
2348
2349	return NULL;
2350	}
2351
2352	int bpf_percpu_hash_copy(struct bpf_map map, void* key, void* *value)
2353	{
2354	struct htab_elem *l;
2355	void __percpu *pptr;
2356	int ret = -ENOENT;
2357	int cpu, off = `0`;
2358	u32 size;
2359
2360	/ per_cpu areas are zero-filled and bpf programs can only*
2361	* access 'value_size' of them, so copying rounded areas
2362	* will not leak any kernel data
2363	*/
2364	size = round_up(map->value_size, `8`);
2365	rcu_read_lock();
2366	l = __htab_map_lookup_elem(map, key);
2367	if (!l)
2368	goto out;
2369	/ We do not mark LRU map element here in order to not mess up*
2370	* eviction heuristics when user space does a map walk.
2371	*/
2372	pptr = htab_elem_get_ptr(l, key_size: map->key_size);
2373	for_each_possible_cpu(cpu) {
2374	copy_map_value_long(map, dst: value + off, per_cpu_ptr(pptr, cpu));
2375	check_and_init_map_value(map, dst: value + off);
2376	off += size;
2377	}
2378	ret = `0`;
2379	out:
2380	rcu_read_unlock();
2381	return ret;
2382	}
2383
2384	int bpf_percpu_hash_update(struct bpf_map map, void* key, void* *value,
2385	u64 map_flags)
2386	{
2387	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
2388	int ret;
2389
2390	rcu_read_lock();
2391	if (htab_is_lru(htab))
2392	ret = __htab_lru_percpu_map_update_elem(map, key, value,
2393	map_flags, onallcpus: true);
2394	else
2395	ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
2396	onallcpus: true);
2397	rcu_read_unlock();
2398
2399	return ret;
2400	}
2401
2402	static void htab_percpu_map_seq_show_elem(struct bpf_map map, void* *key,
2403	struct seq_file *m)
2404	{
2405	struct htab_elem *l;
2406	void __percpu *pptr;
2407	int cpu;
2408
2409	rcu_read_lock();
2410
2411	l = __htab_map_lookup_elem(map, key);
2412	if (!l) {
2413	rcu_read_unlock();
2414	return;
2415	}
2416
2417	btf_type_seq_show(btf: map->btf, type_id: map->btf_key_type_id, obj: key, m);
2418	seq_puts(m, s: ": {\n");
2419	pptr = htab_elem_get_ptr(l, key_size: map->key_size);
2420	for_each_possible_cpu(cpu) {
2421	seq_printf(m, fmt: "\tcpu%d: ", cpu);
2422	btf_type_seq_show(btf: map->btf, type_id: map->btf_value_type_id,
2423	per_cpu_ptr(pptr, cpu), m);
2424	seq_puts(m, s: "\n");
2425	}
2426	seq_puts(m, s: "}\n");
2427
2428	rcu_read_unlock();
2429	}
2430
2431	const struct bpf_map_ops htab_percpu_map_ops = {
2432	.map_meta_equal = bpf_map_meta_equal,
2433	.map_alloc_check = htab_map_alloc_check,
2434	.map_alloc = htab_map_alloc,
2435	.map_free = htab_map_free,
2436	.map_get_next_key = htab_map_get_next_key,
2437	.map_lookup_elem = htab_percpu_map_lookup_elem,
2438	.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
2439	.map_update_elem = htab_percpu_map_update_elem,
2440	.map_delete_elem = htab_map_delete_elem,
2441	.map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem,
2442	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2443	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2444	.map_for_each_callback = bpf_for_each_hash_elem,
2445	.map_mem_usage = htab_map_mem_usage,
2446	BATCH_OPS(htab_percpu),
2447	.map_btf_id = &htab_map_btf_ids[`0`],
2448	.iter_seq_info = &iter_seq_info,
2449	};
2450
2451	const struct bpf_map_ops htab_lru_percpu_map_ops = {
2452	.map_meta_equal = bpf_map_meta_equal,
2453	.map_alloc_check = htab_map_alloc_check,
2454	.map_alloc = htab_map_alloc,
2455	.map_free = htab_map_free,
2456	.map_get_next_key = htab_map_get_next_key,
2457	.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
2458	.map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem,
2459	.map_update_elem = htab_lru_percpu_map_update_elem,
2460	.map_delete_elem = htab_lru_map_delete_elem,
2461	.map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem,
2462	.map_seq_show_elem = htab_percpu_map_seq_show_elem,
2463	.map_set_for_each_callback_args = map_set_for_each_callback_args,
2464	.map_for_each_callback = bpf_for_each_hash_elem,
2465	.map_mem_usage = htab_map_mem_usage,
2466	BATCH_OPS(htab_lru_percpu),
2467	.map_btf_id = &htab_map_btf_ids[`0`],
2468	.iter_seq_info = &iter_seq_info,
2469	};
2470
2471	static int fd_htab_map_alloc_check(union bpf_attr *attr)
2472	{
2473	if (attr->value_size != sizeof(u32))
2474	return -EINVAL;
2475	return htab_map_alloc_check(attr);
2476	}
2477
2478	static void fd_htab_map_free(struct bpf_map *map)
2479	{
2480	struct bpf_htab htab = container_of(map, struct* bpf_htab, map);
2481	struct hlist_nulls_node *n;
2482	struct hlist_nulls_head *head;
2483	struct htab_elem *l;
2484	int i;
2485
2486	for (i = `0`; i < htab->n_buckets; i++) {
2487	head = select_bucket(htab, hash: i);
2488
2489	hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
2490	void *ptr = fd_htab_map_get_ptr(map, l);
2491
2492	map->ops->map_fd_put_ptr(map, ptr, false);
2493	}
2494	}
2495
2496	htab_map_free(map);
2497	}
2498
2499	/ only called from syscall /
2500	int bpf_fd_htab_map_lookup_elem(struct bpf_map map, void* key, u32 value)
2501	{
2502	void **ptr;
2503	int ret = `0`;
2504
2505	if (!map->ops->map_fd_sys_lookup_elem)
2506	return -ENOTSUPP;
2507
2508	rcu_read_lock();
2509	ptr = htab_map_lookup_elem(map, key);
2510	if (ptr)
2511	value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(ptr));
2512	else
2513	ret = -ENOENT;
2514	rcu_read_unlock();
2515
2516	return ret;
2517	}
2518
2519	/ only called from syscall /
2520	int bpf_fd_htab_map_update_elem(struct bpf_map map, struct* file *map_file,
2521	void key, void* *value, u64 map_flags)
2522	{
2523	void *ptr;
2524	int ret;
2525	u32 ufd = (u32 )value;
2526
2527	ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
2528	if (IS_ERR(ptr))
2529	return PTR_ERR(ptr);
2530
2531	/ The htab bucket lock is always held during update operations in fd*
2532	* htab map, and the following rcu_read_lock() is only used to avoid
2533	* the WARN_ON_ONCE in htab_map_update_elem().
2534	*/
2535	rcu_read_lock();
2536	ret = htab_map_update_elem(map, key, value: &ptr, map_flags);
2537	rcu_read_unlock();
2538	if (ret)
2539	map->ops->map_fd_put_ptr(map, ptr, false);
2540
2541	return ret;
2542	}
2543
2544	static struct bpf_map htab_of_map_alloc(union* bpf_attr *attr)
2545	{
2546	struct bpf_map map, inner_map_meta;
2547
2548	inner_map_meta = bpf_map_meta_alloc(inner_map_ufd: attr->inner_map_fd);
2549	if (IS_ERR(ptr: inner_map_meta))
2550	return inner_map_meta;
2551
2552	map = htab_map_alloc(attr);
2553	if (IS_ERR(ptr: map)) {
2554	bpf_map_meta_free(map_meta: inner_map_meta);
2555	return map;
2556	}
2557
2558	map->inner_map_meta = inner_map_meta;
2559
2560	return map;
2561	}
2562
2563	static void htab_of_map_lookup_elem(struct* bpf_map map, void* *key)
2564	{
2565	struct bpf_map **inner_map = htab_map_lookup_elem(map, key);
2566
2567	if (!inner_map)
2568	return NULL;
2569
2570	return READ_ONCE(*inner_map);
2571	}
2572
2573	static int htab_of_map_gen_lookup(struct bpf_map *map,
2574	struct bpf_insn *insn_buf)
2575	{
2576	struct bpf_insn *insn = insn_buf;
2577	const int ret = BPF_REG_0;
2578
2579	BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
2580	(void ()(struct bpf_map map, void* *key))NULL));
2581	*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
2582	*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, `0`, `2`);
2583	*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
2584	offsetof(struct htab_elem, key) +
2585	round_up(map->key_size, `8`));
2586	*insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, `0`);
2587
2588	return insn - insn_buf;
2589	}
2590
2591	static void htab_of_map_free(struct bpf_map *map)
2592	{
2593	bpf_map_meta_free(map_meta: map->inner_map_meta);
2594	fd_htab_map_free(map);
2595	}
2596
2597	const struct bpf_map_ops htab_of_maps_map_ops = {
2598	.map_alloc_check = fd_htab_map_alloc_check,
2599	.map_alloc = htab_of_map_alloc,
2600	.map_free = htab_of_map_free,
2601	.map_get_next_key = htab_map_get_next_key,
2602	.map_lookup_elem = htab_of_map_lookup_elem,
2603	.map_delete_elem = htab_map_delete_elem,
2604	.map_fd_get_ptr = bpf_map_fd_get_ptr,
2605	.map_fd_put_ptr = bpf_map_fd_put_ptr,
2606	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
2607	.map_gen_lookup = htab_of_map_gen_lookup,
2608	.map_check_btf = map_check_no_btf,
2609	.map_mem_usage = htab_map_mem_usage,
2610	BATCH_OPS(htab),
2611	.map_btf_id = &htab_map_btf_ids[`0`],
2612	};
2613

source code of linux/kernel/bpf/hashtab.c