1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com |
3 | * Copyright (c) 2016 Facebook |
4 | */ |
5 | #include <linux/bpf.h> |
6 | #include <linux/btf.h> |
7 | #include <linux/jhash.h> |
8 | #include <linux/filter.h> |
9 | #include <linux/rculist_nulls.h> |
10 | #include <linux/rcupdate_wait.h> |
11 | #include <linux/random.h> |
12 | #include <uapi/linux/btf.h> |
13 | #include <linux/rcupdate_trace.h> |
14 | #include <linux/btf_ids.h> |
15 | #include "percpu_freelist.h" |
16 | #include "bpf_lru_list.h" |
17 | #include "map_in_map.h" |
18 | #include <linux/bpf_mem_alloc.h> |
19 | |
20 | #define HTAB_CREATE_FLAG_MASK \ |
21 | (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ |
22 | BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) |
23 | |
24 | #define BATCH_OPS(_name) \ |
25 | .map_lookup_batch = \ |
26 | _name##_map_lookup_batch, \ |
27 | .map_lookup_and_delete_batch = \ |
28 | _name##_map_lookup_and_delete_batch, \ |
29 | .map_update_batch = \ |
30 | generic_map_update_batch, \ |
31 | .map_delete_batch = \ |
32 | generic_map_delete_batch |
33 | |
34 | /* |
35 | * The bucket lock has two protection scopes: |
36 | * |
37 | * 1) Serializing concurrent operations from BPF programs on different |
38 | * CPUs |
39 | * |
40 | * 2) Serializing concurrent operations from BPF programs and sys_bpf() |
41 | * |
42 | * BPF programs can execute in any context including perf, kprobes and |
43 | * tracing. As there are almost no limits where perf, kprobes and tracing |
44 | * can be invoked from the lock operations need to be protected against |
45 | * deadlocks. Deadlocks can be caused by recursion and by an invocation in |
46 | * the lock held section when functions which acquire this lock are invoked |
47 | * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU |
48 | * variable bpf_prog_active, which prevents BPF programs attached to perf |
49 | * events, kprobes and tracing to be invoked before the prior invocation |
50 | * from one of these contexts completed. sys_bpf() uses the same mechanism |
51 | * by pinning the task to the current CPU and incrementing the recursion |
52 | * protection across the map operation. |
53 | * |
54 | * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain |
55 | * operations like memory allocations (even with GFP_ATOMIC) from atomic |
56 | * contexts. This is required because even with GFP_ATOMIC the memory |
57 | * allocator calls into code paths which acquire locks with long held lock |
58 | * sections. To ensure the deterministic behaviour these locks are regular |
59 | * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only |
60 | * true atomic contexts on an RT kernel are the low level hardware |
61 | * handling, scheduling, low level interrupt handling, NMIs etc. None of |
62 | * these contexts should ever do memory allocations. |
63 | * |
64 | * As regular device interrupt handlers and soft interrupts are forced into |
65 | * thread context, the existing code which does |
66 | * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*(); |
67 | * just works. |
68 | * |
69 | * In theory the BPF locks could be converted to regular spinlocks as well, |
70 | * but the bucket locks and percpu_freelist locks can be taken from |
71 | * arbitrary contexts (perf, kprobes, tracepoints) which are required to be |
72 | * atomic contexts even on RT. Before the introduction of bpf_mem_alloc, |
73 | * it is only safe to use raw spinlock for preallocated hash map on a RT kernel, |
74 | * because there is no memory allocation within the lock held sections. However |
75 | * after hash map was fully converted to use bpf_mem_alloc, there will be |
76 | * non-synchronous memory allocation for non-preallocated hash map, so it is |
77 | * safe to always use raw spinlock for bucket lock. |
78 | */ |
79 | struct bucket { |
80 | struct hlist_nulls_head head; |
81 | raw_spinlock_t raw_lock; |
82 | }; |
83 | |
84 | #define HASHTAB_MAP_LOCK_COUNT 8 |
85 | #define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) |
86 | |
87 | struct bpf_htab { |
88 | struct bpf_map map; |
89 | struct bpf_mem_alloc ma; |
90 | struct bpf_mem_alloc pcpu_ma; |
91 | struct bucket *buckets; |
92 | void *elems; |
93 | union { |
94 | struct pcpu_freelist freelist; |
95 | struct bpf_lru lru; |
96 | }; |
97 | struct htab_elem *__percpu *; |
98 | /* number of elements in non-preallocated hashtable are kept |
99 | * in either pcount or count |
100 | */ |
101 | struct percpu_counter pcount; |
102 | atomic_t count; |
103 | bool use_percpu_counter; |
104 | u32 n_buckets; /* number of hash buckets */ |
105 | u32 elem_size; /* size of each element in bytes */ |
106 | u32 hashrnd; |
107 | struct lock_class_key lockdep_key; |
108 | int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; |
109 | }; |
110 | |
111 | /* each htab element is struct htab_elem + key + value */ |
112 | struct htab_elem { |
113 | union { |
114 | struct hlist_nulls_node hash_node; |
115 | struct { |
116 | void *padding; |
117 | union { |
118 | struct pcpu_freelist_node fnode; |
119 | struct htab_elem *batch_flink; |
120 | }; |
121 | }; |
122 | }; |
123 | union { |
124 | /* pointer to per-cpu pointer */ |
125 | void *ptr_to_pptr; |
126 | struct bpf_lru_node lru_node; |
127 | }; |
128 | u32 hash; |
129 | char key[] __aligned(8); |
130 | }; |
131 | |
132 | static inline bool htab_is_prealloc(const struct bpf_htab *htab) |
133 | { |
134 | return !(htab->map.map_flags & BPF_F_NO_PREALLOC); |
135 | } |
136 | |
137 | static void htab_init_buckets(struct bpf_htab *htab) |
138 | { |
139 | unsigned int i; |
140 | |
141 | for (i = 0; i < htab->n_buckets; i++) { |
142 | INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); |
143 | raw_spin_lock_init(&htab->buckets[i].raw_lock); |
144 | lockdep_set_class(&htab->buckets[i].raw_lock, |
145 | &htab->lockdep_key); |
146 | cond_resched(); |
147 | } |
148 | } |
149 | |
150 | static inline int htab_lock_bucket(const struct bpf_htab *htab, |
151 | struct bucket *b, u32 hash, |
152 | unsigned long *pflags) |
153 | { |
154 | unsigned long flags; |
155 | |
156 | hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); |
157 | |
158 | preempt_disable(); |
159 | local_irq_save(flags); |
160 | if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { |
161 | __this_cpu_dec(*(htab->map_locked[hash])); |
162 | local_irq_restore(flags); |
163 | preempt_enable(); |
164 | return -EBUSY; |
165 | } |
166 | |
167 | raw_spin_lock(&b->raw_lock); |
168 | *pflags = flags; |
169 | |
170 | return 0; |
171 | } |
172 | |
173 | static inline void htab_unlock_bucket(const struct bpf_htab *htab, |
174 | struct bucket *b, u32 hash, |
175 | unsigned long flags) |
176 | { |
177 | hash = hash & min_t(u32, HASHTAB_MAP_LOCK_MASK, htab->n_buckets - 1); |
178 | raw_spin_unlock(&b->raw_lock); |
179 | __this_cpu_dec(*(htab->map_locked[hash])); |
180 | local_irq_restore(flags); |
181 | preempt_enable(); |
182 | } |
183 | |
184 | static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); |
185 | |
186 | static bool htab_is_lru(const struct bpf_htab *htab) |
187 | { |
188 | return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || |
189 | htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; |
190 | } |
191 | |
192 | static bool htab_is_percpu(const struct bpf_htab *htab) |
193 | { |
194 | return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || |
195 | htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; |
196 | } |
197 | |
198 | static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, |
199 | void __percpu *pptr) |
200 | { |
201 | *(void __percpu **)(l->key + key_size) = pptr; |
202 | } |
203 | |
204 | static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) |
205 | { |
206 | return *(void __percpu **)(l->key + key_size); |
207 | } |
208 | |
209 | static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) |
210 | { |
211 | return *(void **)(l->key + roundup(map->key_size, 8)); |
212 | } |
213 | |
214 | static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) |
215 | { |
216 | return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); |
217 | } |
218 | |
219 | static bool (struct bpf_htab *htab) |
220 | { |
221 | return !htab_is_percpu(htab) && !htab_is_lru(htab); |
222 | } |
223 | |
224 | static void htab_free_prealloced_timers(struct bpf_htab *htab) |
225 | { |
226 | u32 num_entries = htab->map.max_entries; |
227 | int i; |
228 | |
229 | if (!btf_record_has_field(rec: htab->map.record, type: BPF_TIMER)) |
230 | return; |
231 | if (htab_has_extra_elems(htab)) |
232 | num_entries += num_possible_cpus(); |
233 | |
234 | for (i = 0; i < num_entries; i++) { |
235 | struct htab_elem *elem; |
236 | |
237 | elem = get_htab_elem(htab, i); |
238 | bpf_obj_free_timer(rec: htab->map.record, obj: elem->key + round_up(htab->map.key_size, 8)); |
239 | cond_resched(); |
240 | } |
241 | } |
242 | |
243 | static void htab_free_prealloced_fields(struct bpf_htab *htab) |
244 | { |
245 | u32 num_entries = htab->map.max_entries; |
246 | int i; |
247 | |
248 | if (IS_ERR_OR_NULL(ptr: htab->map.record)) |
249 | return; |
250 | if (htab_has_extra_elems(htab)) |
251 | num_entries += num_possible_cpus(); |
252 | for (i = 0; i < num_entries; i++) { |
253 | struct htab_elem *elem; |
254 | |
255 | elem = get_htab_elem(htab, i); |
256 | if (htab_is_percpu(htab)) { |
257 | void __percpu *pptr = htab_elem_get_ptr(l: elem, key_size: htab->map.key_size); |
258 | int cpu; |
259 | |
260 | for_each_possible_cpu(cpu) { |
261 | bpf_obj_free_fields(rec: htab->map.record, per_cpu_ptr(pptr, cpu)); |
262 | cond_resched(); |
263 | } |
264 | } else { |
265 | bpf_obj_free_fields(rec: htab->map.record, obj: elem->key + round_up(htab->map.key_size, 8)); |
266 | cond_resched(); |
267 | } |
268 | cond_resched(); |
269 | } |
270 | } |
271 | |
272 | static void htab_free_elems(struct bpf_htab *htab) |
273 | { |
274 | int i; |
275 | |
276 | if (!htab_is_percpu(htab)) |
277 | goto free_elems; |
278 | |
279 | for (i = 0; i < htab->map.max_entries; i++) { |
280 | void __percpu *pptr; |
281 | |
282 | pptr = htab_elem_get_ptr(l: get_htab_elem(htab, i), |
283 | key_size: htab->map.key_size); |
284 | free_percpu(pdata: pptr); |
285 | cond_resched(); |
286 | } |
287 | free_elems: |
288 | bpf_map_area_free(base: htab->elems); |
289 | } |
290 | |
291 | /* The LRU list has a lock (lru_lock). Each htab bucket has a lock |
292 | * (bucket_lock). If both locks need to be acquired together, the lock |
293 | * order is always lru_lock -> bucket_lock and this only happens in |
294 | * bpf_lru_list.c logic. For example, certain code path of |
295 | * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), |
296 | * will acquire lru_lock first followed by acquiring bucket_lock. |
297 | * |
298 | * In hashtab.c, to avoid deadlock, lock acquisition of |
299 | * bucket_lock followed by lru_lock is not allowed. In such cases, |
300 | * bucket_lock needs to be released first before acquiring lru_lock. |
301 | */ |
302 | static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, |
303 | u32 hash) |
304 | { |
305 | struct bpf_lru_node *node = bpf_lru_pop_free(lru: &htab->lru, hash); |
306 | struct htab_elem *l; |
307 | |
308 | if (node) { |
309 | bpf_map_inc_elem_count(map: &htab->map); |
310 | l = container_of(node, struct htab_elem, lru_node); |
311 | memcpy(l->key, key, htab->map.key_size); |
312 | return l; |
313 | } |
314 | |
315 | return NULL; |
316 | } |
317 | |
318 | static int prealloc_init(struct bpf_htab *htab) |
319 | { |
320 | u32 num_entries = htab->map.max_entries; |
321 | int err = -ENOMEM, i; |
322 | |
323 | if (htab_has_extra_elems(htab)) |
324 | num_entries += num_possible_cpus(); |
325 | |
326 | htab->elems = bpf_map_area_alloc(size: (u64)htab->elem_size * num_entries, |
327 | numa_node: htab->map.numa_node); |
328 | if (!htab->elems) |
329 | return -ENOMEM; |
330 | |
331 | if (!htab_is_percpu(htab)) |
332 | goto skip_percpu_elems; |
333 | |
334 | for (i = 0; i < num_entries; i++) { |
335 | u32 size = round_up(htab->map.value_size, 8); |
336 | void __percpu *pptr; |
337 | |
338 | pptr = bpf_map_alloc_percpu(map: &htab->map, size, align: 8, |
339 | GFP_USER | __GFP_NOWARN); |
340 | if (!pptr) |
341 | goto free_elems; |
342 | htab_elem_set_ptr(l: get_htab_elem(htab, i), key_size: htab->map.key_size, |
343 | pptr); |
344 | cond_resched(); |
345 | } |
346 | |
347 | skip_percpu_elems: |
348 | if (htab_is_lru(htab)) |
349 | err = bpf_lru_init(lru: &htab->lru, |
350 | percpu: htab->map.map_flags & BPF_F_NO_COMMON_LRU, |
351 | offsetof(struct htab_elem, hash) - |
352 | offsetof(struct htab_elem, lru_node), |
353 | del_from_htab: htab_lru_map_delete_node, |
354 | delete_arg: htab); |
355 | else |
356 | err = pcpu_freelist_init(&htab->freelist); |
357 | |
358 | if (err) |
359 | goto free_elems; |
360 | |
361 | if (htab_is_lru(htab)) |
362 | bpf_lru_populate(lru: &htab->lru, buf: htab->elems, |
363 | offsetof(struct htab_elem, lru_node), |
364 | elem_size: htab->elem_size, nr_elems: num_entries); |
365 | else |
366 | pcpu_freelist_populate(s: &htab->freelist, |
367 | buf: htab->elems + offsetof(struct htab_elem, fnode), |
368 | elem_size: htab->elem_size, nr_elems: num_entries); |
369 | |
370 | return 0; |
371 | |
372 | free_elems: |
373 | htab_free_elems(htab); |
374 | return err; |
375 | } |
376 | |
377 | static void prealloc_destroy(struct bpf_htab *htab) |
378 | { |
379 | htab_free_elems(htab); |
380 | |
381 | if (htab_is_lru(htab)) |
382 | bpf_lru_destroy(lru: &htab->lru); |
383 | else |
384 | pcpu_freelist_destroy(s: &htab->freelist); |
385 | } |
386 | |
387 | static int (struct bpf_htab *htab) |
388 | { |
389 | struct htab_elem *__percpu *pptr, *l_new; |
390 | struct pcpu_freelist_node *l; |
391 | int cpu; |
392 | |
393 | pptr = bpf_map_alloc_percpu(map: &htab->map, size: sizeof(struct htab_elem *), align: 8, |
394 | GFP_USER | __GFP_NOWARN); |
395 | if (!pptr) |
396 | return -ENOMEM; |
397 | |
398 | for_each_possible_cpu(cpu) { |
399 | l = pcpu_freelist_pop(&htab->freelist); |
400 | /* pop will succeed, since prealloc_init() |
401 | * preallocated extra num_possible_cpus elements |
402 | */ |
403 | l_new = container_of(l, struct htab_elem, fnode); |
404 | *per_cpu_ptr(pptr, cpu) = l_new; |
405 | } |
406 | htab->extra_elems = pptr; |
407 | return 0; |
408 | } |
409 | |
410 | /* Called from syscall */ |
411 | static int htab_map_alloc_check(union bpf_attr *attr) |
412 | { |
413 | bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || |
414 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); |
415 | bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || |
416 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); |
417 | /* percpu_lru means each cpu has its own LRU list. |
418 | * it is different from BPF_MAP_TYPE_PERCPU_HASH where |
419 | * the map's value itself is percpu. percpu_lru has |
420 | * nothing to do with the map's value. |
421 | */ |
422 | bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); |
423 | bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); |
424 | bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); |
425 | int numa_node = bpf_map_attr_numa_node(attr); |
426 | |
427 | BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != |
428 | offsetof(struct htab_elem, hash_node.pprev)); |
429 | |
430 | if (zero_seed && !capable(CAP_SYS_ADMIN)) |
431 | /* Guard against local DoS, and discourage production use. */ |
432 | return -EPERM; |
433 | |
434 | if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || |
435 | !bpf_map_flags_access_ok(access_flags: attr->map_flags)) |
436 | return -EINVAL; |
437 | |
438 | if (!lru && percpu_lru) |
439 | return -EINVAL; |
440 | |
441 | if (lru && !prealloc) |
442 | return -ENOTSUPP; |
443 | |
444 | if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) |
445 | return -EINVAL; |
446 | |
447 | /* check sanity of attributes. |
448 | * value_size == 0 may be allowed in the future to use map as a set |
449 | */ |
450 | if (attr->max_entries == 0 || attr->key_size == 0 || |
451 | attr->value_size == 0) |
452 | return -EINVAL; |
453 | |
454 | if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - |
455 | sizeof(struct htab_elem)) |
456 | /* if key_size + value_size is bigger, the user space won't be |
457 | * able to access the elements via bpf syscall. This check |
458 | * also makes sure that the elem_size doesn't overflow and it's |
459 | * kmalloc-able later in htab_map_update_elem() |
460 | */ |
461 | return -E2BIG; |
462 | |
463 | return 0; |
464 | } |
465 | |
466 | static struct bpf_map *htab_map_alloc(union bpf_attr *attr) |
467 | { |
468 | bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || |
469 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); |
470 | bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || |
471 | attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); |
472 | /* percpu_lru means each cpu has its own LRU list. |
473 | * it is different from BPF_MAP_TYPE_PERCPU_HASH where |
474 | * the map's value itself is percpu. percpu_lru has |
475 | * nothing to do with the map's value. |
476 | */ |
477 | bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); |
478 | bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); |
479 | struct bpf_htab *htab; |
480 | int err, i; |
481 | |
482 | htab = bpf_map_area_alloc(size: sizeof(*htab), NUMA_NO_NODE); |
483 | if (!htab) |
484 | return ERR_PTR(error: -ENOMEM); |
485 | |
486 | lockdep_register_key(key: &htab->lockdep_key); |
487 | |
488 | bpf_map_init_from_attr(map: &htab->map, attr); |
489 | |
490 | if (percpu_lru) { |
491 | /* ensure each CPU's lru list has >=1 elements. |
492 | * since we are at it, make each lru list has the same |
493 | * number of elements. |
494 | */ |
495 | htab->map.max_entries = roundup(attr->max_entries, |
496 | num_possible_cpus()); |
497 | if (htab->map.max_entries < attr->max_entries) |
498 | htab->map.max_entries = rounddown(attr->max_entries, |
499 | num_possible_cpus()); |
500 | } |
501 | |
502 | /* hash table size must be power of 2; roundup_pow_of_two() can overflow |
503 | * into UB on 32-bit arches, so check that first |
504 | */ |
505 | err = -E2BIG; |
506 | if (htab->map.max_entries > 1UL << 31) |
507 | goto free_htab; |
508 | |
509 | htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); |
510 | |
511 | htab->elem_size = sizeof(struct htab_elem) + |
512 | round_up(htab->map.key_size, 8); |
513 | if (percpu) |
514 | htab->elem_size += sizeof(void *); |
515 | else |
516 | htab->elem_size += round_up(htab->map.value_size, 8); |
517 | |
518 | /* check for u32 overflow */ |
519 | if (htab->n_buckets > U32_MAX / sizeof(struct bucket)) |
520 | goto free_htab; |
521 | |
522 | err = bpf_map_init_elem_count(map: &htab->map); |
523 | if (err) |
524 | goto free_htab; |
525 | |
526 | err = -ENOMEM; |
527 | htab->buckets = bpf_map_area_alloc(size: htab->n_buckets * |
528 | sizeof(struct bucket), |
529 | numa_node: htab->map.numa_node); |
530 | if (!htab->buckets) |
531 | goto free_elem_count; |
532 | |
533 | for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { |
534 | htab->map_locked[i] = bpf_map_alloc_percpu(map: &htab->map, |
535 | size: sizeof(int), |
536 | align: sizeof(int), |
537 | GFP_USER); |
538 | if (!htab->map_locked[i]) |
539 | goto free_map_locked; |
540 | } |
541 | |
542 | if (htab->map.map_flags & BPF_F_ZERO_SEED) |
543 | htab->hashrnd = 0; |
544 | else |
545 | htab->hashrnd = get_random_u32(); |
546 | |
547 | htab_init_buckets(htab); |
548 | |
549 | /* compute_batch_value() computes batch value as num_online_cpus() * 2 |
550 | * and __percpu_counter_compare() needs |
551 | * htab->max_entries - cur_number_of_elems to be more than batch * num_online_cpus() |
552 | * for percpu_counter to be faster than atomic_t. In practice the average bpf |
553 | * hash map size is 10k, which means that a system with 64 cpus will fill |
554 | * hashmap to 20% of 10k before percpu_counter becomes ineffective. Therefore |
555 | * define our own batch count as 32 then 10k hash map can be filled up to 80%: |
556 | * 10k - 8k > 32 _batch_ * 64 _cpus_ |
557 | * and __percpu_counter_compare() will still be fast. At that point hash map |
558 | * collisions will dominate its performance anyway. Assume that hash map filled |
559 | * to 50+% isn't going to be O(1) and use the following formula to choose |
560 | * between percpu_counter and atomic_t. |
561 | */ |
562 | #define PERCPU_COUNTER_BATCH 32 |
563 | if (attr->max_entries / 2 > num_online_cpus() * PERCPU_COUNTER_BATCH) |
564 | htab->use_percpu_counter = true; |
565 | |
566 | if (htab->use_percpu_counter) { |
567 | err = percpu_counter_init(&htab->pcount, 0, GFP_KERNEL); |
568 | if (err) |
569 | goto free_map_locked; |
570 | } |
571 | |
572 | if (prealloc) { |
573 | err = prealloc_init(htab); |
574 | if (err) |
575 | goto free_map_locked; |
576 | |
577 | if (!percpu && !lru) { |
578 | /* lru itself can remove the least used element, so |
579 | * there is no need for an extra elem during map_update. |
580 | */ |
581 | err = alloc_extra_elems(htab); |
582 | if (err) |
583 | goto free_prealloc; |
584 | } |
585 | } else { |
586 | err = bpf_mem_alloc_init(ma: &htab->ma, size: htab->elem_size, percpu: false); |
587 | if (err) |
588 | goto free_map_locked; |
589 | if (percpu) { |
590 | err = bpf_mem_alloc_init(ma: &htab->pcpu_ma, |
591 | round_up(htab->map.value_size, 8), percpu: true); |
592 | if (err) |
593 | goto free_map_locked; |
594 | } |
595 | } |
596 | |
597 | return &htab->map; |
598 | |
599 | free_prealloc: |
600 | prealloc_destroy(htab); |
601 | free_map_locked: |
602 | if (htab->use_percpu_counter) |
603 | percpu_counter_destroy(fbc: &htab->pcount); |
604 | for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) |
605 | free_percpu(pdata: htab->map_locked[i]); |
606 | bpf_map_area_free(base: htab->buckets); |
607 | bpf_mem_alloc_destroy(ma: &htab->pcpu_ma); |
608 | bpf_mem_alloc_destroy(ma: &htab->ma); |
609 | free_elem_count: |
610 | bpf_map_free_elem_count(map: &htab->map); |
611 | free_htab: |
612 | lockdep_unregister_key(key: &htab->lockdep_key); |
613 | bpf_map_area_free(base: htab); |
614 | return ERR_PTR(error: err); |
615 | } |
616 | |
617 | static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) |
618 | { |
619 | if (likely(key_len % 4 == 0)) |
620 | return jhash2(k: key, length: key_len / 4, initval: hashrnd); |
621 | return jhash(key, length: key_len, initval: hashrnd); |
622 | } |
623 | |
624 | static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) |
625 | { |
626 | return &htab->buckets[hash & (htab->n_buckets - 1)]; |
627 | } |
628 | |
629 | static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) |
630 | { |
631 | return &__select_bucket(htab, hash)->head; |
632 | } |
633 | |
634 | /* this lookup function can only be called with bucket lock taken */ |
635 | static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, |
636 | void *key, u32 key_size) |
637 | { |
638 | struct hlist_nulls_node *n; |
639 | struct htab_elem *l; |
640 | |
641 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) |
642 | if (l->hash == hash && !memcmp(p: &l->key, q: key, size: key_size)) |
643 | return l; |
644 | |
645 | return NULL; |
646 | } |
647 | |
648 | /* can be called without bucket lock. it will repeat the loop in |
649 | * the unlikely event when elements moved from one bucket into another |
650 | * while link list is being walked |
651 | */ |
652 | static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, |
653 | u32 hash, void *key, |
654 | u32 key_size, u32 n_buckets) |
655 | { |
656 | struct hlist_nulls_node *n; |
657 | struct htab_elem *l; |
658 | |
659 | again: |
660 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) |
661 | if (l->hash == hash && !memcmp(p: &l->key, q: key, size: key_size)) |
662 | return l; |
663 | |
664 | if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) |
665 | goto again; |
666 | |
667 | return NULL; |
668 | } |
669 | |
670 | /* Called from syscall or from eBPF program directly, so |
671 | * arguments have to match bpf_map_lookup_elem() exactly. |
672 | * The return value is adjusted by BPF instructions |
673 | * in htab_map_gen_lookup(). |
674 | */ |
675 | static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) |
676 | { |
677 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
678 | struct hlist_nulls_head *head; |
679 | struct htab_elem *l; |
680 | u32 hash, key_size; |
681 | |
682 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
683 | !rcu_read_lock_bh_held()); |
684 | |
685 | key_size = map->key_size; |
686 | |
687 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
688 | |
689 | head = select_bucket(htab, hash); |
690 | |
691 | l = lookup_nulls_elem_raw(head, hash, key, key_size, n_buckets: htab->n_buckets); |
692 | |
693 | return l; |
694 | } |
695 | |
696 | static void *htab_map_lookup_elem(struct bpf_map *map, void *key) |
697 | { |
698 | struct htab_elem *l = __htab_map_lookup_elem(map, key); |
699 | |
700 | if (l) |
701 | return l->key + round_up(map->key_size, 8); |
702 | |
703 | return NULL; |
704 | } |
705 | |
706 | /* inline bpf_map_lookup_elem() call. |
707 | * Instead of: |
708 | * bpf_prog |
709 | * bpf_map_lookup_elem |
710 | * map->ops->map_lookup_elem |
711 | * htab_map_lookup_elem |
712 | * __htab_map_lookup_elem |
713 | * do: |
714 | * bpf_prog |
715 | * __htab_map_lookup_elem |
716 | */ |
717 | static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) |
718 | { |
719 | struct bpf_insn *insn = insn_buf; |
720 | const int ret = BPF_REG_0; |
721 | |
722 | BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, |
723 | (void *(*)(struct bpf_map *map, void *key))NULL)); |
724 | *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); |
725 | *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); |
726 | *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, |
727 | offsetof(struct htab_elem, key) + |
728 | round_up(map->key_size, 8)); |
729 | return insn - insn_buf; |
730 | } |
731 | |
732 | static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, |
733 | void *key, const bool mark) |
734 | { |
735 | struct htab_elem *l = __htab_map_lookup_elem(map, key); |
736 | |
737 | if (l) { |
738 | if (mark) |
739 | bpf_lru_node_set_ref(node: &l->lru_node); |
740 | return l->key + round_up(map->key_size, 8); |
741 | } |
742 | |
743 | return NULL; |
744 | } |
745 | |
746 | static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) |
747 | { |
748 | return __htab_lru_map_lookup_elem(map, key, mark: true); |
749 | } |
750 | |
751 | static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) |
752 | { |
753 | return __htab_lru_map_lookup_elem(map, key, mark: false); |
754 | } |
755 | |
756 | static int htab_lru_map_gen_lookup(struct bpf_map *map, |
757 | struct bpf_insn *insn_buf) |
758 | { |
759 | struct bpf_insn *insn = insn_buf; |
760 | const int ret = BPF_REG_0; |
761 | const int ref_reg = BPF_REG_1; |
762 | |
763 | BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, |
764 | (void *(*)(struct bpf_map *map, void *key))NULL)); |
765 | *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); |
766 | *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); |
767 | *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, |
768 | offsetof(struct htab_elem, lru_node) + |
769 | offsetof(struct bpf_lru_node, ref)); |
770 | *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); |
771 | *insn++ = BPF_ST_MEM(BPF_B, ret, |
772 | offsetof(struct htab_elem, lru_node) + |
773 | offsetof(struct bpf_lru_node, ref), |
774 | 1); |
775 | *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, |
776 | offsetof(struct htab_elem, key) + |
777 | round_up(map->key_size, 8)); |
778 | return insn - insn_buf; |
779 | } |
780 | |
781 | static void check_and_free_fields(struct bpf_htab *htab, |
782 | struct htab_elem *elem) |
783 | { |
784 | if (htab_is_percpu(htab)) { |
785 | void __percpu *pptr = htab_elem_get_ptr(l: elem, key_size: htab->map.key_size); |
786 | int cpu; |
787 | |
788 | for_each_possible_cpu(cpu) |
789 | bpf_obj_free_fields(rec: htab->map.record, per_cpu_ptr(pptr, cpu)); |
790 | } else { |
791 | void *map_value = elem->key + round_up(htab->map.key_size, 8); |
792 | |
793 | bpf_obj_free_fields(rec: htab->map.record, obj: map_value); |
794 | } |
795 | } |
796 | |
797 | /* It is called from the bpf_lru_list when the LRU needs to delete |
798 | * older elements from the htab. |
799 | */ |
800 | static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) |
801 | { |
802 | struct bpf_htab *htab = arg; |
803 | struct htab_elem *l = NULL, *tgt_l; |
804 | struct hlist_nulls_head *head; |
805 | struct hlist_nulls_node *n; |
806 | unsigned long flags; |
807 | struct bucket *b; |
808 | int ret; |
809 | |
810 | tgt_l = container_of(node, struct htab_elem, lru_node); |
811 | b = __select_bucket(htab, hash: tgt_l->hash); |
812 | head = &b->head; |
813 | |
814 | ret = htab_lock_bucket(htab, b, hash: tgt_l->hash, pflags: &flags); |
815 | if (ret) |
816 | return false; |
817 | |
818 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) |
819 | if (l == tgt_l) { |
820 | hlist_nulls_del_rcu(n: &l->hash_node); |
821 | check_and_free_fields(htab, elem: l); |
822 | bpf_map_dec_elem_count(map: &htab->map); |
823 | break; |
824 | } |
825 | |
826 | htab_unlock_bucket(htab, b, hash: tgt_l->hash, flags); |
827 | |
828 | return l == tgt_l; |
829 | } |
830 | |
831 | /* Called from syscall */ |
832 | static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) |
833 | { |
834 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
835 | struct hlist_nulls_head *head; |
836 | struct htab_elem *l, *next_l; |
837 | u32 hash, key_size; |
838 | int i = 0; |
839 | |
840 | WARN_ON_ONCE(!rcu_read_lock_held()); |
841 | |
842 | key_size = map->key_size; |
843 | |
844 | if (!key) |
845 | goto find_first_elem; |
846 | |
847 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
848 | |
849 | head = select_bucket(htab, hash); |
850 | |
851 | /* lookup the key */ |
852 | l = lookup_nulls_elem_raw(head, hash, key, key_size, n_buckets: htab->n_buckets); |
853 | |
854 | if (!l) |
855 | goto find_first_elem; |
856 | |
857 | /* key was found, get next key in the same bucket */ |
858 | next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), |
859 | struct htab_elem, hash_node); |
860 | |
861 | if (next_l) { |
862 | /* if next elem in this hash list is non-zero, just return it */ |
863 | memcpy(next_key, next_l->key, key_size); |
864 | return 0; |
865 | } |
866 | |
867 | /* no more elements in this hash list, go to the next bucket */ |
868 | i = hash & (htab->n_buckets - 1); |
869 | i++; |
870 | |
871 | find_first_elem: |
872 | /* iterate over buckets */ |
873 | for (; i < htab->n_buckets; i++) { |
874 | head = select_bucket(htab, hash: i); |
875 | |
876 | /* pick first element in the bucket */ |
877 | next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), |
878 | struct htab_elem, hash_node); |
879 | if (next_l) { |
880 | /* if it's not empty, just return it */ |
881 | memcpy(next_key, next_l->key, key_size); |
882 | return 0; |
883 | } |
884 | } |
885 | |
886 | /* iterated over all buckets and all elements */ |
887 | return -ENOENT; |
888 | } |
889 | |
890 | static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) |
891 | { |
892 | check_and_free_fields(htab, elem: l); |
893 | if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) |
894 | bpf_mem_cache_free(ma: &htab->pcpu_ma, ptr: l->ptr_to_pptr); |
895 | bpf_mem_cache_free(ma: &htab->ma, ptr: l); |
896 | } |
897 | |
898 | static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) |
899 | { |
900 | struct bpf_map *map = &htab->map; |
901 | void *ptr; |
902 | |
903 | if (map->ops->map_fd_put_ptr) { |
904 | ptr = fd_htab_map_get_ptr(map, l); |
905 | map->ops->map_fd_put_ptr(map, ptr, true); |
906 | } |
907 | } |
908 | |
909 | static bool is_map_full(struct bpf_htab *htab) |
910 | { |
911 | if (htab->use_percpu_counter) |
912 | return __percpu_counter_compare(fbc: &htab->pcount, rhs: htab->map.max_entries, |
913 | PERCPU_COUNTER_BATCH) >= 0; |
914 | return atomic_read(v: &htab->count) >= htab->map.max_entries; |
915 | } |
916 | |
917 | static void inc_elem_count(struct bpf_htab *htab) |
918 | { |
919 | bpf_map_inc_elem_count(map: &htab->map); |
920 | |
921 | if (htab->use_percpu_counter) |
922 | percpu_counter_add_batch(fbc: &htab->pcount, amount: 1, PERCPU_COUNTER_BATCH); |
923 | else |
924 | atomic_inc(v: &htab->count); |
925 | } |
926 | |
927 | static void dec_elem_count(struct bpf_htab *htab) |
928 | { |
929 | bpf_map_dec_elem_count(map: &htab->map); |
930 | |
931 | if (htab->use_percpu_counter) |
932 | percpu_counter_add_batch(fbc: &htab->pcount, amount: -1, PERCPU_COUNTER_BATCH); |
933 | else |
934 | atomic_dec(v: &htab->count); |
935 | } |
936 | |
937 | |
938 | static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) |
939 | { |
940 | htab_put_fd_value(htab, l); |
941 | |
942 | if (htab_is_prealloc(htab)) { |
943 | bpf_map_dec_elem_count(map: &htab->map); |
944 | check_and_free_fields(htab, elem: l); |
945 | __pcpu_freelist_push(&htab->freelist, &l->fnode); |
946 | } else { |
947 | dec_elem_count(htab); |
948 | htab_elem_free(htab, l); |
949 | } |
950 | } |
951 | |
952 | static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, |
953 | void *value, bool onallcpus) |
954 | { |
955 | if (!onallcpus) { |
956 | /* copy true value_size bytes */ |
957 | copy_map_value(map: &htab->map, this_cpu_ptr(pptr), src: value); |
958 | } else { |
959 | u32 size = round_up(htab->map.value_size, 8); |
960 | int off = 0, cpu; |
961 | |
962 | for_each_possible_cpu(cpu) { |
963 | copy_map_value_long(map: &htab->map, per_cpu_ptr(pptr, cpu), src: value + off); |
964 | off += size; |
965 | } |
966 | } |
967 | } |
968 | |
969 | static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, |
970 | void *value, bool onallcpus) |
971 | { |
972 | /* When not setting the initial value on all cpus, zero-fill element |
973 | * values for other cpus. Otherwise, bpf program has no way to ensure |
974 | * known initial values for cpus other than current one |
975 | * (onallcpus=false always when coming from bpf prog). |
976 | */ |
977 | if (!onallcpus) { |
978 | int current_cpu = raw_smp_processor_id(); |
979 | int cpu; |
980 | |
981 | for_each_possible_cpu(cpu) { |
982 | if (cpu == current_cpu) |
983 | copy_map_value_long(map: &htab->map, per_cpu_ptr(pptr, cpu), src: value); |
984 | else /* Since elem is preallocated, we cannot touch special fields */ |
985 | zero_map_value(map: &htab->map, per_cpu_ptr(pptr, cpu)); |
986 | } |
987 | } else { |
988 | pcpu_copy_value(htab, pptr, value, onallcpus); |
989 | } |
990 | } |
991 | |
992 | static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) |
993 | { |
994 | return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && |
995 | BITS_PER_LONG == 64; |
996 | } |
997 | |
998 | static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, |
999 | void *value, u32 key_size, u32 hash, |
1000 | bool percpu, bool onallcpus, |
1001 | struct htab_elem *old_elem) |
1002 | { |
1003 | u32 size = htab->map.value_size; |
1004 | bool prealloc = htab_is_prealloc(htab); |
1005 | struct htab_elem *l_new, **pl_new; |
1006 | void __percpu *pptr; |
1007 | |
1008 | if (prealloc) { |
1009 | if (old_elem) { |
1010 | /* if we're updating the existing element, |
1011 | * use per-cpu extra elems to avoid freelist_pop/push |
1012 | */ |
1013 | pl_new = this_cpu_ptr(htab->extra_elems); |
1014 | l_new = *pl_new; |
1015 | htab_put_fd_value(htab, l: old_elem); |
1016 | *pl_new = old_elem; |
1017 | } else { |
1018 | struct pcpu_freelist_node *l; |
1019 | |
1020 | l = __pcpu_freelist_pop(&htab->freelist); |
1021 | if (!l) |
1022 | return ERR_PTR(error: -E2BIG); |
1023 | l_new = container_of(l, struct htab_elem, fnode); |
1024 | bpf_map_inc_elem_count(map: &htab->map); |
1025 | } |
1026 | } else { |
1027 | if (is_map_full(htab)) |
1028 | if (!old_elem) |
1029 | /* when map is full and update() is replacing |
1030 | * old element, it's ok to allocate, since |
1031 | * old element will be freed immediately. |
1032 | * Otherwise return an error |
1033 | */ |
1034 | return ERR_PTR(error: -E2BIG); |
1035 | inc_elem_count(htab); |
1036 | l_new = bpf_mem_cache_alloc(ma: &htab->ma); |
1037 | if (!l_new) { |
1038 | l_new = ERR_PTR(error: -ENOMEM); |
1039 | goto dec_count; |
1040 | } |
1041 | } |
1042 | |
1043 | memcpy(l_new->key, key, key_size); |
1044 | if (percpu) { |
1045 | if (prealloc) { |
1046 | pptr = htab_elem_get_ptr(l: l_new, key_size); |
1047 | } else { |
1048 | /* alloc_percpu zero-fills */ |
1049 | pptr = bpf_mem_cache_alloc(ma: &htab->pcpu_ma); |
1050 | if (!pptr) { |
1051 | bpf_mem_cache_free(ma: &htab->ma, ptr: l_new); |
1052 | l_new = ERR_PTR(error: -ENOMEM); |
1053 | goto dec_count; |
1054 | } |
1055 | l_new->ptr_to_pptr = pptr; |
1056 | pptr = *(void **)pptr; |
1057 | } |
1058 | |
1059 | pcpu_init_value(htab, pptr, value, onallcpus); |
1060 | |
1061 | if (!prealloc) |
1062 | htab_elem_set_ptr(l: l_new, key_size, pptr); |
1063 | } else if (fd_htab_map_needs_adjust(htab)) { |
1064 | size = round_up(size, 8); |
1065 | memcpy(l_new->key + round_up(key_size, 8), value, size); |
1066 | } else { |
1067 | copy_map_value(map: &htab->map, |
1068 | dst: l_new->key + round_up(key_size, 8), |
1069 | src: value); |
1070 | } |
1071 | |
1072 | l_new->hash = hash; |
1073 | return l_new; |
1074 | dec_count: |
1075 | dec_elem_count(htab); |
1076 | return l_new; |
1077 | } |
1078 | |
1079 | static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, |
1080 | u64 map_flags) |
1081 | { |
1082 | if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) |
1083 | /* elem already exists */ |
1084 | return -EEXIST; |
1085 | |
1086 | if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) |
1087 | /* elem doesn't exist, cannot update it */ |
1088 | return -ENOENT; |
1089 | |
1090 | return 0; |
1091 | } |
1092 | |
1093 | /* Called from syscall or from eBPF program */ |
1094 | static long htab_map_update_elem(struct bpf_map *map, void *key, void *value, |
1095 | u64 map_flags) |
1096 | { |
1097 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1098 | struct htab_elem *l_new = NULL, *l_old; |
1099 | struct hlist_nulls_head *head; |
1100 | unsigned long flags; |
1101 | struct bucket *b; |
1102 | u32 key_size, hash; |
1103 | int ret; |
1104 | |
1105 | if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) |
1106 | /* unknown flags */ |
1107 | return -EINVAL; |
1108 | |
1109 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1110 | !rcu_read_lock_bh_held()); |
1111 | |
1112 | key_size = map->key_size; |
1113 | |
1114 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
1115 | |
1116 | b = __select_bucket(htab, hash); |
1117 | head = &b->head; |
1118 | |
1119 | if (unlikely(map_flags & BPF_F_LOCK)) { |
1120 | if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK))) |
1121 | return -EINVAL; |
1122 | /* find an element without taking the bucket lock */ |
1123 | l_old = lookup_nulls_elem_raw(head, hash, key, key_size, |
1124 | n_buckets: htab->n_buckets); |
1125 | ret = check_flags(htab, l_old, map_flags); |
1126 | if (ret) |
1127 | return ret; |
1128 | if (l_old) { |
1129 | /* grab the element lock and update value in place */ |
1130 | copy_map_value_locked(map, |
1131 | dst: l_old->key + round_up(key_size, 8), |
1132 | src: value, lock_src: false); |
1133 | return 0; |
1134 | } |
1135 | /* fall through, grab the bucket lock and lookup again. |
1136 | * 99.9% chance that the element won't be found, |
1137 | * but second lookup under lock has to be done. |
1138 | */ |
1139 | } |
1140 | |
1141 | ret = htab_lock_bucket(htab, b, hash, pflags: &flags); |
1142 | if (ret) |
1143 | return ret; |
1144 | |
1145 | l_old = lookup_elem_raw(head, hash, key, key_size); |
1146 | |
1147 | ret = check_flags(htab, l_old, map_flags); |
1148 | if (ret) |
1149 | goto err; |
1150 | |
1151 | if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { |
1152 | /* first lookup without the bucket lock didn't find the element, |
1153 | * but second lookup with the bucket lock found it. |
1154 | * This case is highly unlikely, but has to be dealt with: |
1155 | * grab the element lock in addition to the bucket lock |
1156 | * and update element in place |
1157 | */ |
1158 | copy_map_value_locked(map, |
1159 | dst: l_old->key + round_up(key_size, 8), |
1160 | src: value, lock_src: false); |
1161 | ret = 0; |
1162 | goto err; |
1163 | } |
1164 | |
1165 | l_new = alloc_htab_elem(htab, key, value, key_size, hash, percpu: false, onallcpus: false, |
1166 | old_elem: l_old); |
1167 | if (IS_ERR(ptr: l_new)) { |
1168 | /* all pre-allocated elements are in use or memory exhausted */ |
1169 | ret = PTR_ERR(ptr: l_new); |
1170 | goto err; |
1171 | } |
1172 | |
1173 | /* add new element to the head of the list, so that |
1174 | * concurrent search will find it before old elem |
1175 | */ |
1176 | hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head); |
1177 | if (l_old) { |
1178 | hlist_nulls_del_rcu(n: &l_old->hash_node); |
1179 | if (!htab_is_prealloc(htab)) |
1180 | free_htab_elem(htab, l: l_old); |
1181 | else |
1182 | check_and_free_fields(htab, elem: l_old); |
1183 | } |
1184 | ret = 0; |
1185 | err: |
1186 | htab_unlock_bucket(htab, b, hash, flags); |
1187 | return ret; |
1188 | } |
1189 | |
1190 | static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem) |
1191 | { |
1192 | check_and_free_fields(htab, elem); |
1193 | bpf_map_dec_elem_count(map: &htab->map); |
1194 | bpf_lru_push_free(lru: &htab->lru, node: &elem->lru_node); |
1195 | } |
1196 | |
1197 | static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, |
1198 | u64 map_flags) |
1199 | { |
1200 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1201 | struct htab_elem *l_new, *l_old = NULL; |
1202 | struct hlist_nulls_head *head; |
1203 | unsigned long flags; |
1204 | struct bucket *b; |
1205 | u32 key_size, hash; |
1206 | int ret; |
1207 | |
1208 | if (unlikely(map_flags > BPF_EXIST)) |
1209 | /* unknown flags */ |
1210 | return -EINVAL; |
1211 | |
1212 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1213 | !rcu_read_lock_bh_held()); |
1214 | |
1215 | key_size = map->key_size; |
1216 | |
1217 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
1218 | |
1219 | b = __select_bucket(htab, hash); |
1220 | head = &b->head; |
1221 | |
1222 | /* For LRU, we need to alloc before taking bucket's |
1223 | * spinlock because getting free nodes from LRU may need |
1224 | * to remove older elements from htab and this removal |
1225 | * operation will need a bucket lock. |
1226 | */ |
1227 | l_new = prealloc_lru_pop(htab, key, hash); |
1228 | if (!l_new) |
1229 | return -ENOMEM; |
1230 | copy_map_value(map: &htab->map, |
1231 | dst: l_new->key + round_up(map->key_size, 8), src: value); |
1232 | |
1233 | ret = htab_lock_bucket(htab, b, hash, pflags: &flags); |
1234 | if (ret) |
1235 | goto err_lock_bucket; |
1236 | |
1237 | l_old = lookup_elem_raw(head, hash, key, key_size); |
1238 | |
1239 | ret = check_flags(htab, l_old, map_flags); |
1240 | if (ret) |
1241 | goto err; |
1242 | |
1243 | /* add new element to the head of the list, so that |
1244 | * concurrent search will find it before old elem |
1245 | */ |
1246 | hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head); |
1247 | if (l_old) { |
1248 | bpf_lru_node_set_ref(node: &l_new->lru_node); |
1249 | hlist_nulls_del_rcu(n: &l_old->hash_node); |
1250 | } |
1251 | ret = 0; |
1252 | |
1253 | err: |
1254 | htab_unlock_bucket(htab, b, hash, flags); |
1255 | |
1256 | err_lock_bucket: |
1257 | if (ret) |
1258 | htab_lru_push_free(htab, elem: l_new); |
1259 | else if (l_old) |
1260 | htab_lru_push_free(htab, elem: l_old); |
1261 | |
1262 | return ret; |
1263 | } |
1264 | |
1265 | static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key, |
1266 | void *value, u64 map_flags, |
1267 | bool onallcpus) |
1268 | { |
1269 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1270 | struct htab_elem *l_new = NULL, *l_old; |
1271 | struct hlist_nulls_head *head; |
1272 | unsigned long flags; |
1273 | struct bucket *b; |
1274 | u32 key_size, hash; |
1275 | int ret; |
1276 | |
1277 | if (unlikely(map_flags > BPF_EXIST)) |
1278 | /* unknown flags */ |
1279 | return -EINVAL; |
1280 | |
1281 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1282 | !rcu_read_lock_bh_held()); |
1283 | |
1284 | key_size = map->key_size; |
1285 | |
1286 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
1287 | |
1288 | b = __select_bucket(htab, hash); |
1289 | head = &b->head; |
1290 | |
1291 | ret = htab_lock_bucket(htab, b, hash, pflags: &flags); |
1292 | if (ret) |
1293 | return ret; |
1294 | |
1295 | l_old = lookup_elem_raw(head, hash, key, key_size); |
1296 | |
1297 | ret = check_flags(htab, l_old, map_flags); |
1298 | if (ret) |
1299 | goto err; |
1300 | |
1301 | if (l_old) { |
1302 | /* per-cpu hash map can update value in-place */ |
1303 | pcpu_copy_value(htab, pptr: htab_elem_get_ptr(l: l_old, key_size), |
1304 | value, onallcpus); |
1305 | } else { |
1306 | l_new = alloc_htab_elem(htab, key, value, key_size, |
1307 | hash, percpu: true, onallcpus, NULL); |
1308 | if (IS_ERR(ptr: l_new)) { |
1309 | ret = PTR_ERR(ptr: l_new); |
1310 | goto err; |
1311 | } |
1312 | hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head); |
1313 | } |
1314 | ret = 0; |
1315 | err: |
1316 | htab_unlock_bucket(htab, b, hash, flags); |
1317 | return ret; |
1318 | } |
1319 | |
1320 | static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, |
1321 | void *value, u64 map_flags, |
1322 | bool onallcpus) |
1323 | { |
1324 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1325 | struct htab_elem *l_new = NULL, *l_old; |
1326 | struct hlist_nulls_head *head; |
1327 | unsigned long flags; |
1328 | struct bucket *b; |
1329 | u32 key_size, hash; |
1330 | int ret; |
1331 | |
1332 | if (unlikely(map_flags > BPF_EXIST)) |
1333 | /* unknown flags */ |
1334 | return -EINVAL; |
1335 | |
1336 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1337 | !rcu_read_lock_bh_held()); |
1338 | |
1339 | key_size = map->key_size; |
1340 | |
1341 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
1342 | |
1343 | b = __select_bucket(htab, hash); |
1344 | head = &b->head; |
1345 | |
1346 | /* For LRU, we need to alloc before taking bucket's |
1347 | * spinlock because LRU's elem alloc may need |
1348 | * to remove older elem from htab and this removal |
1349 | * operation will need a bucket lock. |
1350 | */ |
1351 | if (map_flags != BPF_EXIST) { |
1352 | l_new = prealloc_lru_pop(htab, key, hash); |
1353 | if (!l_new) |
1354 | return -ENOMEM; |
1355 | } |
1356 | |
1357 | ret = htab_lock_bucket(htab, b, hash, pflags: &flags); |
1358 | if (ret) |
1359 | goto err_lock_bucket; |
1360 | |
1361 | l_old = lookup_elem_raw(head, hash, key, key_size); |
1362 | |
1363 | ret = check_flags(htab, l_old, map_flags); |
1364 | if (ret) |
1365 | goto err; |
1366 | |
1367 | if (l_old) { |
1368 | bpf_lru_node_set_ref(node: &l_old->lru_node); |
1369 | |
1370 | /* per-cpu hash map can update value in-place */ |
1371 | pcpu_copy_value(htab, pptr: htab_elem_get_ptr(l: l_old, key_size), |
1372 | value, onallcpus); |
1373 | } else { |
1374 | pcpu_init_value(htab, pptr: htab_elem_get_ptr(l: l_new, key_size), |
1375 | value, onallcpus); |
1376 | hlist_nulls_add_head_rcu(n: &l_new->hash_node, h: head); |
1377 | l_new = NULL; |
1378 | } |
1379 | ret = 0; |
1380 | err: |
1381 | htab_unlock_bucket(htab, b, hash, flags); |
1382 | err_lock_bucket: |
1383 | if (l_new) { |
1384 | bpf_map_dec_elem_count(map: &htab->map); |
1385 | bpf_lru_push_free(lru: &htab->lru, node: &l_new->lru_node); |
1386 | } |
1387 | return ret; |
1388 | } |
1389 | |
1390 | static long htab_percpu_map_update_elem(struct bpf_map *map, void *key, |
1391 | void *value, u64 map_flags) |
1392 | { |
1393 | return __htab_percpu_map_update_elem(map, key, value, map_flags, onallcpus: false); |
1394 | } |
1395 | |
1396 | static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, |
1397 | void *value, u64 map_flags) |
1398 | { |
1399 | return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, |
1400 | onallcpus: false); |
1401 | } |
1402 | |
1403 | /* Called from syscall or from eBPF program */ |
1404 | static long htab_map_delete_elem(struct bpf_map *map, void *key) |
1405 | { |
1406 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1407 | struct hlist_nulls_head *head; |
1408 | struct bucket *b; |
1409 | struct htab_elem *l; |
1410 | unsigned long flags; |
1411 | u32 hash, key_size; |
1412 | int ret; |
1413 | |
1414 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1415 | !rcu_read_lock_bh_held()); |
1416 | |
1417 | key_size = map->key_size; |
1418 | |
1419 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
1420 | b = __select_bucket(htab, hash); |
1421 | head = &b->head; |
1422 | |
1423 | ret = htab_lock_bucket(htab, b, hash, pflags: &flags); |
1424 | if (ret) |
1425 | return ret; |
1426 | |
1427 | l = lookup_elem_raw(head, hash, key, key_size); |
1428 | |
1429 | if (l) { |
1430 | hlist_nulls_del_rcu(n: &l->hash_node); |
1431 | free_htab_elem(htab, l); |
1432 | } else { |
1433 | ret = -ENOENT; |
1434 | } |
1435 | |
1436 | htab_unlock_bucket(htab, b, hash, flags); |
1437 | return ret; |
1438 | } |
1439 | |
1440 | static long htab_lru_map_delete_elem(struct bpf_map *map, void *key) |
1441 | { |
1442 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1443 | struct hlist_nulls_head *head; |
1444 | struct bucket *b; |
1445 | struct htab_elem *l; |
1446 | unsigned long flags; |
1447 | u32 hash, key_size; |
1448 | int ret; |
1449 | |
1450 | WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && |
1451 | !rcu_read_lock_bh_held()); |
1452 | |
1453 | key_size = map->key_size; |
1454 | |
1455 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
1456 | b = __select_bucket(htab, hash); |
1457 | head = &b->head; |
1458 | |
1459 | ret = htab_lock_bucket(htab, b, hash, pflags: &flags); |
1460 | if (ret) |
1461 | return ret; |
1462 | |
1463 | l = lookup_elem_raw(head, hash, key, key_size); |
1464 | |
1465 | if (l) |
1466 | hlist_nulls_del_rcu(n: &l->hash_node); |
1467 | else |
1468 | ret = -ENOENT; |
1469 | |
1470 | htab_unlock_bucket(htab, b, hash, flags); |
1471 | if (l) |
1472 | htab_lru_push_free(htab, elem: l); |
1473 | return ret; |
1474 | } |
1475 | |
1476 | static void delete_all_elements(struct bpf_htab *htab) |
1477 | { |
1478 | int i; |
1479 | |
1480 | /* It's called from a worker thread, so disable migration here, |
1481 | * since bpf_mem_cache_free() relies on that. |
1482 | */ |
1483 | migrate_disable(); |
1484 | for (i = 0; i < htab->n_buckets; i++) { |
1485 | struct hlist_nulls_head *head = select_bucket(htab, hash: i); |
1486 | struct hlist_nulls_node *n; |
1487 | struct htab_elem *l; |
1488 | |
1489 | hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { |
1490 | hlist_nulls_del_rcu(n: &l->hash_node); |
1491 | htab_elem_free(htab, l); |
1492 | } |
1493 | } |
1494 | migrate_enable(); |
1495 | } |
1496 | |
1497 | static void htab_free_malloced_timers(struct bpf_htab *htab) |
1498 | { |
1499 | int i; |
1500 | |
1501 | rcu_read_lock(); |
1502 | for (i = 0; i < htab->n_buckets; i++) { |
1503 | struct hlist_nulls_head *head = select_bucket(htab, hash: i); |
1504 | struct hlist_nulls_node *n; |
1505 | struct htab_elem *l; |
1506 | |
1507 | hlist_nulls_for_each_entry(l, n, head, hash_node) { |
1508 | /* We only free timer on uref dropping to zero */ |
1509 | bpf_obj_free_timer(rec: htab->map.record, obj: l->key + round_up(htab->map.key_size, 8)); |
1510 | } |
1511 | cond_resched_rcu(); |
1512 | } |
1513 | rcu_read_unlock(); |
1514 | } |
1515 | |
1516 | static void htab_map_free_timers(struct bpf_map *map) |
1517 | { |
1518 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1519 | |
1520 | /* We only free timer on uref dropping to zero */ |
1521 | if (!btf_record_has_field(rec: htab->map.record, type: BPF_TIMER)) |
1522 | return; |
1523 | if (!htab_is_prealloc(htab)) |
1524 | htab_free_malloced_timers(htab); |
1525 | else |
1526 | htab_free_prealloced_timers(htab); |
1527 | } |
1528 | |
1529 | /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ |
1530 | static void htab_map_free(struct bpf_map *map) |
1531 | { |
1532 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1533 | int i; |
1534 | |
1535 | /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. |
1536 | * bpf_free_used_maps() is called after bpf prog is no longer executing. |
1537 | * There is no need to synchronize_rcu() here to protect map elements. |
1538 | */ |
1539 | |
1540 | /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it |
1541 | * underneath and is reponsible for waiting for callbacks to finish |
1542 | * during bpf_mem_alloc_destroy(). |
1543 | */ |
1544 | if (!htab_is_prealloc(htab)) { |
1545 | delete_all_elements(htab); |
1546 | } else { |
1547 | htab_free_prealloced_fields(htab); |
1548 | prealloc_destroy(htab); |
1549 | } |
1550 | |
1551 | bpf_map_free_elem_count(map); |
1552 | free_percpu(pdata: htab->extra_elems); |
1553 | bpf_map_area_free(base: htab->buckets); |
1554 | bpf_mem_alloc_destroy(ma: &htab->pcpu_ma); |
1555 | bpf_mem_alloc_destroy(ma: &htab->ma); |
1556 | if (htab->use_percpu_counter) |
1557 | percpu_counter_destroy(fbc: &htab->pcount); |
1558 | for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) |
1559 | free_percpu(pdata: htab->map_locked[i]); |
1560 | lockdep_unregister_key(key: &htab->lockdep_key); |
1561 | bpf_map_area_free(base: htab); |
1562 | } |
1563 | |
1564 | static void htab_map_seq_show_elem(struct bpf_map *map, void *key, |
1565 | struct seq_file *m) |
1566 | { |
1567 | void *value; |
1568 | |
1569 | rcu_read_lock(); |
1570 | |
1571 | value = htab_map_lookup_elem(map, key); |
1572 | if (!value) { |
1573 | rcu_read_unlock(); |
1574 | return; |
1575 | } |
1576 | |
1577 | btf_type_seq_show(btf: map->btf, type_id: map->btf_key_type_id, obj: key, m); |
1578 | seq_puts(m, s: ": " ); |
1579 | btf_type_seq_show(btf: map->btf, type_id: map->btf_value_type_id, obj: value, m); |
1580 | seq_puts(m, s: "\n" ); |
1581 | |
1582 | rcu_read_unlock(); |
1583 | } |
1584 | |
1585 | static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, |
1586 | void *value, bool is_lru_map, |
1587 | bool is_percpu, u64 flags) |
1588 | { |
1589 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1590 | struct hlist_nulls_head *head; |
1591 | unsigned long bflags; |
1592 | struct htab_elem *l; |
1593 | u32 hash, key_size; |
1594 | struct bucket *b; |
1595 | int ret; |
1596 | |
1597 | key_size = map->key_size; |
1598 | |
1599 | hash = htab_map_hash(key, key_len: key_size, hashrnd: htab->hashrnd); |
1600 | b = __select_bucket(htab, hash); |
1601 | head = &b->head; |
1602 | |
1603 | ret = htab_lock_bucket(htab, b, hash, pflags: &bflags); |
1604 | if (ret) |
1605 | return ret; |
1606 | |
1607 | l = lookup_elem_raw(head, hash, key, key_size); |
1608 | if (!l) { |
1609 | ret = -ENOENT; |
1610 | } else { |
1611 | if (is_percpu) { |
1612 | u32 roundup_value_size = round_up(map->value_size, 8); |
1613 | void __percpu *pptr; |
1614 | int off = 0, cpu; |
1615 | |
1616 | pptr = htab_elem_get_ptr(l, key_size); |
1617 | for_each_possible_cpu(cpu) { |
1618 | copy_map_value_long(map: &htab->map, dst: value + off, per_cpu_ptr(pptr, cpu)); |
1619 | check_and_init_map_value(map: &htab->map, dst: value + off); |
1620 | off += roundup_value_size; |
1621 | } |
1622 | } else { |
1623 | u32 roundup_key_size = round_up(map->key_size, 8); |
1624 | |
1625 | if (flags & BPF_F_LOCK) |
1626 | copy_map_value_locked(map, dst: value, src: l->key + |
1627 | roundup_key_size, |
1628 | lock_src: true); |
1629 | else |
1630 | copy_map_value(map, dst: value, src: l->key + |
1631 | roundup_key_size); |
1632 | /* Zeroing special fields in the temp buffer */ |
1633 | check_and_init_map_value(map, dst: value); |
1634 | } |
1635 | |
1636 | hlist_nulls_del_rcu(n: &l->hash_node); |
1637 | if (!is_lru_map) |
1638 | free_htab_elem(htab, l); |
1639 | } |
1640 | |
1641 | htab_unlock_bucket(htab, b, hash, flags: bflags); |
1642 | |
1643 | if (is_lru_map && l) |
1644 | htab_lru_push_free(htab, elem: l); |
1645 | |
1646 | return ret; |
1647 | } |
1648 | |
1649 | static int htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, |
1650 | void *value, u64 flags) |
1651 | { |
1652 | return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: false, is_percpu: false, |
1653 | flags); |
1654 | } |
1655 | |
1656 | static int htab_percpu_map_lookup_and_delete_elem(struct bpf_map *map, |
1657 | void *key, void *value, |
1658 | u64 flags) |
1659 | { |
1660 | return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: false, is_percpu: true, |
1661 | flags); |
1662 | } |
1663 | |
1664 | static int htab_lru_map_lookup_and_delete_elem(struct bpf_map *map, void *key, |
1665 | void *value, u64 flags) |
1666 | { |
1667 | return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: true, is_percpu: false, |
1668 | flags); |
1669 | } |
1670 | |
1671 | static int htab_lru_percpu_map_lookup_and_delete_elem(struct bpf_map *map, |
1672 | void *key, void *value, |
1673 | u64 flags) |
1674 | { |
1675 | return __htab_map_lookup_and_delete_elem(map, key, value, is_lru_map: true, is_percpu: true, |
1676 | flags); |
1677 | } |
1678 | |
1679 | static int |
1680 | __htab_map_lookup_and_delete_batch(struct bpf_map *map, |
1681 | const union bpf_attr *attr, |
1682 | union bpf_attr __user *uattr, |
1683 | bool do_delete, bool is_lru_map, |
1684 | bool is_percpu) |
1685 | { |
1686 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
1687 | u32 bucket_cnt, total, key_size, value_size, roundup_key_size; |
1688 | void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; |
1689 | void __user *uvalues = u64_to_user_ptr(attr->batch.values); |
1690 | void __user *ukeys = u64_to_user_ptr(attr->batch.keys); |
1691 | void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); |
1692 | u32 batch, max_count, size, bucket_size, map_id; |
1693 | struct htab_elem *node_to_free = NULL; |
1694 | u64 elem_map_flags, map_flags; |
1695 | struct hlist_nulls_head *head; |
1696 | struct hlist_nulls_node *n; |
1697 | unsigned long flags = 0; |
1698 | bool locked = false; |
1699 | struct htab_elem *l; |
1700 | struct bucket *b; |
1701 | int ret = 0; |
1702 | |
1703 | elem_map_flags = attr->batch.elem_flags; |
1704 | if ((elem_map_flags & ~BPF_F_LOCK) || |
1705 | ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(rec: map->record, type: BPF_SPIN_LOCK))) |
1706 | return -EINVAL; |
1707 | |
1708 | map_flags = attr->batch.flags; |
1709 | if (map_flags) |
1710 | return -EINVAL; |
1711 | |
1712 | max_count = attr->batch.count; |
1713 | if (!max_count) |
1714 | return 0; |
1715 | |
1716 | if (put_user(0, &uattr->batch.count)) |
1717 | return -EFAULT; |
1718 | |
1719 | batch = 0; |
1720 | if (ubatch && copy_from_user(to: &batch, from: ubatch, n: sizeof(batch))) |
1721 | return -EFAULT; |
1722 | |
1723 | if (batch >= htab->n_buckets) |
1724 | return -ENOENT; |
1725 | |
1726 | key_size = htab->map.key_size; |
1727 | roundup_key_size = round_up(htab->map.key_size, 8); |
1728 | value_size = htab->map.value_size; |
1729 | size = round_up(value_size, 8); |
1730 | if (is_percpu) |
1731 | value_size = size * num_possible_cpus(); |
1732 | total = 0; |
1733 | /* while experimenting with hash tables with sizes ranging from 10 to |
1734 | * 1000, it was observed that a bucket can have up to 5 entries. |
1735 | */ |
1736 | bucket_size = 5; |
1737 | |
1738 | alloc: |
1739 | /* We cannot do copy_from_user or copy_to_user inside |
1740 | * the rcu_read_lock. Allocate enough space here. |
1741 | */ |
1742 | keys = kvmalloc_array(n: key_size, size: bucket_size, GFP_USER | __GFP_NOWARN); |
1743 | values = kvmalloc_array(n: value_size, size: bucket_size, GFP_USER | __GFP_NOWARN); |
1744 | if (!keys || !values) { |
1745 | ret = -ENOMEM; |
1746 | goto after_loop; |
1747 | } |
1748 | |
1749 | again: |
1750 | bpf_disable_instrumentation(); |
1751 | rcu_read_lock(); |
1752 | again_nocopy: |
1753 | dst_key = keys; |
1754 | dst_val = values; |
1755 | b = &htab->buckets[batch]; |
1756 | head = &b->head; |
1757 | /* do not grab the lock unless need it (bucket_cnt > 0). */ |
1758 | if (locked) { |
1759 | ret = htab_lock_bucket(htab, b, hash: batch, pflags: &flags); |
1760 | if (ret) { |
1761 | rcu_read_unlock(); |
1762 | bpf_enable_instrumentation(); |
1763 | goto after_loop; |
1764 | } |
1765 | } |
1766 | |
1767 | bucket_cnt = 0; |
1768 | hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) |
1769 | bucket_cnt++; |
1770 | |
1771 | if (bucket_cnt && !locked) { |
1772 | locked = true; |
1773 | goto again_nocopy; |
1774 | } |
1775 | |
1776 | if (bucket_cnt > (max_count - total)) { |
1777 | if (total == 0) |
1778 | ret = -ENOSPC; |
1779 | /* Note that since bucket_cnt > 0 here, it is implicit |
1780 | * that the locked was grabbed, so release it. |
1781 | */ |
1782 | htab_unlock_bucket(htab, b, hash: batch, flags); |
1783 | rcu_read_unlock(); |
1784 | bpf_enable_instrumentation(); |
1785 | goto after_loop; |
1786 | } |
1787 | |
1788 | if (bucket_cnt > bucket_size) { |
1789 | bucket_size = bucket_cnt; |
1790 | /* Note that since bucket_cnt > 0 here, it is implicit |
1791 | * that the locked was grabbed, so release it. |
1792 | */ |
1793 | htab_unlock_bucket(htab, b, hash: batch, flags); |
1794 | rcu_read_unlock(); |
1795 | bpf_enable_instrumentation(); |
1796 | kvfree(addr: keys); |
1797 | kvfree(addr: values); |
1798 | goto alloc; |
1799 | } |
1800 | |
1801 | /* Next block is only safe to run if you have grabbed the lock */ |
1802 | if (!locked) |
1803 | goto next_batch; |
1804 | |
1805 | hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { |
1806 | memcpy(dst_key, l->key, key_size); |
1807 | |
1808 | if (is_percpu) { |
1809 | int off = 0, cpu; |
1810 | void __percpu *pptr; |
1811 | |
1812 | pptr = htab_elem_get_ptr(l, key_size: map->key_size); |
1813 | for_each_possible_cpu(cpu) { |
1814 | copy_map_value_long(map: &htab->map, dst: dst_val + off, per_cpu_ptr(pptr, cpu)); |
1815 | check_and_init_map_value(map: &htab->map, dst: dst_val + off); |
1816 | off += size; |
1817 | } |
1818 | } else { |
1819 | value = l->key + roundup_key_size; |
1820 | if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { |
1821 | struct bpf_map **inner_map = value; |
1822 | |
1823 | /* Actual value is the id of the inner map */ |
1824 | map_id = map->ops->map_fd_sys_lookup_elem(*inner_map); |
1825 | value = &map_id; |
1826 | } |
1827 | |
1828 | if (elem_map_flags & BPF_F_LOCK) |
1829 | copy_map_value_locked(map, dst: dst_val, src: value, |
1830 | lock_src: true); |
1831 | else |
1832 | copy_map_value(map, dst: dst_val, src: value); |
1833 | /* Zeroing special fields in the temp buffer */ |
1834 | check_and_init_map_value(map, dst: dst_val); |
1835 | } |
1836 | if (do_delete) { |
1837 | hlist_nulls_del_rcu(n: &l->hash_node); |
1838 | |
1839 | /* bpf_lru_push_free() will acquire lru_lock, which |
1840 | * may cause deadlock. See comments in function |
1841 | * prealloc_lru_pop(). Let us do bpf_lru_push_free() |
1842 | * after releasing the bucket lock. |
1843 | */ |
1844 | if (is_lru_map) { |
1845 | l->batch_flink = node_to_free; |
1846 | node_to_free = l; |
1847 | } else { |
1848 | free_htab_elem(htab, l); |
1849 | } |
1850 | } |
1851 | dst_key += key_size; |
1852 | dst_val += value_size; |
1853 | } |
1854 | |
1855 | htab_unlock_bucket(htab, b, hash: batch, flags); |
1856 | locked = false; |
1857 | |
1858 | while (node_to_free) { |
1859 | l = node_to_free; |
1860 | node_to_free = node_to_free->batch_flink; |
1861 | htab_lru_push_free(htab, elem: l); |
1862 | } |
1863 | |
1864 | next_batch: |
1865 | /* If we are not copying data, we can go to next bucket and avoid |
1866 | * unlocking the rcu. |
1867 | */ |
1868 | if (!bucket_cnt && (batch + 1 < htab->n_buckets)) { |
1869 | batch++; |
1870 | goto again_nocopy; |
1871 | } |
1872 | |
1873 | rcu_read_unlock(); |
1874 | bpf_enable_instrumentation(); |
1875 | if (bucket_cnt && (copy_to_user(to: ukeys + total * key_size, from: keys, |
1876 | n: key_size * bucket_cnt) || |
1877 | copy_to_user(to: uvalues + total * value_size, from: values, |
1878 | n: value_size * bucket_cnt))) { |
1879 | ret = -EFAULT; |
1880 | goto after_loop; |
1881 | } |
1882 | |
1883 | total += bucket_cnt; |
1884 | batch++; |
1885 | if (batch >= htab->n_buckets) { |
1886 | ret = -ENOENT; |
1887 | goto after_loop; |
1888 | } |
1889 | goto again; |
1890 | |
1891 | after_loop: |
1892 | if (ret == -EFAULT) |
1893 | goto out; |
1894 | |
1895 | /* copy # of entries and next batch */ |
1896 | ubatch = u64_to_user_ptr(attr->batch.out_batch); |
1897 | if (copy_to_user(to: ubatch, from: &batch, n: sizeof(batch)) || |
1898 | put_user(total, &uattr->batch.count)) |
1899 | ret = -EFAULT; |
1900 | |
1901 | out: |
1902 | kvfree(addr: keys); |
1903 | kvfree(addr: values); |
1904 | return ret; |
1905 | } |
1906 | |
1907 | static int |
1908 | htab_percpu_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, |
1909 | union bpf_attr __user *uattr) |
1910 | { |
1911 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false, |
1912 | is_lru_map: false, is_percpu: true); |
1913 | } |
1914 | |
1915 | static int |
1916 | htab_percpu_map_lookup_and_delete_batch(struct bpf_map *map, |
1917 | const union bpf_attr *attr, |
1918 | union bpf_attr __user *uattr) |
1919 | { |
1920 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true, |
1921 | is_lru_map: false, is_percpu: true); |
1922 | } |
1923 | |
1924 | static int |
1925 | htab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, |
1926 | union bpf_attr __user *uattr) |
1927 | { |
1928 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false, |
1929 | is_lru_map: false, is_percpu: false); |
1930 | } |
1931 | |
1932 | static int |
1933 | htab_map_lookup_and_delete_batch(struct bpf_map *map, |
1934 | const union bpf_attr *attr, |
1935 | union bpf_attr __user *uattr) |
1936 | { |
1937 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true, |
1938 | is_lru_map: false, is_percpu: false); |
1939 | } |
1940 | |
1941 | static int |
1942 | htab_lru_percpu_map_lookup_batch(struct bpf_map *map, |
1943 | const union bpf_attr *attr, |
1944 | union bpf_attr __user *uattr) |
1945 | { |
1946 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false, |
1947 | is_lru_map: true, is_percpu: true); |
1948 | } |
1949 | |
1950 | static int |
1951 | htab_lru_percpu_map_lookup_and_delete_batch(struct bpf_map *map, |
1952 | const union bpf_attr *attr, |
1953 | union bpf_attr __user *uattr) |
1954 | { |
1955 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true, |
1956 | is_lru_map: true, is_percpu: true); |
1957 | } |
1958 | |
1959 | static int |
1960 | htab_lru_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, |
1961 | union bpf_attr __user *uattr) |
1962 | { |
1963 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: false, |
1964 | is_lru_map: true, is_percpu: false); |
1965 | } |
1966 | |
1967 | static int |
1968 | htab_lru_map_lookup_and_delete_batch(struct bpf_map *map, |
1969 | const union bpf_attr *attr, |
1970 | union bpf_attr __user *uattr) |
1971 | { |
1972 | return __htab_map_lookup_and_delete_batch(map, attr, uattr, do_delete: true, |
1973 | is_lru_map: true, is_percpu: false); |
1974 | } |
1975 | |
1976 | struct bpf_iter_seq_hash_map_info { |
1977 | struct bpf_map *map; |
1978 | struct bpf_htab *htab; |
1979 | void *percpu_value_buf; // non-zero means percpu hash |
1980 | u32 bucket_id; |
1981 | u32 skip_elems; |
1982 | }; |
1983 | |
1984 | static struct htab_elem * |
1985 | bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info, |
1986 | struct htab_elem *prev_elem) |
1987 | { |
1988 | const struct bpf_htab *htab = info->htab; |
1989 | u32 skip_elems = info->skip_elems; |
1990 | u32 bucket_id = info->bucket_id; |
1991 | struct hlist_nulls_head *head; |
1992 | struct hlist_nulls_node *n; |
1993 | struct htab_elem *elem; |
1994 | struct bucket *b; |
1995 | u32 i, count; |
1996 | |
1997 | if (bucket_id >= htab->n_buckets) |
1998 | return NULL; |
1999 | |
2000 | /* try to find next elem in the same bucket */ |
2001 | if (prev_elem) { |
2002 | /* no update/deletion on this bucket, prev_elem should be still valid |
2003 | * and we won't skip elements. |
2004 | */ |
2005 | n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node)); |
2006 | elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node); |
2007 | if (elem) |
2008 | return elem; |
2009 | |
2010 | /* not found, unlock and go to the next bucket */ |
2011 | b = &htab->buckets[bucket_id++]; |
2012 | rcu_read_unlock(); |
2013 | skip_elems = 0; |
2014 | } |
2015 | |
2016 | for (i = bucket_id; i < htab->n_buckets; i++) { |
2017 | b = &htab->buckets[i]; |
2018 | rcu_read_lock(); |
2019 | |
2020 | count = 0; |
2021 | head = &b->head; |
2022 | hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { |
2023 | if (count >= skip_elems) { |
2024 | info->bucket_id = i; |
2025 | info->skip_elems = count; |
2026 | return elem; |
2027 | } |
2028 | count++; |
2029 | } |
2030 | |
2031 | rcu_read_unlock(); |
2032 | skip_elems = 0; |
2033 | } |
2034 | |
2035 | info->bucket_id = i; |
2036 | info->skip_elems = 0; |
2037 | return NULL; |
2038 | } |
2039 | |
2040 | static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos) |
2041 | { |
2042 | struct bpf_iter_seq_hash_map_info *info = seq->private; |
2043 | struct htab_elem *elem; |
2044 | |
2045 | elem = bpf_hash_map_seq_find_next(info, NULL); |
2046 | if (!elem) |
2047 | return NULL; |
2048 | |
2049 | if (*pos == 0) |
2050 | ++*pos; |
2051 | return elem; |
2052 | } |
2053 | |
2054 | static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
2055 | { |
2056 | struct bpf_iter_seq_hash_map_info *info = seq->private; |
2057 | |
2058 | ++*pos; |
2059 | ++info->skip_elems; |
2060 | return bpf_hash_map_seq_find_next(info, prev_elem: v); |
2061 | } |
2062 | |
2063 | static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem) |
2064 | { |
2065 | struct bpf_iter_seq_hash_map_info *info = seq->private; |
2066 | u32 roundup_key_size, roundup_value_size; |
2067 | struct bpf_iter__bpf_map_elem ctx = {}; |
2068 | struct bpf_map *map = info->map; |
2069 | struct bpf_iter_meta meta; |
2070 | int ret = 0, off = 0, cpu; |
2071 | struct bpf_prog *prog; |
2072 | void __percpu *pptr; |
2073 | |
2074 | meta.seq = seq; |
2075 | prog = bpf_iter_get_info(meta: &meta, in_stop: elem == NULL); |
2076 | if (prog) { |
2077 | ctx.meta = &meta; |
2078 | ctx.map = info->map; |
2079 | if (elem) { |
2080 | roundup_key_size = round_up(map->key_size, 8); |
2081 | ctx.key = elem->key; |
2082 | if (!info->percpu_value_buf) { |
2083 | ctx.value = elem->key + roundup_key_size; |
2084 | } else { |
2085 | roundup_value_size = round_up(map->value_size, 8); |
2086 | pptr = htab_elem_get_ptr(l: elem, key_size: map->key_size); |
2087 | for_each_possible_cpu(cpu) { |
2088 | copy_map_value_long(map, dst: info->percpu_value_buf + off, |
2089 | per_cpu_ptr(pptr, cpu)); |
2090 | check_and_init_map_value(map, dst: info->percpu_value_buf + off); |
2091 | off += roundup_value_size; |
2092 | } |
2093 | ctx.value = info->percpu_value_buf; |
2094 | } |
2095 | } |
2096 | ret = bpf_iter_run_prog(prog, ctx: &ctx); |
2097 | } |
2098 | |
2099 | return ret; |
2100 | } |
2101 | |
2102 | static int bpf_hash_map_seq_show(struct seq_file *seq, void *v) |
2103 | { |
2104 | return __bpf_hash_map_seq_show(seq, elem: v); |
2105 | } |
2106 | |
2107 | static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v) |
2108 | { |
2109 | if (!v) |
2110 | (void)__bpf_hash_map_seq_show(seq, NULL); |
2111 | else |
2112 | rcu_read_unlock(); |
2113 | } |
2114 | |
2115 | static int bpf_iter_init_hash_map(void *priv_data, |
2116 | struct bpf_iter_aux_info *aux) |
2117 | { |
2118 | struct bpf_iter_seq_hash_map_info *seq_info = priv_data; |
2119 | struct bpf_map *map = aux->map; |
2120 | void *value_buf; |
2121 | u32 buf_size; |
2122 | |
2123 | if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || |
2124 | map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { |
2125 | buf_size = round_up(map->value_size, 8) * num_possible_cpus(); |
2126 | value_buf = kmalloc(size: buf_size, GFP_USER | __GFP_NOWARN); |
2127 | if (!value_buf) |
2128 | return -ENOMEM; |
2129 | |
2130 | seq_info->percpu_value_buf = value_buf; |
2131 | } |
2132 | |
2133 | bpf_map_inc_with_uref(map); |
2134 | seq_info->map = map; |
2135 | seq_info->htab = container_of(map, struct bpf_htab, map); |
2136 | return 0; |
2137 | } |
2138 | |
2139 | static void bpf_iter_fini_hash_map(void *priv_data) |
2140 | { |
2141 | struct bpf_iter_seq_hash_map_info *seq_info = priv_data; |
2142 | |
2143 | bpf_map_put_with_uref(map: seq_info->map); |
2144 | kfree(objp: seq_info->percpu_value_buf); |
2145 | } |
2146 | |
2147 | static const struct seq_operations bpf_hash_map_seq_ops = { |
2148 | .start = bpf_hash_map_seq_start, |
2149 | .next = bpf_hash_map_seq_next, |
2150 | .stop = bpf_hash_map_seq_stop, |
2151 | .show = bpf_hash_map_seq_show, |
2152 | }; |
2153 | |
2154 | static const struct bpf_iter_seq_info iter_seq_info = { |
2155 | .seq_ops = &bpf_hash_map_seq_ops, |
2156 | .init_seq_private = bpf_iter_init_hash_map, |
2157 | .fini_seq_private = bpf_iter_fini_hash_map, |
2158 | .seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info), |
2159 | }; |
2160 | |
2161 | static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn, |
2162 | void *callback_ctx, u64 flags) |
2163 | { |
2164 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
2165 | struct hlist_nulls_head *head; |
2166 | struct hlist_nulls_node *n; |
2167 | struct htab_elem *elem; |
2168 | u32 roundup_key_size; |
2169 | int i, num_elems = 0; |
2170 | void __percpu *pptr; |
2171 | struct bucket *b; |
2172 | void *key, *val; |
2173 | bool is_percpu; |
2174 | u64 ret = 0; |
2175 | |
2176 | if (flags != 0) |
2177 | return -EINVAL; |
2178 | |
2179 | is_percpu = htab_is_percpu(htab); |
2180 | |
2181 | roundup_key_size = round_up(map->key_size, 8); |
2182 | /* disable migration so percpu value prepared here will be the |
2183 | * same as the one seen by the bpf program with bpf_map_lookup_elem(). |
2184 | */ |
2185 | if (is_percpu) |
2186 | migrate_disable(); |
2187 | for (i = 0; i < htab->n_buckets; i++) { |
2188 | b = &htab->buckets[i]; |
2189 | rcu_read_lock(); |
2190 | head = &b->head; |
2191 | hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { |
2192 | key = elem->key; |
2193 | if (is_percpu) { |
2194 | /* current cpu value for percpu map */ |
2195 | pptr = htab_elem_get_ptr(l: elem, key_size: map->key_size); |
2196 | val = this_cpu_ptr(pptr); |
2197 | } else { |
2198 | val = elem->key + roundup_key_size; |
2199 | } |
2200 | num_elems++; |
2201 | ret = callback_fn((u64)(long)map, (u64)(long)key, |
2202 | (u64)(long)val, (u64)(long)callback_ctx, 0); |
2203 | /* return value: 0 - continue, 1 - stop and return */ |
2204 | if (ret) { |
2205 | rcu_read_unlock(); |
2206 | goto out; |
2207 | } |
2208 | } |
2209 | rcu_read_unlock(); |
2210 | } |
2211 | out: |
2212 | if (is_percpu) |
2213 | migrate_enable(); |
2214 | return num_elems; |
2215 | } |
2216 | |
2217 | static u64 htab_map_mem_usage(const struct bpf_map *map) |
2218 | { |
2219 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
2220 | u32 value_size = round_up(htab->map.value_size, 8); |
2221 | bool prealloc = htab_is_prealloc(htab); |
2222 | bool percpu = htab_is_percpu(htab); |
2223 | bool lru = htab_is_lru(htab); |
2224 | u64 num_entries; |
2225 | u64 usage = sizeof(struct bpf_htab); |
2226 | |
2227 | usage += sizeof(struct bucket) * htab->n_buckets; |
2228 | usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT; |
2229 | if (prealloc) { |
2230 | num_entries = map->max_entries; |
2231 | if (htab_has_extra_elems(htab)) |
2232 | num_entries += num_possible_cpus(); |
2233 | |
2234 | usage += htab->elem_size * num_entries; |
2235 | |
2236 | if (percpu) |
2237 | usage += value_size * num_possible_cpus() * num_entries; |
2238 | else if (!lru) |
2239 | usage += sizeof(struct htab_elem *) * num_possible_cpus(); |
2240 | } else { |
2241 | #define LLIST_NODE_SZ sizeof(struct llist_node) |
2242 | |
2243 | num_entries = htab->use_percpu_counter ? |
2244 | percpu_counter_sum(fbc: &htab->pcount) : |
2245 | atomic_read(v: &htab->count); |
2246 | usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries; |
2247 | if (percpu) { |
2248 | usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries; |
2249 | usage += value_size * num_possible_cpus() * num_entries; |
2250 | } |
2251 | } |
2252 | return usage; |
2253 | } |
2254 | |
2255 | BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab) |
2256 | const struct bpf_map_ops htab_map_ops = { |
2257 | .map_meta_equal = bpf_map_meta_equal, |
2258 | .map_alloc_check = htab_map_alloc_check, |
2259 | .map_alloc = htab_map_alloc, |
2260 | .map_free = htab_map_free, |
2261 | .map_get_next_key = htab_map_get_next_key, |
2262 | .map_release_uref = htab_map_free_timers, |
2263 | .map_lookup_elem = htab_map_lookup_elem, |
2264 | .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, |
2265 | .map_update_elem = htab_map_update_elem, |
2266 | .map_delete_elem = htab_map_delete_elem, |
2267 | .map_gen_lookup = htab_map_gen_lookup, |
2268 | .map_seq_show_elem = htab_map_seq_show_elem, |
2269 | .map_set_for_each_callback_args = map_set_for_each_callback_args, |
2270 | .map_for_each_callback = bpf_for_each_hash_elem, |
2271 | .map_mem_usage = htab_map_mem_usage, |
2272 | BATCH_OPS(htab), |
2273 | .map_btf_id = &htab_map_btf_ids[0], |
2274 | .iter_seq_info = &iter_seq_info, |
2275 | }; |
2276 | |
2277 | const struct bpf_map_ops htab_lru_map_ops = { |
2278 | .map_meta_equal = bpf_map_meta_equal, |
2279 | .map_alloc_check = htab_map_alloc_check, |
2280 | .map_alloc = htab_map_alloc, |
2281 | .map_free = htab_map_free, |
2282 | .map_get_next_key = htab_map_get_next_key, |
2283 | .map_release_uref = htab_map_free_timers, |
2284 | .map_lookup_elem = htab_lru_map_lookup_elem, |
2285 | .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, |
2286 | .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, |
2287 | .map_update_elem = htab_lru_map_update_elem, |
2288 | .map_delete_elem = htab_lru_map_delete_elem, |
2289 | .map_gen_lookup = htab_lru_map_gen_lookup, |
2290 | .map_seq_show_elem = htab_map_seq_show_elem, |
2291 | .map_set_for_each_callback_args = map_set_for_each_callback_args, |
2292 | .map_for_each_callback = bpf_for_each_hash_elem, |
2293 | .map_mem_usage = htab_map_mem_usage, |
2294 | BATCH_OPS(htab_lru), |
2295 | .map_btf_id = &htab_map_btf_ids[0], |
2296 | .iter_seq_info = &iter_seq_info, |
2297 | }; |
2298 | |
2299 | /* Called from eBPF program */ |
2300 | static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) |
2301 | { |
2302 | struct htab_elem *l = __htab_map_lookup_elem(map, key); |
2303 | |
2304 | if (l) |
2305 | return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); |
2306 | else |
2307 | return NULL; |
2308 | } |
2309 | |
2310 | static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) |
2311 | { |
2312 | struct htab_elem *l; |
2313 | |
2314 | if (cpu >= nr_cpu_ids) |
2315 | return NULL; |
2316 | |
2317 | l = __htab_map_lookup_elem(map, key); |
2318 | if (l) |
2319 | return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); |
2320 | else |
2321 | return NULL; |
2322 | } |
2323 | |
2324 | static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) |
2325 | { |
2326 | struct htab_elem *l = __htab_map_lookup_elem(map, key); |
2327 | |
2328 | if (l) { |
2329 | bpf_lru_node_set_ref(node: &l->lru_node); |
2330 | return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); |
2331 | } |
2332 | |
2333 | return NULL; |
2334 | } |
2335 | |
2336 | static void *htab_lru_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) |
2337 | { |
2338 | struct htab_elem *l; |
2339 | |
2340 | if (cpu >= nr_cpu_ids) |
2341 | return NULL; |
2342 | |
2343 | l = __htab_map_lookup_elem(map, key); |
2344 | if (l) { |
2345 | bpf_lru_node_set_ref(node: &l->lru_node); |
2346 | return per_cpu_ptr(htab_elem_get_ptr(l, map->key_size), cpu); |
2347 | } |
2348 | |
2349 | return NULL; |
2350 | } |
2351 | |
2352 | int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) |
2353 | { |
2354 | struct htab_elem *l; |
2355 | void __percpu *pptr; |
2356 | int ret = -ENOENT; |
2357 | int cpu, off = 0; |
2358 | u32 size; |
2359 | |
2360 | /* per_cpu areas are zero-filled and bpf programs can only |
2361 | * access 'value_size' of them, so copying rounded areas |
2362 | * will not leak any kernel data |
2363 | */ |
2364 | size = round_up(map->value_size, 8); |
2365 | rcu_read_lock(); |
2366 | l = __htab_map_lookup_elem(map, key); |
2367 | if (!l) |
2368 | goto out; |
2369 | /* We do not mark LRU map element here in order to not mess up |
2370 | * eviction heuristics when user space does a map walk. |
2371 | */ |
2372 | pptr = htab_elem_get_ptr(l, key_size: map->key_size); |
2373 | for_each_possible_cpu(cpu) { |
2374 | copy_map_value_long(map, dst: value + off, per_cpu_ptr(pptr, cpu)); |
2375 | check_and_init_map_value(map, dst: value + off); |
2376 | off += size; |
2377 | } |
2378 | ret = 0; |
2379 | out: |
2380 | rcu_read_unlock(); |
2381 | return ret; |
2382 | } |
2383 | |
2384 | int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, |
2385 | u64 map_flags) |
2386 | { |
2387 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
2388 | int ret; |
2389 | |
2390 | rcu_read_lock(); |
2391 | if (htab_is_lru(htab)) |
2392 | ret = __htab_lru_percpu_map_update_elem(map, key, value, |
2393 | map_flags, onallcpus: true); |
2394 | else |
2395 | ret = __htab_percpu_map_update_elem(map, key, value, map_flags, |
2396 | onallcpus: true); |
2397 | rcu_read_unlock(); |
2398 | |
2399 | return ret; |
2400 | } |
2401 | |
2402 | static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, |
2403 | struct seq_file *m) |
2404 | { |
2405 | struct htab_elem *l; |
2406 | void __percpu *pptr; |
2407 | int cpu; |
2408 | |
2409 | rcu_read_lock(); |
2410 | |
2411 | l = __htab_map_lookup_elem(map, key); |
2412 | if (!l) { |
2413 | rcu_read_unlock(); |
2414 | return; |
2415 | } |
2416 | |
2417 | btf_type_seq_show(btf: map->btf, type_id: map->btf_key_type_id, obj: key, m); |
2418 | seq_puts(m, s: ": {\n" ); |
2419 | pptr = htab_elem_get_ptr(l, key_size: map->key_size); |
2420 | for_each_possible_cpu(cpu) { |
2421 | seq_printf(m, fmt: "\tcpu%d: " , cpu); |
2422 | btf_type_seq_show(btf: map->btf, type_id: map->btf_value_type_id, |
2423 | per_cpu_ptr(pptr, cpu), m); |
2424 | seq_puts(m, s: "\n" ); |
2425 | } |
2426 | seq_puts(m, s: "}\n" ); |
2427 | |
2428 | rcu_read_unlock(); |
2429 | } |
2430 | |
2431 | const struct bpf_map_ops htab_percpu_map_ops = { |
2432 | .map_meta_equal = bpf_map_meta_equal, |
2433 | .map_alloc_check = htab_map_alloc_check, |
2434 | .map_alloc = htab_map_alloc, |
2435 | .map_free = htab_map_free, |
2436 | .map_get_next_key = htab_map_get_next_key, |
2437 | .map_lookup_elem = htab_percpu_map_lookup_elem, |
2438 | .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, |
2439 | .map_update_elem = htab_percpu_map_update_elem, |
2440 | .map_delete_elem = htab_map_delete_elem, |
2441 | .map_lookup_percpu_elem = htab_percpu_map_lookup_percpu_elem, |
2442 | .map_seq_show_elem = htab_percpu_map_seq_show_elem, |
2443 | .map_set_for_each_callback_args = map_set_for_each_callback_args, |
2444 | .map_for_each_callback = bpf_for_each_hash_elem, |
2445 | .map_mem_usage = htab_map_mem_usage, |
2446 | BATCH_OPS(htab_percpu), |
2447 | .map_btf_id = &htab_map_btf_ids[0], |
2448 | .iter_seq_info = &iter_seq_info, |
2449 | }; |
2450 | |
2451 | const struct bpf_map_ops htab_lru_percpu_map_ops = { |
2452 | .map_meta_equal = bpf_map_meta_equal, |
2453 | .map_alloc_check = htab_map_alloc_check, |
2454 | .map_alloc = htab_map_alloc, |
2455 | .map_free = htab_map_free, |
2456 | .map_get_next_key = htab_map_get_next_key, |
2457 | .map_lookup_elem = htab_lru_percpu_map_lookup_elem, |
2458 | .map_lookup_and_delete_elem = htab_lru_percpu_map_lookup_and_delete_elem, |
2459 | .map_update_elem = htab_lru_percpu_map_update_elem, |
2460 | .map_delete_elem = htab_lru_map_delete_elem, |
2461 | .map_lookup_percpu_elem = htab_lru_percpu_map_lookup_percpu_elem, |
2462 | .map_seq_show_elem = htab_percpu_map_seq_show_elem, |
2463 | .map_set_for_each_callback_args = map_set_for_each_callback_args, |
2464 | .map_for_each_callback = bpf_for_each_hash_elem, |
2465 | .map_mem_usage = htab_map_mem_usage, |
2466 | BATCH_OPS(htab_lru_percpu), |
2467 | .map_btf_id = &htab_map_btf_ids[0], |
2468 | .iter_seq_info = &iter_seq_info, |
2469 | }; |
2470 | |
2471 | static int fd_htab_map_alloc_check(union bpf_attr *attr) |
2472 | { |
2473 | if (attr->value_size != sizeof(u32)) |
2474 | return -EINVAL; |
2475 | return htab_map_alloc_check(attr); |
2476 | } |
2477 | |
2478 | static void fd_htab_map_free(struct bpf_map *map) |
2479 | { |
2480 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); |
2481 | struct hlist_nulls_node *n; |
2482 | struct hlist_nulls_head *head; |
2483 | struct htab_elem *l; |
2484 | int i; |
2485 | |
2486 | for (i = 0; i < htab->n_buckets; i++) { |
2487 | head = select_bucket(htab, hash: i); |
2488 | |
2489 | hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { |
2490 | void *ptr = fd_htab_map_get_ptr(map, l); |
2491 | |
2492 | map->ops->map_fd_put_ptr(map, ptr, false); |
2493 | } |
2494 | } |
2495 | |
2496 | htab_map_free(map); |
2497 | } |
2498 | |
2499 | /* only called from syscall */ |
2500 | int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) |
2501 | { |
2502 | void **ptr; |
2503 | int ret = 0; |
2504 | |
2505 | if (!map->ops->map_fd_sys_lookup_elem) |
2506 | return -ENOTSUPP; |
2507 | |
2508 | rcu_read_lock(); |
2509 | ptr = htab_map_lookup_elem(map, key); |
2510 | if (ptr) |
2511 | *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); |
2512 | else |
2513 | ret = -ENOENT; |
2514 | rcu_read_unlock(); |
2515 | |
2516 | return ret; |
2517 | } |
2518 | |
2519 | /* only called from syscall */ |
2520 | int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, |
2521 | void *key, void *value, u64 map_flags) |
2522 | { |
2523 | void *ptr; |
2524 | int ret; |
2525 | u32 ufd = *(u32 *)value; |
2526 | |
2527 | ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); |
2528 | if (IS_ERR(ptr)) |
2529 | return PTR_ERR(ptr); |
2530 | |
2531 | /* The htab bucket lock is always held during update operations in fd |
2532 | * htab map, and the following rcu_read_lock() is only used to avoid |
2533 | * the WARN_ON_ONCE in htab_map_update_elem(). |
2534 | */ |
2535 | rcu_read_lock(); |
2536 | ret = htab_map_update_elem(map, key, value: &ptr, map_flags); |
2537 | rcu_read_unlock(); |
2538 | if (ret) |
2539 | map->ops->map_fd_put_ptr(map, ptr, false); |
2540 | |
2541 | return ret; |
2542 | } |
2543 | |
2544 | static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) |
2545 | { |
2546 | struct bpf_map *map, *inner_map_meta; |
2547 | |
2548 | inner_map_meta = bpf_map_meta_alloc(inner_map_ufd: attr->inner_map_fd); |
2549 | if (IS_ERR(ptr: inner_map_meta)) |
2550 | return inner_map_meta; |
2551 | |
2552 | map = htab_map_alloc(attr); |
2553 | if (IS_ERR(ptr: map)) { |
2554 | bpf_map_meta_free(map_meta: inner_map_meta); |
2555 | return map; |
2556 | } |
2557 | |
2558 | map->inner_map_meta = inner_map_meta; |
2559 | |
2560 | return map; |
2561 | } |
2562 | |
2563 | static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) |
2564 | { |
2565 | struct bpf_map **inner_map = htab_map_lookup_elem(map, key); |
2566 | |
2567 | if (!inner_map) |
2568 | return NULL; |
2569 | |
2570 | return READ_ONCE(*inner_map); |
2571 | } |
2572 | |
2573 | static int htab_of_map_gen_lookup(struct bpf_map *map, |
2574 | struct bpf_insn *insn_buf) |
2575 | { |
2576 | struct bpf_insn *insn = insn_buf; |
2577 | const int ret = BPF_REG_0; |
2578 | |
2579 | BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, |
2580 | (void *(*)(struct bpf_map *map, void *key))NULL)); |
2581 | *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); |
2582 | *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); |
2583 | *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, |
2584 | offsetof(struct htab_elem, key) + |
2585 | round_up(map->key_size, 8)); |
2586 | *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); |
2587 | |
2588 | return insn - insn_buf; |
2589 | } |
2590 | |
2591 | static void htab_of_map_free(struct bpf_map *map) |
2592 | { |
2593 | bpf_map_meta_free(map_meta: map->inner_map_meta); |
2594 | fd_htab_map_free(map); |
2595 | } |
2596 | |
2597 | const struct bpf_map_ops htab_of_maps_map_ops = { |
2598 | .map_alloc_check = fd_htab_map_alloc_check, |
2599 | .map_alloc = htab_of_map_alloc, |
2600 | .map_free = htab_of_map_free, |
2601 | .map_get_next_key = htab_map_get_next_key, |
2602 | .map_lookup_elem = htab_of_map_lookup_elem, |
2603 | .map_delete_elem = htab_map_delete_elem, |
2604 | .map_fd_get_ptr = bpf_map_fd_get_ptr, |
2605 | .map_fd_put_ptr = bpf_map_fd_put_ptr, |
2606 | .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, |
2607 | .map_gen_lookup = htab_of_map_gen_lookup, |
2608 | .map_check_btf = map_check_no_btf, |
2609 | .map_mem_usage = htab_map_mem_usage, |
2610 | BATCH_OPS(htab), |
2611 | .map_btf_id = &htab_map_btf_ids[0], |
2612 | }; |
2613 | |