1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> |
4 | * |
5 | * Development of this code funded by Astaro AG (http://www.astaro.com/) |
6 | */ |
7 | |
8 | #include <linux/kernel.h> |
9 | #include <linux/init.h> |
10 | #include <linux/module.h> |
11 | #include <linux/list.h> |
12 | #include <linux/rbtree.h> |
13 | #include <linux/netlink.h> |
14 | #include <linux/netfilter.h> |
15 | #include <linux/netfilter/nf_tables.h> |
16 | #include <net/netfilter/nf_tables_core.h> |
17 | |
18 | struct nft_rbtree { |
19 | struct rb_root root; |
20 | rwlock_t lock; |
21 | seqcount_rwlock_t count; |
22 | unsigned long last_gc; |
23 | }; |
24 | |
25 | struct nft_rbtree_elem { |
26 | struct nft_elem_priv priv; |
27 | struct rb_node node; |
28 | struct nft_set_ext ext; |
29 | }; |
30 | |
31 | static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) |
32 | { |
33 | return nft_set_ext_exists(ext: &rbe->ext, id: NFT_SET_EXT_FLAGS) && |
34 | (*nft_set_ext_flags(ext: &rbe->ext) & NFT_SET_ELEM_INTERVAL_END); |
35 | } |
36 | |
37 | static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) |
38 | { |
39 | return !nft_rbtree_interval_end(rbe); |
40 | } |
41 | |
42 | static int nft_rbtree_cmp(const struct nft_set *set, |
43 | const struct nft_rbtree_elem *e1, |
44 | const struct nft_rbtree_elem *e2) |
45 | { |
46 | return memcmp(p: nft_set_ext_key(ext: &e1->ext), q: nft_set_ext_key(ext: &e2->ext), |
47 | size: set->klen); |
48 | } |
49 | |
50 | static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) |
51 | { |
52 | return nft_set_elem_expired(ext: &rbe->ext); |
53 | } |
54 | |
55 | static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, |
56 | const u32 *key, const struct nft_set_ext **ext, |
57 | unsigned int seq) |
58 | { |
59 | struct nft_rbtree *priv = nft_set_priv(set); |
60 | const struct nft_rbtree_elem *rbe, *interval = NULL; |
61 | u8 genmask = nft_genmask_cur(net); |
62 | const struct rb_node *parent; |
63 | int d; |
64 | |
65 | parent = rcu_dereference_raw(priv->root.rb_node); |
66 | while (parent != NULL) { |
67 | if (read_seqcount_retry(&priv->count, seq)) |
68 | return false; |
69 | |
70 | rbe = rb_entry(parent, struct nft_rbtree_elem, node); |
71 | |
72 | d = memcmp(p: nft_set_ext_key(ext: &rbe->ext), q: key, size: set->klen); |
73 | if (d < 0) { |
74 | parent = rcu_dereference_raw(parent->rb_left); |
75 | if (interval && |
76 | !nft_rbtree_cmp(set, e1: rbe, e2: interval) && |
77 | nft_rbtree_interval_end(rbe) && |
78 | nft_rbtree_interval_start(rbe: interval)) |
79 | continue; |
80 | interval = rbe; |
81 | } else if (d > 0) |
82 | parent = rcu_dereference_raw(parent->rb_right); |
83 | else { |
84 | if (!nft_set_elem_active(ext: &rbe->ext, genmask)) { |
85 | parent = rcu_dereference_raw(parent->rb_left); |
86 | continue; |
87 | } |
88 | |
89 | if (nft_rbtree_elem_expired(rbe)) |
90 | return false; |
91 | |
92 | if (nft_rbtree_interval_end(rbe)) { |
93 | if (nft_set_is_anonymous(set)) |
94 | return false; |
95 | parent = rcu_dereference_raw(parent->rb_left); |
96 | interval = NULL; |
97 | continue; |
98 | } |
99 | |
100 | *ext = &rbe->ext; |
101 | return true; |
102 | } |
103 | } |
104 | |
105 | if (set->flags & NFT_SET_INTERVAL && interval != NULL && |
106 | nft_set_elem_active(ext: &interval->ext, genmask) && |
107 | !nft_rbtree_elem_expired(rbe: interval) && |
108 | nft_rbtree_interval_start(rbe: interval)) { |
109 | *ext = &interval->ext; |
110 | return true; |
111 | } |
112 | |
113 | return false; |
114 | } |
115 | |
116 | INDIRECT_CALLABLE_SCOPE |
117 | bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, |
118 | const u32 *key, const struct nft_set_ext **ext) |
119 | { |
120 | struct nft_rbtree *priv = nft_set_priv(set); |
121 | unsigned int seq = read_seqcount_begin(&priv->count); |
122 | bool ret; |
123 | |
124 | ret = __nft_rbtree_lookup(net, set, key, ext, seq); |
125 | if (ret || !read_seqcount_retry(&priv->count, seq)) |
126 | return ret; |
127 | |
128 | read_lock_bh(&priv->lock); |
129 | seq = read_seqcount_begin(&priv->count); |
130 | ret = __nft_rbtree_lookup(net, set, key, ext, seq); |
131 | read_unlock_bh(&priv->lock); |
132 | |
133 | return ret; |
134 | } |
135 | |
136 | static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, |
137 | const u32 *key, struct nft_rbtree_elem **elem, |
138 | unsigned int seq, unsigned int flags, u8 genmask) |
139 | { |
140 | struct nft_rbtree_elem *rbe, *interval = NULL; |
141 | struct nft_rbtree *priv = nft_set_priv(set); |
142 | const struct rb_node *parent; |
143 | const void *this; |
144 | int d; |
145 | |
146 | parent = rcu_dereference_raw(priv->root.rb_node); |
147 | while (parent != NULL) { |
148 | if (read_seqcount_retry(&priv->count, seq)) |
149 | return false; |
150 | |
151 | rbe = rb_entry(parent, struct nft_rbtree_elem, node); |
152 | |
153 | this = nft_set_ext_key(ext: &rbe->ext); |
154 | d = memcmp(p: this, q: key, size: set->klen); |
155 | if (d < 0) { |
156 | parent = rcu_dereference_raw(parent->rb_left); |
157 | if (!(flags & NFT_SET_ELEM_INTERVAL_END)) |
158 | interval = rbe; |
159 | } else if (d > 0) { |
160 | parent = rcu_dereference_raw(parent->rb_right); |
161 | if (flags & NFT_SET_ELEM_INTERVAL_END) |
162 | interval = rbe; |
163 | } else { |
164 | if (!nft_set_elem_active(ext: &rbe->ext, genmask)) { |
165 | parent = rcu_dereference_raw(parent->rb_left); |
166 | continue; |
167 | } |
168 | |
169 | if (nft_set_elem_expired(ext: &rbe->ext)) |
170 | return false; |
171 | |
172 | if (!nft_set_ext_exists(ext: &rbe->ext, id: NFT_SET_EXT_FLAGS) || |
173 | (*nft_set_ext_flags(ext: &rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == |
174 | (flags & NFT_SET_ELEM_INTERVAL_END)) { |
175 | *elem = rbe; |
176 | return true; |
177 | } |
178 | |
179 | if (nft_rbtree_interval_end(rbe)) |
180 | interval = NULL; |
181 | |
182 | parent = rcu_dereference_raw(parent->rb_left); |
183 | } |
184 | } |
185 | |
186 | if (set->flags & NFT_SET_INTERVAL && interval != NULL && |
187 | nft_set_elem_active(ext: &interval->ext, genmask) && |
188 | !nft_set_elem_expired(ext: &interval->ext) && |
189 | ((!nft_rbtree_interval_end(rbe: interval) && |
190 | !(flags & NFT_SET_ELEM_INTERVAL_END)) || |
191 | (nft_rbtree_interval_end(rbe: interval) && |
192 | (flags & NFT_SET_ELEM_INTERVAL_END)))) { |
193 | *elem = interval; |
194 | return true; |
195 | } |
196 | |
197 | return false; |
198 | } |
199 | |
200 | static struct nft_elem_priv * |
201 | nft_rbtree_get(const struct net *net, const struct nft_set *set, |
202 | const struct nft_set_elem *elem, unsigned int flags) |
203 | { |
204 | struct nft_rbtree *priv = nft_set_priv(set); |
205 | unsigned int seq = read_seqcount_begin(&priv->count); |
206 | struct nft_rbtree_elem *rbe = ERR_PTR(error: -ENOENT); |
207 | const u32 *key = (const u32 *)&elem->key.val; |
208 | u8 genmask = nft_genmask_cur(net); |
209 | bool ret; |
210 | |
211 | ret = __nft_rbtree_get(net, set, key, elem: &rbe, seq, flags, genmask); |
212 | if (ret || !read_seqcount_retry(&priv->count, seq)) |
213 | return &rbe->priv; |
214 | |
215 | read_lock_bh(&priv->lock); |
216 | seq = read_seqcount_begin(&priv->count); |
217 | ret = __nft_rbtree_get(net, set, key, elem: &rbe, seq, flags, genmask); |
218 | read_unlock_bh(&priv->lock); |
219 | |
220 | if (!ret) |
221 | return ERR_PTR(error: -ENOENT); |
222 | |
223 | return &rbe->priv; |
224 | } |
225 | |
226 | static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set, |
227 | struct nft_rbtree *priv, |
228 | struct nft_rbtree_elem *rbe) |
229 | { |
230 | lockdep_assert_held_write(&priv->lock); |
231 | nft_setelem_data_deactivate(net, set, elem_priv: &rbe->priv); |
232 | rb_erase(&rbe->node, &priv->root); |
233 | } |
234 | |
235 | static const struct nft_rbtree_elem * |
236 | nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, |
237 | struct nft_rbtree_elem *rbe) |
238 | { |
239 | struct nft_set *set = (struct nft_set *)__set; |
240 | struct rb_node *prev = rb_prev(&rbe->node); |
241 | struct net *net = read_pnet(pnet: &set->net); |
242 | struct nft_rbtree_elem *rbe_prev; |
243 | struct nft_trans_gc *gc; |
244 | |
245 | gc = nft_trans_gc_alloc(set, gc_seq: 0, GFP_ATOMIC); |
246 | if (!gc) |
247 | return ERR_PTR(error: -ENOMEM); |
248 | |
249 | /* search for end interval coming before this element. |
250 | * end intervals don't carry a timeout extension, they |
251 | * are coupled with the interval start element. |
252 | */ |
253 | while (prev) { |
254 | rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); |
255 | if (nft_rbtree_interval_end(rbe: rbe_prev) && |
256 | nft_set_elem_active(ext: &rbe_prev->ext, NFT_GENMASK_ANY)) |
257 | break; |
258 | |
259 | prev = rb_prev(prev); |
260 | } |
261 | |
262 | rbe_prev = NULL; |
263 | if (prev) { |
264 | rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); |
265 | nft_rbtree_gc_elem_remove(net, set, priv, rbe: rbe_prev); |
266 | |
267 | /* There is always room in this trans gc for this element, |
268 | * memory allocation never actually happens, hence, the warning |
269 | * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, |
270 | * this is synchronous gc which never fails. |
271 | */ |
272 | gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); |
273 | if (WARN_ON_ONCE(!gc)) |
274 | return ERR_PTR(error: -ENOMEM); |
275 | |
276 | nft_trans_gc_elem_add(gc, priv: rbe_prev); |
277 | } |
278 | |
279 | nft_rbtree_gc_elem_remove(net, set, priv, rbe); |
280 | gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); |
281 | if (WARN_ON_ONCE(!gc)) |
282 | return ERR_PTR(error: -ENOMEM); |
283 | |
284 | nft_trans_gc_elem_add(gc, priv: rbe); |
285 | |
286 | nft_trans_gc_queue_sync_done(trans: gc); |
287 | |
288 | return rbe_prev; |
289 | } |
290 | |
291 | static bool nft_rbtree_update_first(const struct nft_set *set, |
292 | struct nft_rbtree_elem *rbe, |
293 | struct rb_node *first) |
294 | { |
295 | struct nft_rbtree_elem *first_elem; |
296 | |
297 | first_elem = rb_entry(first, struct nft_rbtree_elem, node); |
298 | /* this element is closest to where the new element is to be inserted: |
299 | * update the first element for the node list path. |
300 | */ |
301 | if (nft_rbtree_cmp(set, e1: rbe, e2: first_elem) < 0) |
302 | return true; |
303 | |
304 | return false; |
305 | } |
306 | |
307 | static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, |
308 | struct nft_rbtree_elem *new, |
309 | struct nft_elem_priv **elem_priv) |
310 | { |
311 | struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; |
312 | struct rb_node *node, *next, *parent, **p, *first = NULL; |
313 | struct nft_rbtree *priv = nft_set_priv(set); |
314 | u8 cur_genmask = nft_genmask_cur(net); |
315 | u8 genmask = nft_genmask_next(net); |
316 | u64 tstamp = nft_net_tstamp(net); |
317 | int d; |
318 | |
319 | /* Descend the tree to search for an existing element greater than the |
320 | * key value to insert that is greater than the new element. This is the |
321 | * first element to walk the ordered elements to find possible overlap. |
322 | */ |
323 | parent = NULL; |
324 | p = &priv->root.rb_node; |
325 | while (*p != NULL) { |
326 | parent = *p; |
327 | rbe = rb_entry(parent, struct nft_rbtree_elem, node); |
328 | d = nft_rbtree_cmp(set, e1: rbe, e2: new); |
329 | |
330 | if (d < 0) { |
331 | p = &parent->rb_left; |
332 | } else if (d > 0) { |
333 | if (!first || |
334 | nft_rbtree_update_first(set, rbe, first)) |
335 | first = &rbe->node; |
336 | |
337 | p = &parent->rb_right; |
338 | } else { |
339 | if (nft_rbtree_interval_end(rbe)) |
340 | p = &parent->rb_left; |
341 | else |
342 | p = &parent->rb_right; |
343 | } |
344 | } |
345 | |
346 | if (!first) |
347 | first = rb_first(&priv->root); |
348 | |
349 | /* Detect overlap by going through the list of valid tree nodes. |
350 | * Values stored in the tree are in reversed order, starting from |
351 | * highest to lowest value. |
352 | */ |
353 | for (node = first; node != NULL; node = next) { |
354 | next = rb_next(node); |
355 | |
356 | rbe = rb_entry(node, struct nft_rbtree_elem, node); |
357 | |
358 | if (!nft_set_elem_active(ext: &rbe->ext, genmask)) |
359 | continue; |
360 | |
361 | /* perform garbage collection to avoid bogus overlap reports |
362 | * but skip new elements in this transaction. |
363 | */ |
364 | if (__nft_set_elem_expired(ext: &rbe->ext, tstamp) && |
365 | nft_set_elem_active(ext: &rbe->ext, genmask: cur_genmask)) { |
366 | const struct nft_rbtree_elem *removed_end; |
367 | |
368 | removed_end = nft_rbtree_gc_elem(set: set, priv, rbe); |
369 | if (IS_ERR(ptr: removed_end)) |
370 | return PTR_ERR(ptr: removed_end); |
371 | |
372 | if (removed_end == rbe_le || removed_end == rbe_ge) |
373 | return -EAGAIN; |
374 | |
375 | continue; |
376 | } |
377 | |
378 | d = nft_rbtree_cmp(set, e1: rbe, e2: new); |
379 | if (d == 0) { |
380 | /* Matching end element: no need to look for an |
381 | * overlapping greater or equal element. |
382 | */ |
383 | if (nft_rbtree_interval_end(rbe)) { |
384 | rbe_le = rbe; |
385 | break; |
386 | } |
387 | |
388 | /* first element that is greater or equal to key value. */ |
389 | if (!rbe_ge) { |
390 | rbe_ge = rbe; |
391 | continue; |
392 | } |
393 | |
394 | /* this is a closer more or equal element, update it. */ |
395 | if (nft_rbtree_cmp(set, e1: rbe_ge, e2: new) != 0) { |
396 | rbe_ge = rbe; |
397 | continue; |
398 | } |
399 | |
400 | /* element is equal to key value, make sure flags are |
401 | * the same, an existing more or equal start element |
402 | * must not be replaced by more or equal end element. |
403 | */ |
404 | if ((nft_rbtree_interval_start(rbe: new) && |
405 | nft_rbtree_interval_start(rbe: rbe_ge)) || |
406 | (nft_rbtree_interval_end(rbe: new) && |
407 | nft_rbtree_interval_end(rbe: rbe_ge))) { |
408 | rbe_ge = rbe; |
409 | continue; |
410 | } |
411 | } else if (d > 0) { |
412 | /* annotate element greater than the new element. */ |
413 | rbe_ge = rbe; |
414 | continue; |
415 | } else if (d < 0) { |
416 | /* annotate element less than the new element. */ |
417 | rbe_le = rbe; |
418 | break; |
419 | } |
420 | } |
421 | |
422 | /* - new start element matching existing start element: full overlap |
423 | * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. |
424 | */ |
425 | if (rbe_ge && !nft_rbtree_cmp(set, e1: new, e2: rbe_ge) && |
426 | nft_rbtree_interval_start(rbe: rbe_ge) == nft_rbtree_interval_start(rbe: new)) { |
427 | *elem_priv = &rbe_ge->priv; |
428 | return -EEXIST; |
429 | } |
430 | |
431 | /* - new end element matching existing end element: full overlap |
432 | * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. |
433 | */ |
434 | if (rbe_le && !nft_rbtree_cmp(set, e1: new, e2: rbe_le) && |
435 | nft_rbtree_interval_end(rbe: rbe_le) == nft_rbtree_interval_end(rbe: new)) { |
436 | *elem_priv = &rbe_le->priv; |
437 | return -EEXIST; |
438 | } |
439 | |
440 | /* - new start element with existing closest, less or equal key value |
441 | * being a start element: partial overlap, reported as -ENOTEMPTY. |
442 | * Anonymous sets allow for two consecutive start element since they |
443 | * are constant, skip them to avoid bogus overlap reports. |
444 | */ |
445 | if (!nft_set_is_anonymous(set) && rbe_le && |
446 | nft_rbtree_interval_start(rbe: rbe_le) && nft_rbtree_interval_start(rbe: new)) |
447 | return -ENOTEMPTY; |
448 | |
449 | /* - new end element with existing closest, less or equal key value |
450 | * being a end element: partial overlap, reported as -ENOTEMPTY. |
451 | */ |
452 | if (rbe_le && |
453 | nft_rbtree_interval_end(rbe: rbe_le) && nft_rbtree_interval_end(rbe: new)) |
454 | return -ENOTEMPTY; |
455 | |
456 | /* - new end element with existing closest, greater or equal key value |
457 | * being an end element: partial overlap, reported as -ENOTEMPTY |
458 | */ |
459 | if (rbe_ge && |
460 | nft_rbtree_interval_end(rbe: rbe_ge) && nft_rbtree_interval_end(rbe: new)) |
461 | return -ENOTEMPTY; |
462 | |
463 | /* Accepted element: pick insertion point depending on key value */ |
464 | parent = NULL; |
465 | p = &priv->root.rb_node; |
466 | while (*p != NULL) { |
467 | parent = *p; |
468 | rbe = rb_entry(parent, struct nft_rbtree_elem, node); |
469 | d = nft_rbtree_cmp(set, e1: rbe, e2: new); |
470 | |
471 | if (d < 0) |
472 | p = &parent->rb_left; |
473 | else if (d > 0) |
474 | p = &parent->rb_right; |
475 | else if (nft_rbtree_interval_end(rbe)) |
476 | p = &parent->rb_left; |
477 | else |
478 | p = &parent->rb_right; |
479 | } |
480 | |
481 | rb_link_node_rcu(node: &new->node, parent, rb_link: p); |
482 | rb_insert_color(&new->node, &priv->root); |
483 | return 0; |
484 | } |
485 | |
486 | static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, |
487 | const struct nft_set_elem *elem, |
488 | struct nft_elem_priv **elem_priv) |
489 | { |
490 | struct nft_rbtree_elem *rbe = nft_elem_priv_cast(priv: elem->priv); |
491 | struct nft_rbtree *priv = nft_set_priv(set); |
492 | int err; |
493 | |
494 | do { |
495 | if (fatal_signal_pending(current)) |
496 | return -EINTR; |
497 | |
498 | cond_resched(); |
499 | |
500 | write_lock_bh(&priv->lock); |
501 | write_seqcount_begin(&priv->count); |
502 | err = __nft_rbtree_insert(net, set, new: rbe, elem_priv); |
503 | write_seqcount_end(&priv->count); |
504 | write_unlock_bh(&priv->lock); |
505 | } while (err == -EAGAIN); |
506 | |
507 | return err; |
508 | } |
509 | |
510 | static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) |
511 | { |
512 | write_lock_bh(&priv->lock); |
513 | write_seqcount_begin(&priv->count); |
514 | rb_erase(&rbe->node, &priv->root); |
515 | write_seqcount_end(&priv->count); |
516 | write_unlock_bh(&priv->lock); |
517 | } |
518 | |
519 | static void nft_rbtree_remove(const struct net *net, |
520 | const struct nft_set *set, |
521 | struct nft_elem_priv *elem_priv) |
522 | { |
523 | struct nft_rbtree_elem *rbe = nft_elem_priv_cast(priv: elem_priv); |
524 | struct nft_rbtree *priv = nft_set_priv(set); |
525 | |
526 | nft_rbtree_erase(priv, rbe); |
527 | } |
528 | |
529 | static void nft_rbtree_activate(const struct net *net, |
530 | const struct nft_set *set, |
531 | struct nft_elem_priv *elem_priv) |
532 | { |
533 | struct nft_rbtree_elem *rbe = nft_elem_priv_cast(priv: elem_priv); |
534 | |
535 | nft_clear(net, &rbe->ext); |
536 | } |
537 | |
538 | static void nft_rbtree_flush(const struct net *net, |
539 | const struct nft_set *set, |
540 | struct nft_elem_priv *elem_priv) |
541 | { |
542 | struct nft_rbtree_elem *rbe = nft_elem_priv_cast(priv: elem_priv); |
543 | |
544 | nft_set_elem_change_active(net, set, ext: &rbe->ext); |
545 | } |
546 | |
547 | static struct nft_elem_priv * |
548 | nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, |
549 | const struct nft_set_elem *elem) |
550 | { |
551 | struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(priv: elem->priv); |
552 | const struct nft_rbtree *priv = nft_set_priv(set); |
553 | const struct rb_node *parent = priv->root.rb_node; |
554 | u8 genmask = nft_genmask_next(net); |
555 | u64 tstamp = nft_net_tstamp(net); |
556 | int d; |
557 | |
558 | while (parent != NULL) { |
559 | rbe = rb_entry(parent, struct nft_rbtree_elem, node); |
560 | |
561 | d = memcmp(p: nft_set_ext_key(ext: &rbe->ext), q: &elem->key.val, |
562 | size: set->klen); |
563 | if (d < 0) |
564 | parent = parent->rb_left; |
565 | else if (d > 0) |
566 | parent = parent->rb_right; |
567 | else { |
568 | if (nft_rbtree_interval_end(rbe) && |
569 | nft_rbtree_interval_start(rbe: this)) { |
570 | parent = parent->rb_left; |
571 | continue; |
572 | } else if (nft_rbtree_interval_start(rbe) && |
573 | nft_rbtree_interval_end(rbe: this)) { |
574 | parent = parent->rb_right; |
575 | continue; |
576 | } else if (__nft_set_elem_expired(ext: &rbe->ext, tstamp)) { |
577 | break; |
578 | } else if (!nft_set_elem_active(ext: &rbe->ext, genmask)) { |
579 | parent = parent->rb_left; |
580 | continue; |
581 | } |
582 | nft_rbtree_flush(net, set, elem_priv: &rbe->priv); |
583 | return &rbe->priv; |
584 | } |
585 | } |
586 | return NULL; |
587 | } |
588 | |
589 | static void nft_rbtree_walk(const struct nft_ctx *ctx, |
590 | struct nft_set *set, |
591 | struct nft_set_iter *iter) |
592 | { |
593 | struct nft_rbtree *priv = nft_set_priv(set); |
594 | struct nft_rbtree_elem *rbe; |
595 | struct rb_node *node; |
596 | |
597 | read_lock_bh(&priv->lock); |
598 | for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { |
599 | rbe = rb_entry(node, struct nft_rbtree_elem, node); |
600 | |
601 | if (iter->count < iter->skip) |
602 | goto cont; |
603 | |
604 | iter->err = iter->fn(ctx, set, iter, &rbe->priv); |
605 | if (iter->err < 0) { |
606 | read_unlock_bh(&priv->lock); |
607 | return; |
608 | } |
609 | cont: |
610 | iter->count++; |
611 | } |
612 | read_unlock_bh(&priv->lock); |
613 | } |
614 | |
615 | static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, |
616 | struct nft_rbtree *priv, |
617 | struct nft_rbtree_elem *rbe) |
618 | { |
619 | nft_setelem_data_deactivate(net, set, elem_priv: &rbe->priv); |
620 | nft_rbtree_erase(priv, rbe); |
621 | } |
622 | |
623 | static void nft_rbtree_gc(struct nft_set *set) |
624 | { |
625 | struct nft_rbtree *priv = nft_set_priv(set); |
626 | struct nft_rbtree_elem *rbe, *rbe_end = NULL; |
627 | struct net *net = read_pnet(pnet: &set->net); |
628 | u64 tstamp = nft_net_tstamp(net); |
629 | struct rb_node *node, *next; |
630 | struct nft_trans_gc *gc; |
631 | |
632 | set = nft_set_container_of(priv); |
633 | net = read_pnet(pnet: &set->net); |
634 | |
635 | gc = nft_trans_gc_alloc(set, gc_seq: 0, GFP_KERNEL); |
636 | if (!gc) |
637 | return; |
638 | |
639 | for (node = rb_first(&priv->root); node ; node = next) { |
640 | next = rb_next(node); |
641 | |
642 | rbe = rb_entry(node, struct nft_rbtree_elem, node); |
643 | |
644 | /* elements are reversed in the rbtree for historical reasons, |
645 | * from highest to lowest value, that is why end element is |
646 | * always visited before the start element. |
647 | */ |
648 | if (nft_rbtree_interval_end(rbe)) { |
649 | rbe_end = rbe; |
650 | continue; |
651 | } |
652 | if (!__nft_set_elem_expired(ext: &rbe->ext, tstamp)) |
653 | continue; |
654 | |
655 | gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); |
656 | if (!gc) |
657 | goto try_later; |
658 | |
659 | /* end element needs to be removed first, it has |
660 | * no timeout extension. |
661 | */ |
662 | if (rbe_end) { |
663 | nft_rbtree_gc_remove(net, set, priv, rbe: rbe_end); |
664 | nft_trans_gc_elem_add(gc, priv: rbe_end); |
665 | rbe_end = NULL; |
666 | } |
667 | |
668 | gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); |
669 | if (!gc) |
670 | goto try_later; |
671 | |
672 | nft_rbtree_gc_remove(net, set, priv, rbe); |
673 | nft_trans_gc_elem_add(gc, priv: rbe); |
674 | } |
675 | |
676 | try_later: |
677 | |
678 | if (gc) { |
679 | gc = nft_trans_gc_catchall_sync(gc); |
680 | nft_trans_gc_queue_sync_done(trans: gc); |
681 | priv->last_gc = jiffies; |
682 | } |
683 | } |
684 | |
685 | static u64 nft_rbtree_privsize(const struct nlattr * const nla[], |
686 | const struct nft_set_desc *desc) |
687 | { |
688 | return sizeof(struct nft_rbtree); |
689 | } |
690 | |
691 | static int nft_rbtree_init(const struct nft_set *set, |
692 | const struct nft_set_desc *desc, |
693 | const struct nlattr * const nla[]) |
694 | { |
695 | struct nft_rbtree *priv = nft_set_priv(set); |
696 | |
697 | BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0); |
698 | |
699 | rwlock_init(&priv->lock); |
700 | seqcount_rwlock_init(&priv->count, &priv->lock); |
701 | priv->root = RB_ROOT; |
702 | |
703 | return 0; |
704 | } |
705 | |
706 | static void nft_rbtree_destroy(const struct nft_ctx *ctx, |
707 | const struct nft_set *set) |
708 | { |
709 | struct nft_rbtree *priv = nft_set_priv(set); |
710 | struct nft_rbtree_elem *rbe; |
711 | struct rb_node *node; |
712 | |
713 | while ((node = priv->root.rb_node) != NULL) { |
714 | rb_erase(node, &priv->root); |
715 | rbe = rb_entry(node, struct nft_rbtree_elem, node); |
716 | nf_tables_set_elem_destroy(ctx, set, elem_priv: &rbe->priv); |
717 | } |
718 | } |
719 | |
720 | static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, |
721 | struct nft_set_estimate *est) |
722 | { |
723 | if (desc->field_count > 1) |
724 | return false; |
725 | |
726 | if (desc->size) |
727 | est->size = sizeof(struct nft_rbtree) + |
728 | desc->size * sizeof(struct nft_rbtree_elem); |
729 | else |
730 | est->size = ~0; |
731 | |
732 | est->lookup = NFT_SET_CLASS_O_LOG_N; |
733 | est->space = NFT_SET_CLASS_O_N; |
734 | |
735 | return true; |
736 | } |
737 | |
738 | static void nft_rbtree_commit(struct nft_set *set) |
739 | { |
740 | struct nft_rbtree *priv = nft_set_priv(set); |
741 | |
742 | if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) |
743 | nft_rbtree_gc(set); |
744 | } |
745 | |
746 | static void nft_rbtree_gc_init(const struct nft_set *set) |
747 | { |
748 | struct nft_rbtree *priv = nft_set_priv(set); |
749 | |
750 | priv->last_gc = jiffies; |
751 | } |
752 | |
753 | const struct nft_set_type nft_set_rbtree_type = { |
754 | .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, |
755 | .ops = { |
756 | .privsize = nft_rbtree_privsize, |
757 | .elemsize = offsetof(struct nft_rbtree_elem, ext), |
758 | .estimate = nft_rbtree_estimate, |
759 | .init = nft_rbtree_init, |
760 | .destroy = nft_rbtree_destroy, |
761 | .insert = nft_rbtree_insert, |
762 | .remove = nft_rbtree_remove, |
763 | .deactivate = nft_rbtree_deactivate, |
764 | .flush = nft_rbtree_flush, |
765 | .activate = nft_rbtree_activate, |
766 | .commit = nft_rbtree_commit, |
767 | .gc_init = nft_rbtree_gc_init, |
768 | .lookup = nft_rbtree_lookup, |
769 | .walk = nft_rbtree_walk, |
770 | .get = nft_rbtree_get, |
771 | }, |
772 | }; |
773 | |