1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | #include <linux/kernel.h> |
3 | #include <linux/module.h> |
4 | #include <linux/init.h> |
5 | #include <linux/netlink.h> |
6 | #include <linux/netfilter.h> |
7 | #include <linux/workqueue.h> |
8 | #include <linux/spinlock.h> |
9 | #include <linux/netfilter/nf_conntrack_common.h> |
10 | #include <linux/netfilter/nf_tables.h> |
11 | #include <net/ip.h> /* for ipv4 options. */ |
12 | #include <net/netfilter/nf_tables.h> |
13 | #include <net/netfilter/nf_tables_core.h> |
14 | #include <net/netfilter/nf_conntrack_core.h> |
15 | #include <net/netfilter/nf_conntrack_extend.h> |
16 | #include <net/netfilter/nf_flow_table.h> |
17 | |
18 | struct nft_flow_offload { |
19 | struct nft_flowtable *flowtable; |
20 | }; |
21 | |
22 | static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) |
23 | { |
24 | if (dst_xfrm(dst)) |
25 | return FLOW_OFFLOAD_XMIT_XFRM; |
26 | |
27 | return FLOW_OFFLOAD_XMIT_NEIGH; |
28 | } |
29 | |
30 | static void nft_default_forward_path(struct nf_flow_route *route, |
31 | struct dst_entry *dst_cache, |
32 | enum ip_conntrack_dir dir) |
33 | { |
34 | route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; |
35 | route->tuple[dir].dst = dst_cache; |
36 | route->tuple[dir].xmit_type = nft_xmit_type(dst: dst_cache); |
37 | } |
38 | |
39 | static bool nft_is_valid_ether_device(const struct net_device *dev) |
40 | { |
41 | if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || |
42 | dev->addr_len != ETH_ALEN || !is_valid_ether_addr(addr: dev->dev_addr)) |
43 | return false; |
44 | |
45 | return true; |
46 | } |
47 | |
48 | static int nft_dev_fill_forward_path(const struct nf_flow_route *route, |
49 | const struct dst_entry *dst_cache, |
50 | const struct nf_conn *ct, |
51 | enum ip_conntrack_dir dir, u8 *ha, |
52 | struct net_device_path_stack *stack) |
53 | { |
54 | const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; |
55 | struct net_device *dev = dst_cache->dev; |
56 | struct neighbour *n; |
57 | u8 nud_state; |
58 | |
59 | if (!nft_is_valid_ether_device(dev)) |
60 | goto out; |
61 | |
62 | n = dst_neigh_lookup(dst: dst_cache, daddr); |
63 | if (!n) |
64 | return -1; |
65 | |
66 | read_lock_bh(&n->lock); |
67 | nud_state = n->nud_state; |
68 | ether_addr_copy(dst: ha, src: n->ha); |
69 | read_unlock_bh(&n->lock); |
70 | neigh_release(neigh: n); |
71 | |
72 | if (!(nud_state & NUD_VALID)) |
73 | return -1; |
74 | |
75 | out: |
76 | return dev_fill_forward_path(dev, daddr: ha, stack); |
77 | } |
78 | |
79 | struct nft_forward_info { |
80 | const struct net_device *indev; |
81 | const struct net_device *outdev; |
82 | const struct net_device *hw_outdev; |
83 | struct id { |
84 | __u16 id; |
85 | __be16 proto; |
86 | } encap[NF_FLOW_TABLE_ENCAP_MAX]; |
87 | u8 num_encaps; |
88 | u8 ingress_vlans; |
89 | u8 h_source[ETH_ALEN]; |
90 | u8 h_dest[ETH_ALEN]; |
91 | enum flow_offload_xmit_type xmit_type; |
92 | }; |
93 | |
94 | static void nft_dev_path_info(const struct net_device_path_stack *stack, |
95 | struct nft_forward_info *info, |
96 | unsigned char *ha, struct nf_flowtable *flowtable) |
97 | { |
98 | const struct net_device_path *path; |
99 | int i; |
100 | |
101 | memcpy(info->h_dest, ha, ETH_ALEN); |
102 | |
103 | for (i = 0; i < stack->num_paths; i++) { |
104 | path = &stack->path[i]; |
105 | switch (path->type) { |
106 | case DEV_PATH_ETHERNET: |
107 | case DEV_PATH_DSA: |
108 | case DEV_PATH_VLAN: |
109 | case DEV_PATH_PPPOE: |
110 | info->indev = path->dev; |
111 | if (is_zero_ether_addr(addr: info->h_source)) |
112 | memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); |
113 | |
114 | if (path->type == DEV_PATH_ETHERNET) |
115 | break; |
116 | if (path->type == DEV_PATH_DSA) { |
117 | i = stack->num_paths; |
118 | break; |
119 | } |
120 | |
121 | /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ |
122 | if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { |
123 | info->indev = NULL; |
124 | break; |
125 | } |
126 | if (!info->outdev) |
127 | info->outdev = path->dev; |
128 | info->encap[info->num_encaps].id = path->encap.id; |
129 | info->encap[info->num_encaps].proto = path->encap.proto; |
130 | info->num_encaps++; |
131 | if (path->type == DEV_PATH_PPPOE) |
132 | memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); |
133 | break; |
134 | case DEV_PATH_BRIDGE: |
135 | if (is_zero_ether_addr(addr: info->h_source)) |
136 | memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); |
137 | |
138 | switch (path->bridge.vlan_mode) { |
139 | case DEV_PATH_BR_VLAN_UNTAG_HW: |
140 | info->ingress_vlans |= BIT(info->num_encaps - 1); |
141 | break; |
142 | case DEV_PATH_BR_VLAN_TAG: |
143 | info->encap[info->num_encaps].id = path->bridge.vlan_id; |
144 | info->encap[info->num_encaps].proto = path->bridge.vlan_proto; |
145 | info->num_encaps++; |
146 | break; |
147 | case DEV_PATH_BR_VLAN_UNTAG: |
148 | info->num_encaps--; |
149 | break; |
150 | case DEV_PATH_BR_VLAN_KEEP: |
151 | break; |
152 | } |
153 | info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; |
154 | break; |
155 | default: |
156 | info->indev = NULL; |
157 | break; |
158 | } |
159 | } |
160 | if (!info->outdev) |
161 | info->outdev = info->indev; |
162 | |
163 | info->hw_outdev = info->indev; |
164 | |
165 | if (nf_flowtable_hw_offload(flowtable) && |
166 | nft_is_valid_ether_device(dev: info->indev)) |
167 | info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; |
168 | } |
169 | |
170 | static bool nft_flowtable_find_dev(const struct net_device *dev, |
171 | struct nft_flowtable *ft) |
172 | { |
173 | struct nft_hook *hook; |
174 | bool found = false; |
175 | |
176 | list_for_each_entry_rcu(hook, &ft->hook_list, list) { |
177 | if (hook->ops.dev != dev) |
178 | continue; |
179 | |
180 | found = true; |
181 | break; |
182 | } |
183 | |
184 | return found; |
185 | } |
186 | |
187 | static void nft_dev_forward_path(struct nf_flow_route *route, |
188 | const struct nf_conn *ct, |
189 | enum ip_conntrack_dir dir, |
190 | struct nft_flowtable *ft) |
191 | { |
192 | const struct dst_entry *dst = route->tuple[dir].dst; |
193 | struct net_device_path_stack stack; |
194 | struct nft_forward_info info = {}; |
195 | unsigned char ha[ETH_ALEN]; |
196 | int i; |
197 | |
198 | if (nft_dev_fill_forward_path(route, dst_cache: dst, ct, dir, ha, stack: &stack) >= 0) |
199 | nft_dev_path_info(stack: &stack, info: &info, ha, flowtable: &ft->data); |
200 | |
201 | if (!info.indev || !nft_flowtable_find_dev(dev: info.indev, ft)) |
202 | return; |
203 | |
204 | route->tuple[!dir].in.ifindex = info.indev->ifindex; |
205 | for (i = 0; i < info.num_encaps; i++) { |
206 | route->tuple[!dir].in.encap[i].id = info.encap[i].id; |
207 | route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; |
208 | } |
209 | route->tuple[!dir].in.num_encaps = info.num_encaps; |
210 | route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; |
211 | |
212 | if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { |
213 | memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); |
214 | memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); |
215 | route->tuple[dir].out.ifindex = info.outdev->ifindex; |
216 | route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; |
217 | route->tuple[dir].xmit_type = info.xmit_type; |
218 | } |
219 | } |
220 | |
221 | static int nft_flow_route(const struct nft_pktinfo *pkt, |
222 | const struct nf_conn *ct, |
223 | struct nf_flow_route *route, |
224 | enum ip_conntrack_dir dir, |
225 | struct nft_flowtable *ft) |
226 | { |
227 | struct dst_entry *this_dst = skb_dst(skb: pkt->skb); |
228 | struct dst_entry *other_dst = NULL; |
229 | struct flowi fl; |
230 | |
231 | memset(&fl, 0, sizeof(fl)); |
232 | switch (nft_pf(pkt)) { |
233 | case NFPROTO_IPV4: |
234 | fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; |
235 | fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; |
236 | fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; |
237 | fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; |
238 | fl.u.ip4.flowi4_tos = RT_TOS(ip_hdr(pkt->skb)->tos); |
239 | fl.u.ip4.flowi4_mark = pkt->skb->mark; |
240 | fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; |
241 | break; |
242 | case NFPROTO_IPV6: |
243 | fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; |
244 | fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; |
245 | fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; |
246 | fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; |
247 | fl.u.ip6.flowlabel = ip6_flowinfo(hdr: ipv6_hdr(skb: pkt->skb)); |
248 | fl.u.ip6.flowi6_mark = pkt->skb->mark; |
249 | fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; |
250 | break; |
251 | } |
252 | |
253 | if (!dst_hold_safe(dst: this_dst)) |
254 | return -ENOENT; |
255 | |
256 | nf_route(net: nft_net(pkt), dst: &other_dst, fl: &fl, strict: false, family: nft_pf(pkt)); |
257 | if (!other_dst) { |
258 | dst_release(dst: this_dst); |
259 | return -ENOENT; |
260 | } |
261 | |
262 | nft_default_forward_path(route, dst_cache: this_dst, dir); |
263 | nft_default_forward_path(route, dst_cache: other_dst, dir: !dir); |
264 | |
265 | if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && |
266 | route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { |
267 | nft_dev_forward_path(route, ct, dir, ft); |
268 | nft_dev_forward_path(route, ct, dir: !dir, ft); |
269 | } |
270 | |
271 | return 0; |
272 | } |
273 | |
274 | static bool nft_flow_offload_skip(struct sk_buff *skb, int family) |
275 | { |
276 | if (skb_sec_path(skb)) |
277 | return true; |
278 | |
279 | if (family == NFPROTO_IPV4) { |
280 | const struct ip_options *opt; |
281 | |
282 | opt = &(IPCB(skb)->opt); |
283 | |
284 | if (unlikely(opt->optlen)) |
285 | return true; |
286 | } |
287 | |
288 | return false; |
289 | } |
290 | |
291 | static void nft_flow_offload_eval(const struct nft_expr *expr, |
292 | struct nft_regs *regs, |
293 | const struct nft_pktinfo *pkt) |
294 | { |
295 | struct nft_flow_offload *priv = nft_expr_priv(expr); |
296 | struct nf_flowtable *flowtable = &priv->flowtable->data; |
297 | struct tcphdr _tcph, *tcph = NULL; |
298 | struct nf_flow_route route = {}; |
299 | enum ip_conntrack_info ctinfo; |
300 | struct flow_offload *flow; |
301 | enum ip_conntrack_dir dir; |
302 | struct nf_conn *ct; |
303 | int ret; |
304 | |
305 | if (nft_flow_offload_skip(skb: pkt->skb, family: nft_pf(pkt))) |
306 | goto out; |
307 | |
308 | ct = nf_ct_get(skb: pkt->skb, ctinfo: &ctinfo); |
309 | if (!ct) |
310 | goto out; |
311 | |
312 | switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { |
313 | case IPPROTO_TCP: |
314 | tcph = skb_header_pointer(skb: pkt->skb, offset: nft_thoff(pkt), |
315 | len: sizeof(_tcph), buffer: &_tcph); |
316 | if (unlikely(!tcph || tcph->fin || tcph->rst || |
317 | !nf_conntrack_tcp_established(ct))) |
318 | goto out; |
319 | break; |
320 | case IPPROTO_UDP: |
321 | break; |
322 | #ifdef CONFIG_NF_CT_PROTO_GRE |
323 | case IPPROTO_GRE: { |
324 | struct nf_conntrack_tuple *tuple; |
325 | |
326 | if (ct->status & IPS_NAT_MASK) |
327 | goto out; |
328 | tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; |
329 | /* No support for GRE v1 */ |
330 | if (tuple->src.u.gre.key || tuple->dst.u.gre.key) |
331 | goto out; |
332 | break; |
333 | } |
334 | #endif |
335 | default: |
336 | goto out; |
337 | } |
338 | |
339 | if (nf_ct_ext_exist(ct, id: NF_CT_EXT_HELPER) || |
340 | ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH)) |
341 | goto out; |
342 | |
343 | if (!nf_ct_is_confirmed(ct)) |
344 | goto out; |
345 | |
346 | if (test_and_set_bit(nr: IPS_OFFLOAD_BIT, addr: &ct->status)) |
347 | goto out; |
348 | |
349 | dir = CTINFO2DIR(ctinfo); |
350 | if (nft_flow_route(pkt, ct, route: &route, dir, ft: priv->flowtable) < 0) |
351 | goto err_flow_route; |
352 | |
353 | flow = flow_offload_alloc(ct); |
354 | if (!flow) |
355 | goto err_flow_alloc; |
356 | |
357 | flow_offload_route_init(flow, route: &route); |
358 | |
359 | if (tcph) { |
360 | ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; |
361 | ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; |
362 | } |
363 | |
364 | __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); |
365 | ret = flow_offload_add(flow_table: flowtable, flow); |
366 | if (ret < 0) |
367 | goto err_flow_add; |
368 | |
369 | return; |
370 | |
371 | err_flow_add: |
372 | flow_offload_free(flow); |
373 | err_flow_alloc: |
374 | dst_release(dst: route.tuple[dir].dst); |
375 | dst_release(dst: route.tuple[!dir].dst); |
376 | err_flow_route: |
377 | clear_bit(nr: IPS_OFFLOAD_BIT, addr: &ct->status); |
378 | out: |
379 | regs->verdict.code = NFT_BREAK; |
380 | } |
381 | |
382 | static int nft_flow_offload_validate(const struct nft_ctx *ctx, |
383 | const struct nft_expr *expr, |
384 | const struct nft_data **data) |
385 | { |
386 | unsigned int hook_mask = (1 << NF_INET_FORWARD); |
387 | |
388 | if (ctx->family != NFPROTO_IPV4 && |
389 | ctx->family != NFPROTO_IPV6 && |
390 | ctx->family != NFPROTO_INET) |
391 | return -EOPNOTSUPP; |
392 | |
393 | return nft_chain_validate_hooks(chain: ctx->chain, hook_flags: hook_mask); |
394 | } |
395 | |
396 | static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = { |
397 | [NFTA_FLOW_TABLE_NAME] = { .type = NLA_STRING, |
398 | .len = NFT_NAME_MAXLEN - 1 }, |
399 | }; |
400 | |
401 | static int nft_flow_offload_init(const struct nft_ctx *ctx, |
402 | const struct nft_expr *expr, |
403 | const struct nlattr * const tb[]) |
404 | { |
405 | struct nft_flow_offload *priv = nft_expr_priv(expr); |
406 | u8 genmask = nft_genmask_next(net: ctx->net); |
407 | struct nft_flowtable *flowtable; |
408 | |
409 | if (!tb[NFTA_FLOW_TABLE_NAME]) |
410 | return -EINVAL; |
411 | |
412 | flowtable = nft_flowtable_lookup(table: ctx->table, nla: tb[NFTA_FLOW_TABLE_NAME], |
413 | genmask); |
414 | if (IS_ERR(ptr: flowtable)) |
415 | return PTR_ERR(ptr: flowtable); |
416 | |
417 | if (!nft_use_inc(use: &flowtable->use)) |
418 | return -EMFILE; |
419 | |
420 | priv->flowtable = flowtable; |
421 | |
422 | return nf_ct_netns_get(net: ctx->net, nfproto: ctx->family); |
423 | } |
424 | |
425 | static void nft_flow_offload_deactivate(const struct nft_ctx *ctx, |
426 | const struct nft_expr *expr, |
427 | enum nft_trans_phase phase) |
428 | { |
429 | struct nft_flow_offload *priv = nft_expr_priv(expr); |
430 | |
431 | nf_tables_deactivate_flowtable(ctx, flowtable: priv->flowtable, phase); |
432 | } |
433 | |
434 | static void nft_flow_offload_activate(const struct nft_ctx *ctx, |
435 | const struct nft_expr *expr) |
436 | { |
437 | struct nft_flow_offload *priv = nft_expr_priv(expr); |
438 | |
439 | nft_use_inc_restore(use: &priv->flowtable->use); |
440 | } |
441 | |
442 | static void nft_flow_offload_destroy(const struct nft_ctx *ctx, |
443 | const struct nft_expr *expr) |
444 | { |
445 | nf_ct_netns_put(net: ctx->net, nfproto: ctx->family); |
446 | } |
447 | |
448 | static int nft_flow_offload_dump(struct sk_buff *skb, |
449 | const struct nft_expr *expr, bool reset) |
450 | { |
451 | struct nft_flow_offload *priv = nft_expr_priv(expr); |
452 | |
453 | if (nla_put_string(skb, attrtype: NFTA_FLOW_TABLE_NAME, str: priv->flowtable->name)) |
454 | goto nla_put_failure; |
455 | |
456 | return 0; |
457 | |
458 | nla_put_failure: |
459 | return -1; |
460 | } |
461 | |
462 | static struct nft_expr_type nft_flow_offload_type; |
463 | static const struct nft_expr_ops nft_flow_offload_ops = { |
464 | .type = &nft_flow_offload_type, |
465 | .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)), |
466 | .eval = nft_flow_offload_eval, |
467 | .init = nft_flow_offload_init, |
468 | .activate = nft_flow_offload_activate, |
469 | .deactivate = nft_flow_offload_deactivate, |
470 | .destroy = nft_flow_offload_destroy, |
471 | .validate = nft_flow_offload_validate, |
472 | .dump = nft_flow_offload_dump, |
473 | .reduce = NFT_REDUCE_READONLY, |
474 | }; |
475 | |
476 | static struct nft_expr_type nft_flow_offload_type __read_mostly = { |
477 | .name = "flow_offload" , |
478 | .ops = &nft_flow_offload_ops, |
479 | .policy = nft_flow_offload_policy, |
480 | .maxattr = NFTA_FLOW_MAX, |
481 | .owner = THIS_MODULE, |
482 | }; |
483 | |
484 | static int flow_offload_netdev_event(struct notifier_block *this, |
485 | unsigned long event, void *ptr) |
486 | { |
487 | struct net_device *dev = netdev_notifier_info_to_dev(info: ptr); |
488 | |
489 | if (event != NETDEV_DOWN) |
490 | return NOTIFY_DONE; |
491 | |
492 | nf_flow_table_cleanup(dev); |
493 | |
494 | return NOTIFY_DONE; |
495 | } |
496 | |
497 | static struct notifier_block flow_offload_netdev_notifier = { |
498 | .notifier_call = flow_offload_netdev_event, |
499 | }; |
500 | |
501 | static int __init nft_flow_offload_module_init(void) |
502 | { |
503 | int err; |
504 | |
505 | err = register_netdevice_notifier(nb: &flow_offload_netdev_notifier); |
506 | if (err) |
507 | goto err; |
508 | |
509 | err = nft_register_expr(&nft_flow_offload_type); |
510 | if (err < 0) |
511 | goto register_expr; |
512 | |
513 | return 0; |
514 | |
515 | register_expr: |
516 | unregister_netdevice_notifier(nb: &flow_offload_netdev_notifier); |
517 | err: |
518 | return err; |
519 | } |
520 | |
521 | static void __exit nft_flow_offload_module_exit(void) |
522 | { |
523 | nft_unregister_expr(&nft_flow_offload_type); |
524 | unregister_netdevice_notifier(nb: &flow_offload_netdev_notifier); |
525 | } |
526 | |
527 | module_init(nft_flow_offload_module_init); |
528 | module_exit(nft_flow_offload_module_exit); |
529 | |
530 | MODULE_LICENSE("GPL" ); |
531 | MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>" ); |
532 | MODULE_ALIAS_NFT_EXPR("flow_offload" ); |
533 | MODULE_DESCRIPTION("nftables hardware flow offload module" ); |
534 | |