1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * ip_vs_xmit.c: various packet transmitters for IPVS |
4 | * |
5 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> |
6 | * Julian Anastasov <ja@ssi.bg> |
7 | * |
8 | * Changes: |
9 | * |
10 | * Description of forwarding methods: |
11 | * - all transmitters are called from LOCAL_IN (remote clients) and |
12 | * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD |
13 | * - not all connections have destination server, for example, |
14 | * connections in backup server when fwmark is used |
15 | * - bypass connections use daddr from packet |
16 | * - we can use dst without ref while sending in RCU section, we use |
17 | * ref when returning NF_ACCEPT for NAT-ed packet via loopback |
18 | * LOCAL_OUT rules: |
19 | * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) |
20 | * - skb->pkt_type is not set yet |
21 | * - the only place where we can see skb->sk != NULL |
22 | */ |
23 | |
24 | #define KMSG_COMPONENT "IPVS" |
25 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt |
26 | |
27 | #include <linux/kernel.h> |
28 | #include <linux/slab.h> |
29 | #include <linux/tcp.h> /* for tcphdr */ |
30 | #include <net/ip.h> |
31 | #include <net/gue.h> |
32 | #include <net/gre.h> |
33 | #include <net/tcp.h> /* for csum_tcpudp_magic */ |
34 | #include <net/udp.h> |
35 | #include <net/icmp.h> /* for icmp_send */ |
36 | #include <net/route.h> /* for ip_route_output */ |
37 | #include <net/ipv6.h> |
38 | #include <net/ip6_route.h> |
39 | #include <net/ip_tunnels.h> |
40 | #include <net/ip6_checksum.h> |
41 | #include <net/addrconf.h> |
42 | #include <linux/icmpv6.h> |
43 | #include <linux/netfilter.h> |
44 | #include <linux/netfilter_ipv4.h> |
45 | |
46 | #include <net/ip_vs.h> |
47 | |
48 | enum { |
49 | IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */ |
50 | IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */ |
51 | IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to |
52 | * local |
53 | */ |
54 | IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ |
55 | IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ |
56 | IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ |
57 | }; |
58 | |
59 | static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) |
60 | { |
61 | return kmalloc(size: sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); |
62 | } |
63 | |
64 | static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) |
65 | { |
66 | kfree(objp: dest_dst); |
67 | } |
68 | |
69 | /* |
70 | * Destination cache to speed up outgoing route lookup |
71 | */ |
72 | static inline void |
73 | __ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, |
74 | struct dst_entry *dst, u32 dst_cookie) |
75 | { |
76 | struct ip_vs_dest_dst *old; |
77 | |
78 | old = rcu_dereference_protected(dest->dest_dst, |
79 | lockdep_is_held(&dest->dst_lock)); |
80 | |
81 | if (dest_dst) { |
82 | dest_dst->dst_cache = dst; |
83 | dest_dst->dst_cookie = dst_cookie; |
84 | } |
85 | rcu_assign_pointer(dest->dest_dst, dest_dst); |
86 | |
87 | if (old) |
88 | call_rcu(head: &old->rcu_head, func: ip_vs_dest_dst_rcu_free); |
89 | } |
90 | |
91 | static inline struct ip_vs_dest_dst * |
92 | __ip_vs_dst_check(struct ip_vs_dest *dest) |
93 | { |
94 | struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); |
95 | struct dst_entry *dst; |
96 | |
97 | if (!dest_dst) |
98 | return NULL; |
99 | dst = dest_dst->dst_cache; |
100 | if (dst->obsolete && |
101 | dst->ops->check(dst, dest_dst->dst_cookie) == NULL) |
102 | return NULL; |
103 | return dest_dst; |
104 | } |
105 | |
106 | static inline bool |
107 | __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) |
108 | { |
109 | if (IP6CB(skb)->frag_max_size) { |
110 | /* frag_max_size tell us that, this packet have been |
111 | * defragmented by netfilter IPv6 conntrack module. |
112 | */ |
113 | if (IP6CB(skb)->frag_max_size > mtu) |
114 | return true; /* largest fragment violate MTU */ |
115 | } |
116 | else if (skb->len > mtu && !skb_is_gso(skb)) { |
117 | return true; /* Packet size violate MTU size */ |
118 | } |
119 | return false; |
120 | } |
121 | |
122 | /* Get route to daddr, update *saddr, optionally bind route to saddr */ |
123 | static struct rtable *do_output_route4(struct net *net, __be32 daddr, |
124 | int rt_mode, __be32 *saddr) |
125 | { |
126 | struct flowi4 fl4; |
127 | struct rtable *rt; |
128 | bool loop = false; |
129 | |
130 | memset(&fl4, 0, sizeof(fl4)); |
131 | fl4.daddr = daddr; |
132 | fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? |
133 | FLOWI_FLAG_KNOWN_NH : 0; |
134 | |
135 | retry: |
136 | rt = ip_route_output_key(net, flp: &fl4); |
137 | if (IS_ERR(ptr: rt)) { |
138 | /* Invalid saddr ? */ |
139 | if (PTR_ERR(ptr: rt) == -EINVAL && *saddr && |
140 | rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { |
141 | *saddr = 0; |
142 | flowi4_update_output(fl4: &fl4, oif: 0, daddr, saddr: 0); |
143 | goto retry; |
144 | } |
145 | IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n" , &daddr); |
146 | return NULL; |
147 | } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { |
148 | ip_rt_put(rt); |
149 | *saddr = fl4.saddr; |
150 | flowi4_update_output(fl4: &fl4, oif: 0, daddr, saddr: fl4.saddr); |
151 | loop = true; |
152 | goto retry; |
153 | } |
154 | *saddr = fl4.saddr; |
155 | return rt; |
156 | } |
157 | |
158 | #ifdef CONFIG_IP_VS_IPV6 |
159 | static inline int __ip_vs_is_local_route6(struct rt6_info *rt) |
160 | { |
161 | return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; |
162 | } |
163 | #endif |
164 | |
165 | static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, |
166 | int rt_mode, |
167 | bool new_rt_is_local) |
168 | { |
169 | bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); |
170 | bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL); |
171 | bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR); |
172 | bool source_is_loopback; |
173 | bool old_rt_is_local; |
174 | |
175 | #ifdef CONFIG_IP_VS_IPV6 |
176 | if (skb_af == AF_INET6) { |
177 | int addr_type = ipv6_addr_type(addr: &ipv6_hdr(skb)->saddr); |
178 | |
179 | source_is_loopback = |
180 | (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && |
181 | (addr_type & IPV6_ADDR_LOOPBACK); |
182 | old_rt_is_local = __ip_vs_is_local_route6( |
183 | rt: (struct rt6_info *)skb_dst(skb)); |
184 | } else |
185 | #endif |
186 | { |
187 | source_is_loopback = ipv4_is_loopback(addr: ip_hdr(skb)->saddr); |
188 | old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; |
189 | } |
190 | |
191 | if (unlikely(new_rt_is_local)) { |
192 | if (!rt_mode_allow_local) |
193 | return true; |
194 | if (!rt_mode_allow_redirect && !old_rt_is_local) |
195 | return true; |
196 | } else { |
197 | if (!rt_mode_allow_non_local) |
198 | return true; |
199 | if (source_is_loopback) |
200 | return true; |
201 | } |
202 | return false; |
203 | } |
204 | |
205 | static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) |
206 | { |
207 | struct sock *sk = skb->sk; |
208 | struct rtable *ort = skb_rtable(skb); |
209 | |
210 | if (!skb->dev && sk && sk_fullsock(sk)) |
211 | ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true); |
212 | } |
213 | |
214 | static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, |
215 | int rt_mode, |
216 | struct ip_vs_iphdr *ipvsh, |
217 | struct sk_buff *skb, int mtu) |
218 | { |
219 | #ifdef CONFIG_IP_VS_IPV6 |
220 | if (skb_af == AF_INET6) { |
221 | struct net *net = ipvs->net; |
222 | |
223 | if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { |
224 | if (!skb->dev) |
225 | skb->dev = net->loopback_dev; |
226 | /* only send ICMP too big on first fragment */ |
227 | if (!ipvsh->fragoffs && !ip_vs_iph_icmp(iph: ipvsh)) |
228 | icmpv6_send(skb, ICMPV6_PKT_TOOBIG, code: 0, info: mtu); |
229 | IP_VS_DBG(1, "frag needed for %pI6c\n" , |
230 | &ipv6_hdr(skb)->saddr); |
231 | return false; |
232 | } |
233 | } else |
234 | #endif |
235 | { |
236 | /* If we're going to tunnel the packet and pmtu discovery |
237 | * is disabled, we'll just fragment it anyway |
238 | */ |
239 | if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs)) |
240 | return true; |
241 | |
242 | if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && |
243 | skb->len > mtu && !skb_is_gso(skb) && |
244 | !ip_vs_iph_icmp(ipvsh))) { |
245 | icmp_send(skb_in: skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, |
246 | htonl(mtu)); |
247 | IP_VS_DBG(1, "frag needed for %pI4\n" , |
248 | &ip_hdr(skb)->saddr); |
249 | return false; |
250 | } |
251 | } |
252 | |
253 | return true; |
254 | } |
255 | |
256 | static inline bool decrement_ttl(struct netns_ipvs *ipvs, |
257 | int skb_af, |
258 | struct sk_buff *skb) |
259 | { |
260 | struct net *net = ipvs->net; |
261 | |
262 | #ifdef CONFIG_IP_VS_IPV6 |
263 | if (skb_af == AF_INET6) { |
264 | struct dst_entry *dst = skb_dst(skb); |
265 | |
266 | /* check and decrement ttl */ |
267 | if (ipv6_hdr(skb)->hop_limit <= 1) { |
268 | struct inet6_dev *idev = __in6_dev_get_safely(dev: skb->dev); |
269 | |
270 | /* Force OUTPUT device used as source address */ |
271 | skb->dev = dst->dev; |
272 | icmpv6_send(skb, ICMPV6_TIME_EXCEED, |
273 | ICMPV6_EXC_HOPLIMIT, info: 0); |
274 | IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); |
275 | |
276 | return false; |
277 | } |
278 | |
279 | /* don't propagate ttl change to cloned packets */ |
280 | if (skb_ensure_writable(skb, write_len: sizeof(struct ipv6hdr))) |
281 | return false; |
282 | |
283 | ipv6_hdr(skb)->hop_limit--; |
284 | } else |
285 | #endif |
286 | { |
287 | if (ip_hdr(skb)->ttl <= 1) { |
288 | /* Tell the sender its packet died... */ |
289 | IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); |
290 | icmp_send(skb_in: skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, info: 0); |
291 | return false; |
292 | } |
293 | |
294 | /* don't propagate ttl change to cloned packets */ |
295 | if (skb_ensure_writable(skb, write_len: sizeof(struct iphdr))) |
296 | return false; |
297 | |
298 | /* Decrease ttl */ |
299 | ip_decrease_ttl(iph: ip_hdr(skb)); |
300 | } |
301 | |
302 | return true; |
303 | } |
304 | |
305 | /* Get route to destination or remote server */ |
306 | static int |
307 | __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, |
308 | struct ip_vs_dest *dest, |
309 | __be32 daddr, int rt_mode, __be32 *ret_saddr, |
310 | struct ip_vs_iphdr *ipvsh) |
311 | { |
312 | struct net *net = ipvs->net; |
313 | struct ip_vs_dest_dst *dest_dst; |
314 | struct rtable *rt; /* Route to the other host */ |
315 | int mtu; |
316 | int local, noref = 1; |
317 | |
318 | if (dest) { |
319 | dest_dst = __ip_vs_dst_check(dest); |
320 | if (likely(dest_dst)) |
321 | rt = (struct rtable *) dest_dst->dst_cache; |
322 | else { |
323 | dest_dst = ip_vs_dest_dst_alloc(); |
324 | spin_lock_bh(lock: &dest->dst_lock); |
325 | if (!dest_dst) { |
326 | __ip_vs_dst_set(dest, NULL, NULL, dst_cookie: 0); |
327 | spin_unlock_bh(lock: &dest->dst_lock); |
328 | goto err_unreach; |
329 | } |
330 | rt = do_output_route4(net, daddr: dest->addr.ip, rt_mode, |
331 | saddr: &dest_dst->dst_saddr.ip); |
332 | if (!rt) { |
333 | __ip_vs_dst_set(dest, NULL, NULL, dst_cookie: 0); |
334 | spin_unlock_bh(lock: &dest->dst_lock); |
335 | ip_vs_dest_dst_free(dest_dst); |
336 | goto err_unreach; |
337 | } |
338 | __ip_vs_dst_set(dest, dest_dst, dst: &rt->dst, dst_cookie: 0); |
339 | spin_unlock_bh(lock: &dest->dst_lock); |
340 | IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n" , |
341 | &dest->addr.ip, &dest_dst->dst_saddr.ip, |
342 | rcuref_read(&rt->dst.__rcuref)); |
343 | } |
344 | if (ret_saddr) |
345 | *ret_saddr = dest_dst->dst_saddr.ip; |
346 | } else { |
347 | __be32 saddr = htonl(INADDR_ANY); |
348 | |
349 | noref = 0; |
350 | |
351 | /* For such unconfigured boxes avoid many route lookups |
352 | * for performance reasons because we do not remember saddr |
353 | */ |
354 | rt_mode &= ~IP_VS_RT_MODE_CONNECT; |
355 | rt = do_output_route4(net, daddr, rt_mode, saddr: &saddr); |
356 | if (!rt) |
357 | goto err_unreach; |
358 | if (ret_saddr) |
359 | *ret_saddr = saddr; |
360 | } |
361 | |
362 | local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; |
363 | if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, |
364 | local))) { |
365 | IP_VS_DBG_RL("We are crossing local and non-local addresses" |
366 | " daddr=%pI4\n" , &daddr); |
367 | goto err_put; |
368 | } |
369 | |
370 | if (unlikely(local)) { |
371 | /* skb to local stack, preserve old route */ |
372 | if (!noref) |
373 | ip_rt_put(rt); |
374 | return local; |
375 | } |
376 | |
377 | if (!decrement_ttl(ipvs, skb_af, skb)) |
378 | goto err_put; |
379 | |
380 | if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { |
381 | mtu = dst_mtu(dst: &rt->dst); |
382 | } else { |
383 | mtu = dst_mtu(dst: &rt->dst) - sizeof(struct iphdr); |
384 | if (!dest) |
385 | goto err_put; |
386 | if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
387 | mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); |
388 | if ((dest->tun_flags & |
389 | IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && |
390 | skb->ip_summed == CHECKSUM_PARTIAL) |
391 | mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; |
392 | } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { |
393 | __be16 tflags = 0; |
394 | |
395 | if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) |
396 | tflags |= TUNNEL_CSUM; |
397 | mtu -= gre_calc_hlen(o_flags: tflags); |
398 | } |
399 | if (mtu < 68) { |
400 | IP_VS_DBG_RL("%s(): mtu less than 68\n" , __func__); |
401 | goto err_put; |
402 | } |
403 | maybe_update_pmtu(skb_af, skb, mtu); |
404 | } |
405 | |
406 | if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) |
407 | goto err_put; |
408 | |
409 | skb_dst_drop(skb); |
410 | if (noref) |
411 | skb_dst_set_noref(skb, dst: &rt->dst); |
412 | else |
413 | skb_dst_set(skb, dst: &rt->dst); |
414 | |
415 | return local; |
416 | |
417 | err_put: |
418 | if (!noref) |
419 | ip_rt_put(rt); |
420 | return -1; |
421 | |
422 | err_unreach: |
423 | dst_link_failure(skb); |
424 | return -1; |
425 | } |
426 | |
427 | #ifdef CONFIG_IP_VS_IPV6 |
428 | static struct dst_entry * |
429 | __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, |
430 | struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) |
431 | { |
432 | struct dst_entry *dst; |
433 | struct flowi6 fl6 = { |
434 | .daddr = *daddr, |
435 | }; |
436 | |
437 | if (rt_mode & IP_VS_RT_MODE_KNOWN_NH) |
438 | fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; |
439 | |
440 | dst = ip6_route_output(net, NULL, fl6: &fl6); |
441 | if (dst->error) |
442 | goto out_err; |
443 | if (!ret_saddr) |
444 | return dst; |
445 | if (ipv6_addr_any(a: &fl6.saddr) && |
446 | ipv6_dev_get_saddr(net, dev: ip6_dst_idev(dst)->dev, |
447 | daddr: &fl6.daddr, srcprefs: 0, saddr: &fl6.saddr) < 0) |
448 | goto out_err; |
449 | if (do_xfrm) { |
450 | dst = xfrm_lookup(net, dst_orig: dst, fl: flowi6_to_flowi(fl6: &fl6), NULL, flags: 0); |
451 | if (IS_ERR(ptr: dst)) { |
452 | dst = NULL; |
453 | goto out_err; |
454 | } |
455 | } |
456 | *ret_saddr = fl6.saddr; |
457 | return dst; |
458 | |
459 | out_err: |
460 | dst_release(dst); |
461 | IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n" , daddr); |
462 | return NULL; |
463 | } |
464 | |
465 | /* |
466 | * Get route to destination or remote server |
467 | */ |
468 | static int |
469 | __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, |
470 | struct ip_vs_dest *dest, |
471 | struct in6_addr *daddr, struct in6_addr *ret_saddr, |
472 | struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) |
473 | { |
474 | struct net *net = ipvs->net; |
475 | struct ip_vs_dest_dst *dest_dst; |
476 | struct rt6_info *rt; /* Route to the other host */ |
477 | struct dst_entry *dst; |
478 | int mtu; |
479 | int local, noref = 1; |
480 | |
481 | if (dest) { |
482 | dest_dst = __ip_vs_dst_check(dest); |
483 | if (likely(dest_dst)) |
484 | rt = (struct rt6_info *) dest_dst->dst_cache; |
485 | else { |
486 | u32 cookie; |
487 | |
488 | dest_dst = ip_vs_dest_dst_alloc(); |
489 | spin_lock_bh(lock: &dest->dst_lock); |
490 | if (!dest_dst) { |
491 | __ip_vs_dst_set(dest, NULL, NULL, dst_cookie: 0); |
492 | spin_unlock_bh(lock: &dest->dst_lock); |
493 | goto err_unreach; |
494 | } |
495 | dst = __ip_vs_route_output_v6(net, daddr: &dest->addr.in6, |
496 | ret_saddr: &dest_dst->dst_saddr.in6, |
497 | do_xfrm, rt_mode); |
498 | if (!dst) { |
499 | __ip_vs_dst_set(dest, NULL, NULL, dst_cookie: 0); |
500 | spin_unlock_bh(lock: &dest->dst_lock); |
501 | ip_vs_dest_dst_free(dest_dst); |
502 | goto err_unreach; |
503 | } |
504 | rt = (struct rt6_info *) dst; |
505 | cookie = rt6_get_cookie(rt); |
506 | __ip_vs_dst_set(dest, dest_dst, dst: &rt->dst, dst_cookie: cookie); |
507 | spin_unlock_bh(lock: &dest->dst_lock); |
508 | IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n" , |
509 | &dest->addr.in6, &dest_dst->dst_saddr.in6, |
510 | rcuref_read(&rt->dst.__rcuref)); |
511 | } |
512 | if (ret_saddr) |
513 | *ret_saddr = dest_dst->dst_saddr.in6; |
514 | } else { |
515 | noref = 0; |
516 | dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, |
517 | rt_mode); |
518 | if (!dst) |
519 | goto err_unreach; |
520 | rt = (struct rt6_info *) dst; |
521 | } |
522 | |
523 | local = __ip_vs_is_local_route6(rt); |
524 | |
525 | if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, |
526 | local))) { |
527 | IP_VS_DBG_RL("We are crossing local and non-local addresses" |
528 | " daddr=%pI6\n" , daddr); |
529 | goto err_put; |
530 | } |
531 | |
532 | if (unlikely(local)) { |
533 | /* skb to local stack, preserve old route */ |
534 | if (!noref) |
535 | dst_release(dst: &rt->dst); |
536 | return local; |
537 | } |
538 | |
539 | if (!decrement_ttl(ipvs, skb_af, skb)) |
540 | goto err_put; |
541 | |
542 | /* MTU checking */ |
543 | if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) |
544 | mtu = dst_mtu(dst: &rt->dst); |
545 | else { |
546 | mtu = dst_mtu(dst: &rt->dst) - sizeof(struct ipv6hdr); |
547 | if (!dest) |
548 | goto err_put; |
549 | if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
550 | mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); |
551 | if ((dest->tun_flags & |
552 | IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && |
553 | skb->ip_summed == CHECKSUM_PARTIAL) |
554 | mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; |
555 | } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { |
556 | __be16 tflags = 0; |
557 | |
558 | if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) |
559 | tflags |= TUNNEL_CSUM; |
560 | mtu -= gre_calc_hlen(o_flags: tflags); |
561 | } |
562 | if (mtu < IPV6_MIN_MTU) { |
563 | IP_VS_DBG_RL("%s(): mtu less than %d\n" , __func__, |
564 | IPV6_MIN_MTU); |
565 | goto err_put; |
566 | } |
567 | maybe_update_pmtu(skb_af, skb, mtu); |
568 | } |
569 | |
570 | if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) |
571 | goto err_put; |
572 | |
573 | skb_dst_drop(skb); |
574 | if (noref) |
575 | skb_dst_set_noref(skb, dst: &rt->dst); |
576 | else |
577 | skb_dst_set(skb, dst: &rt->dst); |
578 | |
579 | return local; |
580 | |
581 | err_put: |
582 | if (!noref) |
583 | dst_release(dst: &rt->dst); |
584 | return -1; |
585 | |
586 | err_unreach: |
587 | /* The ip6_link_failure function requires the dev field to be set |
588 | * in order to get the net (further for the sake of fwmark |
589 | * reflection). |
590 | */ |
591 | if (!skb->dev) |
592 | skb->dev = skb_dst(skb)->dev; |
593 | |
594 | dst_link_failure(skb); |
595 | return -1; |
596 | } |
597 | #endif |
598 | |
599 | |
600 | /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ |
601 | static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, |
602 | struct ip_vs_conn *cp) |
603 | { |
604 | int ret = NF_ACCEPT; |
605 | |
606 | skb->ipvs_property = 1; |
607 | if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) |
608 | ret = ip_vs_confirm_conntrack(skb); |
609 | if (ret == NF_ACCEPT) { |
610 | nf_reset_ct(skb); |
611 | skb_forward_csum(skb); |
612 | if (skb->dev) |
613 | skb_clear_tstamp(skb); |
614 | } |
615 | return ret; |
616 | } |
617 | |
618 | /* In the event of a remote destination, it's possible that we would have |
619 | * matches against an old socket (particularly a TIME-WAIT socket). This |
620 | * causes havoc down the line (ip_local_out et. al. expect regular sockets |
621 | * and invalid memory accesses will happen) so simply drop the association |
622 | * in this case. |
623 | */ |
624 | static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb) |
625 | { |
626 | /* If dev is set, the packet came from the LOCAL_IN callback and |
627 | * not from a local TCP socket. |
628 | */ |
629 | if (skb->dev) |
630 | skb_orphan(skb); |
631 | } |
632 | |
633 | /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ |
634 | static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, |
635 | struct ip_vs_conn *cp, int local) |
636 | { |
637 | int ret = NF_STOLEN; |
638 | |
639 | skb->ipvs_property = 1; |
640 | if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) |
641 | ip_vs_notrack(skb); |
642 | else |
643 | ip_vs_update_conntrack(skb, cp, outin: 1); |
644 | |
645 | /* Remove the early_demux association unless it's bound for the |
646 | * exact same port and address on this host after translation. |
647 | */ |
648 | if (!local || cp->vport != cp->dport || |
649 | !ip_vs_addr_equal(af: cp->af, a: &cp->vaddr, b: &cp->daddr)) |
650 | ip_vs_drop_early_demux_sk(skb); |
651 | |
652 | if (!local) { |
653 | skb_forward_csum(skb); |
654 | if (skb->dev) |
655 | skb_clear_tstamp(skb); |
656 | NF_HOOK(pf, hook: NF_INET_LOCAL_OUT, net: cp->ipvs->net, NULL, skb, |
657 | NULL, out: skb_dst(skb)->dev, okfn: dst_output); |
658 | } else |
659 | ret = NF_ACCEPT; |
660 | |
661 | return ret; |
662 | } |
663 | |
664 | /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ |
665 | static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, |
666 | struct ip_vs_conn *cp, int local) |
667 | { |
668 | int ret = NF_STOLEN; |
669 | |
670 | skb->ipvs_property = 1; |
671 | if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) |
672 | ip_vs_notrack(skb); |
673 | if (!local) { |
674 | ip_vs_drop_early_demux_sk(skb); |
675 | skb_forward_csum(skb); |
676 | if (skb->dev) |
677 | skb_clear_tstamp(skb); |
678 | NF_HOOK(pf, hook: NF_INET_LOCAL_OUT, net: cp->ipvs->net, NULL, skb, |
679 | NULL, out: skb_dst(skb)->dev, okfn: dst_output); |
680 | } else |
681 | ret = NF_ACCEPT; |
682 | return ret; |
683 | } |
684 | |
685 | |
686 | /* |
687 | * NULL transmitter (do nothing except return NF_ACCEPT) |
688 | */ |
689 | int |
690 | ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, |
691 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
692 | { |
693 | /* we do not touch skb and do not need pskb ptr */ |
694 | return ip_vs_send_or_cont(pf: NFPROTO_IPV4, skb, cp, local: 1); |
695 | } |
696 | |
697 | |
698 | /* |
699 | * Bypass transmitter |
700 | * Let packets bypass the destination when the destination is not |
701 | * available, it may be only used in transparent cache cluster. |
702 | */ |
703 | int |
704 | ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, |
705 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
706 | { |
707 | struct iphdr *iph = ip_hdr(skb); |
708 | |
709 | if (__ip_vs_get_out_rt(ipvs: cp->ipvs, skb_af: cp->af, skb, NULL, daddr: iph->daddr, |
710 | rt_mode: IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) |
711 | goto tx_error; |
712 | |
713 | ip_send_check(ip: iph); |
714 | |
715 | /* Another hack: avoid icmp_send in ip_fragment */ |
716 | skb->ignore_df = 1; |
717 | |
718 | ip_vs_send_or_cont(pf: NFPROTO_IPV4, skb, cp, local: 0); |
719 | |
720 | return NF_STOLEN; |
721 | |
722 | tx_error: |
723 | kfree_skb(skb); |
724 | return NF_STOLEN; |
725 | } |
726 | |
727 | #ifdef CONFIG_IP_VS_IPV6 |
728 | int |
729 | ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, |
730 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
731 | { |
732 | struct ipv6hdr *iph = ipv6_hdr(skb); |
733 | |
734 | if (__ip_vs_get_out_rt_v6(ipvs: cp->ipvs, skb_af: cp->af, skb, NULL, |
735 | daddr: &iph->daddr, NULL, |
736 | ipvsh, do_xfrm: 0, rt_mode: IP_VS_RT_MODE_NON_LOCAL) < 0) |
737 | goto tx_error; |
738 | |
739 | /* Another hack: avoid icmp_send in ip_fragment */ |
740 | skb->ignore_df = 1; |
741 | |
742 | ip_vs_send_or_cont(pf: NFPROTO_IPV6, skb, cp, local: 0); |
743 | |
744 | return NF_STOLEN; |
745 | |
746 | tx_error: |
747 | kfree_skb(skb); |
748 | return NF_STOLEN; |
749 | } |
750 | #endif |
751 | |
752 | /* |
753 | * NAT transmitter (only for outside-to-inside nat forwarding) |
754 | * Not used for related ICMP |
755 | */ |
756 | int |
757 | ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, |
758 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
759 | { |
760 | struct rtable *rt; /* Route to the other host */ |
761 | int local, rc, was_input; |
762 | |
763 | /* check if it is a connection of no-client-port */ |
764 | if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { |
765 | __be16 _pt, *p; |
766 | |
767 | p = skb_header_pointer(skb, offset: ipvsh->len, len: sizeof(_pt), buffer: &_pt); |
768 | if (p == NULL) |
769 | goto tx_error; |
770 | ip_vs_conn_fill_cport(cp, cport: *p); |
771 | IP_VS_DBG(10, "filled cport=%d\n" , ntohs(*p)); |
772 | } |
773 | |
774 | was_input = rt_is_input_route(rt: skb_rtable(skb)); |
775 | local = __ip_vs_get_out_rt(ipvs: cp->ipvs, skb_af: cp->af, skb, dest: cp->dest, daddr: cp->daddr.ip, |
776 | rt_mode: IP_VS_RT_MODE_LOCAL | |
777 | IP_VS_RT_MODE_NON_LOCAL | |
778 | IP_VS_RT_MODE_RDR, NULL, ipvsh); |
779 | if (local < 0) |
780 | goto tx_error; |
781 | rt = skb_rtable(skb); |
782 | /* |
783 | * Avoid duplicate tuple in reply direction for NAT traffic |
784 | * to local address when connection is sync-ed |
785 | */ |
786 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) |
787 | if (cp->flags & IP_VS_CONN_F_SYNC && local) { |
788 | enum ip_conntrack_info ctinfo; |
789 | struct nf_conn *ct = nf_ct_get(skb, ctinfo: &ctinfo); |
790 | |
791 | if (ct) { |
792 | IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off, |
793 | "ip_vs_nat_xmit(): " |
794 | "stopping DNAT to local address" ); |
795 | goto tx_error; |
796 | } |
797 | } |
798 | #endif |
799 | |
800 | /* From world but DNAT to loopback address? */ |
801 | if (local && ipv4_is_loopback(addr: cp->daddr.ip) && was_input) { |
802 | IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off, |
803 | "ip_vs_nat_xmit(): stopping DNAT to loopback " |
804 | "address" ); |
805 | goto tx_error; |
806 | } |
807 | |
808 | /* copy-on-write the packet before mangling it */ |
809 | if (skb_ensure_writable(skb, write_len: sizeof(struct iphdr))) |
810 | goto tx_error; |
811 | |
812 | if (skb_cow(skb, headroom: rt->dst.dev->hard_header_len)) |
813 | goto tx_error; |
814 | |
815 | /* mangle the packet */ |
816 | if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) |
817 | goto tx_error; |
818 | ip_hdr(skb)->daddr = cp->daddr.ip; |
819 | ip_send_check(ip: ip_hdr(skb)); |
820 | |
821 | IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT" ); |
822 | |
823 | /* FIXME: when application helper enlarges the packet and the length |
824 | is larger than the MTU of outgoing device, there will be still |
825 | MTU problem. */ |
826 | |
827 | /* Another hack: avoid icmp_send in ip_fragment */ |
828 | skb->ignore_df = 1; |
829 | |
830 | rc = ip_vs_nat_send_or_cont(pf: NFPROTO_IPV4, skb, cp, local); |
831 | |
832 | return rc; |
833 | |
834 | tx_error: |
835 | kfree_skb(skb); |
836 | return NF_STOLEN; |
837 | } |
838 | |
839 | #ifdef CONFIG_IP_VS_IPV6 |
840 | int |
841 | ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, |
842 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
843 | { |
844 | struct rt6_info *rt; /* Route to the other host */ |
845 | int local, rc; |
846 | |
847 | /* check if it is a connection of no-client-port */ |
848 | if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { |
849 | __be16 _pt, *p; |
850 | p = skb_header_pointer(skb, offset: ipvsh->len, len: sizeof(_pt), buffer: &_pt); |
851 | if (p == NULL) |
852 | goto tx_error; |
853 | ip_vs_conn_fill_cport(cp, cport: *p); |
854 | IP_VS_DBG(10, "filled cport=%d\n" , ntohs(*p)); |
855 | } |
856 | |
857 | local = __ip_vs_get_out_rt_v6(ipvs: cp->ipvs, skb_af: cp->af, skb, dest: cp->dest, |
858 | daddr: &cp->daddr.in6, |
859 | NULL, ipvsh, do_xfrm: 0, |
860 | rt_mode: IP_VS_RT_MODE_LOCAL | |
861 | IP_VS_RT_MODE_NON_LOCAL | |
862 | IP_VS_RT_MODE_RDR); |
863 | if (local < 0) |
864 | goto tx_error; |
865 | rt = (struct rt6_info *) skb_dst(skb); |
866 | /* |
867 | * Avoid duplicate tuple in reply direction for NAT traffic |
868 | * to local address when connection is sync-ed |
869 | */ |
870 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) |
871 | if (cp->flags & IP_VS_CONN_F_SYNC && local) { |
872 | enum ip_conntrack_info ctinfo; |
873 | struct nf_conn *ct = nf_ct_get(skb, ctinfo: &ctinfo); |
874 | |
875 | if (ct) { |
876 | IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off, |
877 | "ip_vs_nat_xmit_v6(): " |
878 | "stopping DNAT to local address" ); |
879 | goto tx_error; |
880 | } |
881 | } |
882 | #endif |
883 | |
884 | /* From world but DNAT to loopback address? */ |
885 | if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && |
886 | ipv6_addr_type(addr: &cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { |
887 | IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off, |
888 | "ip_vs_nat_xmit_v6(): " |
889 | "stopping DNAT to loopback address" ); |
890 | goto tx_error; |
891 | } |
892 | |
893 | /* copy-on-write the packet before mangling it */ |
894 | if (skb_ensure_writable(skb, write_len: sizeof(struct ipv6hdr))) |
895 | goto tx_error; |
896 | |
897 | if (skb_cow(skb, headroom: rt->dst.dev->hard_header_len)) |
898 | goto tx_error; |
899 | |
900 | /* mangle the packet */ |
901 | if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) |
902 | goto tx_error; |
903 | ipv6_hdr(skb)->daddr = cp->daddr.in6; |
904 | |
905 | IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT" ); |
906 | |
907 | /* FIXME: when application helper enlarges the packet and the length |
908 | is larger than the MTU of outgoing device, there will be still |
909 | MTU problem. */ |
910 | |
911 | /* Another hack: avoid icmp_send in ip_fragment */ |
912 | skb->ignore_df = 1; |
913 | |
914 | rc = ip_vs_nat_send_or_cont(pf: NFPROTO_IPV6, skb, cp, local); |
915 | |
916 | return rc; |
917 | |
918 | tx_error: |
919 | kfree_skb(skb); |
920 | return NF_STOLEN; |
921 | } |
922 | #endif |
923 | |
924 | /* When forwarding a packet, we must ensure that we've got enough headroom |
925 | * for the encapsulation packet in the skb. This also gives us an |
926 | * opportunity to figure out what the payload_len, dsfield, ttl, and df |
927 | * values should be, so that we won't need to look at the old ip header |
928 | * again |
929 | */ |
930 | static struct sk_buff * |
931 | ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, |
932 | unsigned int max_headroom, __u8 *next_protocol, |
933 | __u32 *payload_len, __u8 *dsfield, __u8 *ttl, |
934 | __be16 *df) |
935 | { |
936 | struct sk_buff *new_skb = NULL; |
937 | struct iphdr *old_iph = NULL; |
938 | __u8 old_dsfield; |
939 | #ifdef CONFIG_IP_VS_IPV6 |
940 | struct ipv6hdr *old_ipv6h = NULL; |
941 | #endif |
942 | |
943 | ip_vs_drop_early_demux_sk(skb); |
944 | |
945 | if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { |
946 | new_skb = skb_realloc_headroom(skb, headroom: max_headroom); |
947 | if (!new_skb) |
948 | goto error; |
949 | if (skb->sk) |
950 | skb_set_owner_w(skb: new_skb, sk: skb->sk); |
951 | consume_skb(skb); |
952 | skb = new_skb; |
953 | } |
954 | |
955 | #ifdef CONFIG_IP_VS_IPV6 |
956 | if (skb_af == AF_INET6) { |
957 | old_ipv6h = ipv6_hdr(skb); |
958 | *next_protocol = IPPROTO_IPV6; |
959 | if (payload_len) |
960 | *payload_len = |
961 | ntohs(old_ipv6h->payload_len) + |
962 | sizeof(*old_ipv6h); |
963 | old_dsfield = ipv6_get_dsfield(ipv6h: old_ipv6h); |
964 | *ttl = old_ipv6h->hop_limit; |
965 | if (df) |
966 | *df = 0; |
967 | } else |
968 | #endif |
969 | { |
970 | old_iph = ip_hdr(skb); |
971 | /* Copy DF, reset fragment offset and MF */ |
972 | if (df) |
973 | *df = (old_iph->frag_off & htons(IP_DF)); |
974 | *next_protocol = IPPROTO_IPIP; |
975 | |
976 | /* fix old IP header checksum */ |
977 | ip_send_check(ip: old_iph); |
978 | old_dsfield = ipv4_get_dsfield(iph: old_iph); |
979 | *ttl = old_iph->ttl; |
980 | if (payload_len) |
981 | *payload_len = skb_ip_totlen(skb); |
982 | } |
983 | |
984 | /* Implement full-functionality option for ECN encapsulation */ |
985 | *dsfield = INET_ECN_encapsulate(outer: old_dsfield, inner: old_dsfield); |
986 | |
987 | return skb; |
988 | error: |
989 | kfree_skb(skb); |
990 | return ERR_PTR(error: -ENOMEM); |
991 | } |
992 | |
993 | static inline int __tun_gso_type_mask(int encaps_af, int orig_af) |
994 | { |
995 | switch (encaps_af) { |
996 | case AF_INET: |
997 | return SKB_GSO_IPXIP4; |
998 | case AF_INET6: |
999 | return SKB_GSO_IPXIP6; |
1000 | default: |
1001 | return 0; |
1002 | } |
1003 | } |
1004 | |
1005 | static int |
1006 | ipvs_gue_encap(struct net *net, struct sk_buff *skb, |
1007 | struct ip_vs_conn *cp, __u8 *next_protocol) |
1008 | { |
1009 | __be16 dport; |
1010 | __be16 sport = udp_flow_src_port(net, skb, min: 0, max: 0, use_eth: false); |
1011 | struct udphdr *udph; /* Our new UDP header */ |
1012 | struct guehdr *gueh; /* Our new GUE header */ |
1013 | size_t hdrlen, optlen = 0; |
1014 | void *data; |
1015 | bool need_priv = false; |
1016 | |
1017 | if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && |
1018 | skb->ip_summed == CHECKSUM_PARTIAL) { |
1019 | optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; |
1020 | need_priv = true; |
1021 | } |
1022 | |
1023 | hdrlen = sizeof(struct guehdr) + optlen; |
1024 | |
1025 | skb_push(skb, len: hdrlen); |
1026 | |
1027 | gueh = (struct guehdr *)skb->data; |
1028 | |
1029 | gueh->control = 0; |
1030 | gueh->version = 0; |
1031 | gueh->hlen = optlen >> 2; |
1032 | gueh->flags = 0; |
1033 | gueh->proto_ctype = *next_protocol; |
1034 | |
1035 | data = &gueh[1]; |
1036 | |
1037 | if (need_priv) { |
1038 | __be32 *flags = data; |
1039 | u16 csum_start = skb_checksum_start_offset(skb); |
1040 | __be16 *pd; |
1041 | |
1042 | gueh->flags |= GUE_FLAG_PRIV; |
1043 | *flags = 0; |
1044 | data += GUE_LEN_PRIV; |
1045 | |
1046 | if (csum_start < hdrlen) |
1047 | return -EINVAL; |
1048 | |
1049 | csum_start -= hdrlen; |
1050 | pd = data; |
1051 | pd[0] = htons(csum_start); |
1052 | pd[1] = htons(csum_start + skb->csum_offset); |
1053 | |
1054 | if (!skb_is_gso(skb)) { |
1055 | skb->ip_summed = CHECKSUM_NONE; |
1056 | skb->encapsulation = 0; |
1057 | } |
1058 | |
1059 | *flags |= GUE_PFLAG_REMCSUM; |
1060 | data += GUE_PLEN_REMCSUM; |
1061 | } |
1062 | |
1063 | skb_push(skb, len: sizeof(struct udphdr)); |
1064 | skb_reset_transport_header(skb); |
1065 | |
1066 | udph = udp_hdr(skb); |
1067 | |
1068 | dport = cp->dest->tun_port; |
1069 | udph->dest = dport; |
1070 | udph->source = sport; |
1071 | udph->len = htons(skb->len); |
1072 | udph->check = 0; |
1073 | |
1074 | *next_protocol = IPPROTO_UDP; |
1075 | |
1076 | return 0; |
1077 | } |
1078 | |
1079 | static void |
1080 | ipvs_gre_encap(struct net *net, struct sk_buff *skb, |
1081 | struct ip_vs_conn *cp, __u8 *next_protocol) |
1082 | { |
1083 | __be16 proto = *next_protocol == IPPROTO_IPIP ? |
1084 | htons(ETH_P_IP) : htons(ETH_P_IPV6); |
1085 | __be16 tflags = 0; |
1086 | size_t hdrlen; |
1087 | |
1088 | if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) |
1089 | tflags |= TUNNEL_CSUM; |
1090 | |
1091 | hdrlen = gre_calc_hlen(o_flags: tflags); |
1092 | gre_build_header(skb, hdr_len: hdrlen, flags: tflags, proto, key: 0, seq: 0); |
1093 | |
1094 | *next_protocol = IPPROTO_GRE; |
1095 | } |
1096 | |
1097 | /* |
1098 | * IP Tunneling transmitter |
1099 | * |
1100 | * This function encapsulates the packet in a new IP packet, its |
1101 | * destination will be set to cp->daddr. Most code of this function |
1102 | * is taken from ipip.c. |
1103 | * |
1104 | * It is used in VS/TUN cluster. The load balancer selects a real |
1105 | * server from a cluster based on a scheduling algorithm, |
1106 | * encapsulates the request packet and forwards it to the selected |
1107 | * server. For example, all real servers are configured with |
1108 | * "ifconfig tunl0 <Virtual IP Address> up". When the server receives |
1109 | * the encapsulated packet, it will decapsulate the packet, processe |
1110 | * the request and return the response packets directly to the client |
1111 | * without passing the load balancer. This can greatly increase the |
1112 | * scalability of virtual server. |
1113 | * |
1114 | * Used for ANY protocol |
1115 | */ |
1116 | int |
1117 | ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, |
1118 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
1119 | { |
1120 | struct netns_ipvs *ipvs = cp->ipvs; |
1121 | struct net *net = ipvs->net; |
1122 | struct rtable *rt; /* Route to the other host */ |
1123 | __be32 saddr; /* Source for tunnel */ |
1124 | struct net_device *tdev; /* Device to other host */ |
1125 | __u8 next_protocol = 0; |
1126 | __u8 dsfield = 0; |
1127 | __u8 ttl = 0; |
1128 | __be16 df = 0; |
1129 | __be16 *dfp = NULL; |
1130 | struct iphdr *iph; /* Our new IP header */ |
1131 | unsigned int max_headroom; /* The extra header space needed */ |
1132 | int ret, local; |
1133 | int tun_type, gso_type; |
1134 | int tun_flags; |
1135 | |
1136 | local = __ip_vs_get_out_rt(ipvs, skb_af: cp->af, skb, dest: cp->dest, daddr: cp->daddr.ip, |
1137 | rt_mode: IP_VS_RT_MODE_LOCAL | |
1138 | IP_VS_RT_MODE_NON_LOCAL | |
1139 | IP_VS_RT_MODE_CONNECT | |
1140 | IP_VS_RT_MODE_TUNNEL, ret_saddr: &saddr, ipvsh); |
1141 | if (local < 0) |
1142 | goto tx_error; |
1143 | if (local) |
1144 | return ip_vs_send_or_cont(pf: NFPROTO_IPV4, skb, cp, local: 1); |
1145 | |
1146 | rt = skb_rtable(skb); |
1147 | tdev = rt->dst.dev; |
1148 | |
1149 | /* |
1150 | * Okay, now see if we can stuff it in the buffer as-is. |
1151 | */ |
1152 | max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); |
1153 | |
1154 | tun_type = cp->dest->tun_type; |
1155 | tun_flags = cp->dest->tun_flags; |
1156 | |
1157 | if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
1158 | size_t gue_hdrlen, gue_optlen = 0; |
1159 | |
1160 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && |
1161 | skb->ip_summed == CHECKSUM_PARTIAL) { |
1162 | gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; |
1163 | } |
1164 | gue_hdrlen = sizeof(struct guehdr) + gue_optlen; |
1165 | |
1166 | max_headroom += sizeof(struct udphdr) + gue_hdrlen; |
1167 | } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { |
1168 | size_t gre_hdrlen; |
1169 | __be16 tflags = 0; |
1170 | |
1171 | if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) |
1172 | tflags |= TUNNEL_CSUM; |
1173 | gre_hdrlen = gre_calc_hlen(o_flags: tflags); |
1174 | |
1175 | max_headroom += gre_hdrlen; |
1176 | } |
1177 | |
1178 | /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ |
1179 | dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; |
1180 | skb = ip_vs_prepare_tunneled_skb(skb, skb_af: cp->af, max_headroom, |
1181 | next_protocol: &next_protocol, NULL, dsfield: &dsfield, |
1182 | ttl: &ttl, df: dfp); |
1183 | if (IS_ERR(ptr: skb)) |
1184 | return NF_STOLEN; |
1185 | |
1186 | gso_type = __tun_gso_type_mask(AF_INET, orig_af: cp->af); |
1187 | if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
1188 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || |
1189 | (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) |
1190 | gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; |
1191 | else |
1192 | gso_type |= SKB_GSO_UDP_TUNNEL; |
1193 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && |
1194 | skb->ip_summed == CHECKSUM_PARTIAL) { |
1195 | gso_type |= SKB_GSO_TUNNEL_REMCSUM; |
1196 | } |
1197 | } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { |
1198 | if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) |
1199 | gso_type |= SKB_GSO_GRE_CSUM; |
1200 | else |
1201 | gso_type |= SKB_GSO_GRE; |
1202 | } |
1203 | |
1204 | if (iptunnel_handle_offloads(skb, gso_type_mask: gso_type)) |
1205 | goto tx_error; |
1206 | |
1207 | skb->transport_header = skb->network_header; |
1208 | |
1209 | skb_set_inner_ipproto(skb, ipproto: next_protocol); |
1210 | skb_set_inner_mac_header(skb, offset: skb_inner_network_offset(skb)); |
1211 | |
1212 | if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
1213 | bool check = false; |
1214 | |
1215 | if (ipvs_gue_encap(net, skb, cp, next_protocol: &next_protocol)) |
1216 | goto tx_error; |
1217 | |
1218 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || |
1219 | (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) |
1220 | check = true; |
1221 | |
1222 | udp_set_csum(nocheck: !check, skb, saddr, daddr: cp->daddr.ip, len: skb->len); |
1223 | } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) |
1224 | ipvs_gre_encap(net, skb, cp, next_protocol: &next_protocol); |
1225 | |
1226 | skb_push(skb, len: sizeof(struct iphdr)); |
1227 | skb_reset_network_header(skb); |
1228 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); |
1229 | |
1230 | /* |
1231 | * Push down and install the IPIP header. |
1232 | */ |
1233 | iph = ip_hdr(skb); |
1234 | iph->version = 4; |
1235 | iph->ihl = sizeof(struct iphdr)>>2; |
1236 | iph->frag_off = df; |
1237 | iph->protocol = next_protocol; |
1238 | iph->tos = dsfield; |
1239 | iph->daddr = cp->daddr.ip; |
1240 | iph->saddr = saddr; |
1241 | iph->ttl = ttl; |
1242 | ip_select_ident(net, skb, NULL); |
1243 | |
1244 | /* Another hack: avoid icmp_send in ip_fragment */ |
1245 | skb->ignore_df = 1; |
1246 | |
1247 | ret = ip_vs_tunnel_xmit_prepare(skb, cp); |
1248 | if (ret == NF_ACCEPT) |
1249 | ip_local_out(net, sk: skb->sk, skb); |
1250 | else if (ret == NF_DROP) |
1251 | kfree_skb(skb); |
1252 | |
1253 | return NF_STOLEN; |
1254 | |
1255 | tx_error: |
1256 | kfree_skb(skb); |
1257 | return NF_STOLEN; |
1258 | } |
1259 | |
1260 | #ifdef CONFIG_IP_VS_IPV6 |
1261 | int |
1262 | ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, |
1263 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
1264 | { |
1265 | struct netns_ipvs *ipvs = cp->ipvs; |
1266 | struct net *net = ipvs->net; |
1267 | struct rt6_info *rt; /* Route to the other host */ |
1268 | struct in6_addr saddr; /* Source for tunnel */ |
1269 | struct net_device *tdev; /* Device to other host */ |
1270 | __u8 next_protocol = 0; |
1271 | __u32 payload_len = 0; |
1272 | __u8 dsfield = 0; |
1273 | __u8 ttl = 0; |
1274 | struct ipv6hdr *iph; /* Our new IP header */ |
1275 | unsigned int max_headroom; /* The extra header space needed */ |
1276 | int ret, local; |
1277 | int tun_type, gso_type; |
1278 | int tun_flags; |
1279 | |
1280 | local = __ip_vs_get_out_rt_v6(ipvs, skb_af: cp->af, skb, dest: cp->dest, |
1281 | daddr: &cp->daddr.in6, |
1282 | ret_saddr: &saddr, ipvsh, do_xfrm: 1, |
1283 | rt_mode: IP_VS_RT_MODE_LOCAL | |
1284 | IP_VS_RT_MODE_NON_LOCAL | |
1285 | IP_VS_RT_MODE_TUNNEL); |
1286 | if (local < 0) |
1287 | goto tx_error; |
1288 | if (local) |
1289 | return ip_vs_send_or_cont(pf: NFPROTO_IPV6, skb, cp, local: 1); |
1290 | |
1291 | rt = (struct rt6_info *) skb_dst(skb); |
1292 | tdev = rt->dst.dev; |
1293 | |
1294 | /* |
1295 | * Okay, now see if we can stuff it in the buffer as-is. |
1296 | */ |
1297 | max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); |
1298 | |
1299 | tun_type = cp->dest->tun_type; |
1300 | tun_flags = cp->dest->tun_flags; |
1301 | |
1302 | if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
1303 | size_t gue_hdrlen, gue_optlen = 0; |
1304 | |
1305 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && |
1306 | skb->ip_summed == CHECKSUM_PARTIAL) { |
1307 | gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; |
1308 | } |
1309 | gue_hdrlen = sizeof(struct guehdr) + gue_optlen; |
1310 | |
1311 | max_headroom += sizeof(struct udphdr) + gue_hdrlen; |
1312 | } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { |
1313 | size_t gre_hdrlen; |
1314 | __be16 tflags = 0; |
1315 | |
1316 | if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) |
1317 | tflags |= TUNNEL_CSUM; |
1318 | gre_hdrlen = gre_calc_hlen(o_flags: tflags); |
1319 | |
1320 | max_headroom += gre_hdrlen; |
1321 | } |
1322 | |
1323 | skb = ip_vs_prepare_tunneled_skb(skb, skb_af: cp->af, max_headroom, |
1324 | next_protocol: &next_protocol, payload_len: &payload_len, |
1325 | dsfield: &dsfield, ttl: &ttl, NULL); |
1326 | if (IS_ERR(ptr: skb)) |
1327 | return NF_STOLEN; |
1328 | |
1329 | gso_type = __tun_gso_type_mask(AF_INET6, orig_af: cp->af); |
1330 | if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
1331 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || |
1332 | (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) |
1333 | gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; |
1334 | else |
1335 | gso_type |= SKB_GSO_UDP_TUNNEL; |
1336 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && |
1337 | skb->ip_summed == CHECKSUM_PARTIAL) { |
1338 | gso_type |= SKB_GSO_TUNNEL_REMCSUM; |
1339 | } |
1340 | } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { |
1341 | if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) |
1342 | gso_type |= SKB_GSO_GRE_CSUM; |
1343 | else |
1344 | gso_type |= SKB_GSO_GRE; |
1345 | } |
1346 | |
1347 | if (iptunnel_handle_offloads(skb, gso_type_mask: gso_type)) |
1348 | goto tx_error; |
1349 | |
1350 | skb->transport_header = skb->network_header; |
1351 | |
1352 | skb_set_inner_ipproto(skb, ipproto: next_protocol); |
1353 | skb_set_inner_mac_header(skb, offset: skb_inner_network_offset(skb)); |
1354 | |
1355 | if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { |
1356 | bool check = false; |
1357 | |
1358 | if (ipvs_gue_encap(net, skb, cp, next_protocol: &next_protocol)) |
1359 | goto tx_error; |
1360 | |
1361 | if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || |
1362 | (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) |
1363 | check = true; |
1364 | |
1365 | udp6_set_csum(nocheck: !check, skb, saddr: &saddr, daddr: &cp->daddr.in6, len: skb->len); |
1366 | } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) |
1367 | ipvs_gre_encap(net, skb, cp, next_protocol: &next_protocol); |
1368 | |
1369 | skb_push(skb, len: sizeof(struct ipv6hdr)); |
1370 | skb_reset_network_header(skb); |
1371 | memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); |
1372 | |
1373 | /* |
1374 | * Push down and install the IPIP header. |
1375 | */ |
1376 | iph = ipv6_hdr(skb); |
1377 | iph->version = 6; |
1378 | iph->nexthdr = next_protocol; |
1379 | iph->payload_len = htons(payload_len); |
1380 | memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); |
1381 | ipv6_change_dsfield(ipv6h: iph, mask: 0, value: dsfield); |
1382 | iph->daddr = cp->daddr.in6; |
1383 | iph->saddr = saddr; |
1384 | iph->hop_limit = ttl; |
1385 | |
1386 | /* Another hack: avoid icmp_send in ip_fragment */ |
1387 | skb->ignore_df = 1; |
1388 | |
1389 | ret = ip_vs_tunnel_xmit_prepare(skb, cp); |
1390 | if (ret == NF_ACCEPT) |
1391 | ip6_local_out(net, sk: skb->sk, skb); |
1392 | else if (ret == NF_DROP) |
1393 | kfree_skb(skb); |
1394 | |
1395 | return NF_STOLEN; |
1396 | |
1397 | tx_error: |
1398 | kfree_skb(skb); |
1399 | return NF_STOLEN; |
1400 | } |
1401 | #endif |
1402 | |
1403 | |
1404 | /* |
1405 | * Direct Routing transmitter |
1406 | * Used for ANY protocol |
1407 | */ |
1408 | int |
1409 | ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, |
1410 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
1411 | { |
1412 | int local; |
1413 | |
1414 | local = __ip_vs_get_out_rt(ipvs: cp->ipvs, skb_af: cp->af, skb, dest: cp->dest, daddr: cp->daddr.ip, |
1415 | rt_mode: IP_VS_RT_MODE_LOCAL | |
1416 | IP_VS_RT_MODE_NON_LOCAL | |
1417 | IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); |
1418 | if (local < 0) |
1419 | goto tx_error; |
1420 | if (local) |
1421 | return ip_vs_send_or_cont(pf: NFPROTO_IPV4, skb, cp, local: 1); |
1422 | |
1423 | ip_send_check(ip: ip_hdr(skb)); |
1424 | |
1425 | /* Another hack: avoid icmp_send in ip_fragment */ |
1426 | skb->ignore_df = 1; |
1427 | |
1428 | ip_vs_send_or_cont(pf: NFPROTO_IPV4, skb, cp, local: 0); |
1429 | |
1430 | return NF_STOLEN; |
1431 | |
1432 | tx_error: |
1433 | kfree_skb(skb); |
1434 | return NF_STOLEN; |
1435 | } |
1436 | |
1437 | #ifdef CONFIG_IP_VS_IPV6 |
1438 | int |
1439 | ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, |
1440 | struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) |
1441 | { |
1442 | int local; |
1443 | |
1444 | local = __ip_vs_get_out_rt_v6(ipvs: cp->ipvs, skb_af: cp->af, skb, dest: cp->dest, |
1445 | daddr: &cp->daddr.in6, |
1446 | NULL, ipvsh, do_xfrm: 0, |
1447 | rt_mode: IP_VS_RT_MODE_LOCAL | |
1448 | IP_VS_RT_MODE_NON_LOCAL | |
1449 | IP_VS_RT_MODE_KNOWN_NH); |
1450 | if (local < 0) |
1451 | goto tx_error; |
1452 | if (local) |
1453 | return ip_vs_send_or_cont(pf: NFPROTO_IPV6, skb, cp, local: 1); |
1454 | |
1455 | /* Another hack: avoid icmp_send in ip_fragment */ |
1456 | skb->ignore_df = 1; |
1457 | |
1458 | ip_vs_send_or_cont(pf: NFPROTO_IPV6, skb, cp, local: 0); |
1459 | |
1460 | return NF_STOLEN; |
1461 | |
1462 | tx_error: |
1463 | kfree_skb(skb); |
1464 | return NF_STOLEN; |
1465 | } |
1466 | #endif |
1467 | |
1468 | |
1469 | /* |
1470 | * ICMP packet transmitter |
1471 | * called by the ip_vs_in_icmp |
1472 | */ |
1473 | int |
1474 | ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, |
1475 | struct ip_vs_protocol *pp, int offset, unsigned int hooknum, |
1476 | struct ip_vs_iphdr *iph) |
1477 | { |
1478 | struct rtable *rt; /* Route to the other host */ |
1479 | int rc; |
1480 | int local; |
1481 | int rt_mode, was_input; |
1482 | |
1483 | /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be |
1484 | forwarded directly here, because there is no need to |
1485 | translate address/port back */ |
1486 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { |
1487 | if (cp->packet_xmit) |
1488 | rc = cp->packet_xmit(skb, cp, pp, iph); |
1489 | else |
1490 | rc = NF_ACCEPT; |
1491 | /* do not touch skb anymore */ |
1492 | atomic_inc(v: &cp->in_pkts); |
1493 | return rc; |
1494 | } |
1495 | |
1496 | /* |
1497 | * mangle and send the packet here (only for VS/NAT) |
1498 | */ |
1499 | was_input = rt_is_input_route(rt: skb_rtable(skb)); |
1500 | |
1501 | /* LOCALNODE from FORWARD hook is not supported */ |
1502 | rt_mode = (hooknum != NF_INET_FORWARD) ? |
1503 | IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | |
1504 | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; |
1505 | local = __ip_vs_get_out_rt(ipvs: cp->ipvs, skb_af: cp->af, skb, dest: cp->dest, daddr: cp->daddr.ip, rt_mode, |
1506 | NULL, ipvsh: iph); |
1507 | if (local < 0) |
1508 | goto tx_error; |
1509 | rt = skb_rtable(skb); |
1510 | |
1511 | /* |
1512 | * Avoid duplicate tuple in reply direction for NAT traffic |
1513 | * to local address when connection is sync-ed |
1514 | */ |
1515 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) |
1516 | if (cp->flags & IP_VS_CONN_F_SYNC && local) { |
1517 | enum ip_conntrack_info ctinfo; |
1518 | struct nf_conn *ct = nf_ct_get(skb, ctinfo: &ctinfo); |
1519 | |
1520 | if (ct) { |
1521 | IP_VS_DBG(10, "%s(): " |
1522 | "stopping DNAT to local address %pI4\n" , |
1523 | __func__, &cp->daddr.ip); |
1524 | goto tx_error; |
1525 | } |
1526 | } |
1527 | #endif |
1528 | |
1529 | /* From world but DNAT to loopback address? */ |
1530 | if (local && ipv4_is_loopback(addr: cp->daddr.ip) && was_input) { |
1531 | IP_VS_DBG(1, "%s(): " |
1532 | "stopping DNAT to loopback %pI4\n" , |
1533 | __func__, &cp->daddr.ip); |
1534 | goto tx_error; |
1535 | } |
1536 | |
1537 | /* copy-on-write the packet before mangling it */ |
1538 | if (skb_ensure_writable(skb, write_len: offset)) |
1539 | goto tx_error; |
1540 | |
1541 | if (skb_cow(skb, headroom: rt->dst.dev->hard_header_len)) |
1542 | goto tx_error; |
1543 | |
1544 | ip_vs_nat_icmp(skb, pp, cp, dir: 0); |
1545 | |
1546 | /* Another hack: avoid icmp_send in ip_fragment */ |
1547 | skb->ignore_df = 1; |
1548 | |
1549 | return ip_vs_nat_send_or_cont(pf: NFPROTO_IPV4, skb, cp, local); |
1550 | |
1551 | tx_error: |
1552 | kfree_skb(skb); |
1553 | rc = NF_STOLEN; |
1554 | return rc; |
1555 | } |
1556 | |
1557 | #ifdef CONFIG_IP_VS_IPV6 |
1558 | int |
1559 | ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, |
1560 | struct ip_vs_protocol *pp, int offset, unsigned int hooknum, |
1561 | struct ip_vs_iphdr *ipvsh) |
1562 | { |
1563 | struct rt6_info *rt; /* Route to the other host */ |
1564 | int rc; |
1565 | int local; |
1566 | int rt_mode; |
1567 | |
1568 | /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be |
1569 | forwarded directly here, because there is no need to |
1570 | translate address/port back */ |
1571 | if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { |
1572 | if (cp->packet_xmit) |
1573 | rc = cp->packet_xmit(skb, cp, pp, ipvsh); |
1574 | else |
1575 | rc = NF_ACCEPT; |
1576 | /* do not touch skb anymore */ |
1577 | atomic_inc(v: &cp->in_pkts); |
1578 | return rc; |
1579 | } |
1580 | |
1581 | /* |
1582 | * mangle and send the packet here (only for VS/NAT) |
1583 | */ |
1584 | |
1585 | /* LOCALNODE from FORWARD hook is not supported */ |
1586 | rt_mode = (hooknum != NF_INET_FORWARD) ? |
1587 | IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | |
1588 | IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; |
1589 | local = __ip_vs_get_out_rt_v6(ipvs: cp->ipvs, skb_af: cp->af, skb, dest: cp->dest, |
1590 | daddr: &cp->daddr.in6, NULL, ipvsh, do_xfrm: 0, rt_mode); |
1591 | if (local < 0) |
1592 | goto tx_error; |
1593 | rt = (struct rt6_info *) skb_dst(skb); |
1594 | /* |
1595 | * Avoid duplicate tuple in reply direction for NAT traffic |
1596 | * to local address when connection is sync-ed |
1597 | */ |
1598 | #if IS_ENABLED(CONFIG_NF_CONNTRACK) |
1599 | if (cp->flags & IP_VS_CONN_F_SYNC && local) { |
1600 | enum ip_conntrack_info ctinfo; |
1601 | struct nf_conn *ct = nf_ct_get(skb, ctinfo: &ctinfo); |
1602 | |
1603 | if (ct) { |
1604 | IP_VS_DBG(10, "%s(): " |
1605 | "stopping DNAT to local address %pI6\n" , |
1606 | __func__, &cp->daddr.in6); |
1607 | goto tx_error; |
1608 | } |
1609 | } |
1610 | #endif |
1611 | |
1612 | /* From world but DNAT to loopback address? */ |
1613 | if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && |
1614 | ipv6_addr_type(addr: &cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { |
1615 | IP_VS_DBG(1, "%s(): " |
1616 | "stopping DNAT to loopback %pI6\n" , |
1617 | __func__, &cp->daddr.in6); |
1618 | goto tx_error; |
1619 | } |
1620 | |
1621 | /* copy-on-write the packet before mangling it */ |
1622 | if (skb_ensure_writable(skb, write_len: offset)) |
1623 | goto tx_error; |
1624 | |
1625 | if (skb_cow(skb, headroom: rt->dst.dev->hard_header_len)) |
1626 | goto tx_error; |
1627 | |
1628 | ip_vs_nat_icmp_v6(skb, pp, cp, dir: 0); |
1629 | |
1630 | /* Another hack: avoid icmp_send in ip_fragment */ |
1631 | skb->ignore_df = 1; |
1632 | |
1633 | return ip_vs_nat_send_or_cont(pf: NFPROTO_IPV6, skb, cp, local); |
1634 | |
1635 | tx_error: |
1636 | kfree_skb(skb); |
1637 | rc = NF_STOLEN; |
1638 | return rc; |
1639 | } |
1640 | #endif |
1641 | |