ip_vs_xmit.c source code [linux/net/netfilter/ipvs/ip_vs_xmit.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* ip_vs_xmit.c: various packet transmitters for IPVS
4	*
5	* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
6	* Julian Anastasov <ja@ssi.bg>
7	*
8	* Changes:
9	*
10	* Description of forwarding methods:
11	* - all transmitters are called from LOCAL_IN (remote clients) and
12	* LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
13	* - not all connections have destination server, for example,
14	* connections in backup server when fwmark is used
15	* - bypass connections use daddr from packet
16	* - we can use dst without ref while sending in RCU section, we use
17	* ref when returning NF_ACCEPT for NAT-ed packet via loopback
18	* LOCAL_OUT rules:
19	* - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
20	* - skb->pkt_type is not set yet
21	* - the only place where we can see skb->sk != NULL
22	*/
23
24	#define KMSG_COMPONENT "IPVS"
25	#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
26
27	#include <linux/kernel.h>
28	#include <linux/slab.h>
29	#include <linux/tcp.h> /* for tcphdr */
30	#include <net/ip.h>
31	#include <net/gue.h>
32	#include <net/gre.h>
33	#include <net/tcp.h> /* for csum_tcpudp_magic */
34	#include <net/udp.h>
35	#include <net/icmp.h> /* for icmp_send */
36	#include <net/route.h> /* for ip_route_output */
37	#include <net/ipv6.h>
38	#include <net/ip6_route.h>
39	#include <net/ip_tunnels.h>
40	#include <net/ip6_checksum.h>
41	#include <net/addrconf.h>
42	#include <linux/icmpv6.h>
43	#include <linux/netfilter.h>
44	#include <linux/netfilter_ipv4.h>
45
46	#include <net/ip_vs.h>
47
48	enum {
49	IP_VS_RT_MODE_LOCAL = `1`, / Allow local dest /
50	IP_VS_RT_MODE_NON_LOCAL = `2`, / Allow non-local dest /
51	IP_VS_RT_MODE_RDR = `4`, / Allow redirect from remote daddr to*
52	* local
53	*/
54	IP_VS_RT_MODE_CONNECT = `8`, / Always bind route to saddr /
55	IP_VS_RT_MODE_KNOWN_NH = `16`,/ Route via remote addr /
56	IP_VS_RT_MODE_TUNNEL = `32`,/ Tunnel mode /
57	};
58
59	static inline struct ip_vs_dest_dst ip_vs_dest_dst_alloc(void*)
60	{
61	return kmalloc(size: sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
62	}
63
64	static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
65	{
66	kfree(objp: dest_dst);
67	}
68
69	/*
70	* Destination cache to speed up outgoing route lookup
71	*/
72	static inline void
73	__ip_vs_dst_set(struct ip_vs_dest dest, struct* ip_vs_dest_dst *dest_dst,
74	struct dst_entry *dst, u32 dst_cookie)
75	{
76	struct ip_vs_dest_dst *old;
77
78	old = rcu_dereference_protected(dest->dest_dst,
79	lockdep_is_held(&dest->dst_lock));
80
81	if (dest_dst) {
82	dest_dst->dst_cache = dst;
83	dest_dst->dst_cookie = dst_cookie;
84	}
85	rcu_assign_pointer(dest->dest_dst, dest_dst);
86
87	if (old)
88	call_rcu(head: &old->rcu_head, func: ip_vs_dest_dst_rcu_free);
89	}
90
91	static inline struct ip_vs_dest_dst *
92	__ip_vs_dst_check(struct ip_vs_dest *dest)
93	{
94	struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
95	struct dst_entry *dst;
96
97	if (!dest_dst)
98	return NULL;
99	dst = dest_dst->dst_cache;
100	if (dst->obsolete &&
101	dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
102	return NULL;
103	return dest_dst;
104	}
105
106	static inline bool
107	__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
108	{
109	if (IP6CB(skb)->frag_max_size) {
110	/ frag_max_size tell us that, this packet have been*
111	* defragmented by netfilter IPv6 conntrack module.
112	*/
113	if (IP6CB(skb)->frag_max_size > mtu)
114	return true; / largest fragment violate MTU /
115	}
116	else if (skb->len > mtu && !skb_is_gso(skb)) {
117	return true; / Packet size violate MTU size /
118	}
119	return false;
120	}
121
122	/ Get route to daddr, update saddr, optionally bind route to saddr /*
123	static struct rtable do_output_route4(struct* net *net, __be32 daddr,
124	int rt_mode, __be32 *saddr)
125	{
126	struct flowi4 fl4;
127	struct rtable *rt;
128	bool loop = false;
129
130	memset(&fl4, `0`, sizeof(fl4));
131	fl4.daddr = daddr;
132	fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
133	FLOWI_FLAG_KNOWN_NH : `0`;
134
135	retry:
136	rt = ip_route_output_key(net, flp: &fl4);
137	if (IS_ERR(ptr: rt)) {
138	/ Invalid saddr ? /
139	if (PTR_ERR(ptr: rt) == -EINVAL && *saddr &&
140	rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
141	*saddr = `0`;
142	flowi4_update_output(fl4: &fl4, oif: `0`, daddr, saddr: `0`);
143	goto retry;
144	}
145	IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
146	return NULL;
147	} else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
148	ip_rt_put(rt);
149	*saddr = fl4.saddr;
150	flowi4_update_output(fl4: &fl4, oif: `0`, daddr, saddr: fl4.saddr);
151	loop = true;
152	goto retry;
153	}
154	*saddr = fl4.saddr;
155	return rt;
156	}
157
158	#ifdef CONFIG_IP_VS_IPV6
159	static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
160	{
161	return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK;
162	}
163	#endif
164
165	static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
166	int rt_mode,
167	bool new_rt_is_local)
168	{
169	bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
170	bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL);
171	bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR);
172	bool source_is_loopback;
173	bool old_rt_is_local;
174
175	#ifdef CONFIG_IP_VS_IPV6
176	if (skb_af == AF_INET6) {
177	int addr_type = ipv6_addr_type(addr: &ipv6_hdr(skb)->saddr);
178
179	source_is_loopback =
180	(!skb->dev \|\| skb->dev->flags & IFF_LOOPBACK) &&
181	(addr_type & IPV6_ADDR_LOOPBACK);
182	old_rt_is_local = __ip_vs_is_local_route6(
183	rt: (struct rt6_info *)skb_dst(skb));
184	} else
185	#endif
186	{
187	source_is_loopback = ipv4_is_loopback(addr: ip_hdr(skb)->saddr);
188	old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
189	}
190
191	if (unlikely(new_rt_is_local)) {
192	if (!rt_mode_allow_local)
193	return true;
194	if (!rt_mode_allow_redirect && !old_rt_is_local)
195	return true;
196	} else {
197	if (!rt_mode_allow_non_local)
198	return true;
199	if (source_is_loopback)
200	return true;
201	}
202	return false;
203	}
204
205	static inline void maybe_update_pmtu(int skb_af, struct sk_buff skb, int* mtu)
206	{
207	struct sock *sk = skb->sk;
208	struct rtable *ort = skb_rtable(skb);
209
210	if (!skb->dev && sk && sk_fullsock(sk))
211	ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true);
212	}
213
214	static inline bool ensure_mtu_is_adequate(struct netns_ipvs ipvs, int* skb_af,
215	int rt_mode,
216	struct ip_vs_iphdr *ipvsh,
217	struct sk_buff skb, int* mtu)
218	{
219	#ifdef CONFIG_IP_VS_IPV6
220	if (skb_af == AF_INET6) {
221	struct net *net = ipvs->net;
222
223	if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
224	if (!skb->dev)
225	skb->dev = net->loopback_dev;
226	/ only send ICMP too big on first fragment /
227	if (!ipvsh->fragoffs && !ip_vs_iph_icmp(iph: ipvsh))
228	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, code: `0`, info: mtu);
229	IP_VS_DBG(`1`, "frag needed for %pI6c\n",
230	&ipv6_hdr(skb)->saddr);
231	return false;
232	}
233	} else
234	#endif
235	{
236	/ If we're going to tunnel the packet and pmtu discovery*
237	* is disabled, we'll just fragment it anyway
238	*/
239	if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs))
240	return true;
241
242	if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
243	skb->len > mtu && !skb_is_gso(skb) &&
244	!ip_vs_iph_icmp(ipvsh))) {
245	icmp_send(skb_in: skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
246	htonl(mtu));
247	IP_VS_DBG(`1`, "frag needed for %pI4\n",
248	&ip_hdr(skb)->saddr);
249	return false;
250	}
251	}
252
253	return true;
254	}
255
256	static inline bool decrement_ttl(struct netns_ipvs *ipvs,
257	int skb_af,
258	struct sk_buff *skb)
259	{
260	struct net *net = ipvs->net;
261
262	#ifdef CONFIG_IP_VS_IPV6
263	if (skb_af == AF_INET6) {
264	struct dst_entry *dst = skb_dst(skb);
265
266	/ check and decrement ttl /
267	if (ipv6_hdr(skb)->hop_limit <= `1`) {
268	struct inet6_dev *idev = __in6_dev_get_safely(dev: skb->dev);
269
270	/ Force OUTPUT device used as source address /
271	skb->dev = dst->dev;
272	icmpv6_send(skb, ICMPV6_TIME_EXCEED,
273	ICMPV6_EXC_HOPLIMIT, info: `0`);
274	IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
275
276	return false;
277	}
278
279	/ don't propagate ttl change to cloned packets /
280	if (skb_ensure_writable(skb, write_len: sizeof(struct ipv6hdr)))
281	return false;
282
283	ipv6_hdr(skb)->hop_limit--;
284	} else
285	#endif
286	{
287	if (ip_hdr(skb)->ttl <= `1`) {
288	/ Tell the sender its packet died... /
289	IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
290	icmp_send(skb_in: skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, info: `0`);
291	return false;
292	}
293
294	/ don't propagate ttl change to cloned packets /
295	if (skb_ensure_writable(skb, write_len: sizeof(struct iphdr)))
296	return false;
297
298	/ Decrease ttl /
299	ip_decrease_ttl(iph: ip_hdr(skb));
300	}
301
302	return true;
303	}
304
305	/ Get route to destination or remote server /
306	static int
307	__ip_vs_get_out_rt(struct netns_ipvs ipvs, int* skb_af, struct sk_buff *skb,
308	struct ip_vs_dest *dest,
309	__be32 daddr, int rt_mode, __be32 *ret_saddr,
310	struct ip_vs_iphdr *ipvsh)
311	{
312	struct net *net = ipvs->net;
313	struct ip_vs_dest_dst *dest_dst;
314	struct rtable rt; /* Route to the other host /
315	int mtu;
316	int local, noref = `1`;
317
318	if (dest) {
319	dest_dst = __ip_vs_dst_check(dest);
320	if (likely(dest_dst))
321	rt = (struct rtable *) dest_dst->dst_cache;
322	else {
323	dest_dst = ip_vs_dest_dst_alloc();
324	spin_lock_bh(lock: &dest->dst_lock);
325	if (!dest_dst) {
326	__ip_vs_dst_set(dest, NULL, NULL, dst_cookie: `0`);
327	spin_unlock_bh(lock: &dest->dst_lock);
328	goto err_unreach;
329	}
330	rt = do_output_route4(net, daddr: dest->addr.ip, rt_mode,
331	saddr: &dest_dst->dst_saddr.ip);
332	if (!rt) {
333	__ip_vs_dst_set(dest, NULL, NULL, dst_cookie: `0`);
334	spin_unlock_bh(lock: &dest->dst_lock);
335	ip_vs_dest_dst_free(dest_dst);
336	goto err_unreach;
337	}
338	__ip_vs_dst_set(dest, dest_dst, dst: &rt->dst, dst_cookie: `0`);
339	spin_unlock_bh(lock: &dest->dst_lock);
340	IP_VS_DBG(`10`, "new dst %pI4, src %pI4, refcnt=%d\n",
341	&dest->addr.ip, &dest_dst->dst_saddr.ip,
342	rcuref_read(&rt->dst.__rcuref));
343	}
344	if (ret_saddr)
345	*ret_saddr = dest_dst->dst_saddr.ip;
346	} else {
347	__be32 saddr = htonl(INADDR_ANY);
348
349	noref = `0`;
350
351	/ For such unconfigured boxes avoid many route lookups*
352	* for performance reasons because we do not remember saddr
353	*/
354	rt_mode &= ~IP_VS_RT_MODE_CONNECT;
355	rt = do_output_route4(net, daddr, rt_mode, saddr: &saddr);
356	if (!rt)
357	goto err_unreach;
358	if (ret_saddr)
359	*ret_saddr = saddr;
360	}
361
362	local = (rt->rt_flags & RTCF_LOCAL) ? `1` : `0`;
363	if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
364	local))) {
365	IP_VS_DBG_RL("We are crossing local and non-local addresses"
366	" daddr=%pI4\n", &daddr);
367	goto err_put;
368	}
369
370	if (unlikely(local)) {
371	/ skb to local stack, preserve old route /
372	if (!noref)
373	ip_rt_put(rt);
374	return local;
375	}
376
377	if (!decrement_ttl(ipvs, skb_af, skb))
378	goto err_put;
379
380	if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
381	mtu = dst_mtu(dst: &rt->dst);
382	} else {
383	mtu = dst_mtu(dst: &rt->dst) - sizeof(struct iphdr);
384	if (!dest)
385	goto err_put;
386	if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
387	mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
388	if ((dest->tun_flags &
389	IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
390	skb->ip_summed == CHECKSUM_PARTIAL)
391	mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
392	} else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
393	__be16 tflags = `0`;
394
395	if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
396	tflags \|= TUNNEL_CSUM;
397	mtu -= gre_calc_hlen(o_flags: tflags);
398	}
399	if (mtu < `68`) {
400	IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
401	goto err_put;
402	}
403	maybe_update_pmtu(skb_af, skb, mtu);
404	}
405
406	if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
407	goto err_put;
408
409	skb_dst_drop(skb);
410	if (noref)
411	skb_dst_set_noref(skb, dst: &rt->dst);
412	else
413	skb_dst_set(skb, dst: &rt->dst);
414
415	return local;
416
417	err_put:
418	if (!noref)
419	ip_rt_put(rt);
420	return -`1`;
421
422	err_unreach:
423	dst_link_failure(skb);
424	return -`1`;
425	}
426
427	#ifdef CONFIG_IP_VS_IPV6
428	static struct dst_entry *
429	__ip_vs_route_output_v6(struct net net, struct* in6_addr *daddr,
430	struct in6_addr ret_saddr, int* do_xfrm, int rt_mode)
431	{
432	struct dst_entry *dst;
433	struct flowi6 fl6 = {
434	.daddr = *daddr,
435	};
436
437	if (rt_mode & IP_VS_RT_MODE_KNOWN_NH)
438	fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
439
440	dst = ip6_route_output(net, NULL, fl6: &fl6);
441	if (dst->error)
442	goto out_err;
443	if (!ret_saddr)
444	return dst;
445	if (ipv6_addr_any(a: &fl6.saddr) &&
446	ipv6_dev_get_saddr(net, dev: ip6_dst_idev(dst)->dev,
447	daddr: &fl6.daddr, srcprefs: `0`, saddr: &fl6.saddr) < `0`)
448	goto out_err;
449	if (do_xfrm) {
450	dst = xfrm_lookup(net, dst_orig: dst, fl: flowi6_to_flowi(fl6: &fl6), NULL, flags: `0`);
451	if (IS_ERR(ptr: dst)) {
452	dst = NULL;
453	goto out_err;
454	}
455	}
456	*ret_saddr = fl6.saddr;
457	return dst;
458
459	out_err:
460	dst_release(dst);
461	IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
462	return NULL;
463	}
464
465	/*
466	* Get route to destination or remote server
467	*/
468	static int
469	__ip_vs_get_out_rt_v6(struct netns_ipvs ipvs, int* skb_af, struct sk_buff *skb,
470	struct ip_vs_dest *dest,
471	struct in6_addr daddr, struct* in6_addr *ret_saddr,
472	struct ip_vs_iphdr ipvsh, int* do_xfrm, int rt_mode)
473	{
474	struct net *net = ipvs->net;
475	struct ip_vs_dest_dst *dest_dst;
476	struct rt6_info rt; /* Route to the other host /
477	struct dst_entry *dst;
478	int mtu;
479	int local, noref = `1`;
480
481	if (dest) {
482	dest_dst = __ip_vs_dst_check(dest);
483	if (likely(dest_dst))
484	rt = (struct rt6_info *) dest_dst->dst_cache;
485	else {
486	u32 cookie;
487
488	dest_dst = ip_vs_dest_dst_alloc();
489	spin_lock_bh(lock: &dest->dst_lock);
490	if (!dest_dst) {
491	__ip_vs_dst_set(dest, NULL, NULL, dst_cookie: `0`);
492	spin_unlock_bh(lock: &dest->dst_lock);
493	goto err_unreach;
494	}
495	dst = __ip_vs_route_output_v6(net, daddr: &dest->addr.in6,
496	ret_saddr: &dest_dst->dst_saddr.in6,
497	do_xfrm, rt_mode);
498	if (!dst) {
499	__ip_vs_dst_set(dest, NULL, NULL, dst_cookie: `0`);
500	spin_unlock_bh(lock: &dest->dst_lock);
501	ip_vs_dest_dst_free(dest_dst);
502	goto err_unreach;
503	}
504	rt = (struct rt6_info *) dst;
505	cookie = rt6_get_cookie(rt);
506	__ip_vs_dst_set(dest, dest_dst, dst: &rt->dst, dst_cookie: cookie);
507	spin_unlock_bh(lock: &dest->dst_lock);
508	IP_VS_DBG(`10`, "new dst %pI6, src %pI6, refcnt=%d\n",
509	&dest->addr.in6, &dest_dst->dst_saddr.in6,
510	rcuref_read(&rt->dst.__rcuref));
511	}
512	if (ret_saddr)
513	*ret_saddr = dest_dst->dst_saddr.in6;
514	} else {
515	noref = `0`;
516	dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm,
517	rt_mode);
518	if (!dst)
519	goto err_unreach;
520	rt = (struct rt6_info *) dst;
521	}
522
523	local = __ip_vs_is_local_route6(rt);
524
525	if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
526	local))) {
527	IP_VS_DBG_RL("We are crossing local and non-local addresses"
528	" daddr=%pI6\n", daddr);
529	goto err_put;
530	}
531
532	if (unlikely(local)) {
533	/ skb to local stack, preserve old route /
534	if (!noref)
535	dst_release(dst: &rt->dst);
536	return local;
537	}
538
539	if (!decrement_ttl(ipvs, skb_af, skb))
540	goto err_put;
541
542	/ MTU checking /
543	if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
544	mtu = dst_mtu(dst: &rt->dst);
545	else {
546	mtu = dst_mtu(dst: &rt->dst) - sizeof(struct ipv6hdr);
547	if (!dest)
548	goto err_put;
549	if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
550	mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
551	if ((dest->tun_flags &
552	IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
553	skb->ip_summed == CHECKSUM_PARTIAL)
554	mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
555	} else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
556	__be16 tflags = `0`;
557
558	if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
559	tflags \|= TUNNEL_CSUM;
560	mtu -= gre_calc_hlen(o_flags: tflags);
561	}
562	if (mtu < IPV6_MIN_MTU) {
563	IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
564	IPV6_MIN_MTU);
565	goto err_put;
566	}
567	maybe_update_pmtu(skb_af, skb, mtu);
568	}
569
570	if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
571	goto err_put;
572
573	skb_dst_drop(skb);
574	if (noref)
575	skb_dst_set_noref(skb, dst: &rt->dst);
576	else
577	skb_dst_set(skb, dst: &rt->dst);
578
579	return local;
580
581	err_put:
582	if (!noref)
583	dst_release(dst: &rt->dst);
584	return -`1`;
585
586	err_unreach:
587	/ The ip6_link_failure function requires the dev field to be set*
588	* in order to get the net (further for the sake of fwmark
589	* reflection).
590	*/
591	if (!skb->dev)
592	skb->dev = skb_dst(skb)->dev;
593
594	dst_link_failure(skb);
595	return -`1`;
596	}
597	#endif
598
599
600	/ return NF_ACCEPT to allow forwarding or other NF_xxx on error /
601	static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
602	struct ip_vs_conn *cp)
603	{
604	int ret = NF_ACCEPT;
605
606	skb->ipvs_property = `1`;
607	if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
608	ret = ip_vs_confirm_conntrack(skb);
609	if (ret == NF_ACCEPT) {
610	nf_reset_ct(skb);
611	skb_forward_csum(skb);
612	if (skb->dev)
613	skb_clear_tstamp(skb);
614	}
615	return ret;
616	}
617
618	/ In the event of a remote destination, it's possible that we would have*
619	* matches against an old socket (particularly a TIME-WAIT socket). This
620	* causes havoc down the line (ip_local_out et. al. expect regular sockets
621	* and invalid memory accesses will happen) so simply drop the association
622	* in this case.
623	*/
624	static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb)
625	{
626	/ If dev is set, the packet came from the LOCAL_IN callback and*
627	* not from a local TCP socket.
628	*/
629	if (skb->dev)
630	skb_orphan(skb);
631	}
632
633	/ return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) /
634	static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
635	struct ip_vs_conn cp, int* local)
636	{
637	int ret = NF_STOLEN;
638
639	skb->ipvs_property = `1`;
640	if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
641	ip_vs_notrack(skb);
642	else
643	ip_vs_update_conntrack(skb, cp, outin: `1`);
644
645	/ Remove the early_demux association unless it's bound for the*
646	* exact same port and address on this host after translation.
647	*/
648	if (!local \|\| cp->vport != cp->dport \|\|
649	!ip_vs_addr_equal(af: cp->af, a: &cp->vaddr, b: &cp->daddr))
650	ip_vs_drop_early_demux_sk(skb);
651
652	if (!local) {
653	skb_forward_csum(skb);
654	if (skb->dev)
655	skb_clear_tstamp(skb);
656	NF_HOOK(pf, hook: NF_INET_LOCAL_OUT, net: cp->ipvs->net, NULL, skb,
657	NULL, out: skb_dst(skb)->dev, okfn: dst_output);
658	} else
659	ret = NF_ACCEPT;
660
661	return ret;
662	}
663
664	/ return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) /
665	static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
666	struct ip_vs_conn cp, int* local)
667	{
668	int ret = NF_STOLEN;
669
670	skb->ipvs_property = `1`;
671	if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
672	ip_vs_notrack(skb);
673	if (!local) {
674	ip_vs_drop_early_demux_sk(skb);
675	skb_forward_csum(skb);
676	if (skb->dev)
677	skb_clear_tstamp(skb);
678	NF_HOOK(pf, hook: NF_INET_LOCAL_OUT, net: cp->ipvs->net, NULL, skb,
679	NULL, out: skb_dst(skb)->dev, okfn: dst_output);
680	} else
681	ret = NF_ACCEPT;
682	return ret;
683	}
684
685
686	/*
687	* NULL transmitter (do nothing except return NF_ACCEPT)
688	*/
689	int
690	ip_vs_null_xmit(struct sk_buff skb, struct* ip_vs_conn *cp,
691	struct ip_vs_protocol pp, struct* ip_vs_iphdr *ipvsh)
692	{
693	/ we do not touch skb and do not need pskb ptr /
694	return ip_vs_send_or_cont(pf: NFPROTO_IPV4, skb, cp, local: `1`);
695	}
696
697
698	/*
699	* Bypass transmitter
700	* Let packets bypass the destination when the destination is not
701	* available, it may be only used in transparent cache cluster.
702	*/
703	int
704	ip_vs_bypass_xmit(struct sk_buff skb, struct* ip_vs_conn *cp,
705	struct ip_vs_protocol pp, struct* ip_vs_iphdr *ipvsh)
706	{
707	struct iphdr *iph = ip_hdr(skb);
708
709	if (__ip_vs_get_out_rt(ipvs: cp->ipvs, skb_af: cp->af, skb, NULL, daddr: iph->daddr,
710	rt_mode: IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < `0`)
711	goto tx_error;
712
713	ip_send_check(ip: iph);
714
715	/ Another hack: avoid icmp_send in ip_fragment /
716	skb->ignore_df = `1`;
717
718	ip_vs_send_or_cont(pf: NFPROTO_IPV4, skb, cp, local: `0`);
719
720	return NF_STOLEN;
721
722	tx_error:
723	kfree_skb(skb);
724	return NF_STOLEN;
725	}
726
727	#ifdef CONFIG_IP_VS_IPV6
728	int
729	ip_vs_bypass_xmit_v6(struct sk_buff skb, struct* ip_vs_conn *cp,
730	struct ip_vs_protocol pp, struct* ip_vs_iphdr *ipvsh)
731	{
732	struct ipv6hdr *iph = ipv6_hdr(skb);
733
734	if (__ip_vs_get_out_rt_v6(ipvs: cp->ipvs, skb_af: cp->af, skb, NULL,
735	daddr: &iph->daddr, NULL,
736	ipvsh, do_xfrm: `0`, rt_mode: IP_VS_RT_MODE_NON_LOCAL) < `0`)
737	goto tx_error;
738
739	/ Another hack: avoid icmp_send in ip_fragment /
740	skb->ignore_df = `1`;
741
742	ip_vs_send_or_cont(pf: NFPROTO_IPV6, skb, cp, local: `0`);
743
744	return NF_STOLEN;
745
746	tx_error:
747	kfree_skb(skb);
748	return NF_STOLEN;
749	}
750	#endif
751
752	/*
753	* NAT transmitter (only for outside-to-inside nat forwarding)
754	* Not used for related ICMP
755	*/
756	int
757	ip_vs_nat_xmit(struct sk_buff skb, struct* ip_vs_conn *cp,
758	struct ip_vs_protocol pp, struct* ip_vs_iphdr *ipvsh)
759	{
760	struct rtable rt; /* Route to the other host /
761	int local, rc, was_input;
762
763	/ check if it is a connection of no-client-port /
764	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
765	__be16 _pt, *p;
766
767	p = skb_header_pointer(skb, offset: ipvsh->len, len: sizeof(_pt), buffer: &_pt);
768	if (p == NULL)
769	goto tx_error;
770	ip_vs_conn_fill_cport(cp, cport: *p);
771	IP_VS_DBG(`10`, "filled cport=%d\n", ntohs(*p));
772	}
773
774	was_input = rt_is_input_route(rt: skb_rtable(skb));
775	local = __ip_vs_get_out_rt(ipvs: cp->ipvs, skb_af: cp->af, skb, dest: cp->dest, daddr: cp->daddr.ip,
776	rt_mode: IP_VS_RT_MODE_LOCAL \|
777	IP_VS_RT_MODE_NON_LOCAL \|
778	IP_VS_RT_MODE_RDR, NULL, ipvsh);
779	if (local < `0`)
780	goto tx_error;
781	rt = skb_rtable(skb);
782	/*
783	* Avoid duplicate tuple in reply direction for NAT traffic
784	* to local address when connection is sync-ed
785	*/
786	#if IS_ENABLED(CONFIG_NF_CONNTRACK)
787	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
788	enum ip_conntrack_info ctinfo;
789	struct nf_conn *ct = nf_ct_get(skb, ctinfo: &ctinfo);
790
791	if (ct) {
792	IP_VS_DBG_RL_PKT(`10`, AF_INET, pp, skb, ipvsh->off,
793	"ip_vs_nat_xmit(): "
794	"stopping DNAT to local address");
795	goto tx_error;
796	}
797	}
798	#endif
799
800	/ From world but DNAT to loopback address? /
801	if (local && ipv4_is_loopback(addr: cp->daddr.ip) && was_input) {
802	IP_VS_DBG_RL_PKT(`1`, AF_INET, pp, skb, ipvsh->off,
803	"ip_vs_nat_xmit(): stopping DNAT to loopback "
804	"address");
805	goto tx_error;
806	}
807
808	/ copy-on-write the packet before mangling it /
809	if (skb_ensure_writable(skb, write_len: sizeof(struct iphdr)))
810	goto tx_error;
811
812	if (skb_cow(skb, headroom: rt->dst.dev->hard_header_len))
813	goto tx_error;
814
815	/ mangle the packet /
816	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
817	goto tx_error;
818	ip_hdr(skb)->daddr = cp->daddr.ip;
819	ip_send_check(ip: ip_hdr(skb));
820
821	IP_VS_DBG_PKT(`10`, AF_INET, pp, skb, ipvsh->off, "After DNAT");
822
823	/ FIXME: when application helper enlarges the packet and the length*
824	is larger than the MTU of outgoing device, there will be still
825	MTU problem. /*
826
827	/ Another hack: avoid icmp_send in ip_fragment /
828	skb->ignore_df = `1`;
829
830	rc = ip_vs_nat_send_or_cont(pf: NFPROTO_IPV4, skb, cp, local);
831
832	return rc;
833
834	tx_error:
835	kfree_skb(skb);
836	return NF_STOLEN;
837	}
838
839	#ifdef CONFIG_IP_VS_IPV6
840	int
841	ip_vs_nat_xmit_v6(struct sk_buff skb, struct* ip_vs_conn *cp,
842	struct ip_vs_protocol pp, struct* ip_vs_iphdr *ipvsh)
843	{
844	struct rt6_info rt; /* Route to the other host /
845	int local, rc;
846
847	/ check if it is a connection of no-client-port /
848	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
849	__be16 _pt, *p;
850	p = skb_header_pointer(skb, offset: ipvsh->len, len: sizeof(_pt), buffer: &_pt);
851	if (p == NULL)
852	goto tx_error;
853	ip_vs_conn_fill_cport(cp, cport: *p);
854	IP_VS_DBG(`10`, "filled cport=%d\n", ntohs(*p));
855	}
856
857	local = __ip_vs_get_out_rt_v6(ipvs: cp->ipvs, skb_af: cp->af, skb, dest: cp->dest,
858	daddr: &cp->daddr.in6,
859	NULL, ipvsh, do_xfrm: `0`,
860	rt_mode: IP_VS_RT_MODE_LOCAL \|
861	IP_VS_RT_MODE_NON_LOCAL \|
862	IP_VS_RT_MODE_RDR);
863	if (local < `0`)
864	goto tx_error;
865	rt = (struct rt6_info *) skb_dst(skb);
866	/*
867	* Avoid duplicate tuple in reply direction for NAT traffic
868	* to local address when connection is sync-ed
869	*/
870	#if IS_ENABLED(CONFIG_NF_CONNTRACK)
871	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
872	enum ip_conntrack_info ctinfo;
873	struct nf_conn *ct = nf_ct_get(skb, ctinfo: &ctinfo);
874
875	if (ct) {
876	IP_VS_DBG_RL_PKT(`10`, AF_INET6, pp, skb, ipvsh->off,
877	"ip_vs_nat_xmit_v6(): "
878	"stopping DNAT to local address");
879	goto tx_error;
880	}
881	}
882	#endif
883
884	/ From world but DNAT to loopback address? /
885	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
886	ipv6_addr_type(addr: &cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
887	IP_VS_DBG_RL_PKT(`1`, AF_INET6, pp, skb, ipvsh->off,
888	"ip_vs_nat_xmit_v6(): "
889	"stopping DNAT to loopback address");
890	goto tx_error;
891	}
892
893	/ copy-on-write the packet before mangling it /
894	if (skb_ensure_writable(skb, write_len: sizeof(struct ipv6hdr)))
895	goto tx_error;
896
897	if (skb_cow(skb, headroom: rt->dst.dev->hard_header_len))
898	goto tx_error;
899
900	/ mangle the packet /
901	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
902	goto tx_error;
903	ipv6_hdr(skb)->daddr = cp->daddr.in6;
904
905	IP_VS_DBG_PKT(`10`, AF_INET6, pp, skb, ipvsh->off, "After DNAT");
906
907	/ FIXME: when application helper enlarges the packet and the length*
908	is larger than the MTU of outgoing device, there will be still
909	MTU problem. /*
910
911	/ Another hack: avoid icmp_send in ip_fragment /
912	skb->ignore_df = `1`;
913
914	rc = ip_vs_nat_send_or_cont(pf: NFPROTO_IPV6, skb, cp, local);
915
916	return rc;
917
918	tx_error:
919	kfree_skb(skb);
920	return NF_STOLEN;
921	}
922	#endif
923
924	/ When forwarding a packet, we must ensure that we've got enough headroom*
925	* for the encapsulation packet in the skb. This also gives us an
926	* opportunity to figure out what the payload_len, dsfield, ttl, and df
927	* values should be, so that we won't need to look at the old ip header
928	* again
929	*/
930	static struct sk_buff *
931	ip_vs_prepare_tunneled_skb(struct sk_buff skb, int* skb_af,
932	unsigned int max_headroom, __u8 *next_protocol,
933	__u32 payload_len, __u8 dsfield, __u8 *ttl,
934	__be16 *df)
935	{
936	struct sk_buff *new_skb = NULL;
937	struct iphdr *old_iph = NULL;
938	__u8 old_dsfield;
939	#ifdef CONFIG_IP_VS_IPV6
940	struct ipv6hdr *old_ipv6h = NULL;
941	#endif
942
943	ip_vs_drop_early_demux_sk(skb);
944
945	if (skb_headroom(skb) < max_headroom \|\| skb_cloned(skb)) {
946	new_skb = skb_realloc_headroom(skb, headroom: max_headroom);
947	if (!new_skb)
948	goto error;
949	if (skb->sk)
950	skb_set_owner_w(skb: new_skb, sk: skb->sk);
951	consume_skb(skb);
952	skb = new_skb;
953	}
954
955	#ifdef CONFIG_IP_VS_IPV6
956	if (skb_af == AF_INET6) {
957	old_ipv6h = ipv6_hdr(skb);
958	*next_protocol = IPPROTO_IPV6;
959	if (payload_len)
960	*payload_len =
961	ntohs(old_ipv6h->payload_len) +
962	sizeof(*old_ipv6h);
963	old_dsfield = ipv6_get_dsfield(ipv6h: old_ipv6h);
964	*ttl = old_ipv6h->hop_limit;
965	if (df)
966	*df = `0`;
967	} else
968	#endif
969	{
970	old_iph = ip_hdr(skb);
971	/ Copy DF, reset fragment offset and MF /
972	if (df)
973	*df = (old_iph->frag_off & htons(IP_DF));
974	*next_protocol = IPPROTO_IPIP;
975
976	/ fix old IP header checksum /
977	ip_send_check(ip: old_iph);
978	old_dsfield = ipv4_get_dsfield(iph: old_iph);
979	*ttl = old_iph->ttl;
980	if (payload_len)
981	*payload_len = skb_ip_totlen(skb);
982	}
983
984	/ Implement full-functionality option for ECN encapsulation /
985	*dsfield = INET_ECN_encapsulate(outer: old_dsfield, inner: old_dsfield);
986
987	return skb;
988	error:
989	kfree_skb(skb);
990	return ERR_PTR(error: -ENOMEM);
991	}
992
993	static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
994	{
995	switch (encaps_af) {
996	case AF_INET:
997	return SKB_GSO_IPXIP4;
998	case AF_INET6:
999	return SKB_GSO_IPXIP6;
1000	default:
1001	return `0`;
1002	}
1003	}
1004
1005	static int
1006	ipvs_gue_encap(struct net net, struct* sk_buff *skb,
1007	struct ip_vs_conn cp, __u8 next_protocol)
1008	{
1009	__be16 dport;
1010	__be16 sport = udp_flow_src_port(net, skb, min: `0`, max: `0`, use_eth: false);
1011	struct udphdr udph; /* Our new UDP header /
1012	struct guehdr gueh; /* Our new GUE header /
1013	size_t hdrlen, optlen = `0`;
1014	void *data;
1015	bool need_priv = false;
1016
1017	if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1018	skb->ip_summed == CHECKSUM_PARTIAL) {
1019	optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1020	need_priv = true;
1021	}
1022
1023	hdrlen = sizeof(struct guehdr) + optlen;
1024
1025	skb_push(skb, len: hdrlen);
1026
1027	gueh = (struct guehdr *)skb->data;
1028
1029	gueh->control = `0`;
1030	gueh->version = `0`;
1031	gueh->hlen = optlen >> `2`;
1032	gueh->flags = `0`;
1033	gueh->proto_ctype = *next_protocol;
1034
1035	data = &gueh[`1`];
1036
1037	if (need_priv) {
1038	__be32 *flags = data;
1039	u16 csum_start = skb_checksum_start_offset(skb);
1040	__be16 *pd;
1041
1042	gueh->flags \|= GUE_FLAG_PRIV;
1043	*flags = `0`;
1044	data += GUE_LEN_PRIV;
1045
1046	if (csum_start < hdrlen)
1047	return -EINVAL;
1048
1049	csum_start -= hdrlen;
1050	pd = data;
1051	pd[`0`] = htons(csum_start);
1052	pd[`1`] = htons(csum_start + skb->csum_offset);
1053
1054	if (!skb_is_gso(skb)) {
1055	skb->ip_summed = CHECKSUM_NONE;
1056	skb->encapsulation = `0`;
1057	}
1058
1059	*flags \|= GUE_PFLAG_REMCSUM;
1060	data += GUE_PLEN_REMCSUM;
1061	}
1062
1063	skb_push(skb, len: sizeof(struct udphdr));
1064	skb_reset_transport_header(skb);
1065
1066	udph = udp_hdr(skb);
1067
1068	dport = cp->dest->tun_port;
1069	udph->dest = dport;
1070	udph->source = sport;
1071	udph->len = htons(skb->len);
1072	udph->check = `0`;
1073
1074	*next_protocol = IPPROTO_UDP;
1075
1076	return `0`;
1077	}
1078
1079	static void
1080	ipvs_gre_encap(struct net net, struct* sk_buff *skb,
1081	struct ip_vs_conn cp, __u8 next_protocol)
1082	{
1083	__be16 proto = *next_protocol == IPPROTO_IPIP ?
1084	htons(ETH_P_IP) : htons(ETH_P_IPV6);
1085	__be16 tflags = `0`;
1086	size_t hdrlen;
1087
1088	if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1089	tflags \|= TUNNEL_CSUM;
1090
1091	hdrlen = gre_calc_hlen(o_flags: tflags);
1092	gre_build_header(skb, hdr_len: hdrlen, flags: tflags, proto, key: `0`, seq: `0`);
1093
1094	*next_protocol = IPPROTO_GRE;
1095	}
1096
1097	/*
1098	* IP Tunneling transmitter
1099	*
1100	* This function encapsulates the packet in a new IP packet, its
1101	* destination will be set to cp->daddr. Most code of this function
1102	* is taken from ipip.c.
1103	*
1104	* It is used in VS/TUN cluster. The load balancer selects a real
1105	* server from a cluster based on a scheduling algorithm,
1106	* encapsulates the request packet and forwards it to the selected
1107	* server. For example, all real servers are configured with
1108	* "ifconfig tunl0 <Virtual IP Address> up". When the server receives
1109	* the encapsulated packet, it will decapsulate the packet, processe
1110	* the request and return the response packets directly to the client
1111	* without passing the load balancer. This can greatly increase the
1112	* scalability of virtual server.
1113	*
1114	* Used for ANY protocol
1115	*/
1116	int
1117	ip_vs_tunnel_xmit(struct sk_buff skb, struct* ip_vs_conn *cp,
1118	struct ip_vs_protocol pp, struct* ip_vs_iphdr *ipvsh)
1119	{
1120	struct netns_ipvs *ipvs = cp->ipvs;
1121	struct net *net = ipvs->net;
1122	struct rtable rt; /* Route to the other host /
1123	__be32 saddr; / Source for tunnel /
1124	struct net_device tdev; /* Device to other host /
1125	__u8 next_protocol = `0`;
1126	__u8 dsfield = `0`;
1127	__u8 ttl = `0`;
1128	__be16 df = `0`;
1129	__be16 *dfp = NULL;
1130	struct iphdr iph; /* Our new IP header /
1131	unsigned int max_headroom; / The extra header space needed /
1132	int ret, local;
1133	int tun_type, gso_type;
1134	int tun_flags;
1135
1136	local = __ip_vs_get_out_rt(ipvs, skb_af: cp->af, skb, dest: cp->dest, daddr: cp->daddr.ip,
1137	rt_mode: IP_VS_RT_MODE_LOCAL \|
1138	IP_VS_RT_MODE_NON_LOCAL \|
1139	IP_VS_RT_MODE_CONNECT \|
1140	IP_VS_RT_MODE_TUNNEL, ret_saddr: &saddr, ipvsh);
1141	if (local < `0`)
1142	goto tx_error;
1143	if (local)
1144	return ip_vs_send_or_cont(pf: NFPROTO_IPV4, skb, cp, local: `1`);
1145
1146	rt = skb_rtable(skb);
1147	tdev = rt->dst.dev;
1148
1149	/*
1150	* Okay, now see if we can stuff it in the buffer as-is.
1151	*/
1152	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
1153
1154	tun_type = cp->dest->tun_type;
1155	tun_flags = cp->dest->tun_flags;
1156
1157	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1158	size_t gue_hdrlen, gue_optlen = `0`;
1159
1160	if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1161	skb->ip_summed == CHECKSUM_PARTIAL) {
1162	gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1163	}
1164	gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
1165
1166	max_headroom += sizeof(struct udphdr) + gue_hdrlen;
1167	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1168	size_t gre_hdrlen;
1169	__be16 tflags = `0`;
1170
1171	if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1172	tflags \|= TUNNEL_CSUM;
1173	gre_hdrlen = gre_calc_hlen(o_flags: tflags);
1174
1175	max_headroom += gre_hdrlen;
1176	}
1177
1178	/ We only care about the df field if sysctl_pmtu_disc(ipvs) is set /
1179	dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
1180	skb = ip_vs_prepare_tunneled_skb(skb, skb_af: cp->af, max_headroom,
1181	next_protocol: &next_protocol, NULL, dsfield: &dsfield,
1182	ttl: &ttl, df: dfp);
1183	if (IS_ERR(ptr: skb))
1184	return NF_STOLEN;
1185
1186	gso_type = __tun_gso_type_mask(AF_INET, orig_af: cp->af);
1187	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1188	if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) \|\|
1189	(tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1190	gso_type \|= SKB_GSO_UDP_TUNNEL_CSUM;
1191	else
1192	gso_type \|= SKB_GSO_UDP_TUNNEL;
1193	if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1194	skb->ip_summed == CHECKSUM_PARTIAL) {
1195	gso_type \|= SKB_GSO_TUNNEL_REMCSUM;
1196	}
1197	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1198	if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1199	gso_type \|= SKB_GSO_GRE_CSUM;
1200	else
1201	gso_type \|= SKB_GSO_GRE;
1202	}
1203
1204	if (iptunnel_handle_offloads(skb, gso_type_mask: gso_type))
1205	goto tx_error;
1206
1207	skb->transport_header = skb->network_header;
1208
1209	skb_set_inner_ipproto(skb, ipproto: next_protocol);
1210	skb_set_inner_mac_header(skb, offset: skb_inner_network_offset(skb));
1211
1212	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1213	bool check = false;
1214
1215	if (ipvs_gue_encap(net, skb, cp, next_protocol: &next_protocol))
1216	goto tx_error;
1217
1218	if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) \|\|
1219	(tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1220	check = true;
1221
1222	udp_set_csum(nocheck: !check, skb, saddr, daddr: cp->daddr.ip, len: skb->len);
1223	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
1224	ipvs_gre_encap(net, skb, cp, next_protocol: &next_protocol);
1225
1226	skb_push(skb, len: sizeof(struct iphdr));
1227	skb_reset_network_header(skb);
1228	memset(&(IPCB(skb)->opt), `0`, sizeof(IPCB(skb)->opt));
1229
1230	/*
1231	* Push down and install the IPIP header.
1232	*/
1233	iph = ip_hdr(skb);
1234	iph->version = `4`;
1235	iph->ihl = sizeof(struct iphdr)>>`2`;
1236	iph->frag_off = df;
1237	iph->protocol = next_protocol;
1238	iph->tos = dsfield;
1239	iph->daddr = cp->daddr.ip;
1240	iph->saddr = saddr;
1241	iph->ttl = ttl;
1242	ip_select_ident(net, skb, NULL);
1243
1244	/ Another hack: avoid icmp_send in ip_fragment /
1245	skb->ignore_df = `1`;
1246
1247	ret = ip_vs_tunnel_xmit_prepare(skb, cp);
1248	if (ret == NF_ACCEPT)
1249	ip_local_out(net, sk: skb->sk, skb);
1250	else if (ret == NF_DROP)
1251	kfree_skb(skb);
1252
1253	return NF_STOLEN;
1254
1255	tx_error:
1256	kfree_skb(skb);
1257	return NF_STOLEN;
1258	}
1259
1260	#ifdef CONFIG_IP_VS_IPV6
1261	int
1262	ip_vs_tunnel_xmit_v6(struct sk_buff skb, struct* ip_vs_conn *cp,
1263	struct ip_vs_protocol pp, struct* ip_vs_iphdr *ipvsh)
1264	{
1265	struct netns_ipvs *ipvs = cp->ipvs;
1266	struct net *net = ipvs->net;
1267	struct rt6_info rt; /* Route to the other host /
1268	struct in6_addr saddr; / Source for tunnel /
1269	struct net_device tdev; /* Device to other host /
1270	__u8 next_protocol = `0`;
1271	__u32 payload_len = `0`;
1272	__u8 dsfield = `0`;
1273	__u8 ttl = `0`;
1274	struct ipv6hdr iph; /* Our new IP header /
1275	unsigned int max_headroom; / The extra header space needed /
1276	int ret, local;
1277	int tun_type, gso_type;
1278	int tun_flags;
1279
1280	local = __ip_vs_get_out_rt_v6(ipvs, skb_af: cp->af, skb, dest: cp->dest,
1281	daddr: &cp->daddr.in6,
1282	ret_saddr: &saddr, ipvsh, do_xfrm: `1`,
1283	rt_mode: IP_VS_RT_MODE_LOCAL \|
1284	IP_VS_RT_MODE_NON_LOCAL \|
1285	IP_VS_RT_MODE_TUNNEL);
1286	if (local < `0`)
1287	goto tx_error;
1288	if (local)
1289	return ip_vs_send_or_cont(pf: NFPROTO_IPV6, skb, cp, local: `1`);
1290
1291	rt = (struct rt6_info *) skb_dst(skb);
1292	tdev = rt->dst.dev;
1293
1294	/*
1295	* Okay, now see if we can stuff it in the buffer as-is.
1296	*/
1297	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
1298
1299	tun_type = cp->dest->tun_type;
1300	tun_flags = cp->dest->tun_flags;
1301
1302	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1303	size_t gue_hdrlen, gue_optlen = `0`;
1304
1305	if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1306	skb->ip_summed == CHECKSUM_PARTIAL) {
1307	gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
1308	}
1309	gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
1310
1311	max_headroom += sizeof(struct udphdr) + gue_hdrlen;
1312	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1313	size_t gre_hdrlen;
1314	__be16 tflags = `0`;
1315
1316	if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1317	tflags \|= TUNNEL_CSUM;
1318	gre_hdrlen = gre_calc_hlen(o_flags: tflags);
1319
1320	max_headroom += gre_hdrlen;
1321	}
1322
1323	skb = ip_vs_prepare_tunneled_skb(skb, skb_af: cp->af, max_headroom,
1324	next_protocol: &next_protocol, payload_len: &payload_len,
1325	dsfield: &dsfield, ttl: &ttl, NULL);
1326	if (IS_ERR(ptr: skb))
1327	return NF_STOLEN;
1328
1329	gso_type = __tun_gso_type_mask(AF_INET6, orig_af: cp->af);
1330	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1331	if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) \|\|
1332	(tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1333	gso_type \|= SKB_GSO_UDP_TUNNEL_CSUM;
1334	else
1335	gso_type \|= SKB_GSO_UDP_TUNNEL;
1336	if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
1337	skb->ip_summed == CHECKSUM_PARTIAL) {
1338	gso_type \|= SKB_GSO_TUNNEL_REMCSUM;
1339	}
1340	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1341	if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
1342	gso_type \|= SKB_GSO_GRE_CSUM;
1343	else
1344	gso_type \|= SKB_GSO_GRE;
1345	}
1346
1347	if (iptunnel_handle_offloads(skb, gso_type_mask: gso_type))
1348	goto tx_error;
1349
1350	skb->transport_header = skb->network_header;
1351
1352	skb_set_inner_ipproto(skb, ipproto: next_protocol);
1353	skb_set_inner_mac_header(skb, offset: skb_inner_network_offset(skb));
1354
1355	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1356	bool check = false;
1357
1358	if (ipvs_gue_encap(net, skb, cp, next_protocol: &next_protocol))
1359	goto tx_error;
1360
1361	if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) \|\|
1362	(tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
1363	check = true;
1364
1365	udp6_set_csum(nocheck: !check, skb, saddr: &saddr, daddr: &cp->daddr.in6, len: skb->len);
1366	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
1367	ipvs_gre_encap(net, skb, cp, next_protocol: &next_protocol);
1368
1369	skb_push(skb, len: sizeof(struct ipv6hdr));
1370	skb_reset_network_header(skb);
1371	memset(&(IPCB(skb)->opt), `0`, sizeof(IPCB(skb)->opt));
1372
1373	/*
1374	* Push down and install the IPIP header.
1375	*/
1376	iph = ipv6_hdr(skb);
1377	iph->version = `6`;
1378	iph->nexthdr = next_protocol;
1379	iph->payload_len = htons(payload_len);
1380	memset(&iph->flow_lbl, `0`, sizeof(iph->flow_lbl));
1381	ipv6_change_dsfield(ipv6h: iph, mask: `0`, value: dsfield);
1382	iph->daddr = cp->daddr.in6;
1383	iph->saddr = saddr;
1384	iph->hop_limit = ttl;
1385
1386	/ Another hack: avoid icmp_send in ip_fragment /
1387	skb->ignore_df = `1`;
1388
1389	ret = ip_vs_tunnel_xmit_prepare(skb, cp);
1390	if (ret == NF_ACCEPT)
1391	ip6_local_out(net, sk: skb->sk, skb);
1392	else if (ret == NF_DROP)
1393	kfree_skb(skb);
1394
1395	return NF_STOLEN;
1396
1397	tx_error:
1398	kfree_skb(skb);
1399	return NF_STOLEN;
1400	}
1401	#endif
1402
1403
1404	/*
1405	* Direct Routing transmitter
1406	* Used for ANY protocol
1407	*/
1408	int
1409	ip_vs_dr_xmit(struct sk_buff skb, struct* ip_vs_conn *cp,
1410	struct ip_vs_protocol pp, struct* ip_vs_iphdr *ipvsh)
1411	{
1412	int local;
1413
1414	local = __ip_vs_get_out_rt(ipvs: cp->ipvs, skb_af: cp->af, skb, dest: cp->dest, daddr: cp->daddr.ip,
1415	rt_mode: IP_VS_RT_MODE_LOCAL \|
1416	IP_VS_RT_MODE_NON_LOCAL \|
1417	IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
1418	if (local < `0`)
1419	goto tx_error;
1420	if (local)
1421	return ip_vs_send_or_cont(pf: NFPROTO_IPV4, skb, cp, local: `1`);
1422
1423	ip_send_check(ip: ip_hdr(skb));
1424
1425	/ Another hack: avoid icmp_send in ip_fragment /
1426	skb->ignore_df = `1`;
1427
1428	ip_vs_send_or_cont(pf: NFPROTO_IPV4, skb, cp, local: `0`);
1429
1430	return NF_STOLEN;
1431
1432	tx_error:
1433	kfree_skb(skb);
1434	return NF_STOLEN;
1435	}
1436
1437	#ifdef CONFIG_IP_VS_IPV6
1438	int
1439	ip_vs_dr_xmit_v6(struct sk_buff skb, struct* ip_vs_conn *cp,
1440	struct ip_vs_protocol pp, struct* ip_vs_iphdr *ipvsh)
1441	{
1442	int local;
1443
1444	local = __ip_vs_get_out_rt_v6(ipvs: cp->ipvs, skb_af: cp->af, skb, dest: cp->dest,
1445	daddr: &cp->daddr.in6,
1446	NULL, ipvsh, do_xfrm: `0`,
1447	rt_mode: IP_VS_RT_MODE_LOCAL \|
1448	IP_VS_RT_MODE_NON_LOCAL \|
1449	IP_VS_RT_MODE_KNOWN_NH);
1450	if (local < `0`)
1451	goto tx_error;
1452	if (local)
1453	return ip_vs_send_or_cont(pf: NFPROTO_IPV6, skb, cp, local: `1`);
1454
1455	/ Another hack: avoid icmp_send in ip_fragment /
1456	skb->ignore_df = `1`;
1457
1458	ip_vs_send_or_cont(pf: NFPROTO_IPV6, skb, cp, local: `0`);
1459
1460	return NF_STOLEN;
1461
1462	tx_error:
1463	kfree_skb(skb);
1464	return NF_STOLEN;
1465	}
1466	#endif
1467
1468
1469	/*
1470	* ICMP packet transmitter
1471	* called by the ip_vs_in_icmp
1472	*/
1473	int
1474	ip_vs_icmp_xmit(struct sk_buff skb, struct* ip_vs_conn *cp,
1475	struct ip_vs_protocol pp, int* offset, unsigned int hooknum,
1476	struct ip_vs_iphdr *iph)
1477	{
1478	struct rtable rt; /* Route to the other host /
1479	int rc;
1480	int local;
1481	int rt_mode, was_input;
1482
1483	/ The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be*
1484	forwarded directly here, because there is no need to
1485	translate address/port back /*
1486	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
1487	if (cp->packet_xmit)
1488	rc = cp->packet_xmit(skb, cp, pp, iph);
1489	else
1490	rc = NF_ACCEPT;
1491	/ do not touch skb anymore /
1492	atomic_inc(v: &cp->in_pkts);
1493	return rc;
1494	}
1495
1496	/*
1497	* mangle and send the packet here (only for VS/NAT)
1498	*/
1499	was_input = rt_is_input_route(rt: skb_rtable(skb));
1500
1501	/ LOCALNODE from FORWARD hook is not supported /
1502	rt_mode = (hooknum != NF_INET_FORWARD) ?
1503	IP_VS_RT_MODE_LOCAL \| IP_VS_RT_MODE_NON_LOCAL \|
1504	IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
1505	local = __ip_vs_get_out_rt(ipvs: cp->ipvs, skb_af: cp->af, skb, dest: cp->dest, daddr: cp->daddr.ip, rt_mode,
1506	NULL, ipvsh: iph);
1507	if (local < `0`)
1508	goto tx_error;
1509	rt = skb_rtable(skb);
1510
1511	/*
1512	* Avoid duplicate tuple in reply direction for NAT traffic
1513	* to local address when connection is sync-ed
1514	*/
1515	#if IS_ENABLED(CONFIG_NF_CONNTRACK)
1516	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1517	enum ip_conntrack_info ctinfo;
1518	struct nf_conn *ct = nf_ct_get(skb, ctinfo: &ctinfo);
1519
1520	if (ct) {
1521	IP_VS_DBG(`10`, "%s(): "
1522	"stopping DNAT to local address %pI4\n",
1523	__func__, &cp->daddr.ip);
1524	goto tx_error;
1525	}
1526	}
1527	#endif
1528
1529	/ From world but DNAT to loopback address? /
1530	if (local && ipv4_is_loopback(addr: cp->daddr.ip) && was_input) {
1531	IP_VS_DBG(`1`, "%s(): "
1532	"stopping DNAT to loopback %pI4\n",
1533	__func__, &cp->daddr.ip);
1534	goto tx_error;
1535	}
1536
1537	/ copy-on-write the packet before mangling it /
1538	if (skb_ensure_writable(skb, write_len: offset))
1539	goto tx_error;
1540
1541	if (skb_cow(skb, headroom: rt->dst.dev->hard_header_len))
1542	goto tx_error;
1543
1544	ip_vs_nat_icmp(skb, pp, cp, dir: `0`);
1545
1546	/ Another hack: avoid icmp_send in ip_fragment /
1547	skb->ignore_df = `1`;
1548
1549	return ip_vs_nat_send_or_cont(pf: NFPROTO_IPV4, skb, cp, local);
1550
1551	tx_error:
1552	kfree_skb(skb);
1553	rc = NF_STOLEN;
1554	return rc;
1555	}
1556
1557	#ifdef CONFIG_IP_VS_IPV6
1558	int
1559	ip_vs_icmp_xmit_v6(struct sk_buff skb, struct* ip_vs_conn *cp,
1560	struct ip_vs_protocol pp, int* offset, unsigned int hooknum,
1561	struct ip_vs_iphdr *ipvsh)
1562	{
1563	struct rt6_info rt; /* Route to the other host /
1564	int rc;
1565	int local;
1566	int rt_mode;
1567
1568	/ The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be*
1569	forwarded directly here, because there is no need to
1570	translate address/port back /*
1571	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
1572	if (cp->packet_xmit)
1573	rc = cp->packet_xmit(skb, cp, pp, ipvsh);
1574	else
1575	rc = NF_ACCEPT;
1576	/ do not touch skb anymore /
1577	atomic_inc(v: &cp->in_pkts);
1578	return rc;
1579	}
1580
1581	/*
1582	* mangle and send the packet here (only for VS/NAT)
1583	*/
1584
1585	/ LOCALNODE from FORWARD hook is not supported /
1586	rt_mode = (hooknum != NF_INET_FORWARD) ?
1587	IP_VS_RT_MODE_LOCAL \| IP_VS_RT_MODE_NON_LOCAL \|
1588	IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
1589	local = __ip_vs_get_out_rt_v6(ipvs: cp->ipvs, skb_af: cp->af, skb, dest: cp->dest,
1590	daddr: &cp->daddr.in6, NULL, ipvsh, do_xfrm: `0`, rt_mode);
1591	if (local < `0`)
1592	goto tx_error;
1593	rt = (struct rt6_info *) skb_dst(skb);
1594	/*
1595	* Avoid duplicate tuple in reply direction for NAT traffic
1596	* to local address when connection is sync-ed
1597	*/
1598	#if IS_ENABLED(CONFIG_NF_CONNTRACK)
1599	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1600	enum ip_conntrack_info ctinfo;
1601	struct nf_conn *ct = nf_ct_get(skb, ctinfo: &ctinfo);
1602
1603	if (ct) {
1604	IP_VS_DBG(`10`, "%s(): "
1605	"stopping DNAT to local address %pI6\n",
1606	__func__, &cp->daddr.in6);
1607	goto tx_error;
1608	}
1609	}
1610	#endif
1611
1612	/ From world but DNAT to loopback address? /
1613	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
1614	ipv6_addr_type(addr: &cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
1615	IP_VS_DBG(`1`, "%s(): "
1616	"stopping DNAT to loopback %pI6\n",
1617	__func__, &cp->daddr.in6);
1618	goto tx_error;
1619	}
1620
1621	/ copy-on-write the packet before mangling it /
1622	if (skb_ensure_writable(skb, write_len: offset))
1623	goto tx_error;
1624
1625	if (skb_cow(skb, headroom: rt->dst.dev->hard_header_len))
1626	goto tx_error;
1627
1628	ip_vs_nat_icmp_v6(skb, pp, cp, dir: `0`);
1629
1630	/ Another hack: avoid icmp_send in ip_fragment /
1631	skb->ignore_df = `1`;
1632
1633	return ip_vs_nat_send_or_cont(pf: NFPROTO_IPV6, skb, cp, local);
1634
1635	tx_error:
1636	kfree_skb(skb);
1637	rc = NF_STOLEN;
1638	return rc;
1639	}
1640	#endif
1641

source code of linux/net/netfilter/ipvs/ip_vs_xmit.c