ip_vs_core.c source code [linux/net/netfilter/ipvs/ip_vs_core.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* IPVS An implementation of the IP virtual server support for the
4	* LINUX operating system. IPVS is now implemented as a module
5	* over the Netfilter framework. IPVS can be used to build a
6	* high-performance and highly available server based on a
7	* cluster of servers.
8	*
9	* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
10	* Peter Kese <peter.kese@ijs.si>
11	* Julian Anastasov <ja@ssi.bg>
12	*
13	* The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
14	* with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
15	* and others.
16	*
17	* Changes:
18	* Paul `Rusty' Russell properly handle non-linear skbs
19	* Harald Welte don't use nfcache
20	*/
21
22	#define KMSG_COMPONENT "IPVS"
23	#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
24
25	#include <linux/module.h>
26	#include <linux/kernel.h>
27	#include <linux/ip.h>
28	#include <linux/tcp.h>
29	#include <linux/sctp.h>
30	#include <linux/icmp.h>
31	#include <linux/slab.h>
32
33	#include <net/ip.h>
34	#include <net/tcp.h>
35	#include <net/udp.h>
36	#include <net/icmp.h> /* for icmp_send */
37	#include <net/gue.h>
38	#include <net/gre.h>
39	#include <net/route.h>
40	#include <net/ip6_checksum.h>
41	#include <net/netns/generic.h> /* net_generic() */
42
43	#include <linux/netfilter.h>
44	#include <linux/netfilter_ipv4.h>
45
46	#ifdef CONFIG_IP_VS_IPV6
47	#include <net/ipv6.h>
48	#include <linux/netfilter_ipv6.h>
49	#include <net/ip6_route.h>
50	#endif
51
52	#include <net/ip_vs.h>
53	#include <linux/indirect_call_wrapper.h>
54
55
56	EXPORT_SYMBOL(register_ip_vs_scheduler);
57	EXPORT_SYMBOL(unregister_ip_vs_scheduler);
58	EXPORT_SYMBOL(ip_vs_proto_name);
59	EXPORT_SYMBOL(ip_vs_conn_new);
60	EXPORT_SYMBOL(ip_vs_conn_in_get);
61	EXPORT_SYMBOL(ip_vs_conn_out_get);
62	#ifdef CONFIG_IP_VS_PROTO_TCP
63	EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
64	#endif
65	EXPORT_SYMBOL(ip_vs_conn_put);
66	#ifdef CONFIG_IP_VS_DEBUG
67	EXPORT_SYMBOL(ip_vs_get_debug_level);
68	#endif
69	EXPORT_SYMBOL(ip_vs_new_conn_out);
70
71	#if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP)
72	#define SNAT_CALL(f, ...) \
73	INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__)
74	#elif defined(CONFIG_IP_VS_PROTO_TCP)
75	#define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, tcp_snat_handler, __VA_ARGS__)
76	#elif defined(CONFIG_IP_VS_PROTO_UDP)
77	#define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, udp_snat_handler, __VA_ARGS__)
78	#else
79	#define SNAT_CALL(f, ...) f(__VA_ARGS__)
80	#endif
81
82	static unsigned int ip_vs_net_id __read_mostly;
83	/ netns cnt used for uniqueness /
84	static atomic_t ipvs_netns_cnt = ATOMIC_INIT(`0`);
85
86	/ ID used in ICMP lookups /
87	#define icmp_id(icmph) (((icmph)->un).echo.id)
88	#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier)
89
90	const char ip_vs_proto_name(unsigned* int proto)
91	{
92	static char buf[`20`];
93
94	switch (proto) {
95	case IPPROTO_IP:
96	return "IP";
97	case IPPROTO_UDP:
98	return "UDP";
99	case IPPROTO_TCP:
100	return "TCP";
101	case IPPROTO_SCTP:
102	return "SCTP";
103	case IPPROTO_ICMP:
104	return "ICMP";
105	#ifdef CONFIG_IP_VS_IPV6
106	case IPPROTO_ICMPV6:
107	return "ICMPv6";
108	#endif
109	default:
110	sprintf(buf, fmt: "IP_%u", proto);
111	return buf;
112	}
113	}
114
115	void ip_vs_init_hash_table(struct list_head table, int* rows)
116	{
117	while (--rows >= `0`)
118	INIT_LIST_HEAD(list: &table[rows]);
119	}
120
121	static inline void
122	ip_vs_in_stats(struct ip_vs_conn cp, struct* sk_buff *skb)
123	{
124	struct ip_vs_dest *dest = cp->dest;
125	struct netns_ipvs *ipvs = cp->ipvs;
126
127	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
128	struct ip_vs_cpu_stats *s;
129	struct ip_vs_service *svc;
130
131	local_bh_disable();
132
133	s = this_cpu_ptr(dest->stats.cpustats);
134	u64_stats_update_begin(syncp: &s->syncp);
135	u64_stats_inc(p: &s->cnt.inpkts);
136	u64_stats_add(p: &s->cnt.inbytes, val: skb->len);
137	u64_stats_update_end(syncp: &s->syncp);
138
139	svc = rcu_dereference(dest->svc);
140	s = this_cpu_ptr(svc->stats.cpustats);
141	u64_stats_update_begin(syncp: &s->syncp);
142	u64_stats_inc(p: &s->cnt.inpkts);
143	u64_stats_add(p: &s->cnt.inbytes, val: skb->len);
144	u64_stats_update_end(syncp: &s->syncp);
145
146	s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
147	u64_stats_update_begin(syncp: &s->syncp);
148	u64_stats_inc(p: &s->cnt.inpkts);
149	u64_stats_add(p: &s->cnt.inbytes, val: skb->len);
150	u64_stats_update_end(syncp: &s->syncp);
151
152	local_bh_enable();
153	}
154	}
155
156
157	static inline void
158	ip_vs_out_stats(struct ip_vs_conn cp, struct* sk_buff *skb)
159	{
160	struct ip_vs_dest *dest = cp->dest;
161	struct netns_ipvs *ipvs = cp->ipvs;
162
163	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
164	struct ip_vs_cpu_stats *s;
165	struct ip_vs_service *svc;
166
167	local_bh_disable();
168
169	s = this_cpu_ptr(dest->stats.cpustats);
170	u64_stats_update_begin(syncp: &s->syncp);
171	u64_stats_inc(p: &s->cnt.outpkts);
172	u64_stats_add(p: &s->cnt.outbytes, val: skb->len);
173	u64_stats_update_end(syncp: &s->syncp);
174
175	svc = rcu_dereference(dest->svc);
176	s = this_cpu_ptr(svc->stats.cpustats);
177	u64_stats_update_begin(syncp: &s->syncp);
178	u64_stats_inc(p: &s->cnt.outpkts);
179	u64_stats_add(p: &s->cnt.outbytes, val: skb->len);
180	u64_stats_update_end(syncp: &s->syncp);
181
182	s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
183	u64_stats_update_begin(syncp: &s->syncp);
184	u64_stats_inc(p: &s->cnt.outpkts);
185	u64_stats_add(p: &s->cnt.outbytes, val: skb->len);
186	u64_stats_update_end(syncp: &s->syncp);
187
188	local_bh_enable();
189	}
190	}
191
192
193	static inline void
194	ip_vs_conn_stats(struct ip_vs_conn cp, struct* ip_vs_service *svc)
195	{
196	struct netns_ipvs *ipvs = svc->ipvs;
197	struct ip_vs_cpu_stats *s;
198
199	local_bh_disable();
200
201	s = this_cpu_ptr(cp->dest->stats.cpustats);
202	u64_stats_update_begin(syncp: &s->syncp);
203	u64_stats_inc(p: &s->cnt.conns);
204	u64_stats_update_end(syncp: &s->syncp);
205
206	s = this_cpu_ptr(svc->stats.cpustats);
207	u64_stats_update_begin(syncp: &s->syncp);
208	u64_stats_inc(p: &s->cnt.conns);
209	u64_stats_update_end(syncp: &s->syncp);
210
211	s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
212	u64_stats_update_begin(syncp: &s->syncp);
213	u64_stats_inc(p: &s->cnt.conns);
214	u64_stats_update_end(syncp: &s->syncp);
215
216	local_bh_enable();
217	}
218
219
220	static inline void
221	ip_vs_set_state(struct ip_vs_conn cp, int* direction,
222	const struct sk_buff *skb,
223	struct ip_vs_proto_data *pd)
224	{
225	if (likely(pd->pp->state_transition))
226	pd->pp->state_transition(cp, direction, skb, pd);
227	}
228
229	static inline int
230	ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
231	struct sk_buff skb, int* protocol,
232	const union nf_inet_addr *caddr, __be16 cport,
233	const union nf_inet_addr *vaddr, __be16 vport,
234	struct ip_vs_conn_param *p)
235	{
236	ip_vs_conn_fill_param(ipvs: svc->ipvs, af: svc->af, protocol, caddr, cport, vaddr,
237	vport, p);
238	p->pe = rcu_dereference(svc->pe);
239	if (p->pe && p->pe->fill_param)
240	return p->pe->fill_param(p, skb);
241
242	return `0`;
243	}
244
245	/*
246	* IPVS persistent scheduling function
247	* It creates a connection entry according to its template if exists,
248	* or selects a server and creates a connection entry plus a template.
249	* Locking: we are svc user (svc->refcnt), so we hold all dests too
250	* Protocols supported: TCP, UDP
251	*/
252	static struct ip_vs_conn *
253	ip_vs_sched_persist(struct ip_vs_service *svc,
254	struct sk_buff *skb, __be16 src_port, __be16 dst_port,
255	int ignored, struct* ip_vs_iphdr *iph)
256	{
257	struct ip_vs_conn *cp = NULL;
258	struct ip_vs_dest *dest;
259	struct ip_vs_conn *ct;
260	__be16 dport = `0`; / destination port to forward /
261	unsigned int flags;
262	struct ip_vs_conn_param param;
263	const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
264	union nf_inet_addr snet; / source network of the client,*
265	after masking /*
266	const union nf_inet_addr src_addr, dst_addr;
267
268	if (likely(!ip_vs_iph_inverse(iph))) {
269	src_addr = &iph->saddr;
270	dst_addr = &iph->daddr;
271	} else {
272	src_addr = &iph->daddr;
273	dst_addr = &iph->saddr;
274	}
275
276
277	/ Mask saddr with the netmask to adjust template granularity /
278	#ifdef CONFIG_IP_VS_IPV6
279	if (svc->af == AF_INET6)
280	ipv6_addr_prefix(pfx: &snet.in6, addr: &src_addr->in6,
281	plen: (__force __u32) svc->netmask);
282	else
283	#endif
284	snet.ip = src_addr->ip & svc->netmask;
285
286	IP_VS_DBG_BUF(`6`, "p-schedule: src %s:%u dest %s:%u "
287	"mnet %s\n",
288	IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port),
289	IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port),
290	IP_VS_DBG_ADDR(svc->af, &snet));
291
292	/*
293	* As far as we know, FTP is a very complicated network protocol, and
294	* it uses control connection and data connections. For active FTP,
295	* FTP server initialize data connection to the client, its source port
296	* is often 20. For passive FTP, FTP server tells the clients the port
297	* that it passively listens to, and the client issues the data
298	* connection. In the tunneling or direct routing mode, the load
299	* balancer is on the client-to-server half of connection, the port
300	* number is unknown to the load balancer. So, a conn template like
301	* <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
302	* service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
303	* is created for other persistent services.
304	*/
305	{
306	int protocol = iph->protocol;
307	const union nf_inet_addr *vaddr = dst_addr;
308	__be16 vport = `0`;
309
310	if (dst_port == svc->port) {
311	/ non-FTP template:*
312	* <protocol, caddr, 0, vaddr, vport, daddr, dport>
313	* FTP template:
314	* <protocol, caddr, 0, vaddr, 0, daddr, 0>
315	*/
316	if (svc->port != FTPPORT)
317	vport = dst_port;
318	} else {
319	/ Note: persistent fwmark-based services and*
320	* persistent port zero service are handled here.
321	* fwmark template:
322	* <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
323	* port zero template:
324	* <protocol,caddr,0,vaddr,0,daddr,0>
325	*/
326	if (svc->fwmark) {
327	protocol = IPPROTO_IP;
328	vaddr = &fwmark;
329	}
330	}
331	/ return ignored = -1 so NF_DROP can be used /*
332	if (ip_vs_conn_fill_param_persist(svc, skb, protocol, caddr: &snet, cport: `0`,
333	vaddr, vport, p: &param) < `0`) {
334	*ignored = -`1`;
335	return NULL;
336	}
337	}
338
339	/ Check if a template already exists /
340	ct = ip_vs_ct_in_get(p: &param);
341	if (!ct \|\| !ip_vs_check_template(ct, NULL)) {
342	struct ip_vs_scheduler *sched;
343
344	/*
345	* No template found or the dest of the connection
346	* template is not available.
347	* return *ignored=0 i.e. ICMP and NF_DROP
348	*/
349	sched = rcu_dereference(svc->scheduler);
350	if (sched) {
351	/ read svc->sched_data after svc->scheduler /
352	smp_rmb();
353	dest = sched->schedule(svc, skb, iph);
354	} else {
355	dest = NULL;
356	}
357	if (!dest) {
358	IP_VS_DBG(`1`, "p-schedule: no dest found.\n");
359	kfree(objp: param.pe_data);
360	*ignored = `0`;
361	return NULL;
362	}
363
364	if (dst_port == svc->port && svc->port != FTPPORT)
365	dport = dest->port;
366
367	/ Create a template*
368	* This adds param.pe_data to the template,
369	* and thus param.pe_data will be destroyed
370	* when the template expires */
371	ct = ip_vs_conn_new(&param, dest->af, &dest->addr, dport,
372	IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
373	if (ct == NULL) {
374	kfree(objp: param.pe_data);
375	*ignored = -`1`;
376	return NULL;
377	}
378
379	ct->timeout = svc->timeout;
380	} else {
381	/ set destination with the found template /
382	dest = ct->dest;
383	kfree(objp: param.pe_data);
384	}
385
386	dport = dst_port;
387	if (dport == svc->port && dest->port)
388	dport = dest->port;
389
390	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
391	&& iph->protocol == IPPROTO_UDP) ?
392	IP_VS_CONN_F_ONE_PACKET : `0`;
393
394	/*
395	* Create a new connection according to the template
396	*/
397	ip_vs_conn_fill_param(ipvs: svc->ipvs, af: svc->af, protocol: iph->protocol, caddr: src_addr,
398	cport: src_port, vaddr: dst_addr, vport: dst_port, p: &param);
399
400	cp = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, flags, dest,
401	skb->mark);
402	if (cp == NULL) {
403	ip_vs_conn_put(ct);
404	*ignored = -`1`;
405	return NULL;
406	}
407
408	/*
409	* Add its control
410	*/
411	ip_vs_control_add(cp, ctl_cp: ct);
412	ip_vs_conn_put(ct);
413
414	ip_vs_conn_stats(cp, svc);
415	return cp;
416	}
417
418
419	/*
420	* IPVS main scheduling function
421	* It selects a server according to the virtual service, and
422	* creates a connection entry.
423	* Protocols supported: TCP, UDP
424	*
425	* Usage of *ignored
426	*
427	* 1 : protocol tried to schedule (eg. on SYN), found svc but the
428	* svc/scheduler decides that this packet should be accepted with
429	* NF_ACCEPT because it must not be scheduled.
430	*
431	* 0 : scheduler can not find destination, so try bypass or
432	* return ICMP and then NF_DROP (ip_vs_leave).
433	*
434	* -1 : scheduler tried to schedule but fatal error occurred, eg.
435	* ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
436	* failure such as missing Call-ID, ENOMEM on skb_linearize
437	* or pe_data. In this case we should return NF_DROP without
438	* any attempts to send ICMP with ip_vs_leave.
439	*/
440	struct ip_vs_conn *
441	ip_vs_schedule(struct ip_vs_service svc, struct* sk_buff *skb,
442	struct ip_vs_proto_data pd, int* *ignored,
443	struct ip_vs_iphdr *iph)
444	{
445	struct ip_vs_protocol *pp = pd->pp;
446	struct ip_vs_conn *cp = NULL;
447	struct ip_vs_scheduler *sched;
448	struct ip_vs_dest *dest;
449	__be16 _ports[`2`], *pptr, cport, vport;
450	const void caddr, vaddr;
451	unsigned int flags;
452
453	*ignored = `1`;
454	/*
455	* IPv6 frags, only the first hit here.
456	*/
457	pptr = frag_safe_skb_hp(skb, offset: iph->len, len: sizeof(_ports), buffer: _ports);
458	if (pptr == NULL)
459	return NULL;
460
461	if (likely(!ip_vs_iph_inverse(iph))) {
462	cport = pptr[`0`];
463	caddr = &iph->saddr;
464	vport = pptr[`1`];
465	vaddr = &iph->daddr;
466	} else {
467	cport = pptr[`1`];
468	caddr = &iph->daddr;
469	vport = pptr[`0`];
470	vaddr = &iph->saddr;
471	}
472
473	/*
474	* FTPDATA needs this check when using local real server.
475	* Never schedule Active FTPDATA connections from real server.
476	* For LVS-NAT they must be already created. For other methods
477	* with persistence the connection is created on SYN+ACK.
478	*/
479	if (cport == FTPDATA) {
480	IP_VS_DBG_PKT(`12`, svc->af, pp, skb, iph->off,
481	"Not scheduling FTPDATA");
482	return NULL;
483	}
484
485	/*
486	* Do not schedule replies from local real server.
487	*/
488	if ((!skb->dev \|\| skb->dev->flags & IFF_LOOPBACK)) {
489	iph->hdr_flags ^= IP_VS_HDR_INVERSE;
490	cp = INDIRECT_CALL_1(pp->conn_in_get,
491	ip_vs_conn_in_get_proto, svc->ipvs,
492	svc->af, skb, iph);
493	iph->hdr_flags ^= IP_VS_HDR_INVERSE;
494
495	if (cp) {
496	IP_VS_DBG_PKT(`12`, svc->af, pp, skb, iph->off,
497	"Not scheduling reply for existing"
498	" connection");
499	__ip_vs_conn_put(cp);
500	return NULL;
501	}
502	}
503
504	/*
505	* Persistent service
506	*/
507	if (svc->flags & IP_VS_SVC_F_PERSISTENT)
508	return ip_vs_sched_persist(svc, skb, src_port: cport, dst_port: vport, ignored,
509	iph);
510
511	*ignored = `0`;
512
513	/*
514	* Non-persistent service
515	*/
516	if (!svc->fwmark && vport != svc->port) {
517	if (!svc->port)
518	pr_err("Schedule: port zero only supported "
519	"in persistent services, "
520	"check your ipvs configuration\n");
521	return NULL;
522	}
523
524	sched = rcu_dereference(svc->scheduler);
525	if (sched) {
526	/ read svc->sched_data after svc->scheduler /
527	smp_rmb();
528	dest = sched->schedule(svc, skb, iph);
529	} else {
530	dest = NULL;
531	}
532	if (dest == NULL) {
533	IP_VS_DBG(`1`, "Schedule: no dest found.\n");
534	return NULL;
535	}
536
537	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
538	&& iph->protocol == IPPROTO_UDP) ?
539	IP_VS_CONN_F_ONE_PACKET : `0`;
540
541	/*
542	* Create a connection entry.
543	*/
544	{
545	struct ip_vs_conn_param p;
546
547	ip_vs_conn_fill_param(ipvs: svc->ipvs, af: svc->af, protocol: iph->protocol,
548	caddr, cport, vaddr, vport, p: &p);
549	cp = ip_vs_conn_new(&p, dest->af, &dest->addr,
550	dest->port ? dest->port : vport,
551	flags, dest, skb->mark);
552	if (!cp) {
553	*ignored = -`1`;
554	return NULL;
555	}
556	}
557
558	IP_VS_DBG_BUF(`6`, "Schedule fwd:%c c:%s:%u v:%s:%u "
559	"d:%s:%u conn->flags:%X conn->refcnt:%d\n",
560	ip_vs_fwd_tag(cp),
561	IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
562	IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
563	IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
564	cp->flags, refcount_read(&cp->refcnt));
565
566	ip_vs_conn_stats(cp, svc);
567	return cp;
568	}
569
570	static inline int ip_vs_addr_is_unicast(struct net net, int* af,
571	union nf_inet_addr *addr)
572	{
573	#ifdef CONFIG_IP_VS_IPV6
574	if (af == AF_INET6)
575	return ipv6_addr_type(addr: &addr->in6) & IPV6_ADDR_UNICAST;
576	#endif
577	return (inet_addr_type(net, addr: addr->ip) == RTN_UNICAST);
578	}
579
580	/*
581	* Pass or drop the packet.
582	* Called by ip_vs_in, when the virtual service is available but
583	* no destination is available for a new connection.
584	*/
585	int ip_vs_leave(struct ip_vs_service svc, struct* sk_buff *skb,
586	struct ip_vs_proto_data pd, struct* ip_vs_iphdr *iph)
587	{
588	__be16 _ports[`2`], *pptr, dport;
589	struct netns_ipvs *ipvs = svc->ipvs;
590	struct net *net = ipvs->net;
591
592	pptr = frag_safe_skb_hp(skb, offset: iph->len, len: sizeof(_ports), buffer: _ports);
593	if (!pptr)
594	return NF_DROP;
595	dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[`1`] : pptr[`0`];
596
597	/ if it is fwmark-based service, the cache_bypass sysctl is up*
598	and the destination is a non-local unicast, then create
599	a cache_bypass connection entry /*
600	if (sysctl_cache_bypass(ipvs) && svc->fwmark &&
601	!(iph->hdr_flags & (IP_VS_HDR_INVERSE \| IP_VS_HDR_ICMP)) &&
602	ip_vs_addr_is_unicast(net, af: svc->af, addr: &iph->daddr)) {
603	int ret;
604	struct ip_vs_conn *cp;
605	unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
606	iph->protocol == IPPROTO_UDP) ?
607	IP_VS_CONN_F_ONE_PACKET : `0`;
608	union nf_inet_addr daddr = { .all = { `0`, `0`, `0`, `0` } };
609
610	/ create a new connection entry /
611	IP_VS_DBG(`6`, "%s(): create a cache_bypass entry\n", __func__);
612	{
613	struct ip_vs_conn_param p;
614	ip_vs_conn_fill_param(ipvs: svc->ipvs, af: svc->af, protocol: iph->protocol,
615	caddr: &iph->saddr, cport: pptr[`0`],
616	vaddr: &iph->daddr, vport: pptr[`1`], p: &p);
617	cp = ip_vs_conn_new(&p, svc->af, &daddr, `0`,
618	IP_VS_CONN_F_BYPASS \| flags,
619	NULL, skb->mark);
620	if (!cp)
621	return NF_DROP;
622	}
623
624	/ statistics /
625	ip_vs_in_stats(cp, skb);
626
627	/ set state /
628	ip_vs_set_state(cp, direction: IP_VS_DIR_INPUT, skb, pd);
629
630	/ transmit the first SYN packet /
631	ret = cp->packet_xmit(skb, cp, pd->pp, iph);
632	/ do not touch skb anymore /
633
634	if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
635	atomic_inc(v: &cp->control->in_pkts);
636	else
637	atomic_inc(v: &cp->in_pkts);
638	ip_vs_conn_put(cp);
639	return ret;
640	}
641
642	/*
643	* When the virtual ftp service is presented, packets destined
644	* for other services on the VIP may get here (except services
645	* listed in the ipvs table), pass the packets, because it is
646	* not ipvs job to decide to drop the packets.
647	*/
648	if (svc->port == FTPPORT && dport != FTPPORT)
649	return NF_ACCEPT;
650
651	if (unlikely(ip_vs_iph_icmp(iph)))
652	return NF_DROP;
653
654	/*
655	* Notify the client that the destination is unreachable, and
656	* release the socket buffer.
657	* Since it is in IP layer, the TCP socket is not actually
658	* created, the TCP RST packet cannot be sent, instead that
659	* ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
660	*/
661	#ifdef CONFIG_IP_VS_IPV6
662	if (svc->af == AF_INET6) {
663	if (!skb->dev)
664	skb->dev = net->loopback_dev;
665	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, info: `0`);
666	} else
667	#endif
668	icmp_send(skb_in: skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, info: `0`);
669
670	return NF_DROP;
671	}
672
673	#ifdef CONFIG_SYSCTL
674
675	static int sysctl_snat_reroute(struct netns_ipvs *ipvs)
676	{
677	return ipvs->sysctl_snat_reroute;
678	}
679
680	static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs)
681	{
682	return ipvs->sysctl_nat_icmp_send;
683	}
684
685	#else
686
687	static int sysctl_snat_reroute(struct netns_ipvs ipvs) { return* `0`; }
688	static int sysctl_nat_icmp_send(struct netns_ipvs ipvs) { return* `0`; }
689
690	#endif
691
692	__sum16 ip_vs_checksum_complete(struct sk_buff skb, int* offset)
693	{
694	return csum_fold(sum: skb_checksum(skb, offset, len: skb->len - offset, csum: `0`));
695	}
696
697	static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
698	{
699	if (NF_INET_LOCAL_IN == hooknum)
700	return IP_DEFRAG_VS_IN;
701	if (NF_INET_FORWARD == hooknum)
702	return IP_DEFRAG_VS_FWD;
703	return IP_DEFRAG_VS_OUT;
704	}
705
706	static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs,
707	struct sk_buff *skb, u_int32_t user)
708	{
709	int err;
710
711	local_bh_disable();
712	err = ip_defrag(net: ipvs->net, skb, user);
713	local_bh_enable();
714	if (!err)
715	ip_send_check(ip: ip_hdr(skb));
716
717	return err;
718	}
719
720	static int ip_vs_route_me_harder(struct netns_ipvs ipvs, int* af,
721	struct sk_buff skb, unsigned* int hooknum)
722	{
723	if (!sysctl_snat_reroute(ipvs))
724	return `0`;
725	/ Reroute replies only to remote clients (FORWARD and LOCAL_OUT) /
726	if (NF_INET_LOCAL_IN == hooknum)
727	return `0`;
728	#ifdef CONFIG_IP_VS_IPV6
729	if (af == AF_INET6) {
730	struct dst_entry *dst = skb_dst(skb);
731
732	if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
733	ip6_route_me_harder(net: ipvs->net, sk: skb->sk, skb) != `0`)
734	return `1`;
735	} else
736	#endif
737	if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
738	ip_route_me_harder(net: ipvs->net, sk: skb->sk, skb, addr_type: RTN_LOCAL) != `0`)
739	return `1`;
740
741	return `0`;
742	}
743
744	/*
745	* Packet has been made sufficiently writable in caller
746	* - inout: 1=in->out, 0=out->in
747	*/
748	void ip_vs_nat_icmp(struct sk_buff skb, struct* ip_vs_protocol *pp,
749	struct ip_vs_conn cp, int* inout)
750	{
751	struct iphdr *iph = ip_hdr(skb);
752	unsigned int icmp_offset = iph->ihl*`4`;
753	struct icmphdr icmph = (struct* icmphdr *)(skb_network_header(skb) +
754	icmp_offset);
755	struct iphdr ciph = (struct* iphdr *)(icmph + `1`);
756
757	if (inout) {
758	iph->saddr = cp->vaddr.ip;
759	ip_send_check(ip: iph);
760	ciph->daddr = cp->vaddr.ip;
761	ip_send_check(ip: ciph);
762	} else {
763	iph->daddr = cp->daddr.ip;
764	ip_send_check(ip: iph);
765	ciph->saddr = cp->daddr.ip;
766	ip_send_check(ip: ciph);
767	}
768
769	/ the TCP/UDP/SCTP port /
770	if (IPPROTO_TCP == ciph->protocol \|\| IPPROTO_UDP == ciph->protocol \|\|
771	IPPROTO_SCTP == ciph->protocol) {
772	__be16 ports = (void* )ciph + ciph->ihl`4`;
773
774	if (inout)
775	ports[`1`] = cp->vport;
776	else
777	ports[`0`] = cp->dport;
778	}
779
780	/ And finally the ICMP checksum /
781	icmph->checksum = `0`;
782	icmph->checksum = ip_vs_checksum_complete(skb, offset: icmp_offset);
783	skb->ip_summed = CHECKSUM_UNNECESSARY;
784
785	if (inout)
786	IP_VS_DBG_PKT(`11`, AF_INET, pp, skb, (void )ciph - (void* *)iph,
787	"Forwarding altered outgoing ICMP");
788	else
789	IP_VS_DBG_PKT(`11`, AF_INET, pp, skb, (void )ciph - (void* *)iph,
790	"Forwarding altered incoming ICMP");
791	}
792
793	#ifdef CONFIG_IP_VS_IPV6
794	void ip_vs_nat_icmp_v6(struct sk_buff skb, struct* ip_vs_protocol *pp,
795	struct ip_vs_conn cp, int* inout)
796	{
797	struct ipv6hdr *iph = ipv6_hdr(skb);
798	unsigned int icmp_offset = `0`;
799	unsigned int offs = `0`; / header offset/
800	int protocol;
801	struct icmp6hdr *icmph;
802	struct ipv6hdr *ciph;
803	unsigned short fragoffs;
804
805	ipv6_find_hdr(skb, offset: &icmp_offset, IPPROTO_ICMPV6, fragoff: &fragoffs, NULL);
806	icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset);
807	offs = icmp_offset + sizeof(struct icmp6hdr);
808	ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs);
809
810	protocol = ipv6_find_hdr(skb, offset: &offs, target: -`1`, fragoff: &fragoffs, NULL);
811
812	if (inout) {
813	iph->saddr = cp->vaddr.in6;
814	ciph->daddr = cp->vaddr.in6;
815	} else {
816	iph->daddr = cp->daddr.in6;
817	ciph->saddr = cp->daddr.in6;
818	}
819
820	/ the TCP/UDP/SCTP port /
821	if (!fragoffs && (IPPROTO_TCP == protocol \|\| IPPROTO_UDP == protocol \|\|
822	IPPROTO_SCTP == protocol)) {
823	__be16 ports = (void* *)(skb_network_header(skb) + offs);
824
825	IP_VS_DBG(`11`, "%s() changed port %d to %d\n", __func__,
826	ntohs(inout ? ports[`1`] : ports[`0`]),
827	ntohs(inout ? cp->vport : cp->dport));
828	if (inout)
829	ports[`1`] = cp->vport;
830	else
831	ports[`0`] = cp->dport;
832	}
833
834	/ And finally the ICMP checksum /
835	icmph->icmp6_cksum = ~csum_ipv6_magic(saddr: &iph->saddr, daddr: &iph->daddr,
836	len: skb->len - icmp_offset,
837	IPPROTO_ICMPV6, sum: `0`);
838	skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
839	skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
840	skb->ip_summed = CHECKSUM_PARTIAL;
841
842	if (inout)
843	IP_VS_DBG_PKT(`11`, AF_INET6, pp, skb,
844	(void )ciph - (void* *)iph,
845	"Forwarding altered outgoing ICMPv6");
846	else
847	IP_VS_DBG_PKT(`11`, AF_INET6, pp, skb,
848	(void )ciph - (void* *)iph,
849	"Forwarding altered incoming ICMPv6");
850	}
851	#endif
852
853	/ Handle relevant response ICMP messages - forward to the right*
854	* destination host.
855	*/
856	static int handle_response_icmp(int af, struct sk_buff *skb,
857	union nf_inet_addr *snet,
858	__u8 protocol, struct ip_vs_conn *cp,
859	struct ip_vs_protocol *pp,
860	unsigned int offset, unsigned int ihl,
861	unsigned int hooknum)
862	{
863	unsigned int verdict = NF_DROP;
864
865	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
866	goto after_nat;
867
868	/ Ensure the checksum is correct /
869	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, offset: ihl)) {
870	/ Failed checksum! /
871	IP_VS_DBG_BUF(`1`, "Forward ICMP: failed checksum from %s!\n",
872	IP_VS_DBG_ADDR(af, snet));
873	goto out;
874	}
875
876	if (IPPROTO_TCP == protocol \|\| IPPROTO_UDP == protocol \|\|
877	IPPROTO_SCTP == protocol)
878	offset += `2` * sizeof(__u16);
879	if (skb_ensure_writable(skb, write_len: offset))
880	goto out;
881
882	#ifdef CONFIG_IP_VS_IPV6
883	if (af == AF_INET6)
884	ip_vs_nat_icmp_v6(skb, pp, cp, inout: `1`);
885	else
886	#endif
887	ip_vs_nat_icmp(skb, pp, cp, inout: `1`);
888
889	if (ip_vs_route_me_harder(ipvs: cp->ipvs, af, skb, hooknum))
890	goto out;
891
892	after_nat:
893	/ do the statistics and put it back /
894	ip_vs_out_stats(cp, skb);
895
896	skb->ipvs_property = `1`;
897	if (!(cp->flags & IP_VS_CONN_F_NFCT))
898	ip_vs_notrack(skb);
899	else
900	ip_vs_update_conntrack(skb, cp, outin: `0`);
901	verdict = NF_ACCEPT;
902
903	out:
904	__ip_vs_conn_put(cp);
905
906	return verdict;
907	}
908
909	/*
910	* Handle ICMP messages in the inside-to-outside direction (outgoing).
911	* Find any that might be relevant, check against existing connections.
912	* Currently handles error types - unreachable, quench, ttl exceeded.
913	*/
914	static int ip_vs_out_icmp(struct netns_ipvs ipvs, struct* sk_buff *skb,
915	int related, unsigned* int hooknum)
916	{
917	struct iphdr *iph;
918	struct icmphdr _icmph, *ic;
919	struct iphdr _ciph, cih; /* The ip header contained within the ICMP /
920	struct ip_vs_iphdr ciph;
921	struct ip_vs_conn *cp;
922	struct ip_vs_protocol *pp;
923	unsigned int offset, ihl;
924	union nf_inet_addr snet;
925
926	*related = `1`;
927
928	/ reassemble IP fragments /
929	if (ip_is_fragment(iph: ip_hdr(skb))) {
930	if (ip_vs_gather_frags(ipvs, skb, user: ip_vs_defrag_user(hooknum)))
931	return NF_STOLEN;
932	}
933
934	iph = ip_hdr(skb);
935	offset = ihl = iph->ihl * `4`;
936	ic = skb_header_pointer(skb, offset, len: sizeof(_icmph), buffer: &_icmph);
937	if (ic == NULL)
938	return NF_DROP;
939
940	IP_VS_DBG(`12`, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
941	ic->type, ntohs(icmp_id(ic)),
942	&iph->saddr, &iph->daddr);
943
944	/*
945	* Work through seeing if this is for us.
946	* These checks are supposed to be in an order that means easy
947	* things are checked first to speed up processing.... however
948	* this means that some packets will manage to get a long way
949	* down this stack and then be rejected, but that's life.
950	*/
951	if ((ic->type != ICMP_DEST_UNREACH) &&
952	(ic->type != ICMP_SOURCE_QUENCH) &&
953	(ic->type != ICMP_TIME_EXCEEDED)) {
954	*related = `0`;
955	return NF_ACCEPT;
956	}
957
958	/ Now find the contained IP header /
959	offset += sizeof(_icmph);
960	cih = skb_header_pointer(skb, offset, len: sizeof(_ciph), buffer: &_ciph);
961	if (cih == NULL)
962	return NF_ACCEPT; / The packet looks wrong, ignore /
963
964	pp = ip_vs_proto_get(proto: cih->protocol);
965	if (!pp)
966	return NF_ACCEPT;
967
968	/ Is the embedded protocol header present? /
969	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
970	pp->dont_defrag))
971	return NF_ACCEPT;
972
973	IP_VS_DBG_PKT(`11`, AF_INET, pp, skb, offset,
974	"Checking outgoing ICMP for");
975
976	ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, inverse: true, iphdr: &ciph);
977
978	/ The embedded headers contain source and dest in reverse order /
979	cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
980	ipvs, AF_INET, skb, &ciph);
981	if (!cp)
982	return NF_ACCEPT;
983
984	snet.ip = iph->saddr;
985	return handle_response_icmp(AF_INET, skb, snet: &snet, protocol: cih->protocol, cp,
986	pp, offset: ciph.len, ihl, hooknum);
987	}
988
989	#ifdef CONFIG_IP_VS_IPV6
990	static int ip_vs_out_icmp_v6(struct netns_ipvs ipvs, struct* sk_buff *skb,
991	int related, unsigned* int hooknum,
992	struct ip_vs_iphdr *ipvsh)
993	{
994	struct icmp6hdr _icmph, *ic;
995	struct ip_vs_iphdr ciph = {.flags = `0`, .fragoffs = `0`};/Contained IP /
996	struct ip_vs_conn *cp;
997	struct ip_vs_protocol *pp;
998	union nf_inet_addr snet;
999	unsigned int offset;
1000
1001	*related = `1`;
1002	ic = frag_safe_skb_hp(skb, offset: ipvsh->len, len: sizeof(_icmph), buffer: &_icmph);
1003	if (ic == NULL)
1004	return NF_DROP;
1005
1006	/*
1007	* Work through seeing if this is for us.
1008	* These checks are supposed to be in an order that means easy
1009	* things are checked first to speed up processing.... however
1010	* this means that some packets will manage to get a long way
1011	* down this stack and then be rejected, but that's life.
1012	*/
1013	if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
1014	*related = `0`;
1015	return NF_ACCEPT;
1016	}
1017	/ Fragment header that is before ICMP header tells us that:*
1018	* it's not an error message since they can't be fragmented.
1019	*/
1020	if (ipvsh->flags & IP6_FH_F_FRAG)
1021	return NF_DROP;
1022
1023	IP_VS_DBG(`8`, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n",
1024	ic->icmp6_type, ntohs(icmpv6_id(ic)),
1025	&ipvsh->saddr, &ipvsh->daddr);
1026
1027	if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset: ipvsh->len + sizeof(_icmph),
1028	inverse: true, iphdr: &ciph))
1029	return NF_ACCEPT; / The packet looks wrong, ignore /
1030
1031	pp = ip_vs_proto_get(proto: ciph.protocol);
1032	if (!pp)
1033	return NF_ACCEPT;
1034
1035	/ The embedded headers contain source and dest in reverse order /
1036	cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
1037	ipvs, AF_INET6, skb, &ciph);
1038	if (!cp)
1039	return NF_ACCEPT;
1040
1041	snet.in6 = ciph.saddr.in6;
1042	offset = ciph.len;
1043	return handle_response_icmp(AF_INET6, skb, snet: &snet, protocol: ciph.protocol, cp,
1044	pp, offset, ihl: sizeof(struct ipv6hdr),
1045	hooknum);
1046	}
1047	#endif
1048
1049	/*
1050	* Check if sctp chunc is ABORT chunk
1051	*/
1052	static inline int is_sctp_abort(const struct sk_buff skb, int* nh_len)
1053	{
1054	struct sctp_chunkhdr *sch, schunk;
1055	sch = skb_header_pointer(skb, offset: nh_len + sizeof(struct sctphdr),
1056	len: sizeof(schunk), buffer: &schunk);
1057	if (sch == NULL)
1058	return `0`;
1059	if (sch->type == SCTP_CID_ABORT)
1060	return `1`;
1061	return `0`;
1062	}
1063
1064	static inline int is_tcp_reset(const struct sk_buff skb, int* nh_len)
1065	{
1066	struct tcphdr _tcph, *th;
1067
1068	th = skb_header_pointer(skb, offset: nh_len, len: sizeof(_tcph), buffer: &_tcph);
1069	if (th == NULL)
1070	return `0`;
1071	return th->rst;
1072	}
1073
1074	static inline bool is_new_conn(const struct sk_buff *skb,
1075	struct ip_vs_iphdr *iph)
1076	{
1077	switch (iph->protocol) {
1078	case IPPROTO_TCP: {
1079	struct tcphdr _tcph, *th;
1080
1081	th = skb_header_pointer(skb, offset: iph->len, len: sizeof(_tcph), buffer: &_tcph);
1082	if (th == NULL)
1083	return false;
1084	return th->syn;
1085	}
1086	case IPPROTO_SCTP: {
1087	struct sctp_chunkhdr *sch, schunk;
1088
1089	sch = skb_header_pointer(skb, offset: iph->len + sizeof(struct sctphdr),
1090	len: sizeof(schunk), buffer: &schunk);
1091	if (sch == NULL)
1092	return false;
1093	return sch->type == SCTP_CID_INIT;
1094	}
1095	default:
1096	return false;
1097	}
1098	}
1099
1100	static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
1101	int conn_reuse_mode)
1102	{
1103	/ Controlled (FTP DATA or persistence)? /
1104	if (cp->control)
1105	return false;
1106
1107	switch (cp->protocol) {
1108	case IPPROTO_TCP:
1109	return (cp->state == IP_VS_TCP_S_TIME_WAIT) \|\|
1110	(cp->state == IP_VS_TCP_S_CLOSE) \|\|
1111	((conn_reuse_mode & `2`) &&
1112	(cp->state == IP_VS_TCP_S_FIN_WAIT) &&
1113	(cp->flags & IP_VS_CONN_F_NOOUTPUT));
1114	case IPPROTO_SCTP:
1115	return cp->state == IP_VS_SCTP_S_CLOSED;
1116	default:
1117	return false;
1118	}
1119	}
1120
1121	/ Generic function to create new connections for outgoing RS packets*
1122	*
1123	* Pre-requisites for successful connection creation:
1124	* 1) Virtual Service is NOT fwmark based:
1125	* In fwmark-VS actual vaddr and vport are unknown to IPVS
1126	* 2) Real Server and Virtual Service were NOT configured without port:
1127	* This is to allow match of different VS to the same RS ip-addr
1128	*/
1129	struct ip_vs_conn ip_vs_new_conn_out(struct* ip_vs_service *svc,
1130	struct ip_vs_dest *dest,
1131	struct sk_buff *skb,
1132	const struct ip_vs_iphdr *iph,
1133	__be16 dport,
1134	__be16 cport)
1135	{
1136	struct ip_vs_conn_param param;
1137	struct ip_vs_conn ct = NULL, cp = NULL;
1138	const union nf_inet_addr vaddr, daddr, *caddr;
1139	union nf_inet_addr snet;
1140	__be16 vport;
1141	unsigned int flags;
1142
1143	vaddr = &svc->addr;
1144	vport = svc->port;
1145	daddr = &iph->saddr;
1146	caddr = &iph->daddr;
1147
1148	/ check pre-requisites are satisfied /
1149	if (svc->fwmark)
1150	return NULL;
1151	if (!vport \|\| !dport)
1152	return NULL;
1153
1154	/ for persistent service first create connection template /
1155	if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
1156	/ apply netmask the same way ingress-side does /
1157	#ifdef CONFIG_IP_VS_IPV6
1158	if (svc->af == AF_INET6)
1159	ipv6_addr_prefix(pfx: &snet.in6, addr: &caddr->in6,
1160	plen: (__force __u32)svc->netmask);
1161	else
1162	#endif
1163	snet.ip = caddr->ip & svc->netmask;
1164	/ fill params and create template if not existent /
1165	if (ip_vs_conn_fill_param_persist(svc, skb, protocol: iph->protocol,
1166	caddr: &snet, cport: `0`, vaddr,
1167	vport, p: &param) < `0`)
1168	return NULL;
1169	ct = ip_vs_ct_in_get(p: &param);
1170	/ check if template exists and points to the same dest /
1171	if (!ct \|\| !ip_vs_check_template(ct, cdest: dest)) {
1172	ct = ip_vs_conn_new(&param, dest->af, daddr, dport,
1173	IP_VS_CONN_F_TEMPLATE, dest, `0`);
1174	if (!ct) {
1175	kfree(objp: param.pe_data);
1176	return NULL;
1177	}
1178	ct->timeout = svc->timeout;
1179	} else {
1180	kfree(objp: param.pe_data);
1181	}
1182	}
1183
1184	/ connection flags /
1185	flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) &&
1186	iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : `0`;
1187	/ create connection /
1188	ip_vs_conn_fill_param(ipvs: svc->ipvs, af: svc->af, protocol: iph->protocol,
1189	caddr, cport, vaddr, vport, p: &param);
1190	cp = ip_vs_conn_new(&param, dest->af, daddr, dport, flags, dest, `0`);
1191	if (!cp) {
1192	if (ct)
1193	ip_vs_conn_put(ct);
1194	return NULL;
1195	}
1196	if (ct) {
1197	ip_vs_control_add(cp, ctl_cp: ct);
1198	ip_vs_conn_put(ct);
1199	}
1200	ip_vs_conn_stats(cp, svc);
1201
1202	/ return connection (will be used to handle outgoing packet) /
1203	IP_VS_DBG_BUF(`6`, "New connection RS-initiated:%c c:%s:%u v:%s:%u "
1204	"d:%s:%u conn->flags:%X conn->refcnt:%d\n",
1205	ip_vs_fwd_tag(cp),
1206	IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
1207	IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
1208	IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
1209	cp->flags, refcount_read(&cp->refcnt));
1210	return cp;
1211	}
1212
1213	/ Handle outgoing packets which are considered requests initiated by*
1214	* real servers, so that subsequent responses from external client can be
1215	* routed to the right real server.
1216	* Used also for outgoing responses in OPS mode.
1217	*
1218	* Connection management is handled by persistent-engine specific callback.
1219	*/
1220	static struct ip_vs_conn __ip_vs_rs_conn_out(unsigned* int hooknum,
1221	struct netns_ipvs *ipvs,
1222	int af, struct sk_buff *skb,
1223	const struct ip_vs_iphdr *iph)
1224	{
1225	struct ip_vs_dest *dest;
1226	struct ip_vs_conn *cp = NULL;
1227	__be16 _ports[`2`], *pptr;
1228
1229	if (hooknum == NF_INET_LOCAL_IN)
1230	return NULL;
1231
1232	pptr = frag_safe_skb_hp(skb, offset: iph->len,
1233	len: sizeof(_ports), buffer: _ports);
1234	if (!pptr)
1235	return NULL;
1236
1237	dest = ip_vs_find_real_service(ipvs, af, protocol: iph->protocol,
1238	daddr: &iph->saddr, dport: pptr[`0`]);
1239	if (dest) {
1240	struct ip_vs_service *svc;
1241	struct ip_vs_pe *pe;
1242
1243	svc = rcu_dereference(dest->svc);
1244	if (svc) {
1245	pe = rcu_dereference(svc->pe);
1246	if (pe && pe->conn_out)
1247	cp = pe->conn_out(svc, dest, skb, iph,
1248	pptr[`0`], pptr[`1`]);
1249	}
1250	}
1251
1252	return cp;
1253	}
1254
1255	/ Handle response packets: rewrite addresses and send away...*
1256	*/
1257	static unsigned int
1258	handle_response(int af, struct sk_buff skb, struct* ip_vs_proto_data *pd,
1259	struct ip_vs_conn cp, struct* ip_vs_iphdr *iph,
1260	unsigned int hooknum)
1261	{
1262	struct ip_vs_protocol *pp = pd->pp;
1263
1264	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
1265	goto after_nat;
1266
1267	IP_VS_DBG_PKT(`11`, af, pp, skb, iph->off, "Outgoing packet");
1268
1269	if (skb_ensure_writable(skb, write_len: iph->len))
1270	goto drop;
1271
1272	/ mangle the packet /
1273	if (pp->snat_handler &&
1274	!SNAT_CALL(pp->snat_handler, skb, pp, cp, iph))
1275	goto drop;
1276
1277	#ifdef CONFIG_IP_VS_IPV6
1278	if (af == AF_INET6)
1279	ipv6_hdr(skb)->saddr = cp->vaddr.in6;
1280	else
1281	#endif
1282	{
1283	ip_hdr(skb)->saddr = cp->vaddr.ip;
1284	ip_send_check(ip: ip_hdr(skb));
1285	}
1286
1287	/*
1288	* nf_iterate does not expect change in the skb->dst->dev.
1289	* It looks like it is not fatal to enable this code for hooks
1290	* where our handlers are at the end of the chain list and
1291	* when all next handlers use skb->dst->dev and not outdev.
1292	* It will definitely route properly the inout NAT traffic
1293	* when multiple paths are used.
1294	*/
1295
1296	/ For policy routing, packets originating from this*
1297	* machine itself may be routed differently to packets
1298	* passing through. We want this packet to be routed as
1299	* if it came from this machine itself. So re-compute
1300	* the routing information.
1301	*/
1302	if (ip_vs_route_me_harder(ipvs: cp->ipvs, af, skb, hooknum))
1303	goto drop;
1304
1305	IP_VS_DBG_PKT(`10`, af, pp, skb, iph->off, "After SNAT");
1306
1307	after_nat:
1308	ip_vs_out_stats(cp, skb);
1309	ip_vs_set_state(cp, direction: IP_VS_DIR_OUTPUT, skb, pd);
1310	skb->ipvs_property = `1`;
1311	if (!(cp->flags & IP_VS_CONN_F_NFCT))
1312	ip_vs_notrack(skb);
1313	else
1314	ip_vs_update_conntrack(skb, cp, outin: `0`);
1315	ip_vs_conn_put(cp);
1316
1317	return NF_ACCEPT;
1318
1319	drop:
1320	ip_vs_conn_put(cp);
1321	kfree_skb(skb);
1322	return NF_STOLEN;
1323	}
1324
1325	/*
1326	* Check if outgoing packet belongs to the established ip_vs_conn.
1327	*/
1328	static unsigned int
1329	ip_vs_out_hook(void priv, struct* sk_buff skb, const* struct nf_hook_state *state)
1330	{
1331	struct netns_ipvs *ipvs = net_ipvs(net: state->net);
1332	unsigned int hooknum = state->hook;
1333	struct ip_vs_iphdr iph;
1334	struct ip_vs_protocol *pp;
1335	struct ip_vs_proto_data *pd;
1336	struct ip_vs_conn *cp;
1337	int af = state->pf;
1338	struct sock *sk;
1339
1340	/ Already marked as IPVS request or reply? /
1341	if (skb->ipvs_property)
1342	return NF_ACCEPT;
1343
1344	sk = skb_to_full_sk(skb);
1345	/ Bad... Do not break raw sockets /
1346	if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
1347	af == AF_INET)) {
1348
1349	if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk))
1350	return NF_ACCEPT;
1351	}
1352
1353	if (unlikely(!skb_dst(skb)))
1354	return NF_ACCEPT;
1355
1356	if (!ipvs->enable)
1357	return NF_ACCEPT;
1358
1359	ip_vs_fill_iph_skb(af, skb, inverse: false, iphdr: &iph);
1360	#ifdef CONFIG_IP_VS_IPV6
1361	if (af == AF_INET6) {
1362	if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1363	int related;
1364	int verdict = ip_vs_out_icmp_v6(ipvs, skb, related: &related,
1365	hooknum, ipvsh: &iph);
1366
1367	if (related)
1368	return verdict;
1369	}
1370	} else
1371	#endif
1372	if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1373	int related;
1374	int verdict = ip_vs_out_icmp(ipvs, skb, related: &related, hooknum);
1375
1376	if (related)
1377	return verdict;
1378	}
1379
1380	pd = ip_vs_proto_data_get(ipvs, proto: iph.protocol);
1381	if (unlikely(!pd))
1382	return NF_ACCEPT;
1383	pp = pd->pp;
1384
1385	/ reassemble IP fragments /
1386	#ifdef CONFIG_IP_VS_IPV6
1387	if (af == AF_INET)
1388	#endif
1389	if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
1390	if (ip_vs_gather_frags(ipvs, skb,
1391	user: ip_vs_defrag_user(hooknum)))
1392	return NF_STOLEN;
1393
1394	ip_vs_fill_iph_skb(AF_INET, skb, inverse: false, iphdr: &iph);
1395	}
1396
1397	/*
1398	* Check if the packet belongs to an existing entry
1399	*/
1400	cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
1401	ipvs, af, skb, &iph);
1402
1403	if (likely(cp))
1404	return handle_response(af, skb, pd, cp, iph: &iph, hooknum);
1405
1406	/ Check for real-server-started requests /
1407	if (atomic_read(v: &ipvs->conn_out_counter)) {
1408	/ Currently only for UDP:*
1409	* connection oriented protocols typically use
1410	* ephemeral ports for outgoing connections, so
1411	* related incoming responses would not match any VS
1412	*/
1413	if (pp->protocol == IPPROTO_UDP) {
1414	cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, iph: &iph);
1415	if (likely(cp))
1416	return handle_response(af, skb, pd, cp, iph: &iph,
1417	hooknum);
1418	}
1419	}
1420
1421	if (sysctl_nat_icmp_send(ipvs) &&
1422	(pp->protocol == IPPROTO_TCP \|\|
1423	pp->protocol == IPPROTO_UDP \|\|
1424	pp->protocol == IPPROTO_SCTP)) {
1425	__be16 _ports[`2`], *pptr;
1426
1427	pptr = frag_safe_skb_hp(skb, offset: iph.len,
1428	len: sizeof(_ports), buffer: _ports);
1429	if (pptr == NULL)
1430	return NF_ACCEPT; / Not for me /
1431	if (ip_vs_has_real_service(ipvs, af, protocol: iph.protocol, daddr: &iph.saddr,
1432	dport: pptr[`0`])) {
1433	/*
1434	* Notify the real server: there is no
1435	* existing entry if it is not RST
1436	* packet or not TCP packet.
1437	*/
1438	if ((iph.protocol != IPPROTO_TCP &&
1439	iph.protocol != IPPROTO_SCTP)
1440	\|\| ((iph.protocol == IPPROTO_TCP
1441	&& !is_tcp_reset(skb, nh_len: iph.len))
1442	\|\| (iph.protocol == IPPROTO_SCTP
1443	&& !is_sctp_abort(skb,
1444	nh_len: iph.len)))) {
1445	#ifdef CONFIG_IP_VS_IPV6
1446	if (af == AF_INET6) {
1447	if (!skb->dev)
1448	skb->dev = ipvs->net->loopback_dev;
1449	icmpv6_send(skb,
1450	ICMPV6_DEST_UNREACH,
1451	ICMPV6_PORT_UNREACH,
1452	info: `0`);
1453	} else
1454	#endif
1455	icmp_send(skb_in: skb,
1456	ICMP_DEST_UNREACH,
1457	ICMP_PORT_UNREACH, info: `0`);
1458	return NF_DROP;
1459	}
1460	}
1461	}
1462
1463	IP_VS_DBG_PKT(`12`, af, pp, skb, iph.off,
1464	"ip_vs_out: packet continues traversal as normal");
1465	return NF_ACCEPT;
1466	}
1467
1468	static unsigned int
1469	ip_vs_try_to_schedule(struct netns_ipvs ipvs, int* af, struct sk_buff *skb,
1470	struct ip_vs_proto_data *pd,
1471	int verdict, struct* ip_vs_conn **cpp,
1472	struct ip_vs_iphdr *iph)
1473	{
1474	struct ip_vs_protocol *pp = pd->pp;
1475
1476	if (!iph->fragoffs) {
1477	/ No (second) fragments need to enter here, as nf_defrag_ipv6*
1478	* replayed fragment zero will already have created the cp
1479	*/
1480
1481	/ Schedule and create new connection entry into cpp /
1482	if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph))
1483	return `0`;
1484	}
1485
1486	if (unlikely(!*cpp)) {
1487	/ sorry, all this trouble for a no-hit :) /
1488	IP_VS_DBG_PKT(`12`, af, pp, skb, iph->off,
1489	"ip_vs_in: packet continues traversal as normal");
1490
1491	/ Fragment couldn't be mapped to a conn entry /
1492	if (iph->fragoffs)
1493	IP_VS_DBG_PKT(`7`, af, pp, skb, iph->off,
1494	"unhandled fragment");
1495
1496	*verdict = NF_ACCEPT;
1497	return `0`;
1498	}
1499
1500	return `1`;
1501	}
1502
1503	/ Check the UDP tunnel and return its header length /
1504	static int ipvs_udp_decap(struct netns_ipvs ipvs, struct* sk_buff *skb,
1505	unsigned int offset, __u16 af,
1506	const union nf_inet_addr daddr, __u8 proto)
1507	{
1508	struct udphdr _udph, *udph;
1509	struct ip_vs_dest *dest;
1510
1511	udph = skb_header_pointer(skb, offset, len: sizeof(_udph), buffer: &_udph);
1512	if (!udph)
1513	goto unk;
1514	offset += sizeof(struct udphdr);
1515	dest = ip_vs_find_tunnel(ipvs, af, daddr, tun_port: udph->dest);
1516	if (!dest)
1517	goto unk;
1518	if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1519	struct guehdr _gueh, *gueh;
1520
1521	gueh = skb_header_pointer(skb, offset, len: sizeof(_gueh), buffer: &_gueh);
1522	if (!gueh)
1523	goto unk;
1524	if (gueh->control != `0` \|\| gueh->version != `0`)
1525	goto unk;
1526	/ Later we can support also IPPROTO_IPV6 /
1527	if (gueh->proto_ctype != IPPROTO_IPIP)
1528	goto unk;
1529	*proto = gueh->proto_ctype;
1530	return sizeof(struct udphdr) + sizeof(struct guehdr) +
1531	(gueh->hlen << `2`);
1532	}
1533
1534	unk:
1535	return `0`;
1536	}
1537
1538	/ Check the GRE tunnel and return its header length /
1539	static int ipvs_gre_decap(struct netns_ipvs ipvs, struct* sk_buff *skb,
1540	unsigned int offset, __u16 af,
1541	const union nf_inet_addr daddr, __u8 proto)
1542	{
1543	struct gre_base_hdr _greh, *greh;
1544	struct ip_vs_dest *dest;
1545
1546	greh = skb_header_pointer(skb, offset, len: sizeof(_greh), buffer: &_greh);
1547	if (!greh)
1548	goto unk;
1549	dest = ip_vs_find_tunnel(ipvs, af, daddr, tun_port: `0`);
1550	if (!dest)
1551	goto unk;
1552	if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1553	__be16 type;
1554
1555	/ Only support version 0 and C (csum) /
1556	if ((greh->flags & ~GRE_CSUM) != `0`)
1557	goto unk;
1558	type = greh->protocol;
1559	/ Later we can support also IPPROTO_IPV6 /
1560	if (type != htons(ETH_P_IP))
1561	goto unk;
1562	*proto = IPPROTO_IPIP;
1563	return gre_calc_hlen(o_flags: gre_flags_to_tnl_flags(flags: greh->flags));
1564	}
1565
1566	unk:
1567	return `0`;
1568	}
1569
1570	/*
1571	* Handle ICMP messages in the outside-to-inside direction (incoming).
1572	* Find any that might be relevant, check against existing connections,
1573	* forward to the right destination host if relevant.
1574	* Currently handles error types - unreachable, quench, ttl exceeded.
1575	*/
1576	static int
1577	ip_vs_in_icmp(struct netns_ipvs ipvs, struct* sk_buff skb, int* *related,
1578	unsigned int hooknum)
1579	{
1580	struct iphdr *iph;
1581	struct icmphdr _icmph, *ic;
1582	struct iphdr _ciph, cih; /* The ip header contained within the ICMP /
1583	struct ip_vs_iphdr ciph;
1584	struct ip_vs_conn *cp;
1585	struct ip_vs_protocol *pp;
1586	struct ip_vs_proto_data *pd;
1587	unsigned int offset, offset2, ihl, verdict;
1588	bool tunnel, new_cp = false;
1589	union nf_inet_addr *raddr;
1590	char *outer_proto = "IPIP";
1591
1592	*related = `1`;
1593
1594	/ reassemble IP fragments /
1595	if (ip_is_fragment(iph: ip_hdr(skb))) {
1596	if (ip_vs_gather_frags(ipvs, skb, user: ip_vs_defrag_user(hooknum)))
1597	return NF_STOLEN;
1598	}
1599
1600	iph = ip_hdr(skb);
1601	offset = ihl = iph->ihl * `4`;
1602	ic = skb_header_pointer(skb, offset, len: sizeof(_icmph), buffer: &_icmph);
1603	if (ic == NULL)
1604	return NF_DROP;
1605
1606	IP_VS_DBG(`12`, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1607	ic->type, ntohs(icmp_id(ic)),
1608	&iph->saddr, &iph->daddr);
1609
1610	/*
1611	* Work through seeing if this is for us.
1612	* These checks are supposed to be in an order that means easy
1613	* things are checked first to speed up processing.... however
1614	* this means that some packets will manage to get a long way
1615	* down this stack and then be rejected, but that's life.
1616	*/
1617	if ((ic->type != ICMP_DEST_UNREACH) &&
1618	(ic->type != ICMP_SOURCE_QUENCH) &&
1619	(ic->type != ICMP_TIME_EXCEEDED)) {
1620	*related = `0`;
1621	return NF_ACCEPT;
1622	}
1623
1624	/ Now find the contained IP header /
1625	offset += sizeof(_icmph);
1626	cih = skb_header_pointer(skb, offset, len: sizeof(_ciph), buffer: &_ciph);
1627	if (cih == NULL)
1628	return NF_ACCEPT; / The packet looks wrong, ignore /
1629	raddr = (union nf_inet_addr *)&cih->daddr;
1630
1631	/ Special case for errors for IPIP/UDP/GRE tunnel packets /
1632	tunnel = false;
1633	if (cih->protocol == IPPROTO_IPIP) {
1634	struct ip_vs_dest *dest;
1635
1636	if (unlikely(cih->frag_off & htons(IP_OFFSET)))
1637	return NF_ACCEPT;
1638	/ Error for our IPIP must arrive at LOCAL_IN /
1639	if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
1640	return NF_ACCEPT;
1641	dest = ip_vs_find_tunnel(ipvs, AF_INET, daddr: raddr, tun_port: `0`);
1642	/ Only for known tunnel /
1643	if (!dest \|\| dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP)
1644	return NF_ACCEPT;
1645	offset += cih->ihl * `4`;
1646	cih = skb_header_pointer(skb, offset, len: sizeof(_ciph), buffer: &_ciph);
1647	if (cih == NULL)
1648	return NF_ACCEPT; / The packet looks wrong, ignore /
1649	tunnel = true;
1650	} else if ((cih->protocol == IPPROTO_UDP \|\| / Can be UDP encap /
1651	cih->protocol == IPPROTO_GRE) && / Can be GRE encap /
1652	/ Error for our tunnel must arrive at LOCAL_IN /
1653	(skb_rtable(skb)->rt_flags & RTCF_LOCAL)) {
1654	__u8 iproto;
1655	int ulen;
1656
1657	/ Non-first fragment has no UDP/GRE header /
1658	if (unlikely(cih->frag_off & htons(IP_OFFSET)))
1659	return NF_ACCEPT;
1660	offset2 = offset + cih->ihl * `4`;
1661	if (cih->protocol == IPPROTO_UDP) {
1662	ulen = ipvs_udp_decap(ipvs, skb, offset: offset2, AF_INET,
1663	daddr: raddr, proto: &iproto);
1664	outer_proto = "UDP";
1665	} else {
1666	ulen = ipvs_gre_decap(ipvs, skb, offset: offset2, AF_INET,
1667	daddr: raddr, proto: &iproto);
1668	outer_proto = "GRE";
1669	}
1670	if (ulen > `0`) {
1671	/ Skip IP and UDP/GRE tunnel headers /
1672	offset = offset2 + ulen;
1673	/ Now we should be at the original IP header /
1674	cih = skb_header_pointer(skb, offset, len: sizeof(_ciph),
1675	buffer: &_ciph);
1676	if (cih && cih->version == `4` && cih->ihl >= `5` &&
1677	iproto == IPPROTO_IPIP)
1678	tunnel = true;
1679	else
1680	return NF_ACCEPT;
1681	}
1682	}
1683
1684	pd = ip_vs_proto_data_get(ipvs, proto: cih->protocol);
1685	if (!pd)
1686	return NF_ACCEPT;
1687	pp = pd->pp;
1688
1689	/ Is the embedded protocol header present? /
1690	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1691	pp->dont_defrag))
1692	return NF_ACCEPT;
1693
1694	IP_VS_DBG_PKT(`11`, AF_INET, pp, skb, offset,
1695	"Checking incoming ICMP for");
1696
1697	offset2 = offset;
1698	ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, inverse: !tunnel, iphdr: &ciph);
1699	offset = ciph.len;
1700
1701	/ The embedded headers contain source and dest in reverse order.*
1702	* For IPIP/UDP/GRE tunnel this is error for request, not for reply.
1703	*/
1704	cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
1705	ipvs, AF_INET, skb, &ciph);
1706
1707	if (!cp) {
1708	int v;
1709
1710	if (tunnel \|\| !sysctl_schedule_icmp(ipvs))
1711	return NF_ACCEPT;
1712
1713	if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, verdict: &v, cpp: &cp, iph: &ciph))
1714	return v;
1715	new_cp = true;
1716	}
1717
1718	verdict = NF_DROP;
1719
1720	/ Ensure the checksum is correct /
1721	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, offset: ihl)) {
1722	/ Failed checksum! /
1723	IP_VS_DBG(`1`, "Incoming ICMP: failed checksum from %pI4!\n",
1724	&iph->saddr);
1725	goto out;
1726	}
1727
1728	if (tunnel) {
1729	__be32 info = ic->un.gateway;
1730	__u8 type = ic->type;
1731	__u8 code = ic->code;
1732
1733	/ Update the MTU /
1734	if (ic->type == ICMP_DEST_UNREACH &&
1735	ic->code == ICMP_FRAG_NEEDED) {
1736	struct ip_vs_dest *dest = cp->dest;
1737	u32 mtu = ntohs(ic->un.frag.mtu);
1738	__be16 frag_off = cih->frag_off;
1739
1740	/ Strip outer IP and ICMP, go to IPIP/UDP/GRE header /
1741	if (pskb_pull(skb, len: ihl + sizeof(_icmph)) == NULL)
1742	goto ignore_tunnel;
1743	offset2 -= ihl + sizeof(_icmph);
1744	skb_reset_network_header(skb);
1745	IP_VS_DBG(`12`, "ICMP for %s %pI4->%pI4: mtu=%u\n",
1746	outer_proto, &ip_hdr(skb)->saddr,
1747	&ip_hdr(skb)->daddr, mtu);
1748	ipv4_update_pmtu(skb, net: ipvs->net, mtu, oif: `0`, protocol: `0`);
1749	/ Client uses PMTUD? /
1750	if (!(frag_off & htons(IP_DF)))
1751	goto ignore_tunnel;
1752	/ Prefer the resulting PMTU /
1753	if (dest) {
1754	struct ip_vs_dest_dst *dest_dst;
1755
1756	dest_dst = rcu_dereference(dest->dest_dst);
1757	if (dest_dst)
1758	mtu = dst_mtu(dst: dest_dst->dst_cache);
1759	}
1760	if (mtu > `68` + sizeof(struct iphdr))
1761	mtu -= sizeof(struct iphdr);
1762	info = htonl(mtu);
1763	}
1764	/ Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of*
1765	* original request.
1766	*/
1767	if (pskb_pull(skb, len: offset2) == NULL)
1768	goto ignore_tunnel;
1769	skb_reset_network_header(skb);
1770	IP_VS_DBG(`12`, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
1771	&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1772	type, code, ntohl(info));
1773	icmp_send(skb_in: skb, type, code, info);
1774	/ ICMP can be shorter but anyways, account it /
1775	ip_vs_out_stats(cp, skb);
1776
1777	ignore_tunnel:
1778	consume_skb(skb);
1779	verdict = NF_STOLEN;
1780	goto out;
1781	}
1782
1783	/ do the statistics and put it back /
1784	ip_vs_in_stats(cp, skb);
1785	if (IPPROTO_TCP == cih->protocol \|\| IPPROTO_UDP == cih->protocol \|\|
1786	IPPROTO_SCTP == cih->protocol)
1787	offset += `2` * sizeof(__u16);
1788	verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, iph: &ciph);
1789
1790	out:
1791	if (likely(!new_cp))
1792	__ip_vs_conn_put(cp);
1793	else
1794	ip_vs_conn_put(cp);
1795
1796	return verdict;
1797	}
1798
1799	#ifdef CONFIG_IP_VS_IPV6
1800	static int ip_vs_in_icmp_v6(struct netns_ipvs ipvs, struct* sk_buff *skb,
1801	int related, unsigned* int hooknum,
1802	struct ip_vs_iphdr *iph)
1803	{
1804	struct icmp6hdr _icmph, *ic;
1805	struct ip_vs_iphdr ciph = {.flags = `0`, .fragoffs = `0`};/Contained IP /
1806	struct ip_vs_conn *cp;
1807	struct ip_vs_protocol *pp;
1808	struct ip_vs_proto_data *pd;
1809	unsigned int offset, verdict;
1810	bool new_cp = false;
1811
1812	*related = `1`;
1813
1814	ic = frag_safe_skb_hp(skb, offset: iph->len, len: sizeof(_icmph), buffer: &_icmph);
1815	if (ic == NULL)
1816	return NF_DROP;
1817
1818	/*
1819	* Work through seeing if this is for us.
1820	* These checks are supposed to be in an order that means easy
1821	* things are checked first to speed up processing.... however
1822	* this means that some packets will manage to get a long way
1823	* down this stack and then be rejected, but that's life.
1824	*/
1825	if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
1826	*related = `0`;
1827	return NF_ACCEPT;
1828	}
1829	/ Fragment header that is before ICMP header tells us that:*
1830	* it's not an error message since they can't be fragmented.
1831	*/
1832	if (iph->flags & IP6_FH_F_FRAG)
1833	return NF_DROP;
1834
1835	IP_VS_DBG(`8`, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n",
1836	ic->icmp6_type, ntohs(icmpv6_id(ic)),
1837	&iph->saddr, &iph->daddr);
1838
1839	offset = iph->len + sizeof(_icmph);
1840	if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, offset, inverse: true, iphdr: &ciph))
1841	return NF_ACCEPT;
1842
1843	pd = ip_vs_proto_data_get(ipvs, proto: ciph.protocol);
1844	if (!pd)
1845	return NF_ACCEPT;
1846	pp = pd->pp;
1847
1848	/ Cannot handle fragmented embedded protocol /
1849	if (ciph.fragoffs)
1850	return NF_ACCEPT;
1851
1852	IP_VS_DBG_PKT(`11`, AF_INET6, pp, skb, offset,
1853	"Checking incoming ICMPv6 for");
1854
1855	/ The embedded headers contain source and dest in reverse order*
1856	* if not from localhost
1857	*/
1858	cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
1859	ipvs, AF_INET6, skb, &ciph);
1860
1861	if (!cp) {
1862	int v;
1863
1864	if (!sysctl_schedule_icmp(ipvs))
1865	return NF_ACCEPT;
1866
1867	if (!ip_vs_try_to_schedule(ipvs, AF_INET6, skb, pd, verdict: &v, cpp: &cp, iph: &ciph))
1868	return v;
1869
1870	new_cp = true;
1871	}
1872
1873	/ VS/TUN, VS/DR and LOCALNODE just let it go /
1874	if ((hooknum == NF_INET_LOCAL_OUT) &&
1875	(IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) {
1876	verdict = NF_ACCEPT;
1877	goto out;
1878	}
1879
1880	/ do the statistics and put it back /
1881	ip_vs_in_stats(cp, skb);
1882
1883	/ Need to mangle contained IPv6 header in ICMPv6 packet /
1884	offset = ciph.len;
1885	if (IPPROTO_TCP == ciph.protocol \|\| IPPROTO_UDP == ciph.protocol \|\|
1886	IPPROTO_SCTP == ciph.protocol)
1887	offset += `2` * sizeof(__u16); / Also mangle ports /
1888
1889	verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset, hooknum, iph: &ciph);
1890
1891	out:
1892	if (likely(!new_cp))
1893	__ip_vs_conn_put(cp);
1894	else
1895	ip_vs_conn_put(cp);
1896
1897	return verdict;
1898	}
1899	#endif
1900
1901
1902	/*
1903	* Check if it's for virtual services, look it up,
1904	* and send it on its way...
1905	*/
1906	static unsigned int
1907	ip_vs_in_hook(void priv, struct* sk_buff skb, const* struct nf_hook_state *state)
1908	{
1909	struct netns_ipvs *ipvs = net_ipvs(net: state->net);
1910	unsigned int hooknum = state->hook;
1911	struct ip_vs_iphdr iph;
1912	struct ip_vs_protocol *pp;
1913	struct ip_vs_proto_data *pd;
1914	struct ip_vs_conn *cp;
1915	int ret, pkts;
1916	struct sock *sk;
1917	int af = state->pf;
1918
1919	/ Already marked as IPVS request or reply? /
1920	if (skb->ipvs_property)
1921	return NF_ACCEPT;
1922
1923	/*
1924	* Big tappo:
1925	* - remote client: only PACKET_HOST
1926	* - route: used for struct net when skb->dev is unset
1927	*/
1928	if (unlikely((skb->pkt_type != PACKET_HOST &&
1929	hooknum != NF_INET_LOCAL_OUT) \|\|
1930	!skb_dst(skb))) {
1931	ip_vs_fill_iph_skb(af, skb, inverse: false, iphdr: &iph);
1932	IP_VS_DBG_BUF(`12`, "packet type=%d proto=%d daddr=%s"
1933	" ignored in hook %u\n",
1934	skb->pkt_type, iph.protocol,
1935	IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
1936	return NF_ACCEPT;
1937	}
1938	/ ipvs enabled in this netns ? /
1939	if (unlikely(sysctl_backup_only(ipvs) \|\| !ipvs->enable))
1940	return NF_ACCEPT;
1941
1942	ip_vs_fill_iph_skb(af, skb, inverse: false, iphdr: &iph);
1943
1944	/ Bad... Do not break raw sockets /
1945	sk = skb_to_full_sk(skb);
1946	if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
1947	af == AF_INET)) {
1948
1949	if (sk->sk_family == PF_INET && inet_test_bit(NODEFRAG, sk))
1950	return NF_ACCEPT;
1951	}
1952
1953	#ifdef CONFIG_IP_VS_IPV6
1954	if (af == AF_INET6) {
1955	if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1956	int related;
1957	int verdict = ip_vs_in_icmp_v6(ipvs, skb, related: &related,
1958	hooknum, iph: &iph);
1959
1960	if (related)
1961	return verdict;
1962	}
1963	} else
1964	#endif
1965	if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1966	int related;
1967	int verdict = ip_vs_in_icmp(ipvs, skb, related: &related,
1968	hooknum);
1969
1970	if (related)
1971	return verdict;
1972	}
1973
1974	/ Protocol supported? /
1975	pd = ip_vs_proto_data_get(ipvs, proto: iph.protocol);
1976	if (unlikely(!pd)) {
1977	/ The only way we'll see this packet again is if it's*
1978	* encapsulated, so mark it with ipvs_property=1 so we
1979	* skip it if we're ignoring tunneled packets
1980	*/
1981	if (sysctl_ignore_tunneled(ipvs))
1982	skb->ipvs_property = `1`;
1983
1984	return NF_ACCEPT;
1985	}
1986	pp = pd->pp;
1987	/*
1988	* Check if the packet belongs to an existing connection entry
1989	*/
1990	cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
1991	ipvs, af, skb, &iph);
1992
1993	if (!iph.fragoffs && is_new_conn(skb, iph: &iph) && cp) {
1994	int conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
1995	bool old_ct = false, resched = false;
1996
1997	if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
1998	unlikely(!atomic_read(&cp->dest->weight))) {
1999	resched = true;
2000	old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
2001	} else if (conn_reuse_mode &&
2002	is_new_conn_expected(cp, conn_reuse_mode)) {
2003	old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
2004	if (!atomic_read(v: &cp->n_control)) {
2005	resched = true;
2006	} else {
2007	/ Do not reschedule controlling connection*
2008	* that uses conntrack while it is still
2009	* referenced by controlled connection(s).
2010	*/
2011	resched = !old_ct;
2012	}
2013	}
2014
2015	if (resched) {
2016	if (!old_ct)
2017	cp->flags &= ~IP_VS_CONN_F_NFCT;
2018	if (!atomic_read(v: &cp->n_control))
2019	ip_vs_conn_expire_now(cp);
2020	__ip_vs_conn_put(cp);
2021	if (old_ct)
2022	return NF_DROP;
2023	cp = NULL;
2024	}
2025	}
2026
2027	/ Check the server status /
2028	if (cp && cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
2029	/ the destination server is not available /
2030	if (sysctl_expire_nodest_conn(ipvs)) {
2031	bool old_ct = ip_vs_conn_uses_old_conntrack(cp, skb);
2032
2033	if (!old_ct)
2034	cp->flags &= ~IP_VS_CONN_F_NFCT;
2035
2036	ip_vs_conn_expire_now(cp);
2037	__ip_vs_conn_put(cp);
2038	if (old_ct)
2039	return NF_DROP;
2040	cp = NULL;
2041	} else {
2042	__ip_vs_conn_put(cp);
2043	return NF_DROP;
2044	}
2045	}
2046
2047	if (unlikely(!cp)) {
2048	int v;
2049
2050	if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, verdict: &v, cpp: &cp, iph: &iph))
2051	return v;
2052	}
2053
2054	IP_VS_DBG_PKT(`11`, af, pp, skb, iph.off, "Incoming packet");
2055
2056	ip_vs_in_stats(cp, skb);
2057	ip_vs_set_state(cp, direction: IP_VS_DIR_INPUT, skb, pd);
2058	if (cp->packet_xmit)
2059	ret = cp->packet_xmit(skb, cp, pp, &iph);
2060	/ do not touch skb anymore /
2061	else {
2062	IP_VS_DBG_RL("warning: packet_xmit is null");
2063	ret = NF_ACCEPT;
2064	}
2065
2066	/ Increase its packet counter and check if it is needed*
2067	* to be synchronized
2068	*
2069	* Sync connection if it is about to close to
2070	* encorage the standby servers to update the connections timeout
2071	*
2072	* For ONE_PKT let ip_vs_sync_conn() do the filter work.
2073	*/
2074
2075	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
2076	pkts = sysctl_sync_threshold(ipvs);
2077	else
2078	pkts = atomic_inc_return(v: &cp->in_pkts);
2079
2080	if (ipvs->sync_state & IP_VS_STATE_MASTER)
2081	ip_vs_sync_conn(ipvs, cp, pkts);
2082	else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
2083	/ increment is done inside ip_vs_sync_conn too /
2084	atomic_inc(v: &cp->control->in_pkts);
2085
2086	ip_vs_conn_put(cp);
2087	return ret;
2088	}
2089
2090	/*
2091	* It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
2092	* related packets destined for 0.0.0.0/0.
2093	* When fwmark-based virtual service is used, such as transparent
2094	* cache cluster, TCP packets can be marked and routed to ip_vs_in,
2095	* but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
2096	* sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
2097	* and send them to ip_vs_in_icmp.
2098	*/
2099	static unsigned int
2100	ip_vs_forward_icmp(void priv, struct* sk_buff *skb,
2101	const struct nf_hook_state *state)
2102	{
2103	struct netns_ipvs *ipvs = net_ipvs(net: state->net);
2104	int r;
2105
2106	/ ipvs enabled in this netns ? /
2107	if (unlikely(sysctl_backup_only(ipvs) \|\| !ipvs->enable))
2108	return NF_ACCEPT;
2109
2110	if (state->pf == NFPROTO_IPV4) {
2111	if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
2112	return NF_ACCEPT;
2113	#ifdef CONFIG_IP_VS_IPV6
2114	} else {
2115	struct ip_vs_iphdr iphdr;
2116
2117	ip_vs_fill_iph_skb(AF_INET6, skb, inverse: false, iphdr: &iphdr);
2118
2119	if (iphdr.protocol != IPPROTO_ICMPV6)
2120	return NF_ACCEPT;
2121
2122	return ip_vs_in_icmp_v6(ipvs, skb, related: &r, hooknum: state->hook, iph: &iphdr);
2123	#endif
2124	}
2125
2126	return ip_vs_in_icmp(ipvs, skb, related: &r, hooknum: state->hook);
2127	}
2128
2129	static const struct nf_hook_ops ip_vs_ops4[] = {
2130	/ After packet filtering, change source only for VS/NAT /
2131	{
2132	.hook = ip_vs_out_hook,
2133	.pf = NFPROTO_IPV4,
2134	.hooknum = NF_INET_LOCAL_IN,
2135	.priority = NF_IP_PRI_NAT_SRC - `2`,
2136	},
2137	/ After packet filtering, forward packet through VS/DR, VS/TUN,*
2138	* or VS/NAT(change destination), so that filtering rules can be
2139	* applied to IPVS. */
2140	{
2141	.hook = ip_vs_in_hook,
2142	.pf = NFPROTO_IPV4,
2143	.hooknum = NF_INET_LOCAL_IN,
2144	.priority = NF_IP_PRI_NAT_SRC - `1`,
2145	},
2146	/ Before ip_vs_in, change source only for VS/NAT /
2147	{
2148	.hook = ip_vs_out_hook,
2149	.pf = NFPROTO_IPV4,
2150	.hooknum = NF_INET_LOCAL_OUT,
2151	.priority = NF_IP_PRI_NAT_DST + `1`,
2152	},
2153	/ After mangle, schedule and forward local requests /
2154	{
2155	.hook = ip_vs_in_hook,
2156	.pf = NFPROTO_IPV4,
2157	.hooknum = NF_INET_LOCAL_OUT,
2158	.priority = NF_IP_PRI_NAT_DST + `2`,
2159	},
2160	/ After packet filtering (but before ip_vs_out_icmp), catch icmp*
2161	* destined for 0.0.0.0/0, which is for incoming IPVS connections */
2162	{
2163	.hook = ip_vs_forward_icmp,
2164	.pf = NFPROTO_IPV4,
2165	.hooknum = NF_INET_FORWARD,
2166	.priority = `99`,
2167	},
2168	/ After packet filtering, change source only for VS/NAT /
2169	{
2170	.hook = ip_vs_out_hook,
2171	.pf = NFPROTO_IPV4,
2172	.hooknum = NF_INET_FORWARD,
2173	.priority = `100`,
2174	},
2175	};
2176
2177	#ifdef CONFIG_IP_VS_IPV6
2178	static const struct nf_hook_ops ip_vs_ops6[] = {
2179	/ After packet filtering, change source only for VS/NAT /
2180	{
2181	.hook = ip_vs_out_hook,
2182	.pf = NFPROTO_IPV6,
2183	.hooknum = NF_INET_LOCAL_IN,
2184	.priority = NF_IP6_PRI_NAT_SRC - `2`,
2185	},
2186	/ After packet filtering, forward packet through VS/DR, VS/TUN,*
2187	* or VS/NAT(change destination), so that filtering rules can be
2188	* applied to IPVS. */
2189	{
2190	.hook = ip_vs_in_hook,
2191	.pf = NFPROTO_IPV6,
2192	.hooknum = NF_INET_LOCAL_IN,
2193	.priority = NF_IP6_PRI_NAT_SRC - `1`,
2194	},
2195	/ Before ip_vs_in, change source only for VS/NAT /
2196	{
2197	.hook = ip_vs_out_hook,
2198	.pf = NFPROTO_IPV6,
2199	.hooknum = NF_INET_LOCAL_OUT,
2200	.priority = NF_IP6_PRI_NAT_DST + `1`,
2201	},
2202	/ After mangle, schedule and forward local requests /
2203	{
2204	.hook = ip_vs_in_hook,
2205	.pf = NFPROTO_IPV6,
2206	.hooknum = NF_INET_LOCAL_OUT,
2207	.priority = NF_IP6_PRI_NAT_DST + `2`,
2208	},
2209	/ After packet filtering (but before ip_vs_out_icmp), catch icmp*
2210	* destined for 0.0.0.0/0, which is for incoming IPVS connections */
2211	{
2212	.hook = ip_vs_forward_icmp,
2213	.pf = NFPROTO_IPV6,
2214	.hooknum = NF_INET_FORWARD,
2215	.priority = `99`,
2216	},
2217	/ After packet filtering, change source only for VS/NAT /
2218	{
2219	.hook = ip_vs_out_hook,
2220	.pf = NFPROTO_IPV6,
2221	.hooknum = NF_INET_FORWARD,
2222	.priority = `100`,
2223	},
2224	};
2225	#endif
2226
2227	int ip_vs_register_hooks(struct netns_ipvs ipvs, unsigned* int af)
2228	{
2229	const struct nf_hook_ops *ops;
2230	unsigned int count;
2231	unsigned int afmask;
2232	int ret = `0`;
2233
2234	if (af == AF_INET6) {
2235	#ifdef CONFIG_IP_VS_IPV6
2236	ops = ip_vs_ops6;
2237	count = ARRAY_SIZE(ip_vs_ops6);
2238	afmask = `2`;
2239	#else
2240	return -EINVAL;
2241	#endif
2242	} else {
2243	ops = ip_vs_ops4;
2244	count = ARRAY_SIZE(ip_vs_ops4);
2245	afmask = `1`;
2246	}
2247
2248	if (!(ipvs->hooks_afmask & afmask)) {
2249	ret = nf_register_net_hooks(net: ipvs->net, reg: ops, n: count);
2250	if (ret >= `0`)
2251	ipvs->hooks_afmask \|= afmask;
2252	}
2253	return ret;
2254	}
2255
2256	void ip_vs_unregister_hooks(struct netns_ipvs ipvs, unsigned* int af)
2257	{
2258	const struct nf_hook_ops *ops;
2259	unsigned int count;
2260	unsigned int afmask;
2261
2262	if (af == AF_INET6) {
2263	#ifdef CONFIG_IP_VS_IPV6
2264	ops = ip_vs_ops6;
2265	count = ARRAY_SIZE(ip_vs_ops6);
2266	afmask = `2`;
2267	#else
2268	return;
2269	#endif
2270	} else {
2271	ops = ip_vs_ops4;
2272	count = ARRAY_SIZE(ip_vs_ops4);
2273	afmask = `1`;
2274	}
2275
2276	if (ipvs->hooks_afmask & afmask) {
2277	nf_unregister_net_hooks(net: ipvs->net, reg: ops, n: count);
2278	ipvs->hooks_afmask &= ~afmask;
2279	}
2280	}
2281
2282	/*
2283	* Initialize IP Virtual Server netns mem.
2284	*/
2285	static int __net_init __ip_vs_init(struct net *net)
2286	{
2287	struct netns_ipvs *ipvs;
2288
2289	ipvs = net_generic(net, id: ip_vs_net_id);
2290	if (ipvs == NULL)
2291	return -ENOMEM;
2292
2293	/ Hold the beast until a service is registered /
2294	ipvs->enable = `0`;
2295	ipvs->net = net;
2296	/ Counters used for creating unique names /
2297	ipvs->gen = atomic_read(v: &ipvs_netns_cnt);
2298	atomic_inc(v: &ipvs_netns_cnt);
2299	net->ipvs = ipvs;
2300
2301	if (ip_vs_estimator_net_init(ipvs) < `0`)
2302	goto estimator_fail;
2303
2304	if (ip_vs_control_net_init(ipvs) < `0`)
2305	goto control_fail;
2306
2307	if (ip_vs_protocol_net_init(ipvs) < `0`)
2308	goto protocol_fail;
2309
2310	if (ip_vs_app_net_init(ipvs) < `0`)
2311	goto app_fail;
2312
2313	if (ip_vs_conn_net_init(ipvs) < `0`)
2314	goto conn_fail;
2315
2316	if (ip_vs_sync_net_init(ipvs) < `0`)
2317	goto sync_fail;
2318
2319	return `0`;
2320	/*
2321	* Error handling
2322	*/
2323
2324	sync_fail:
2325	ip_vs_conn_net_cleanup(ipvs);
2326	conn_fail:
2327	ip_vs_app_net_cleanup(ipvs);
2328	app_fail:
2329	ip_vs_protocol_net_cleanup(ipvs);
2330	protocol_fail:
2331	ip_vs_control_net_cleanup(ipvs);
2332	control_fail:
2333	ip_vs_estimator_net_cleanup(ipvs);
2334	estimator_fail:
2335	net->ipvs = NULL;
2336	return -ENOMEM;
2337	}
2338
2339	static void __net_exit __ip_vs_cleanup_batch(struct list_head *net_list)
2340	{
2341	struct netns_ipvs *ipvs;
2342	struct net *net;
2343
2344	ip_vs_service_nets_cleanup(net_list); / ip_vs_flush() with locks /
2345	list_for_each_entry(net, net_list, exit_list) {
2346	ipvs = net_ipvs(net);
2347	ip_vs_conn_net_cleanup(ipvs);
2348	ip_vs_app_net_cleanup(ipvs);
2349	ip_vs_protocol_net_cleanup(ipvs);
2350	ip_vs_control_net_cleanup(ipvs);
2351	ip_vs_estimator_net_cleanup(ipvs);
2352	IP_VS_DBG(`2`, "ipvs netns %d released\n", ipvs->gen);
2353	net->ipvs = NULL;
2354	}
2355	}
2356
2357	static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list)
2358	{
2359	struct netns_ipvs *ipvs;
2360	struct net *net;
2361
2362	list_for_each_entry(net, net_list, exit_list) {
2363	ipvs = net_ipvs(net);
2364	ip_vs_unregister_hooks(ipvs, AF_INET);
2365	ip_vs_unregister_hooks(ipvs, AF_INET6);
2366	ipvs->enable = `0`; / Disable packet reception /
2367	smp_wmb();
2368	ip_vs_sync_net_cleanup(ipvs);
2369	}
2370	}
2371
2372	static struct pernet_operations ipvs_core_ops = {
2373	.init = __ip_vs_init,
2374	.exit_batch = __ip_vs_cleanup_batch,
2375	.id = &ip_vs_net_id,
2376	.size = sizeof(struct netns_ipvs),
2377	};
2378
2379	static struct pernet_operations ipvs_core_dev_ops = {
2380	.exit_batch = __ip_vs_dev_cleanup_batch,
2381	};
2382
2383	/*
2384	* Initialize IP Virtual Server
2385	*/
2386	static int __init ip_vs_init(void)
2387	{
2388	int ret;
2389
2390	ret = ip_vs_control_init();
2391	if (ret < `0`) {
2392	pr_err("can't setup control.\n");
2393	goto exit;
2394	}
2395
2396	ip_vs_protocol_init();
2397
2398	ret = ip_vs_conn_init();
2399	if (ret < `0`) {
2400	pr_err("can't setup connection table.\n");
2401	goto cleanup_protocol;
2402	}
2403
2404	ret = register_pernet_subsys(&ipvs_core_ops); / Alloc ip_vs struct /
2405	if (ret < `0`)
2406	goto cleanup_conn;
2407
2408	ret = register_pernet_device(&ipvs_core_dev_ops);
2409	if (ret < `0`)
2410	goto cleanup_sub;
2411
2412	ret = ip_vs_register_nl_ioctl();
2413	if (ret < `0`) {
2414	pr_err("can't register netlink/ioctl.\n");
2415	goto cleanup_dev;
2416	}
2417
2418	pr_info("ipvs loaded.\n");
2419
2420	return ret;
2421
2422	cleanup_dev:
2423	unregister_pernet_device(&ipvs_core_dev_ops);
2424	cleanup_sub:
2425	unregister_pernet_subsys(&ipvs_core_ops);
2426	cleanup_conn:
2427	ip_vs_conn_cleanup();
2428	cleanup_protocol:
2429	ip_vs_protocol_cleanup();
2430	ip_vs_control_cleanup();
2431	exit:
2432	return ret;
2433	}
2434
2435	static void __exit ip_vs_cleanup(void)
2436	{
2437	ip_vs_unregister_nl_ioctl();
2438	unregister_pernet_device(&ipvs_core_dev_ops);
2439	unregister_pernet_subsys(&ipvs_core_ops); / free ip_vs struct /
2440	ip_vs_conn_cleanup();
2441	ip_vs_protocol_cleanup();
2442	ip_vs_control_cleanup();
2443	/ common rcu_barrier() used by:*
2444	* - ip_vs_control_cleanup()
2445	*/
2446	rcu_barrier();
2447	pr_info("ipvs unloaded.\n");
2448	}
2449
2450	module_init(ip_vs_init);
2451	module_exit(ip_vs_cleanup);
2452	MODULE_LICENSE("GPL");
2453	MODULE_DESCRIPTION("IP Virtual Server");
2454

source code of linux/net/netfilter/ipvs/ip_vs_core.c