nf_conntrack_core.c source code [linux/net/netfilter/nf_conntrack_core.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/ Connection state tracking for netfilter. This is separated from,*
3	but required by, the NAT layer; it can also be used by an iptables
4	extension. /*
5
6	/ (C) 1999-2001 Paul `Rusty' Russell*
7	* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
8	* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
9	* (C) 2005-2012 Patrick McHardy <kaber@trash.net>
10	*/
11
12	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14	#include <linux/types.h>
15	#include <linux/netfilter.h>
16	#include <linux/module.h>
17	#include <linux/sched.h>
18	#include <linux/skbuff.h>
19	#include <linux/proc_fs.h>
20	#include <linux/vmalloc.h>
21	#include <linux/stddef.h>
22	#include <linux/slab.h>
23	#include <linux/random.h>
24	#include <linux/siphash.h>
25	#include <linux/err.h>
26	#include <linux/percpu.h>
27	#include <linux/moduleparam.h>
28	#include <linux/notifier.h>
29	#include <linux/kernel.h>
30	#include <linux/netdevice.h>
31	#include <linux/socket.h>
32	#include <linux/mm.h>
33	#include <linux/nsproxy.h>
34	#include <linux/rculist_nulls.h>
35
36	#include <net/netfilter/nf_conntrack.h>
37	#include <net/netfilter/nf_conntrack_bpf.h>
38	#include <net/netfilter/nf_conntrack_l4proto.h>
39	#include <net/netfilter/nf_conntrack_expect.h>
40	#include <net/netfilter/nf_conntrack_helper.h>
41	#include <net/netfilter/nf_conntrack_core.h>
42	#include <net/netfilter/nf_conntrack_extend.h>
43	#include <net/netfilter/nf_conntrack_acct.h>
44	#include <net/netfilter/nf_conntrack_ecache.h>
45	#include <net/netfilter/nf_conntrack_zones.h>
46	#include <net/netfilter/nf_conntrack_timestamp.h>
47	#include <net/netfilter/nf_conntrack_timeout.h>
48	#include <net/netfilter/nf_conntrack_labels.h>
49	#include <net/netfilter/nf_conntrack_synproxy.h>
50	#include <net/netfilter/nf_nat.h>
51	#include <net/netfilter/nf_nat_helper.h>
52	#include <net/netns/hash.h>
53	#include <net/ip.h>
54
55	#include "nf_internals.h"
56
57	__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
58	EXPORT_SYMBOL_GPL(nf_conntrack_locks);
59
60	__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
61	EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
62
63	struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
64	EXPORT_SYMBOL_GPL(nf_conntrack_hash);
65
66	struct conntrack_gc_work {
67	struct delayed_work dwork;
68	u32 next_bucket;
69	u32 avg_timeout;
70	u32 count;
71	u32 start_time;
72	bool exiting;
73	bool early_drop;
74	};
75
76	static __read_mostly struct kmem_cache *nf_conntrack_cachep;
77	static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
78	static __read_mostly bool nf_conntrack_locks_all;
79
80	/ serialize hash resizes and nf_ct_iterate_cleanup /
81	static DEFINE_MUTEX(nf_conntrack_mutex);
82
83	#define GC_SCAN_INTERVAL_MAX (60ul * HZ)
84	#define GC_SCAN_INTERVAL_MIN (1ul * HZ)
85
86	/ clamp timeouts to this value (TCP unacked) /
87	#define GC_SCAN_INTERVAL_CLAMP (300ul * HZ)
88
89	/ Initial bias pretending we have 100 entries at the upper bound so we don't*
90	* wakeup often just because we have three entries with a 1s timeout while still
91	* allowing non-idle machines to wakeup more often when needed.
92	*/
93	#define GC_SCAN_INITIAL_COUNT 100
94	#define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX
95
96	#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)
97	#define GC_SCAN_EXPIRED_MAX (64000u / HZ)
98
99	#define MIN_CHAINLEN 50u
100	#define MAX_CHAINLEN (80u - MIN_CHAINLEN)
101
102	static struct conntrack_gc_work conntrack_gc_work;
103
104	void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
105	{
106	/ 1) Acquire the lock /
107	spin_lock(lock);
108
109	/ 2) read nf_conntrack_locks_all, with ACQUIRE semantics*
110	* It pairs with the smp_store_release() in nf_conntrack_all_unlock()
111	*/
112	if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false))
113	return;
114
115	/ fast path failed, unlock /
116	spin_unlock(lock);
117
118	/ Slow path 1) get global lock /
119	spin_lock(lock: &nf_conntrack_locks_all_lock);
120
121	/ Slow path 2) get the lock we want /
122	spin_lock(lock);
123
124	/ Slow path 3) release the global lock /
125	spin_unlock(lock: &nf_conntrack_locks_all_lock);
126	}
127	EXPORT_SYMBOL_GPL(nf_conntrack_lock);
128
129	static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
130	{
131	h1 %= CONNTRACK_LOCKS;
132	h2 %= CONNTRACK_LOCKS;
133	spin_unlock(lock: &nf_conntrack_locks[h1]);
134	if (h1 != h2)
135	spin_unlock(lock: &nf_conntrack_locks[h2]);
136	}
137
138	/ return true if we need to recompute hashes (in case hash table was resized) /
139	static bool nf_conntrack_double_lock(struct net net, unsigned* int h1,
140	unsigned int h2, unsigned int sequence)
141	{
142	h1 %= CONNTRACK_LOCKS;
143	h2 %= CONNTRACK_LOCKS;
144	if (h1 <= h2) {
145	nf_conntrack_lock(&nf_conntrack_locks[h1]);
146	if (h1 != h2)
147	spin_lock_nested(&nf_conntrack_locks[h2],
148	SINGLE_DEPTH_NESTING);
149	} else {
150	nf_conntrack_lock(&nf_conntrack_locks[h2]);
151	spin_lock_nested(&nf_conntrack_locks[h1],
152	SINGLE_DEPTH_NESTING);
153	}
154	if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
155	nf_conntrack_double_unlock(h1, h2);
156	return true;
157	}
158	return false;
159	}
160
161	static void nf_conntrack_all_lock(void)
162	__acquires(&nf_conntrack_locks_all_lock)
163	{
164	int i;
165
166	spin_lock(lock: &nf_conntrack_locks_all_lock);
167
168	/ For nf_contrack_locks_all, only the latest time when another*
169	* CPU will see an update is controlled, by the "release" of the
170	* spin_lock below.
171	* The earliest time is not controlled, an thus KCSAN could detect
172	* a race when nf_conntract_lock() reads the variable.
173	* WRITE_ONCE() is used to ensure the compiler will not
174	* optimize the write.
175	*/
176	WRITE_ONCE(nf_conntrack_locks_all, true);
177
178	for (i = `0`; i < CONNTRACK_LOCKS; i++) {
179	spin_lock(lock: &nf_conntrack_locks[i]);
180
181	/ This spin_unlock provides the "release" to ensure that*
182	* nf_conntrack_locks_all==true is visible to everyone that
183	* acquired spin_lock(&nf_conntrack_locks[]).
184	*/
185	spin_unlock(lock: &nf_conntrack_locks[i]);
186	}
187	}
188
189	static void nf_conntrack_all_unlock(void)
190	__releases(&nf_conntrack_locks_all_lock)
191	{
192	/ All prior stores must be complete before we clear*
193	* 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
194	* might observe the false value but not the entire
195	* critical section.
196	* It pairs with the smp_load_acquire() in nf_conntrack_lock()
197	*/
198	smp_store_release(&nf_conntrack_locks_all, false);
199	spin_unlock(lock: &nf_conntrack_locks_all_lock);
200	}
201
202	unsigned int nf_conntrack_htable_size __read_mostly;
203	EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
204
205	unsigned int nf_conntrack_max __read_mostly;
206	EXPORT_SYMBOL_GPL(nf_conntrack_max);
207	seqcount_spinlock_t nf_conntrack_generation __read_mostly;
208	static siphash_aligned_key_t nf_conntrack_hash_rnd;
209
210	static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
211	unsigned int zoneid,
212	const struct net *net)
213	{
214	siphash_key_t key;
215
216	get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
217
218	key = nf_conntrack_hash_rnd;
219
220	key.key[`0`] ^= zoneid;
221	key.key[`1`] ^= net_hash_mix(net);
222
223	return siphash(data: (void *)tuple,
224	offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend),
225	key: &key);
226	}
227
228	static u32 scale_hash(u32 hash)
229	{
230	return reciprocal_scale(val: hash, ep_ro: nf_conntrack_htable_size);
231	}
232
233	static u32 __hash_conntrack(const struct net *net,
234	const struct nf_conntrack_tuple *tuple,
235	unsigned int zoneid,
236	unsigned int size)
237	{
238	return reciprocal_scale(val: hash_conntrack_raw(tuple, zoneid, net), ep_ro: size);
239	}
240
241	static u32 hash_conntrack(const struct net *net,
242	const struct nf_conntrack_tuple *tuple,
243	unsigned int zoneid)
244	{
245	return scale_hash(hash: hash_conntrack_raw(tuple, zoneid, net));
246	}
247
248	static bool nf_ct_get_tuple_ports(const struct sk_buff *skb,
249	unsigned int dataoff,
250	struct nf_conntrack_tuple *tuple)
251	{ struct {
252	__be16 sport;
253	__be16 dport;
254	} _inet_hdr, *inet_hdr;
255
256	/ Actually only need first 4 bytes to get ports. /
257	inet_hdr = skb_header_pointer(skb, offset: dataoff, len: sizeof(_inet_hdr), buffer: &_inet_hdr);
258	if (!inet_hdr)
259	return false;
260
261	tuple->src.u.udp.port = inet_hdr->sport;
262	tuple->dst.u.udp.port = inet_hdr->dport;
263	return true;
264	}
265
266	static bool
267	nf_ct_get_tuple(const struct sk_buff *skb,
268	unsigned int nhoff,
269	unsigned int dataoff,
270	u_int16_t l3num,
271	u_int8_t protonum,
272	struct net *net,
273	struct nf_conntrack_tuple *tuple)
274	{
275	unsigned int size;
276	const __be32 *ap;
277	__be32 _addrs[`8`];
278
279	memset(tuple, `0`, sizeof(*tuple));
280
281	tuple->src.l3num = l3num;
282	switch (l3num) {
283	case NFPROTO_IPV4:
284	nhoff += offsetof(struct iphdr, saddr);
285	size = `2` * sizeof(__be32);
286	break;
287	case NFPROTO_IPV6:
288	nhoff += offsetof(struct ipv6hdr, saddr);
289	size = sizeof(_addrs);
290	break;
291	default:
292	return true;
293	}
294
295	ap = skb_header_pointer(skb, offset: nhoff, len: size, buffer: _addrs);
296	if (!ap)
297	return false;
298
299	switch (l3num) {
300	case NFPROTO_IPV4:
301	tuple->src.u3.ip = ap[`0`];
302	tuple->dst.u3.ip = ap[`1`];
303	break;
304	case NFPROTO_IPV6:
305	memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
306	memcpy(tuple->dst.u3.ip6, ap + `4`, sizeof(tuple->dst.u3.ip6));
307	break;
308	}
309
310	tuple->dst.protonum = protonum;
311	tuple->dst.dir = IP_CT_DIR_ORIGINAL;
312
313	switch (protonum) {
314	#if IS_ENABLED(CONFIG_IPV6)
315	case IPPROTO_ICMPV6:
316	return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple);
317	#endif
318	case IPPROTO_ICMP:
319	return icmp_pkt_to_tuple(skb, dataoff, net, tuple);
320	#ifdef CONFIG_NF_CT_PROTO_GRE
321	case IPPROTO_GRE:
322	return gre_pkt_to_tuple(skb, dataoff, net, tuple);
323	#endif
324	case IPPROTO_TCP:
325	case IPPROTO_UDP:
326	#ifdef CONFIG_NF_CT_PROTO_UDPLITE
327	case IPPROTO_UDPLITE:
328	#endif
329	#ifdef CONFIG_NF_CT_PROTO_SCTP
330	case IPPROTO_SCTP:
331	#endif
332	#ifdef CONFIG_NF_CT_PROTO_DCCP
333	case IPPROTO_DCCP:
334	#endif
335	/ fallthrough /
336	return nf_ct_get_tuple_ports(skb, dataoff, tuple);
337	default:
338	break;
339	}
340
341	return true;
342	}
343
344	static int ipv4_get_l4proto(const struct sk_buff skb, unsigned* int nhoff,
345	u_int8_t *protonum)
346	{
347	int dataoff = -`1`;
348	const struct iphdr *iph;
349	struct iphdr _iph;
350
351	iph = skb_header_pointer(skb, offset: nhoff, len: sizeof(_iph), buffer: &_iph);
352	if (!iph)
353	return -`1`;
354
355	/ Conntrack defragments packets, we might still see fragments*
356	* inside ICMP packets though.
357	*/
358	if (iph->frag_off & htons(IP_OFFSET))
359	return -`1`;
360
361	dataoff = nhoff + (iph->ihl << `2`);
362	*protonum = iph->protocol;
363
364	/ Check bogus IP headers /
365	if (dataoff > skb->len) {
366	pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n",
367	nhoff, iph->ihl << `2`, skb->len);
368	return -`1`;
369	}
370	return dataoff;
371	}
372
373	#if IS_ENABLED(CONFIG_IPV6)
374	static int ipv6_get_l4proto(const struct sk_buff skb, unsigned* int nhoff,
375	u8 *protonum)
376	{
377	int protoff = -`1`;
378	unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
379	__be16 frag_off;
380	u8 nexthdr;
381
382	if (skb_copy_bits(skb, offset: nhoff + offsetof(struct ipv6hdr, nexthdr),
383	to: &nexthdr, len: sizeof(nexthdr)) != `0`) {
384	pr_debug("can't get nexthdr\n");
385	return -`1`;
386	}
387	protoff = ipv6_skip_exthdr(skb, start: extoff, nexthdrp: &nexthdr, frag_offp: &frag_off);
388	/*
389	* (protoff == skb->len) means the packet has not data, just
390	* IPv6 and possibly extensions headers, but it is tracked anyway
391	*/
392	if (protoff < `0` \|\| (frag_off & htons(~`0x7`)) != `0`) {
393	pr_debug("can't find proto in pkt\n");
394	return -`1`;
395	}
396
397	*protonum = nexthdr;
398	return protoff;
399	}
400	#endif
401
402	static int get_l4proto(const struct sk_buff *skb,
403	unsigned int nhoff, u8 pf, u8 *l4num)
404	{
405	switch (pf) {
406	case NFPROTO_IPV4:
407	return ipv4_get_l4proto(skb, nhoff, protonum: l4num);
408	#if IS_ENABLED(CONFIG_IPV6)
409	case NFPROTO_IPV6:
410	return ipv6_get_l4proto(skb, nhoff, protonum: l4num);
411	#endif
412	default:
413	*l4num = `0`;
414	break;
415	}
416	return -`1`;
417	}
418
419	bool nf_ct_get_tuplepr(const struct sk_buff skb, unsigned* int nhoff,
420	u_int16_t l3num,
421	struct net net, struct* nf_conntrack_tuple *tuple)
422	{
423	u8 protonum;
424	int protoff;
425
426	protoff = get_l4proto(skb, nhoff, pf: l3num, l4num: &protonum);
427	if (protoff <= `0`)
428	return false;
429
430	return nf_ct_get_tuple(skb, nhoff, dataoff: protoff, l3num, protonum, net, tuple);
431	}
432	EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
433
434	bool
435	nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
436	const struct nf_conntrack_tuple *orig)
437	{
438	memset(inverse, `0`, sizeof(*inverse));
439
440	inverse->src.l3num = orig->src.l3num;
441
442	switch (orig->src.l3num) {
443	case NFPROTO_IPV4:
444	inverse->src.u3.ip = orig->dst.u3.ip;
445	inverse->dst.u3.ip = orig->src.u3.ip;
446	break;
447	case NFPROTO_IPV6:
448	inverse->src.u3.in6 = orig->dst.u3.in6;
449	inverse->dst.u3.in6 = orig->src.u3.in6;
450	break;
451	default:
452	break;
453	}
454
455	inverse->dst.dir = !orig->dst.dir;
456
457	inverse->dst.protonum = orig->dst.protonum;
458
459	switch (orig->dst.protonum) {
460	case IPPROTO_ICMP:
461	return nf_conntrack_invert_icmp_tuple(tuple: inverse, orig);
462	#if IS_ENABLED(CONFIG_IPV6)
463	case IPPROTO_ICMPV6:
464	return nf_conntrack_invert_icmpv6_tuple(tuple: inverse, orig);
465	#endif
466	}
467
468	inverse->src.u.all = orig->dst.u.all;
469	inverse->dst.u.all = orig->src.u.all;
470	return true;
471	}
472	EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
473
474	/ Generate a almost-unique pseudo-id for a given conntrack.*
475	*
476	* intentionally doesn't re-use any of the seeds used for hash
477	* table location, we assume id gets exposed to userspace.
478	*
479	* Following nf_conn items do not change throughout lifetime
480	* of the nf_conn:
481	*
482	* 1. nf_conn address
483	* 2. nf_conn->master address (normally NULL)
484	* 3. the associated net namespace
485	* 4. the original direction tuple
486	*/
487	u32 nf_ct_get_id(const struct nf_conn *ct)
488	{
489	static siphash_aligned_key_t ct_id_seed;
490	unsigned long a, b, c, d;
491
492	net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
493
494	a = (unsigned long)ct;
495	b = (unsigned long)ct->master;
496	c = (unsigned long)nf_ct_net(ct);
497	d = (unsigned long)siphash(data: &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
498	len: sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple),
499	key: &ct_id_seed);
500	#ifdef CONFIG_64BIT
501	return siphash_4u64(a: (u64)a, b: (u64)b, c: (u64)c, d: (u64)d, key: &ct_id_seed);
502	#else
503	return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed);
504	#endif
505	}
506	EXPORT_SYMBOL_GPL(nf_ct_get_id);
507
508	static void
509	clean_from_lists(struct nf_conn *ct)
510	{
511	hlist_nulls_del_rcu(n: &ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
512	hlist_nulls_del_rcu(n: &ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
513
514	/ Destroy all pending expectations /
515	nf_ct_remove_expectations(ct);
516	}
517
518	#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
519
520	/ Released via nf_ct_destroy() /
521	struct nf_conn nf_ct_tmpl_alloc(struct* net *net,
522	const struct nf_conntrack_zone *zone,
523	gfp_t flags)
524	{
525	struct nf_conn tmpl, p;
526
527	if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) {
528	tmpl = kzalloc(size: sizeof(*tmpl) + NFCT_INFOMASK, flags);
529	if (!tmpl)
530	return NULL;
531
532	p = tmpl;
533	tmpl = (struct nf_conn )NFCT_ALIGN((unsigned* long)p);
534	if (tmpl != p) {
535	tmpl = (struct nf_conn )NFCT_ALIGN((unsigned* long)p);
536	tmpl->proto.tmpl_padto = (char )tmpl - (char* *)p;
537	}
538	} else {
539	tmpl = kzalloc(size: sizeof(*tmpl), flags);
540	if (!tmpl)
541	return NULL;
542	}
543
544	tmpl->status = IPS_TEMPLATE;
545	write_pnet(pnet: &tmpl->ct_net, net);
546	nf_ct_zone_add(ct: tmpl, zone);
547	refcount_set(r: &tmpl->ct_general.use, n: `1`);
548
549	return tmpl;
550	}
551	EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
552
553	void nf_ct_tmpl_free(struct nf_conn *tmpl)
554	{
555	kfree(objp: tmpl->ext);
556
557	if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK)
558	kfree(objp: (char *)tmpl - tmpl->proto.tmpl_padto);
559	else
560	kfree(objp: tmpl);
561	}
562	EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
563
564	static void destroy_gre_conntrack(struct nf_conn *ct)
565	{
566	#ifdef CONFIG_NF_CT_PROTO_GRE
567	struct nf_conn *master = ct->master;
568
569	if (master)
570	nf_ct_gre_keymap_destroy(ct: master);
571	#endif
572	}
573
574	void nf_ct_destroy(struct nf_conntrack *nfct)
575	{
576	struct nf_conn ct = (struct* nf_conn *)nfct;
577
578	WARN_ON(refcount_read(&nfct->use) != `0`);
579
580	if (unlikely(nf_ct_is_template(ct))) {
581	nf_ct_tmpl_free(ct);
582	return;
583	}
584
585	if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
586	destroy_gre_conntrack(ct);
587
588	/ Expectations will have been removed in clean_from_lists,*
589	* except TFTP can create an expectation on the first packet,
590	* before connection is in the list, so we need to clean here,
591	* too.
592	*/
593	nf_ct_remove_expectations(ct);
594
595	if (ct->master)
596	nf_ct_put(ct: ct->master);
597
598	nf_conntrack_free(ct);
599	}
600	EXPORT_SYMBOL(nf_ct_destroy);
601
602	static void __nf_ct_delete_from_lists(struct nf_conn *ct)
603	{
604	struct net *net = nf_ct_net(ct);
605	unsigned int hash, reply_hash;
606	unsigned int sequence;
607
608	do {
609	sequence = read_seqcount_begin(&nf_conntrack_generation);
610	hash = hash_conntrack(net,
611	tuple: &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
612	zoneid: nf_ct_zone_id(zone: nf_ct_zone(ct), dir: IP_CT_DIR_ORIGINAL));
613	reply_hash = hash_conntrack(net,
614	tuple: &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
615	zoneid: nf_ct_zone_id(zone: nf_ct_zone(ct), dir: IP_CT_DIR_REPLY));
616	} while (nf_conntrack_double_lock(net, h1: hash, h2: reply_hash, sequence));
617
618	clean_from_lists(ct);
619	nf_conntrack_double_unlock(h1: hash, h2: reply_hash);
620	}
621
622	static void nf_ct_delete_from_lists(struct nf_conn *ct)
623	{
624	nf_ct_helper_destroy(ct);
625	local_bh_disable();
626
627	__nf_ct_delete_from_lists(ct);
628
629	local_bh_enable();
630	}
631
632	static void nf_ct_add_to_ecache_list(struct nf_conn *ct)
633	{
634	#ifdef CONFIG_NF_CONNTRACK_EVENTS
635	struct nf_conntrack_net *cnet = nf_ct_pernet(net: nf_ct_net(ct));
636
637	spin_lock(lock: &cnet->ecache.dying_lock);
638	hlist_nulls_add_head_rcu(n: &ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
639	h: &cnet->ecache.dying_list);
640	spin_unlock(lock: &cnet->ecache.dying_lock);
641	#endif
642	}
643
644	bool nf_ct_delete(struct nf_conn ct, u32 portid, int* report)
645	{
646	struct nf_conn_tstamp *tstamp;
647	struct net *net;
648
649	if (test_and_set_bit(nr: IPS_DYING_BIT, addr: &ct->status))
650	return false;
651
652	tstamp = nf_conn_tstamp_find(ct);
653	if (tstamp) {
654	s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp;
655
656	tstamp->stop = ktime_get_real_ns();
657	if (timeout < `0`)
658	tstamp->stop -= jiffies_to_nsecs(j: -timeout);
659	}
660
661	if (nf_conntrack_event_report(event: IPCT_DESTROY, ct,
662	portid, report) < `0`) {
663	/ destroy event was not delivered. nf_ct_put will*
664	* be done by event cache worker on redelivery.
665	*/
666	nf_ct_helper_destroy(ct);
667	local_bh_disable();
668	__nf_ct_delete_from_lists(ct);
669	nf_ct_add_to_ecache_list(ct);
670	local_bh_enable();
671
672	nf_conntrack_ecache_work(net: nf_ct_net(ct), state: NFCT_ECACHE_DESTROY_FAIL);
673	return false;
674	}
675
676	net = nf_ct_net(ct);
677	if (nf_conntrack_ecache_dwork_pending(net))
678	nf_conntrack_ecache_work(net, state: NFCT_ECACHE_DESTROY_SENT);
679	nf_ct_delete_from_lists(ct);
680	nf_ct_put(ct);
681	return true;
682	}
683	EXPORT_SYMBOL_GPL(nf_ct_delete);
684
685	static inline bool
686	nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
687	const struct nf_conntrack_tuple *tuple,
688	const struct nf_conntrack_zone *zone,
689	const struct net *net)
690	{
691	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash: h);
692
693	/ A conntrack can be recreated with the equal tuple,*
694	* so we need to check that the conntrack is confirmed
695	*/
696	return nf_ct_tuple_equal(t1: tuple, t2: &h->tuple) &&
697	nf_ct_zone_equal(a: ct, b: zone, NF_CT_DIRECTION(h)) &&
698	nf_ct_is_confirmed(ct) &&
699	net_eq(net1: net, net2: nf_ct_net(ct));
700	}
701
702	static inline bool
703	nf_ct_match(const struct nf_conn ct1, const* struct nf_conn *ct2)
704	{
705	return nf_ct_tuple_equal(t1: &ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
706	t2: &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
707	nf_ct_tuple_equal(t1: &ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
708	t2: &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
709	nf_ct_zone_equal(a: ct1, b: nf_ct_zone(ct: ct2), dir: IP_CT_DIR_ORIGINAL) &&
710	nf_ct_zone_equal(a: ct1, b: nf_ct_zone(ct: ct2), dir: IP_CT_DIR_REPLY) &&
711	net_eq(net1: nf_ct_net(ct: ct1), net2: nf_ct_net(ct: ct2));
712	}
713
714	/ caller must hold rcu readlock and none of the nf_conntrack_locks /
715	static void nf_ct_gc_expired(struct nf_conn *ct)
716	{
717	if (!refcount_inc_not_zero(r: &ct->ct_general.use))
718	return;
719
720	/ load ->status after refcount increase /
721	smp_acquire__after_ctrl_dep();
722
723	if (nf_ct_should_gc(ct))
724	nf_ct_kill(ct);
725
726	nf_ct_put(ct);
727	}
728
729	/*
730	* Warning :
731	* - Caller must take a reference on returned object
732	* and recheck nf_ct_tuple_equal(tuple, &h->tuple)
733	*/
734	static struct nf_conntrack_tuple_hash *
735	____nf_conntrack_find(struct net net, const* struct nf_conntrack_zone *zone,
736	const struct nf_conntrack_tuple *tuple, u32 hash)
737	{
738	struct nf_conntrack_tuple_hash *h;
739	struct hlist_nulls_head *ct_hash;
740	struct hlist_nulls_node *n;
741	unsigned int bucket, hsize;
742
743	begin:
744	nf_conntrack_get_ht(hash: &ct_hash, hsize: &hsize);
745	bucket = reciprocal_scale(val: hash, ep_ro: hsize);
746
747	hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
748	struct nf_conn *ct;
749
750	ct = nf_ct_tuplehash_to_ctrack(hash: h);
751	if (nf_ct_is_expired(ct)) {
752	nf_ct_gc_expired(ct);
753	continue;
754	}
755
756	if (nf_ct_key_equal(h, tuple, zone, net))
757	return h;
758	}
759	/*
760	* if the nulls value we got at the end of this lookup is
761	* not the expected one, we must restart lookup.
762	* We probably met an item that was moved to another chain.
763	*/
764	if (get_nulls_value(ptr: n) != bucket) {
765	NF_CT_STAT_INC_ATOMIC(net, search_restart);
766	goto begin;
767	}
768
769	return NULL;
770	}
771
772	/ Find a connection corresponding to a tuple. /
773	static struct nf_conntrack_tuple_hash *
774	__nf_conntrack_find_get(struct net net, const* struct nf_conntrack_zone *zone,
775	const struct nf_conntrack_tuple *tuple, u32 hash)
776	{
777	struct nf_conntrack_tuple_hash *h;
778	struct nf_conn *ct;
779
780	h = ____nf_conntrack_find(net, zone, tuple, hash);
781	if (h) {
782	/ We have a candidate that matches the tuple we're interested*
783	* in, try to obtain a reference and re-check tuple
784	*/
785	ct = nf_ct_tuplehash_to_ctrack(hash: h);
786	if (likely(refcount_inc_not_zero(&ct->ct_general.use))) {
787	/ re-check key after refcount /
788	smp_acquire__after_ctrl_dep();
789
790	if (likely(nf_ct_key_equal(h, tuple, zone, net)))
791	return h;
792
793	/ TYPESAFE_BY_RCU recycled the candidate /
794	nf_ct_put(ct);
795	}
796
797	h = NULL;
798	}
799
800	return h;
801	}
802
803	struct nf_conntrack_tuple_hash *
804	nf_conntrack_find_get(struct net net, const* struct nf_conntrack_zone *zone,
805	const struct nf_conntrack_tuple *tuple)
806	{
807	unsigned int rid, zone_id = nf_ct_zone_id(zone, dir: IP_CT_DIR_ORIGINAL);
808	struct nf_conntrack_tuple_hash *thash;
809
810	rcu_read_lock();
811
812	thash = __nf_conntrack_find_get(net, zone, tuple,
813	hash: hash_conntrack_raw(tuple, zoneid: zone_id, net));
814
815	if (thash)
816	goto out_unlock;
817
818	rid = nf_ct_zone_id(zone, dir: IP_CT_DIR_REPLY);
819	if (rid != zone_id)
820	thash = __nf_conntrack_find_get(net, zone, tuple,
821	hash: hash_conntrack_raw(tuple, zoneid: rid, net));
822
823	out_unlock:
824	rcu_read_unlock();
825	return thash;
826	}
827	EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
828
829	static void __nf_conntrack_hash_insert(struct nf_conn *ct,
830	unsigned int hash,
831	unsigned int reply_hash)
832	{
833	hlist_nulls_add_head_rcu(n: &ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
834	h: &nf_conntrack_hash[hash]);
835	hlist_nulls_add_head_rcu(n: &ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
836	h: &nf_conntrack_hash[reply_hash]);
837	}
838
839	static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext)
840	{
841	/ if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions*
842	* may contain stale pointers to e.g. helper that has been removed.
843	*
844	* The helper can't clear this because the nf_conn object isn't in
845	* any hash and synchronize_rcu() isn't enough because associated skb
846	* might sit in a queue.
847	*/
848	return !ext \|\| ext->gen_id == atomic_read(v: &nf_conntrack_ext_genid);
849	}
850
851	static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext)
852	{
853	if (!ext)
854	return true;
855
856	if (ext->gen_id != atomic_read(v: &nf_conntrack_ext_genid))
857	return false;
858
859	/ inserted into conntrack table, nf_ct_iterate_cleanup()*
860	* will find it. Disable nf_ct_ext_find() id check.
861	*/
862	WRITE_ONCE(ext->gen_id, `0`);
863	return true;
864	}
865
866	int
867	nf_conntrack_hash_check_insert(struct nf_conn *ct)
868	{
869	const struct nf_conntrack_zone *zone;
870	struct net *net = nf_ct_net(ct);
871	unsigned int hash, reply_hash;
872	struct nf_conntrack_tuple_hash *h;
873	struct hlist_nulls_node *n;
874	unsigned int max_chainlen;
875	unsigned int chainlen = `0`;
876	unsigned int sequence;
877	int err = -EEXIST;
878
879	zone = nf_ct_zone(ct);
880
881	if (!nf_ct_ext_valid_pre(ext: ct->ext))
882	return -EAGAIN;
883
884	local_bh_disable();
885	do {
886	sequence = read_seqcount_begin(&nf_conntrack_generation);
887	hash = hash_conntrack(net,
888	tuple: &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
889	zoneid: nf_ct_zone_id(zone: nf_ct_zone(ct), dir: IP_CT_DIR_ORIGINAL));
890	reply_hash = hash_conntrack(net,
891	tuple: &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
892	zoneid: nf_ct_zone_id(zone: nf_ct_zone(ct), dir: IP_CT_DIR_REPLY));
893	} while (nf_conntrack_double_lock(net, h1: hash, h2: reply_hash, sequence));
894
895	max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN);
896
897	/ See if there's one in the list already, including reverse /
898	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
899	if (nf_ct_key_equal(h, tuple: &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
900	zone, net))
901	goto out;
902
903	if (chainlen++ > max_chainlen)
904	goto chaintoolong;
905	}
906
907	chainlen = `0`;
908
909	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
910	if (nf_ct_key_equal(h, tuple: &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
911	zone, net))
912	goto out;
913	if (chainlen++ > max_chainlen)
914	goto chaintoolong;
915	}
916
917	/ If genid has changed, we can't insert anymore because ct*
918	* extensions could have stale pointers and nf_ct_iterate_destroy
919	* might have completed its table scan already.
920	*
921	* Increment of the ext genid right after this check is fine:
922	* nf_ct_iterate_destroy blocks until locks are released.
923	*/
924	if (!nf_ct_ext_valid_post(ext: ct->ext)) {
925	err = -EAGAIN;
926	goto out;
927	}
928
929	smp_wmb();
930	/ The caller holds a reference to this object /
931	refcount_set(r: &ct->ct_general.use, n: `2`);
932	__nf_conntrack_hash_insert(ct, hash, reply_hash);
933	nf_conntrack_double_unlock(h1: hash, h2: reply_hash);
934	NF_CT_STAT_INC(net, insert);
935	local_bh_enable();
936
937	return `0`;
938	chaintoolong:
939	NF_CT_STAT_INC(net, chaintoolong);
940	err = -ENOSPC;
941	out:
942	nf_conntrack_double_unlock(h1: hash, h2: reply_hash);
943	local_bh_enable();
944	return err;
945	}
946	EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
947
948	void nf_ct_acct_add(struct nf_conn ct, u32 dir, unsigned* int packets,
949	unsigned int bytes)
950	{
951	struct nf_conn_acct *acct;
952
953	acct = nf_conn_acct_find(ct);
954	if (acct) {
955	struct nf_conn_counter *counter = acct->counter;
956
957	atomic64_add(i: packets, v: &counter[dir].packets);
958	atomic64_add(i: bytes, v: &counter[dir].bytes);
959	}
960	}
961	EXPORT_SYMBOL_GPL(nf_ct_acct_add);
962
963	static void nf_ct_acct_merge(struct nf_conn ct, enum* ip_conntrack_info ctinfo,
964	const struct nf_conn *loser_ct)
965	{
966	struct nf_conn_acct *acct;
967
968	acct = nf_conn_acct_find(ct: loser_ct);
969	if (acct) {
970	struct nf_conn_counter *counter = acct->counter;
971	unsigned int bytes;
972
973	/ u32 should be fine since we must have seen one packet. /
974	bytes = atomic64_read(v: &counter[CTINFO2DIR(ctinfo)].bytes);
975	nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
976	}
977	}
978
979	static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
980	{
981	struct nf_conn_tstamp *tstamp;
982
983	refcount_inc(r: &ct->ct_general.use);
984
985	/ set conntrack timestamp, if enabled. /
986	tstamp = nf_conn_tstamp_find(ct);
987	if (tstamp)
988	tstamp->start = ktime_get_real_ns();
989	}
990
991	/ caller must hold locks to prevent concurrent changes /
992	static int __nf_ct_resolve_clash(struct sk_buff *skb,
993	struct nf_conntrack_tuple_hash *h)
994	{
995	/ This is the conntrack entry already in hashes that won race. /
996	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash: h);
997	enum ip_conntrack_info ctinfo;
998	struct nf_conn *loser_ct;
999
1000	loser_ct = nf_ct_get(skb, ctinfo: &ctinfo);
1001
1002	if (nf_ct_is_dying(ct))
1003	return NF_DROP;
1004
1005	if (((ct->status & IPS_NAT_DONE_MASK) == `0`) \|\|
1006	nf_ct_match(ct1: ct, ct2: loser_ct)) {
1007	struct net *net = nf_ct_net(ct);
1008
1009	nf_conntrack_get(nfct: &ct->ct_general);
1010
1011	nf_ct_acct_merge(ct, ctinfo, loser_ct);
1012	nf_ct_put(ct: loser_ct);
1013	nf_ct_set(skb, ct, info: ctinfo);
1014
1015	NF_CT_STAT_INC(net, clash_resolve);
1016	return NF_ACCEPT;
1017	}
1018
1019	return NF_DROP;
1020	}
1021
1022	/**
1023	* nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry
1024	*
1025	* @skb: skb that causes the collision
1026	* @repl_idx: hash slot for reply direction
1027	*
1028	* Called when origin or reply direction had a clash.
1029	* The skb can be handled without packet drop provided the reply direction
1030	* is unique or there the existing entry has the identical tuple in both
1031	* directions.
1032	*
1033	* Caller must hold conntrack table locks to prevent concurrent updates.
1034	*
1035	* Returns NF_DROP if the clash could not be handled.
1036	*/
1037	static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx)
1038	{
1039	struct nf_conn loser_ct = (struct* nf_conn *)skb_nfct(skb);
1040	const struct nf_conntrack_zone *zone;
1041	struct nf_conntrack_tuple_hash *h;
1042	struct hlist_nulls_node *n;
1043	struct net *net;
1044
1045	zone = nf_ct_zone(ct: loser_ct);
1046	net = nf_ct_net(ct: loser_ct);
1047
1048	/ Reply direction must never result in a clash, unless both origin*
1049	* and reply tuples are identical.
1050	*/
1051	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) {
1052	if (nf_ct_key_equal(h,
1053	tuple: &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1054	zone, net))
1055	return __nf_ct_resolve_clash(skb, h);
1056	}
1057
1058	/ We want the clashing entry to go away real soon: 1 second timeout. /
1059	WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ);
1060
1061	/ IPS_NAT_CLASH removes the entry automatically on the first*
1062	* reply. Also prevents UDP tracker from moving the entry to
1063	* ASSURED state, i.e. the entry can always be evicted under
1064	* pressure.
1065	*/
1066	loser_ct->status \|= IPS_FIXED_TIMEOUT \| IPS_NAT_CLASH;
1067
1068	__nf_conntrack_insert_prepare(ct: loser_ct);
1069
1070	/ fake add for ORIGINAL dir: we want lookups to only find the entry*
1071	* already in the table. This also hides the clashing entry from
1072	* ctnetlink iteration, i.e. conntrack -L won't show them.
1073	*/
1074	hlist_nulls_add_fake(n: &loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
1075
1076	hlist_nulls_add_head_rcu(n: &loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
1077	h: &nf_conntrack_hash[repl_idx]);
1078
1079	NF_CT_STAT_INC(net, clash_resolve);
1080	return NF_ACCEPT;
1081	}
1082
1083	/**
1084	* nf_ct_resolve_clash - attempt to handle clash without packet drop
1085	*
1086	* @skb: skb that causes the clash
1087	* @h: tuplehash of the clashing entry already in table
1088	* @reply_hash: hash slot for reply direction
1089	*
1090	* A conntrack entry can be inserted to the connection tracking table
1091	* if there is no existing entry with an identical tuple.
1092	*
1093	* If there is one, @skb (and the assocated, unconfirmed conntrack) has
1094	* to be dropped. In case @skb is retransmitted, next conntrack lookup
1095	* will find the already-existing entry.
1096	*
1097	* The major problem with such packet drop is the extra delay added by
1098	* the packet loss -- it will take some time for a retransmit to occur
1099	* (or the sender to time out when waiting for a reply).
1100	*
1101	* This function attempts to handle the situation without packet drop.
1102	*
1103	* If @skb has no NAT transformation or if the colliding entries are
1104	* exactly the same, only the to-be-confirmed conntrack entry is discarded
1105	* and @skb is associated with the conntrack entry already in the table.
1106	*
1107	* Failing that, the new, unconfirmed conntrack is still added to the table
1108	* provided that the collision only occurs in the ORIGINAL direction.
1109	* The new entry will be added only in the non-clashing REPLY direction,
1110	* so packets in the ORIGINAL direction will continue to match the existing
1111	* entry. The new entry will also have a fixed timeout so it expires --
1112	* due to the collision, it will only see reply traffic.
1113	*
1114	* Returns NF_DROP if the clash could not be resolved.
1115	*/
1116	static __cold noinline int
1117	nf_ct_resolve_clash(struct sk_buff skb, struct* nf_conntrack_tuple_hash *h,
1118	u32 reply_hash)
1119	{
1120	/ This is the conntrack entry already in hashes that won race. /
1121	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash: h);
1122	const struct nf_conntrack_l4proto *l4proto;
1123	enum ip_conntrack_info ctinfo;
1124	struct nf_conn *loser_ct;
1125	struct net *net;
1126	int ret;
1127
1128	loser_ct = nf_ct_get(skb, ctinfo: &ctinfo);
1129	net = nf_ct_net(ct: loser_ct);
1130
1131	l4proto = nf_ct_l4proto_find(l4proto: nf_ct_protonum(ct));
1132	if (!l4proto->allow_clash)
1133	goto drop;
1134
1135	ret = __nf_ct_resolve_clash(skb, h);
1136	if (ret == NF_ACCEPT)
1137	return ret;
1138
1139	ret = nf_ct_resolve_clash_harder(skb, repl_idx: reply_hash);
1140	if (ret == NF_ACCEPT)
1141	return ret;
1142
1143	drop:
1144	NF_CT_STAT_INC(net, drop);
1145	NF_CT_STAT_INC(net, insert_failed);
1146	return NF_DROP;
1147	}
1148
1149	/ Confirm a connection given skb; places it in hash table /
1150	int
1151	__nf_conntrack_confirm(struct sk_buff *skb)
1152	{
1153	unsigned int chainlen = `0`, sequence, max_chainlen;
1154	const struct nf_conntrack_zone *zone;
1155	unsigned int hash, reply_hash;
1156	struct nf_conntrack_tuple_hash *h;
1157	struct nf_conn *ct;
1158	struct nf_conn_help *help;
1159	struct hlist_nulls_node *n;
1160	enum ip_conntrack_info ctinfo;
1161	struct net *net;
1162	int ret = NF_DROP;
1163
1164	ct = nf_ct_get(skb, ctinfo: &ctinfo);
1165	net = nf_ct_net(ct);
1166
1167	/ ipt_REJECT uses nf_conntrack_attach to attach related*
1168	ICMP/TCP RST packets in other direction. Actual packet
1169	which created connection will be IP_CT_NEW or for an
1170	expected connection, IP_CT_RELATED. /*
1171	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
1172	return NF_ACCEPT;
1173
1174	zone = nf_ct_zone(ct);
1175	local_bh_disable();
1176
1177	do {
1178	sequence = read_seqcount_begin(&nf_conntrack_generation);
1179	/ reuse the hash saved before /
1180	hash = (unsigned* long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
1181	hash = scale_hash(hash);
1182	reply_hash = hash_conntrack(net,
1183	tuple: &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1184	zoneid: nf_ct_zone_id(zone: nf_ct_zone(ct), dir: IP_CT_DIR_REPLY));
1185	} while (nf_conntrack_double_lock(net, h1: hash, h2: reply_hash, sequence));
1186
1187	/ We're not in hash table, and we refuse to set up related*
1188	* connections for unconfirmed conns. But packet copies and
1189	* REJECT will give spurious warnings here.
1190	*/
1191
1192	/ Another skb with the same unconfirmed conntrack may*
1193	* win the race. This may happen for bridge(br_flood)
1194	* or broadcast/multicast packets do skb_clone with
1195	* unconfirmed conntrack.
1196	*/
1197	if (unlikely(nf_ct_is_confirmed(ct))) {
1198	WARN_ON_ONCE(`1`);
1199	nf_conntrack_double_unlock(h1: hash, h2: reply_hash);
1200	local_bh_enable();
1201	return NF_DROP;
1202	}
1203
1204	if (!nf_ct_ext_valid_pre(ext: ct->ext)) {
1205	NF_CT_STAT_INC(net, insert_failed);
1206	goto dying;
1207	}
1208
1209	/ We have to check the DYING flag after unlink to prevent*
1210	* a race against nf_ct_get_next_corpse() possibly called from
1211	* user context, else we insert an already 'dead' hash, blocking
1212	* further use of that particular connection -JM.
1213	*/
1214	ct->status \|= IPS_CONFIRMED;
1215
1216	if (unlikely(nf_ct_is_dying(ct))) {
1217	NF_CT_STAT_INC(net, insert_failed);
1218	goto dying;
1219	}
1220
1221	max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN);
1222	/ See if there's one in the list already, including reverse:*
1223	NAT could have grabbed it without realizing, since we're
1224	not in the hash. If there is, we lost race. /*
1225	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
1226	if (nf_ct_key_equal(h, tuple: &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1227	zone, net))
1228	goto out;
1229	if (chainlen++ > max_chainlen)
1230	goto chaintoolong;
1231	}
1232
1233	chainlen = `0`;
1234	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
1235	if (nf_ct_key_equal(h, tuple: &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
1236	zone, net))
1237	goto out;
1238	if (chainlen++ > max_chainlen) {
1239	chaintoolong:
1240	NF_CT_STAT_INC(net, chaintoolong);
1241	NF_CT_STAT_INC(net, insert_failed);
1242	ret = NF_DROP;
1243	goto dying;
1244	}
1245	}
1246
1247	/ Timer relative to confirmation time, not original*
1248	setting time, otherwise we'd get timer wrap in
1249	weird delay cases. /*
1250	ct->timeout += nfct_time_stamp;
1251
1252	__nf_conntrack_insert_prepare(ct);
1253
1254	/ Since the lookup is lockless, hash insertion must be done after*
1255	* starting the timer and setting the CONFIRMED bit. The RCU barriers
1256	* guarantee that no other CPU can find the conntrack before the above
1257	* stores are visible.
1258	*/
1259	__nf_conntrack_hash_insert(ct, hash, reply_hash);
1260	nf_conntrack_double_unlock(h1: hash, h2: reply_hash);
1261	local_bh_enable();
1262
1263	/ ext area is still valid (rcu read lock is held,*
1264	* but will go out of scope soon, we need to remove
1265	* this conntrack again.
1266	*/
1267	if (!nf_ct_ext_valid_post(ext: ct->ext)) {
1268	nf_ct_kill(ct);
1269	NF_CT_STAT_INC_ATOMIC(net, drop);
1270	return NF_DROP;
1271	}
1272
1273	help = nfct_help(ct);
1274	if (help && help->helper)
1275	nf_conntrack_event_cache(event: IPCT_HELPER, ct);
1276
1277	nf_conntrack_event_cache(master_ct(ct) ?
1278	IPCT_RELATED : IPCT_NEW, ct);
1279	return NF_ACCEPT;
1280
1281	out:
1282	ret = nf_ct_resolve_clash(skb, h, reply_hash);
1283	dying:
1284	nf_conntrack_double_unlock(h1: hash, h2: reply_hash);
1285	local_bh_enable();
1286	return ret;
1287	}
1288	EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
1289
1290	/ Returns true if a connection corresponds to the tuple (required*
1291	for NAT). /*
1292	int
1293	nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
1294	const struct nf_conn *ignored_conntrack)
1295	{
1296	struct net *net = nf_ct_net(ct: ignored_conntrack);
1297	const struct nf_conntrack_zone *zone;
1298	struct nf_conntrack_tuple_hash *h;
1299	struct hlist_nulls_head *ct_hash;
1300	unsigned int hash, hsize;
1301	struct hlist_nulls_node *n;
1302	struct nf_conn *ct;
1303
1304	zone = nf_ct_zone(ct: ignored_conntrack);
1305
1306	rcu_read_lock();
1307	begin:
1308	nf_conntrack_get_ht(hash: &ct_hash, hsize: &hsize);
1309	hash = __hash_conntrack(net, tuple, zoneid: nf_ct_zone_id(zone, dir: IP_CT_DIR_REPLY), size: hsize);
1310
1311	hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
1312	ct = nf_ct_tuplehash_to_ctrack(hash: h);
1313
1314	if (ct == ignored_conntrack)
1315	continue;
1316
1317	if (nf_ct_is_expired(ct)) {
1318	nf_ct_gc_expired(ct);
1319	continue;
1320	}
1321
1322	if (nf_ct_key_equal(h, tuple, zone, net)) {
1323	/ Tuple is taken already, so caller will need to find*
1324	* a new source port to use.
1325	*
1326	* Only exception:
1327	* If the original tuples are identical, then both
1328	* conntracks refer to the same flow.
1329	* This is a rare situation, it can occur e.g. when
1330	* more than one UDP packet is sent from same socket
1331	* in different threads.
1332	*
1333	* Let nf_ct_resolve_clash() deal with this later.
1334	*/
1335	if (nf_ct_tuple_equal(t1: &ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
1336	t2: &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
1337	nf_ct_zone_equal(a: ct, b: zone, dir: IP_CT_DIR_ORIGINAL))
1338	continue;
1339
1340	NF_CT_STAT_INC_ATOMIC(net, found);
1341	rcu_read_unlock();
1342	return `1`;
1343	}
1344	}
1345
1346	if (get_nulls_value(ptr: n) != hash) {
1347	NF_CT_STAT_INC_ATOMIC(net, search_restart);
1348	goto begin;
1349	}
1350
1351	rcu_read_unlock();
1352
1353	return `0`;
1354	}
1355	EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
1356
1357	#define NF_CT_EVICTION_RANGE 8
1358
1359	/ There's a small race here where we may free a just-assured*
1360	connection. Too bad: we're in trouble anyway. /*
1361	static unsigned int early_drop_list(struct net *net,
1362	struct hlist_nulls_head *head)
1363	{
1364	struct nf_conntrack_tuple_hash *h;
1365	struct hlist_nulls_node *n;
1366	unsigned int drops = `0`;
1367	struct nf_conn *tmp;
1368
1369	hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
1370	tmp = nf_ct_tuplehash_to_ctrack(hash: h);
1371
1372	if (nf_ct_is_expired(ct: tmp)) {
1373	nf_ct_gc_expired(ct: tmp);
1374	continue;
1375	}
1376
1377	if (test_bit(IPS_ASSURED_BIT, &tmp->status) \|\|
1378	!net_eq(net1: nf_ct_net(ct: tmp), net2: net) \|\|
1379	nf_ct_is_dying(ct: tmp))
1380	continue;
1381
1382	if (!refcount_inc_not_zero(r: &tmp->ct_general.use))
1383	continue;
1384
1385	/ load ->ct_net and ->status after refcount increase /
1386	smp_acquire__after_ctrl_dep();
1387
1388	/ kill only if still in same netns -- might have moved due to*
1389	* SLAB_TYPESAFE_BY_RCU rules.
1390	*
1391	* We steal the timer reference. If that fails timer has
1392	* already fired or someone else deleted it. Just drop ref
1393	* and move to next entry.
1394	*/
1395	if (net_eq(net1: nf_ct_net(ct: tmp), net2: net) &&
1396	nf_ct_is_confirmed(ct: tmp) &&
1397	nf_ct_delete(tmp, `0`, `0`))
1398	drops++;
1399
1400	nf_ct_put(ct: tmp);
1401	}
1402
1403	return drops;
1404	}
1405
1406	static noinline int early_drop(struct net net, unsigned* int hash)
1407	{
1408	unsigned int i, bucket;
1409
1410	for (i = `0`; i < NF_CT_EVICTION_RANGE; i++) {
1411	struct hlist_nulls_head *ct_hash;
1412	unsigned int hsize, drops;
1413
1414	rcu_read_lock();
1415	nf_conntrack_get_ht(hash: &ct_hash, hsize: &hsize);
1416	if (!i)
1417	bucket = reciprocal_scale(val: hash, ep_ro: hsize);
1418	else
1419	bucket = (bucket + `1`) % hsize;
1420
1421	drops = early_drop_list(net, head: &ct_hash[bucket]);
1422	rcu_read_unlock();
1423
1424	if (drops) {
1425	NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
1426	return true;
1427	}
1428	}
1429
1430	return false;
1431	}
1432
1433	static bool gc_worker_skip_ct(const struct nf_conn *ct)
1434	{
1435	return !nf_ct_is_confirmed(ct) \|\| nf_ct_is_dying(ct);
1436	}
1437
1438	static bool gc_worker_can_early_drop(const struct nf_conn *ct)
1439	{
1440	const struct nf_conntrack_l4proto *l4proto;
1441	u8 protonum = nf_ct_protonum(ct);
1442
1443	if (test_bit(IPS_OFFLOAD_BIT, &ct->status) && protonum != IPPROTO_UDP)
1444	return false;
1445	if (!test_bit(IPS_ASSURED_BIT, &ct->status))
1446	return true;
1447
1448	l4proto = nf_ct_l4proto_find(l4proto: protonum);
1449	if (l4proto->can_early_drop && l4proto->can_early_drop(ct))
1450	return true;
1451
1452	return false;
1453	}
1454
1455	static void gc_worker(struct work_struct *work)
1456	{
1457	unsigned int i, hashsz, nf_conntrack_max95 = `0`;
1458	u32 end_time, start_time = nfct_time_stamp;
1459	struct conntrack_gc_work *gc_work;
1460	unsigned int expired_count = `0`;
1461	unsigned long next_run;
1462	s32 delta_time;
1463	long count;
1464
1465	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
1466
1467	i = gc_work->next_bucket;
1468	if (gc_work->early_drop)
1469	nf_conntrack_max95 = nf_conntrack_max / `100u` * `95u`;
1470
1471	if (i == `0`) {
1472	gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
1473	gc_work->count = GC_SCAN_INITIAL_COUNT;
1474	gc_work->start_time = start_time;
1475	}
1476
1477	next_run = gc_work->avg_timeout;
1478	count = gc_work->count;
1479
1480	end_time = start_time + GC_SCAN_MAX_DURATION;
1481
1482	do {
1483	struct nf_conntrack_tuple_hash *h;
1484	struct hlist_nulls_head *ct_hash;
1485	struct hlist_nulls_node *n;
1486	struct nf_conn *tmp;
1487
1488	rcu_read_lock();
1489
1490	nf_conntrack_get_ht(hash: &ct_hash, hsize: &hashsz);
1491	if (i >= hashsz) {
1492	rcu_read_unlock();
1493	break;
1494	}
1495
1496	hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
1497	struct nf_conntrack_net *cnet;
1498	struct net *net;
1499	long expires;
1500
1501	tmp = nf_ct_tuplehash_to_ctrack(hash: h);
1502
1503	if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
1504	nf_ct_offload_timeout(ct: tmp);
1505	if (!nf_conntrack_max95)
1506	continue;
1507	}
1508
1509	if (expired_count > GC_SCAN_EXPIRED_MAX) {
1510	rcu_read_unlock();
1511
1512	gc_work->next_bucket = i;
1513	gc_work->avg_timeout = next_run;
1514	gc_work->count = count;
1515
1516	delta_time = nfct_time_stamp - gc_work->start_time;
1517
1518	/ re-sched immediately if total cycle time is exceeded /
1519	next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX;
1520	goto early_exit;
1521	}
1522
1523	if (nf_ct_is_expired(ct: tmp)) {
1524	nf_ct_gc_expired(ct: tmp);
1525	expired_count++;
1526	continue;
1527	}
1528
1529	expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
1530	expires = (expires - (long)next_run) / ++count;
1531	next_run += expires;
1532
1533	if (nf_conntrack_max95 == `0` \|\| gc_worker_skip_ct(ct: tmp))
1534	continue;
1535
1536	net = nf_ct_net(ct: tmp);
1537	cnet = nf_ct_pernet(net);
1538	if (atomic_read(v: &cnet->count) < nf_conntrack_max95)
1539	continue;
1540
1541	/ need to take reference to avoid possible races /
1542	if (!refcount_inc_not_zero(r: &tmp->ct_general.use))
1543	continue;
1544
1545	/ load ->status after refcount increase /
1546	smp_acquire__after_ctrl_dep();
1547
1548	if (gc_worker_skip_ct(ct: tmp)) {
1549	nf_ct_put(ct: tmp);
1550	continue;
1551	}
1552
1553	if (gc_worker_can_early_drop(ct: tmp)) {
1554	nf_ct_kill(ct: tmp);
1555	expired_count++;
1556	}
1557
1558	nf_ct_put(ct: tmp);
1559	}
1560
1561	/ could check get_nulls_value() here and restart if ct*
1562	* was moved to another chain. But given gc is best-effort
1563	* we will just continue with next hash slot.
1564	*/
1565	rcu_read_unlock();
1566	cond_resched();
1567	i++;
1568
1569	delta_time = nfct_time_stamp - end_time;
1570	if (delta_time > `0` && i < hashsz) {
1571	gc_work->avg_timeout = next_run;
1572	gc_work->count = count;
1573	gc_work->next_bucket = i;
1574	next_run = `0`;
1575	goto early_exit;
1576	}
1577	} while (i < hashsz);
1578
1579	gc_work->next_bucket = `0`;
1580
1581	next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX);
1582
1583	delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, `1`);
1584	if (next_run > (unsigned long)delta_time)
1585	next_run -= delta_time;
1586	else
1587	next_run = `1`;
1588
1589	early_exit:
1590	if (gc_work->exiting)
1591	return;
1592
1593	if (next_run)
1594	gc_work->early_drop = false;
1595
1596	queue_delayed_work(wq: system_power_efficient_wq, dwork: &gc_work->dwork, delay: next_run);
1597	}
1598
1599	static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
1600	{
1601	INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
1602	gc_work->exiting = false;
1603	}
1604
1605	static struct nf_conn *
1606	__nf_conntrack_alloc(struct net *net,
1607	const struct nf_conntrack_zone *zone,
1608	const struct nf_conntrack_tuple *orig,
1609	const struct nf_conntrack_tuple *repl,
1610	gfp_t gfp, u32 hash)
1611	{
1612	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
1613	unsigned int ct_count;
1614	struct nf_conn *ct;
1615
1616	/ We don't want any race condition at early drop stage /
1617	ct_count = atomic_inc_return(v: &cnet->count);
1618
1619	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
1620	if (!early_drop(net, hash)) {
1621	if (!conntrack_gc_work.early_drop)
1622	conntrack_gc_work.early_drop = true;
1623	atomic_dec(v: &cnet->count);
1624	net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
1625	return ERR_PTR(error: -ENOMEM);
1626	}
1627	}
1628
1629	/*
1630	* Do not use kmem_cache_zalloc(), as this cache uses
1631	* SLAB_TYPESAFE_BY_RCU.
1632	*/
1633	ct = kmem_cache_alloc(cachep: nf_conntrack_cachep, flags: gfp);
1634	if (ct == NULL)
1635	goto out;
1636
1637	spin_lock_init(&ct->lock);
1638	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
1639	ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
1640	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
1641	/ save hash for reusing when confirming /
1642	(unsigned* long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
1643	ct->status = `0`;
1644	WRITE_ONCE(ct->timeout, `0`);
1645	write_pnet(pnet: &ct->ct_net, net);
1646	memset_after(ct, `0`, __nfct_init_offset);
1647
1648	nf_ct_zone_add(ct, zone);
1649
1650	/ Because we use RCU lookups, we set ct_general.use to zero before*
1651	* this is inserted in any list.
1652	*/
1653	refcount_set(r: &ct->ct_general.use, n: `0`);
1654	return ct;
1655	out:
1656	atomic_dec(v: &cnet->count);
1657	return ERR_PTR(error: -ENOMEM);
1658	}
1659
1660	struct nf_conn nf_conntrack_alloc(struct* net *net,
1661	const struct nf_conntrack_zone *zone,
1662	const struct nf_conntrack_tuple *orig,
1663	const struct nf_conntrack_tuple *repl,
1664	gfp_t gfp)
1665	{
1666	return __nf_conntrack_alloc(net, zone, orig, repl, gfp, hash: `0`);
1667	}
1668	EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
1669
1670	void nf_conntrack_free(struct nf_conn *ct)
1671	{
1672	struct net *net = nf_ct_net(ct);
1673	struct nf_conntrack_net *cnet;
1674
1675	/ A freed object has refcnt == 0, that's*
1676	* the golden rule for SLAB_TYPESAFE_BY_RCU
1677	*/
1678	WARN_ON(refcount_read(&ct->ct_general.use) != `0`);
1679
1680	if (ct->status & IPS_SRC_NAT_DONE) {
1681	const struct nf_nat_hook *nat_hook;
1682
1683	rcu_read_lock();
1684	nat_hook = rcu_dereference(nf_nat_hook);
1685	if (nat_hook)
1686	nat_hook->remove_nat_bysrc(ct);
1687	rcu_read_unlock();
1688	}
1689
1690	kfree(objp: ct->ext);
1691	kmem_cache_free(s: nf_conntrack_cachep, objp: ct);
1692	cnet = nf_ct_pernet(net);
1693
1694	smp_mb__before_atomic();
1695	atomic_dec(v: &cnet->count);
1696	}
1697	EXPORT_SYMBOL_GPL(nf_conntrack_free);
1698
1699
1700	/ Allocate a new conntrack: we return -ENOMEM if classification*
1701	failed due to stress. Otherwise it really is unclassifiable. /*
1702	static noinline struct nf_conntrack_tuple_hash *
1703	init_conntrack(struct net net, struct* nf_conn *tmpl,
1704	const struct nf_conntrack_tuple *tuple,
1705	struct sk_buff *skb,
1706	unsigned int dataoff, u32 hash)
1707	{
1708	struct nf_conn *ct;
1709	struct nf_conn_help *help;
1710	struct nf_conntrack_tuple repl_tuple;
1711	#ifdef CONFIG_NF_CONNTRACK_EVENTS
1712	struct nf_conntrack_ecache *ecache;
1713	#endif
1714	struct nf_conntrack_expect *exp = NULL;
1715	const struct nf_conntrack_zone *zone;
1716	struct nf_conn_timeout *timeout_ext;
1717	struct nf_conntrack_zone tmp;
1718	struct nf_conntrack_net *cnet;
1719
1720	if (!nf_ct_invert_tuple(&repl_tuple, tuple))
1721	return NULL;
1722
1723	zone = nf_ct_zone_tmpl(tmpl, skb, tmp: &tmp);
1724	ct = __nf_conntrack_alloc(net, zone, orig: tuple, repl: &repl_tuple, GFP_ATOMIC,
1725	hash);
1726	if (IS_ERR(ptr: ct))
1727	return (struct nf_conntrack_tuple_hash *)ct;
1728
1729	if (!nf_ct_add_synproxy(ct, tmpl)) {
1730	nf_conntrack_free(ct);
1731	return ERR_PTR(error: -ENOMEM);
1732	}
1733
1734	timeout_ext = tmpl ? nf_ct_timeout_find(ct: tmpl) : NULL;
1735
1736	if (timeout_ext)
1737	nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
1738	GFP_ATOMIC);
1739
1740	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1741	nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
1742	nf_ct_labels_ext_add(ct);
1743
1744	#ifdef CONFIG_NF_CONNTRACK_EVENTS
1745	ecache = tmpl ? nf_ct_ecache_find(ct: tmpl) : NULL;
1746
1747	if ((ecache \|\| net->ct.sysctl_events) &&
1748	!nf_ct_ecache_ext_add(ct, ctmask: ecache ? ecache->ctmask : `0`,
1749	expmask: ecache ? ecache->expmask : `0`,
1750	GFP_ATOMIC)) {
1751	nf_conntrack_free(ct);
1752	return ERR_PTR(error: -ENOMEM);
1753	}
1754	#endif
1755
1756	cnet = nf_ct_pernet(net);
1757	if (cnet->expect_count) {
1758	spin_lock_bh(lock: &nf_conntrack_expect_lock);
1759	exp = nf_ct_find_expectation(net, zone, tuple, unlink: !tmpl \|\| nf_ct_is_confirmed(ct: tmpl));
1760	if (exp) {
1761	/ Welcome, Mr. Bond. We've been expecting you... /
1762	__set_bit(IPS_EXPECTED_BIT, &ct->status);
1763	/ exp->master safe, refcnt bumped in nf_ct_find_expectation /
1764	ct->master = exp->master;
1765	if (exp->helper) {
1766	help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
1767	if (help)
1768	rcu_assign_pointer(help->helper, exp->helper);
1769	}
1770
1771	#ifdef CONFIG_NF_CONNTRACK_MARK
1772	ct->mark = READ_ONCE(exp->master->mark);
1773	#endif
1774	#ifdef CONFIG_NF_CONNTRACK_SECMARK
1775	ct->secmark = exp->master->secmark;
1776	#endif
1777	NF_CT_STAT_INC(net, expect_new);
1778	}
1779	spin_unlock_bh(lock: &nf_conntrack_expect_lock);
1780	}
1781	if (!exp && tmpl)
1782	__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1783
1784	/ Other CPU might have obtained a pointer to this object before it was*
1785	* released. Because refcount is 0, refcount_inc_not_zero() will fail.
1786	*
1787	* After refcount_set(1) it will succeed; ensure that zeroing of
1788	* ct->status and the correct ct->net pointer are visible; else other
1789	* core might observe CONFIRMED bit which means the entry is valid and
1790	* in the hash table, but its not (anymore).
1791	*/
1792	smp_wmb();
1793
1794	/ Now it is going to be associated with an sk_buff, set refcount to 1. /
1795	refcount_set(r: &ct->ct_general.use, n: `1`);
1796
1797	if (exp) {
1798	if (exp->expectfn)
1799	exp->expectfn(ct, exp);
1800	nf_ct_expect_put(exp);
1801	}
1802
1803	return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1804	}
1805
1806	/ On success, returns 0, sets skb->_nfct \| ctinfo /
1807	static int
1808	resolve_normal_ct(struct nf_conn *tmpl,
1809	struct sk_buff *skb,
1810	unsigned int dataoff,
1811	u_int8_t protonum,
1812	const struct nf_hook_state *state)
1813	{
1814	const struct nf_conntrack_zone *zone;
1815	struct nf_conntrack_tuple tuple;
1816	struct nf_conntrack_tuple_hash *h;
1817	enum ip_conntrack_info ctinfo;
1818	struct nf_conntrack_zone tmp;
1819	u32 hash, zone_id, rid;
1820	struct nf_conn *ct;
1821
1822	if (!nf_ct_get_tuple(skb, nhoff: skb_network_offset(skb),
1823	dataoff, l3num: state->pf, protonum, net: state->net,
1824	tuple: &tuple))
1825	return `0`;
1826
1827	/ look for tuple match /
1828	zone = nf_ct_zone_tmpl(tmpl, skb, tmp: &tmp);
1829
1830	zone_id = nf_ct_zone_id(zone, dir: IP_CT_DIR_ORIGINAL);
1831	hash = hash_conntrack_raw(tuple: &tuple, zoneid: zone_id, net: state->net);
1832	h = __nf_conntrack_find_get(net: state->net, zone, tuple: &tuple, hash);
1833
1834	if (!h) {
1835	rid = nf_ct_zone_id(zone, dir: IP_CT_DIR_REPLY);
1836	if (zone_id != rid) {
1837	u32 tmp = hash_conntrack_raw(tuple: &tuple, zoneid: rid, net: state->net);
1838
1839	h = __nf_conntrack_find_get(net: state->net, zone, tuple: &tuple, hash: tmp);
1840	}
1841	}
1842
1843	if (!h) {
1844	h = init_conntrack(net: state->net, tmpl, tuple: &tuple,
1845	skb, dataoff, hash);
1846	if (!h)
1847	return `0`;
1848	if (IS_ERR(ptr: h))
1849	return PTR_ERR(ptr: h);
1850	}
1851	ct = nf_ct_tuplehash_to_ctrack(hash: h);
1852
1853	/ It exists; we have (non-exclusive) reference. /
1854	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1855	ctinfo = IP_CT_ESTABLISHED_REPLY;
1856	} else {
1857	unsigned long status = READ_ONCE(ct->status);
1858
1859	/ Once we've had two way comms, always ESTABLISHED. /
1860	if (likely(status & IPS_SEEN_REPLY))
1861	ctinfo = IP_CT_ESTABLISHED;
1862	else if (status & IPS_EXPECTED)
1863	ctinfo = IP_CT_RELATED;
1864	else
1865	ctinfo = IP_CT_NEW;
1866	}
1867	nf_ct_set(skb, ct, info: ctinfo);
1868	return `0`;
1869	}
1870
1871	/*
1872	* icmp packets need special treatment to handle error messages that are
1873	* related to a connection.
1874	*
1875	* Callers need to check if skb has a conntrack assigned when this
1876	* helper returns; in such case skb belongs to an already known connection.
1877	*/
1878	static unsigned int __cold
1879	nf_conntrack_handle_icmp(struct nf_conn *tmpl,
1880	struct sk_buff *skb,
1881	unsigned int dataoff,
1882	u8 protonum,
1883	const struct nf_hook_state *state)
1884	{
1885	int ret;
1886
1887	if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP)
1888	ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state);
1889	#if IS_ENABLED(CONFIG_IPV6)
1890	else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6)
1891	ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state);
1892	#endif
1893	else
1894	return NF_ACCEPT;
1895
1896	if (ret <= `0`)
1897	NF_CT_STAT_INC_ATOMIC(state->net, error);
1898
1899	return ret;
1900	}
1901
1902	static int generic_packet(struct nf_conn ct, struct* sk_buff *skb,
1903	enum ip_conntrack_info ctinfo)
1904	{
1905	const unsigned int *timeout = nf_ct_timeout_lookup(ct);
1906
1907	if (!timeout)
1908	timeout = &nf_generic_pernet(net: nf_ct_net(ct))->timeout;
1909
1910	nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies: *timeout);
1911	return NF_ACCEPT;
1912	}
1913
1914	/ Returns verdict for packet, or -1 for invalid. /
1915	static int nf_conntrack_handle_packet(struct nf_conn *ct,
1916	struct sk_buff *skb,
1917	unsigned int dataoff,
1918	enum ip_conntrack_info ctinfo,
1919	const struct nf_hook_state *state)
1920	{
1921	switch (nf_ct_protonum(ct)) {
1922	case IPPROTO_TCP:
1923	return nf_conntrack_tcp_packet(ct, skb, dataoff,
1924	ctinfo, state);
1925	case IPPROTO_UDP:
1926	return nf_conntrack_udp_packet(ct, skb, dataoff,
1927	ctinfo, state);
1928	case IPPROTO_ICMP:
1929	return nf_conntrack_icmp_packet(ct, skb, ctinfo, state);
1930	#if IS_ENABLED(CONFIG_IPV6)
1931	case IPPROTO_ICMPV6:
1932	return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state);
1933	#endif
1934	#ifdef CONFIG_NF_CT_PROTO_UDPLITE
1935	case IPPROTO_UDPLITE:
1936	return nf_conntrack_udplite_packet(ct, skb, dataoff,
1937	ctinfo, state);
1938	#endif
1939	#ifdef CONFIG_NF_CT_PROTO_SCTP
1940	case IPPROTO_SCTP:
1941	return nf_conntrack_sctp_packet(ct, skb, dataoff,
1942	ctinfo, state);
1943	#endif
1944	#ifdef CONFIG_NF_CT_PROTO_DCCP
1945	case IPPROTO_DCCP:
1946	return nf_conntrack_dccp_packet(ct, skb, dataoff,
1947	ctinfo, state);
1948	#endif
1949	#ifdef CONFIG_NF_CT_PROTO_GRE
1950	case IPPROTO_GRE:
1951	return nf_conntrack_gre_packet(ct, skb, dataoff,
1952	ctinfo, state);
1953	#endif
1954	}
1955
1956	return generic_packet(ct, skb, ctinfo);
1957	}
1958
1959	unsigned int
1960	nf_conntrack_in(struct sk_buff skb, const* struct nf_hook_state *state)
1961	{
1962	enum ip_conntrack_info ctinfo;
1963	struct nf_conn ct, tmpl;
1964	u_int8_t protonum;
1965	int dataoff, ret;
1966
1967	tmpl = nf_ct_get(skb, ctinfo: &ctinfo);
1968	if (tmpl \|\| ctinfo == IP_CT_UNTRACKED) {
1969	/ Previously seen (loopback or untracked)? Ignore. /
1970	if ((tmpl && !nf_ct_is_template(ct: tmpl)) \|\|
1971	ctinfo == IP_CT_UNTRACKED)
1972	return NF_ACCEPT;
1973	skb->_nfct = `0`;
1974	}
1975
1976	/ rcu_read_lock()ed by nf_hook_thresh /
1977	dataoff = get_l4proto(skb, nhoff: skb_network_offset(skb), pf: state->pf, l4num: &protonum);
1978	if (dataoff <= `0`) {
1979	NF_CT_STAT_INC_ATOMIC(state->net, invalid);
1980	ret = NF_ACCEPT;
1981	goto out;
1982	}
1983
1984	if (protonum == IPPROTO_ICMP \|\| protonum == IPPROTO_ICMPV6) {
1985	ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
1986	protonum, state);
1987	if (ret <= `0`) {
1988	ret = -ret;
1989	goto out;
1990	}
1991	/ ICMP[v6] protocol trackers may assign one conntrack. /
1992	if (skb->_nfct)
1993	goto out;
1994	}
1995	repeat:
1996	ret = resolve_normal_ct(tmpl, skb, dataoff,
1997	protonum, state);
1998	if (ret < `0`) {
1999	/ Too stressed to deal. /
2000	NF_CT_STAT_INC_ATOMIC(state->net, drop);
2001	ret = NF_DROP;
2002	goto out;
2003	}
2004
2005	ct = nf_ct_get(skb, ctinfo: &ctinfo);
2006	if (!ct) {
2007	/ Not valid part of a connection /
2008	NF_CT_STAT_INC_ATOMIC(state->net, invalid);
2009	ret = NF_ACCEPT;
2010	goto out;
2011	}
2012
2013	ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state);
2014	if (ret <= `0`) {
2015	/ Invalid: inverse of the return code tells*
2016	* the netfilter core what to do */
2017	nf_ct_put(ct);
2018	skb->_nfct = `0`;
2019	/ Special case: TCP tracker reports an attempt to reopen a*
2020	* closed/aborted connection. We have to go back and create a
2021	* fresh conntrack.
2022	*/
2023	if (ret == -NF_REPEAT)
2024	goto repeat;
2025
2026	NF_CT_STAT_INC_ATOMIC(state->net, invalid);
2027	if (ret == -NF_DROP)
2028	NF_CT_STAT_INC_ATOMIC(state->net, drop);
2029
2030	ret = -ret;
2031	goto out;
2032	}
2033
2034	if (ctinfo == IP_CT_ESTABLISHED_REPLY &&
2035	!test_and_set_bit(nr: IPS_SEEN_REPLY_BIT, addr: &ct->status))
2036	nf_conntrack_event_cache(event: IPCT_REPLY, ct);
2037	out:
2038	if (tmpl)
2039	nf_ct_put(ct: tmpl);
2040
2041	return ret;
2042	}
2043	EXPORT_SYMBOL_GPL(nf_conntrack_in);
2044
2045	/ Refresh conntrack for this many jiffies and do accounting if do_acct is 1 /
2046	void __nf_ct_refresh_acct(struct nf_conn *ct,
2047	enum ip_conntrack_info ctinfo,
2048	const struct sk_buff *skb,
2049	u32 extra_jiffies,
2050	bool do_acct)
2051	{
2052	/ Only update if this is not a fixed timeout /
2053	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
2054	goto acct;
2055
2056	/ If not in hash table, timer will not be active yet /
2057	if (nf_ct_is_confirmed(ct))
2058	extra_jiffies += nfct_time_stamp;
2059
2060	if (READ_ONCE(ct->timeout) != extra_jiffies)
2061	WRITE_ONCE(ct->timeout, extra_jiffies);
2062	acct:
2063	if (do_acct)
2064	nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes: skb->len);
2065	}
2066	EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
2067
2068	bool nf_ct_kill_acct(struct nf_conn *ct,
2069	enum ip_conntrack_info ctinfo,
2070	const struct sk_buff *skb)
2071	{
2072	nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes: skb->len);
2073
2074	return nf_ct_delete(ct, `0`, `0`);
2075	}
2076	EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
2077
2078	#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
2079
2080	#include <linux/netfilter/nfnetlink.h>
2081	#include <linux/netfilter/nfnetlink_conntrack.h>
2082	#include <linux/mutex.h>
2083
2084	/ Generic function for tcp/udp/sctp/dccp and alike. /
2085	int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
2086	const struct nf_conntrack_tuple *tuple)
2087	{
2088	if (nla_put_be16(skb, attrtype: CTA_PROTO_SRC_PORT, value: tuple->src.u.tcp.port) \|\|
2089	nla_put_be16(skb, attrtype: CTA_PROTO_DST_PORT, value: tuple->dst.u.tcp.port))
2090	goto nla_put_failure;
2091	return `0`;
2092
2093	nla_put_failure:
2094	return -`1`;
2095	}
2096	EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
2097
2098	const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+`1`] = {
2099	[CTA_PROTO_SRC_PORT] = { .type = NLA_U16 },
2100	[CTA_PROTO_DST_PORT] = { .type = NLA_U16 },
2101	};
2102	EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
2103
2104	int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
2105	struct nf_conntrack_tuple *t,
2106	u_int32_t flags)
2107	{
2108	if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) {
2109	if (!tb[CTA_PROTO_SRC_PORT])
2110	return -EINVAL;
2111
2112	t->src.u.tcp.port = nla_get_be16(nla: tb[CTA_PROTO_SRC_PORT]);
2113	}
2114
2115	if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) {
2116	if (!tb[CTA_PROTO_DST_PORT])
2117	return -EINVAL;
2118
2119	t->dst.u.tcp.port = nla_get_be16(nla: tb[CTA_PROTO_DST_PORT]);
2120	}
2121
2122	return `0`;
2123	}
2124	EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
2125
2126	unsigned int nf_ct_port_nlattr_tuple_size(void)
2127	{
2128	static unsigned int size __read_mostly;
2129
2130	if (!size)
2131	size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + `1`);
2132
2133	return size;
2134	}
2135	EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
2136	#endif
2137
2138	/ Used by ipt_REJECT and ip6t_REJECT. /
2139	static void nf_conntrack_attach(struct sk_buff nskb, const* struct sk_buff *skb)
2140	{
2141	struct nf_conn *ct;
2142	enum ip_conntrack_info ctinfo;
2143
2144	/ This ICMP is in reverse direction to the packet which caused it /
2145	ct = nf_ct_get(skb, ctinfo: &ctinfo);
2146	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
2147	ctinfo = IP_CT_RELATED_REPLY;
2148	else
2149	ctinfo = IP_CT_RELATED;
2150
2151	/ Attach to new skbuff, and increment count /
2152	nf_ct_set(skb: nskb, ct, info: ctinfo);
2153	nf_conntrack_get(nfct: skb_nfct(skb: nskb));
2154	}
2155
2156	static int __nf_conntrack_update(struct net net, struct* sk_buff *skb,
2157	struct nf_conn *ct,
2158	enum ip_conntrack_info ctinfo)
2159	{
2160	const struct nf_nat_hook *nat_hook;
2161	struct nf_conntrack_tuple_hash *h;
2162	struct nf_conntrack_tuple tuple;
2163	unsigned int status;
2164	int dataoff;
2165	u16 l3num;
2166	u8 l4num;
2167
2168	l3num = nf_ct_l3num(ct);
2169
2170	dataoff = get_l4proto(skb, nhoff: skb_network_offset(skb), pf: l3num, l4num: &l4num);
2171	if (dataoff <= `0`)
2172	return NF_DROP;
2173
2174	if (!nf_ct_get_tuple(skb, nhoff: skb_network_offset(skb), dataoff, l3num,
2175	protonum: l4num, net, tuple: &tuple))
2176	return NF_DROP;
2177
2178	if (ct->status & IPS_SRC_NAT) {
2179	memcpy(tuple.src.u3.all,
2180	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
2181	sizeof(tuple.src.u3.all));
2182	tuple.src.u.all =
2183	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
2184	}
2185
2186	if (ct->status & IPS_DST_NAT) {
2187	memcpy(tuple.dst.u3.all,
2188	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all,
2189	sizeof(tuple.dst.u3.all));
2190	tuple.dst.u.all =
2191	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all;
2192	}
2193
2194	h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple);
2195	if (!h)
2196	return NF_ACCEPT;
2197
2198	/ Store status bits of the conntrack that is clashing to re-do NAT*
2199	* mangling according to what it has been done already to this packet.
2200	*/
2201	status = ct->status;
2202
2203	nf_ct_put(ct);
2204	ct = nf_ct_tuplehash_to_ctrack(hash: h);
2205	nf_ct_set(skb, ct, info: ctinfo);
2206
2207	nat_hook = rcu_dereference(nf_nat_hook);
2208	if (!nat_hook)
2209	return NF_ACCEPT;
2210
2211	if (status & IPS_SRC_NAT) {
2212	unsigned int verdict = nat_hook->manip_pkt(skb, ct,
2213	NF_NAT_MANIP_SRC,
2214	IP_CT_DIR_ORIGINAL);
2215	if (verdict != NF_ACCEPT)
2216	return verdict;
2217	}
2218
2219	if (status & IPS_DST_NAT) {
2220	unsigned int verdict = nat_hook->manip_pkt(skb, ct,
2221	NF_NAT_MANIP_DST,
2222	IP_CT_DIR_ORIGINAL);
2223	if (verdict != NF_ACCEPT)
2224	return verdict;
2225	}
2226
2227	return NF_ACCEPT;
2228	}
2229
2230	/ This packet is coming from userspace via nf_queue, complete the packet*
2231	* processing after the helper invocation in nf_confirm().
2232	*/
2233	static int nf_confirm_cthelper(struct sk_buff skb, struct* nf_conn *ct,
2234	enum ip_conntrack_info ctinfo)
2235	{
2236	const struct nf_conntrack_helper *helper;
2237	const struct nf_conn_help *help;
2238	int protoff;
2239
2240	help = nfct_help(ct);
2241	if (!help)
2242	return NF_ACCEPT;
2243
2244	helper = rcu_dereference(help->helper);
2245	if (!helper)
2246	return NF_ACCEPT;
2247
2248	if (!(helper->flags & NF_CT_HELPER_F_USERSPACE))
2249	return NF_ACCEPT;
2250
2251	switch (nf_ct_l3num(ct)) {
2252	case NFPROTO_IPV4:
2253	protoff = skb_network_offset(skb) + ip_hdrlen(skb);
2254	break;
2255	#if IS_ENABLED(CONFIG_IPV6)
2256	case NFPROTO_IPV6: {
2257	__be16 frag_off;
2258	u8 pnum;
2259
2260	pnum = ipv6_hdr(skb)->nexthdr;
2261	protoff = ipv6_skip_exthdr(skb, start: sizeof(struct ipv6hdr), nexthdrp: &pnum,
2262	frag_offp: &frag_off);
2263	if (protoff < `0` \|\| (frag_off & htons(~`0x7`)) != `0`)
2264	return NF_ACCEPT;
2265	break;
2266	}
2267	#endif
2268	default:
2269	return NF_ACCEPT;
2270	}
2271
2272	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
2273	!nf_is_loopback_packet(skb)) {
2274	if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
2275	NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
2276	return NF_DROP;
2277	}
2278	}
2279
2280	/ We've seen it coming out the other side: confirm it /
2281	return nf_conntrack_confirm(skb);
2282	}
2283
2284	static int nf_conntrack_update(struct net net, struct* sk_buff *skb)
2285	{
2286	enum ip_conntrack_info ctinfo;
2287	struct nf_conn *ct;
2288
2289	ct = nf_ct_get(skb, ctinfo: &ctinfo);
2290	if (!ct)
2291	return NF_ACCEPT;
2292
2293	if (!nf_ct_is_confirmed(ct)) {
2294	int ret = __nf_conntrack_update(net, skb, ct, ctinfo);
2295
2296	if (ret != NF_ACCEPT)
2297	return ret;
2298
2299	ct = nf_ct_get(skb, ctinfo: &ctinfo);
2300	if (!ct)
2301	return NF_ACCEPT;
2302	}
2303
2304	return nf_confirm_cthelper(skb, ct, ctinfo);
2305	}
2306
2307	static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
2308	const struct sk_buff *skb)
2309	{
2310	const struct nf_conntrack_tuple *src_tuple;
2311	const struct nf_conntrack_tuple_hash *hash;
2312	struct nf_conntrack_tuple srctuple;
2313	enum ip_conntrack_info ctinfo;
2314	struct nf_conn *ct;
2315
2316	ct = nf_ct_get(skb, ctinfo: &ctinfo);
2317	if (ct) {
2318	src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
2319	memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
2320	return true;
2321	}
2322
2323	if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
2324	NFPROTO_IPV4, dev_net(dev: skb->dev),
2325	&srctuple))
2326	return false;
2327
2328	hash = nf_conntrack_find_get(dev_net(dev: skb->dev),
2329	&nf_ct_zone_dflt,
2330	&srctuple);
2331	if (!hash)
2332	return false;
2333
2334	ct = nf_ct_tuplehash_to_ctrack(hash);
2335	src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
2336	memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
2337	nf_ct_put(ct);
2338
2339	return true;
2340	}
2341
2342	/ Bring out ya dead! /
2343	static struct nf_conn *
2344	get_next_corpse(int (iter)(struct* nf_conn i, void* *data),
2345	const struct nf_ct_iter_data iter_data, unsigned* int *bucket)
2346	{
2347	struct nf_conntrack_tuple_hash *h;
2348	struct nf_conn *ct;
2349	struct hlist_nulls_node *n;
2350	spinlock_t *lockp;
2351
2352	for (; bucket < nf_conntrack_htable_size; (bucket)++) {
2353	struct hlist_nulls_head hslot = &nf_conntrack_hash[bucket];
2354
2355	if (hlist_nulls_empty(h: hslot))
2356	continue;
2357
2358	lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
2359	local_bh_disable();
2360	nf_conntrack_lock(lockp);
2361	hlist_nulls_for_each_entry(h, n, hslot, hnnode) {
2362	if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY)
2363	continue;
2364	/ All nf_conn objects are added to hash table twice, one*
2365	* for original direction tuple, once for the reply tuple.
2366	*
2367	* Exception: In the IPS_NAT_CLASH case, only the reply
2368	* tuple is added (the original tuple already existed for
2369	* a different object).
2370	*
2371	* We only need to call the iterator once for each
2372	* conntrack, so we just use the 'reply' direction
2373	* tuple while iterating.
2374	*/
2375	ct = nf_ct_tuplehash_to_ctrack(hash: h);
2376
2377	if (iter_data->net &&
2378	!net_eq(net1: iter_data->net, net2: nf_ct_net(ct)))
2379	continue;
2380
2381	if (iter(ct, iter_data->data))
2382	goto found;
2383	}
2384	spin_unlock(lock: lockp);
2385	local_bh_enable();
2386	cond_resched();
2387	}
2388
2389	return NULL;
2390	found:
2391	refcount_inc(r: &ct->ct_general.use);
2392	spin_unlock(lock: lockp);
2393	local_bh_enable();
2394	return ct;
2395	}
2396
2397	static void nf_ct_iterate_cleanup(int (iter)(struct* nf_conn i, void* *data),
2398	const struct nf_ct_iter_data *iter_data)
2399	{
2400	unsigned int bucket = `0`;
2401	struct nf_conn *ct;
2402
2403	might_sleep();
2404
2405	mutex_lock(&nf_conntrack_mutex);
2406	while ((ct = get_next_corpse(iter, iter_data, bucket: &bucket)) != NULL) {
2407	/ Time to push up daises... /
2408
2409	nf_ct_delete(ct, iter_data->portid, iter_data->report);
2410	nf_ct_put(ct);
2411	cond_resched();
2412	}
2413	mutex_unlock(lock: &nf_conntrack_mutex);
2414	}
2415
2416	void nf_ct_iterate_cleanup_net(int (iter)(struct* nf_conn i, void* *data),
2417	const struct nf_ct_iter_data *iter_data)
2418	{
2419	struct net *net = iter_data->net;
2420	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2421
2422	might_sleep();
2423
2424	if (atomic_read(v: &cnet->count) == `0`)
2425	return;
2426
2427	nf_ct_iterate_cleanup(iter, iter_data);
2428	}
2429	EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
2430
2431	/**
2432	* nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table
2433	* @iter: callback to invoke for each conntrack
2434	* @data: data to pass to @iter
2435	*
2436	* Like nf_ct_iterate_cleanup, but first marks conntracks on the
2437	* unconfirmed list as dying (so they will not be inserted into
2438	* main table).
2439	*
2440	* Can only be called in module exit path.
2441	*/
2442	void
2443	nf_ct_iterate_destroy(int (iter)(struct* nf_conn i, void* data), void* *data)
2444	{
2445	struct nf_ct_iter_data iter_data = {};
2446	struct net *net;
2447
2448	down_read(sem: &net_rwsem);
2449	for_each_net(net) {
2450	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2451
2452	if (atomic_read(v: &cnet->count) == `0`)
2453	continue;
2454	nf_queue_nf_hook_drop(net);
2455	}
2456	up_read(sem: &net_rwsem);
2457
2458	/ Need to wait for netns cleanup worker to finish, if its*
2459	* running -- it might have deleted a net namespace from
2460	* the global list, so hook drop above might not have
2461	* affected all namespaces.
2462	*/
2463	net_ns_barrier();
2464
2465	/ a skb w. unconfirmed conntrack could have been reinjected just*
2466	* before we called nf_queue_nf_hook_drop().
2467	*
2468	* This makes sure its inserted into conntrack table.
2469	*/
2470	synchronize_net();
2471
2472	nf_ct_ext_bump_genid();
2473	iter_data.data = data;
2474	nf_ct_iterate_cleanup(iter, iter_data: &iter_data);
2475
2476	/ Another cpu might be in a rcu read section with*
2477	* rcu protected pointer cleared in iter callback
2478	* or hidden via nf_ct_ext_bump_genid() above.
2479	*
2480	* Wait until those are done.
2481	*/
2482	synchronize_rcu();
2483	}
2484	EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
2485
2486	static int kill_all(struct nf_conn i, void* *data)
2487	{
2488	return `1`;
2489	}
2490
2491	void nf_conntrack_cleanup_start(void)
2492	{
2493	cleanup_nf_conntrack_bpf();
2494	conntrack_gc_work.exiting = true;
2495	}
2496
2497	void nf_conntrack_cleanup_end(void)
2498	{
2499	RCU_INIT_POINTER(nf_ct_hook, NULL);
2500	cancel_delayed_work_sync(dwork: &conntrack_gc_work.dwork);
2501	kvfree(addr: nf_conntrack_hash);
2502
2503	nf_conntrack_proto_fini();
2504	nf_conntrack_helper_fini();
2505	nf_conntrack_expect_fini();
2506
2507	kmem_cache_destroy(s: nf_conntrack_cachep);
2508	}
2509
2510	/*
2511	* Mishearing the voices in his head, our hero wonders how he's
2512	* supposed to kill the mall.
2513	*/
2514	void nf_conntrack_cleanup_net(struct net *net)
2515	{
2516	LIST_HEAD(single);
2517
2518	list_add(new: &net->exit_list, head: &single);
2519	nf_conntrack_cleanup_net_list(net_exit_list: &single);
2520	}
2521
2522	void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
2523	{
2524	struct nf_ct_iter_data iter_data = {};
2525	struct net *net;
2526	int busy;
2527
2528	/*
2529	* This makes sure all current packets have passed through
2530	* netfilter framework. Roll on, two-stage module
2531	* delete...
2532	*/
2533	synchronize_rcu_expedited();
2534	i_see_dead_people:
2535	busy = `0`;
2536	list_for_each_entry(net, net_exit_list, exit_list) {
2537	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2538
2539	iter_data.net = net;
2540	nf_ct_iterate_cleanup_net(kill_all, &iter_data);
2541	if (atomic_read(v: &cnet->count) != `0`)
2542	busy = `1`;
2543	}
2544	if (busy) {
2545	schedule();
2546	goto i_see_dead_people;
2547	}
2548
2549	list_for_each_entry(net, net_exit_list, exit_list) {
2550	nf_conntrack_ecache_pernet_fini(net);
2551	nf_conntrack_expect_pernet_fini(net);
2552	free_percpu(pdata: net->ct.stat);
2553	}
2554	}
2555
2556	void nf_ct_alloc_hashtable(unsigned* int sizep, int* nulls)
2557	{
2558	struct hlist_nulls_head *hash;
2559	unsigned int nr_slots, i;
2560
2561	if (sizep > (UINT_MAX / sizeof(struct* hlist_nulls_head)))
2562	return NULL;
2563
2564	BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
2565	nr_slots = sizep = roundup(sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
2566
2567	hash = kvcalloc(n: nr_slots, size: sizeof(struct hlist_nulls_head), GFP_KERNEL);
2568
2569	if (hash && nulls)
2570	for (i = `0`; i < nr_slots; i++)
2571	INIT_HLIST_NULLS_HEAD(&hash[i], i);
2572
2573	return hash;
2574	}
2575	EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
2576
2577	int nf_conntrack_hash_resize(unsigned int hashsize)
2578	{
2579	int i, bucket;
2580	unsigned int old_size;
2581	struct hlist_nulls_head hash, old_hash;
2582	struct nf_conntrack_tuple_hash *h;
2583	struct nf_conn *ct;
2584
2585	if (!hashsize)
2586	return -EINVAL;
2587
2588	hash = nf_ct_alloc_hashtable(&hashsize, `1`);
2589	if (!hash)
2590	return -ENOMEM;
2591
2592	mutex_lock(&nf_conntrack_mutex);
2593	old_size = nf_conntrack_htable_size;
2594	if (old_size == hashsize) {
2595	mutex_unlock(lock: &nf_conntrack_mutex);
2596	kvfree(addr: hash);
2597	return `0`;
2598	}
2599
2600	local_bh_disable();
2601	nf_conntrack_all_lock();
2602	write_seqcount_begin(&nf_conntrack_generation);
2603
2604	/ Lookups in the old hash might happen in parallel, which means we*
2605	* might get false negatives during connection lookup. New connections
2606	* created because of a false negative won't make it into the hash
2607	* though since that required taking the locks.
2608	*/
2609
2610	for (i = `0`; i < nf_conntrack_htable_size; i++) {
2611	while (!hlist_nulls_empty(h: &nf_conntrack_hash[i])) {
2612	unsigned int zone_id;
2613
2614	h = hlist_nulls_entry(nf_conntrack_hash[i].first,
2615	struct nf_conntrack_tuple_hash, hnnode);
2616	ct = nf_ct_tuplehash_to_ctrack(hash: h);
2617	hlist_nulls_del_rcu(n: &h->hnnode);
2618
2619	zone_id = nf_ct_zone_id(zone: nf_ct_zone(ct), NF_CT_DIRECTION(h));
2620	bucket = __hash_conntrack(net: nf_ct_net(ct),
2621	tuple: &h->tuple, zoneid: zone_id, size: hashsize);
2622	hlist_nulls_add_head_rcu(n: &h->hnnode, h: &hash[bucket]);
2623	}
2624	}
2625	old_hash = nf_conntrack_hash;
2626
2627	nf_conntrack_hash = hash;
2628	nf_conntrack_htable_size = hashsize;
2629
2630	write_seqcount_end(&nf_conntrack_generation);
2631	nf_conntrack_all_unlock();
2632	local_bh_enable();
2633
2634	mutex_unlock(lock: &nf_conntrack_mutex);
2635
2636	synchronize_net();
2637	kvfree(addr: old_hash);
2638	return `0`;
2639	}
2640
2641	int nf_conntrack_set_hashsize(const char val, const* struct kernel_param *kp)
2642	{
2643	unsigned int hashsize;
2644	int rc;
2645
2646	if (current->nsproxy->net_ns != &init_net)
2647	return -EOPNOTSUPP;
2648
2649	/ On boot, we can set this without any fancy locking. /
2650	if (!nf_conntrack_hash)
2651	return param_set_uint(val, kp);
2652
2653	rc = kstrtouint(s: val, base: `0`, res: &hashsize);
2654	if (rc)
2655	return rc;
2656
2657	return nf_conntrack_hash_resize(hashsize);
2658	}
2659
2660	int nf_conntrack_init_start(void)
2661	{
2662	unsigned long nr_pages = totalram_pages();
2663	int max_factor = `8`;
2664	int ret = -ENOMEM;
2665	int i;
2666
2667	seqcount_spinlock_init(&nf_conntrack_generation,
2668	&nf_conntrack_locks_all_lock);
2669
2670	for (i = `0`; i < CONNTRACK_LOCKS; i++)
2671	spin_lock_init(&nf_conntrack_locks[i]);
2672
2673	if (!nf_conntrack_htable_size) {
2674	nf_conntrack_htable_size
2675	= (((nr_pages << PAGE_SHIFT) / `16384`)
2676	/ sizeof(struct hlist_head));
2677	if (BITS_PER_LONG >= `64` &&
2678	nr_pages > (`4` * (`1024` * `1024` * `1024` / PAGE_SIZE)))
2679	nf_conntrack_htable_size = `262144`;
2680	else if (nr_pages > (`1024` * `1024` * `1024` / PAGE_SIZE))
2681	nf_conntrack_htable_size = `65536`;
2682
2683	if (nf_conntrack_htable_size < `1024`)
2684	nf_conntrack_htable_size = `1024`;
2685	/ Use a max. factor of one by default to keep the average*
2686	* hash chain length at 2 entries. Each entry has to be added
2687	* twice (once for original direction, once for reply).
2688	* When a table size is given we use the old value of 8 to
2689	* avoid implicit reduction of the max entries setting.
2690	*/
2691	max_factor = `1`;
2692	}
2693
2694	nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, `1`);
2695	if (!nf_conntrack_hash)
2696	return -ENOMEM;
2697
2698	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
2699
2700	nf_conntrack_cachep = kmem_cache_create(name: "nf_conntrack",
2701	size: sizeof(struct nf_conn),
2702	NFCT_INFOMASK + `1`,
2703	SLAB_TYPESAFE_BY_RCU \| SLAB_HWCACHE_ALIGN, NULL);
2704	if (!nf_conntrack_cachep)
2705	goto err_cachep;
2706
2707	ret = nf_conntrack_expect_init();
2708	if (ret < `0`)
2709	goto err_expect;
2710
2711	ret = nf_conntrack_helper_init();
2712	if (ret < `0`)
2713	goto err_helper;
2714
2715	ret = nf_conntrack_proto_init();
2716	if (ret < `0`)
2717	goto err_proto;
2718
2719	conntrack_gc_work_init(gc_work: &conntrack_gc_work);
2720	queue_delayed_work(wq: system_power_efficient_wq, dwork: &conntrack_gc_work.dwork, HZ);
2721
2722	ret = register_nf_conntrack_bpf();
2723	if (ret < `0`)
2724	goto err_kfunc;
2725
2726	return `0`;
2727
2728	err_kfunc:
2729	cancel_delayed_work_sync(dwork: &conntrack_gc_work.dwork);
2730	nf_conntrack_proto_fini();
2731	err_proto:
2732	nf_conntrack_helper_fini();
2733	err_helper:
2734	nf_conntrack_expect_fini();
2735	err_expect:
2736	kmem_cache_destroy(s: nf_conntrack_cachep);
2737	err_cachep:
2738	kvfree(addr: nf_conntrack_hash);
2739	return ret;
2740	}
2741
2742	static void nf_conntrack_set_closing(struct nf_conntrack *nfct)
2743	{
2744	struct nf_conn *ct = nf_ct_to_nf_conn(nfct);
2745
2746	switch (nf_ct_protonum(ct)) {
2747	case IPPROTO_TCP:
2748	nf_conntrack_tcp_set_closing(ct);
2749	break;
2750	}
2751	}
2752
2753	static const struct nf_ct_hook nf_conntrack_hook = {
2754	.update = nf_conntrack_update,
2755	.destroy = nf_ct_destroy,
2756	.get_tuple_skb = nf_conntrack_get_tuple_skb,
2757	.attach = nf_conntrack_attach,
2758	.set_closing = nf_conntrack_set_closing,
2759	.confirm = __nf_conntrack_confirm,
2760	};
2761
2762	void nf_conntrack_init_end(void)
2763	{
2764	RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
2765	}
2766
2767	/*
2768	* We need to use special "null" values, not used in hash table
2769	*/
2770	#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
2771
2772	int nf_conntrack_init_net(struct net *net)
2773	{
2774	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
2775	int ret = -ENOMEM;
2776
2777	BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
2778	BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
2779	atomic_set(v: &cnet->count, i: `0`);
2780
2781	net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
2782	if (!net->ct.stat)
2783	return ret;
2784
2785	ret = nf_conntrack_expect_pernet_init(net);
2786	if (ret < `0`)
2787	goto err_expect;
2788
2789	nf_conntrack_acct_pernet_init(net);
2790	nf_conntrack_tstamp_pernet_init(net);
2791	nf_conntrack_ecache_pernet_init(net);
2792	nf_conntrack_proto_pernet_init(net);
2793
2794	return `0`;
2795
2796	err_expect:
2797	free_percpu(pdata: net->ct.stat);
2798	return ret;
2799	}
2800
2801	/ ctnetlink code shared by both ctnetlink and nf_conntrack_bpf /
2802
2803	int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout)
2804	{
2805	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
2806	return -EPERM;
2807
2808	__nf_ct_set_timeout(ct, timeout);
2809
2810	if (test_bit(IPS_DYING_BIT, &ct->status))
2811	return -ETIME;
2812
2813	return `0`;
2814	}
2815	EXPORT_SYMBOL_GPL(__nf_ct_change_timeout);
2816
2817	void __nf_ct_change_status(struct nf_conn ct, unsigned* long on, unsigned long off)
2818	{
2819	unsigned int bit;
2820
2821	/ Ignore these unchangable bits /
2822	on &= ~IPS_UNCHANGEABLE_MASK;
2823	off &= ~IPS_UNCHANGEABLE_MASK;
2824
2825	for (bit = `0`; bit < __IPS_MAX_BIT; bit++) {
2826	if (on & (`1` << bit))
2827	set_bit(nr: bit, addr: &ct->status);
2828	else if (off & (`1` << bit))
2829	clear_bit(nr: bit, addr: &ct->status);
2830	}
2831	}
2832	EXPORT_SYMBOL_GPL(__nf_ct_change_status);
2833
2834	int nf_ct_change_status_common(struct nf_conn ct, unsigned* int status)
2835	{
2836	unsigned long d;
2837
2838	d = ct->status ^ status;
2839
2840	if (d & (IPS_EXPECTED\|IPS_CONFIRMED\|IPS_DYING))
2841	/ unchangeable /
2842	return -EBUSY;
2843
2844	if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY))
2845	/ SEEN_REPLY bit can only be set /
2846	return -EBUSY;
2847
2848	if (d & IPS_ASSURED && !(status & IPS_ASSURED))
2849	/ ASSURED bit can only be set /
2850	return -EBUSY;
2851
2852	__nf_ct_change_status(ct, status, `0`);
2853	return `0`;
2854	}
2855	EXPORT_SYMBOL_GPL(nf_ct_change_status_common);
2856

source code of linux/net/netfilter/nf_conntrack_core.c