tcp_ipv4.c source code [linux/net/ipv4/tcp_ipv4.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* INET An implementation of the TCP/IP protocol suite for the LINUX
4	* operating system. INET is implemented using the BSD Socket
5	* interface as the means of communication with the user level.
6	*
7	* Implementation of the Transmission Control Protocol(TCP).
8	*
9	* IPv4 specific functions
10	*
11	* code split from:
12	* linux/ipv4/tcp.c
13	* linux/ipv4/tcp_input.c
14	* linux/ipv4/tcp_output.c
15	*
16	* See tcp.c for author information
17	*/
18
19	/*
20	* Changes:
21	* David S. Miller : New socket lookup architecture.
22	* This code is dedicated to John Dyson.
23	* David S. Miller : Change semantics of established hash,
24	* half is devoted to TIME_WAIT sockets
25	* and the rest go in the other half.
26	* Andi Kleen : Add support for syncookies and fixed
27	* some bugs: ip options weren't passed to
28	* the TCP layer, missed a check for an
29	* ACK bit.
30	* Andi Kleen : Implemented fast path mtu discovery.
31	* Fixed many serious bugs in the
32	* request_sock handling and moved
33	* most of it into the af independent code.
34	* Added tail drop and some other bugfixes.
35	* Added new listen semantics.
36	* Mike McLagan : Routing by source
37	* Juan Jose Ciarlante: ip_dynaddr bits
38	* Andi Kleen: various fixes.
39	* Vitaly E. Lavrov : Transparent proxy revived after year
40	* coma.
41	* Andi Kleen : Fix new listen.
42	* Andi Kleen : Fix accept error reporting.
43	* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44	* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45	* a single port at the same time.
46	*/
47
48	#define pr_fmt(fmt) "TCP: " fmt
49
50	#include <linux/bottom_half.h>
51	#include <linux/types.h>
52	#include <linux/fcntl.h>
53	#include <linux/module.h>
54	#include <linux/random.h>
55	#include <linux/cache.h>
56	#include <linux/jhash.h>
57	#include <linux/init.h>
58	#include <linux/times.h>
59	#include <linux/slab.h>
60	#include <linux/sched.h>
61
62	#include <net/net_namespace.h>
63	#include <net/icmp.h>
64	#include <net/inet_hashtables.h>
65	#include <net/tcp.h>
66	#include <net/transp_v6.h>
67	#include <net/ipv6.h>
68	#include <net/inet_common.h>
69	#include <net/inet_ecn.h>
70	#include <net/timewait_sock.h>
71	#include <net/xfrm.h>
72	#include <net/secure_seq.h>
73	#include <net/busy_poll.h>
74	#include <net/rstreason.h>
75
76	#include <linux/inet.h>
77	#include <linux/ipv6.h>
78	#include <linux/stddef.h>
79	#include <linux/proc_fs.h>
80	#include <linux/seq_file.h>
81	#include <linux/inetdevice.h>
82	#include <linux/btf_ids.h>
83	#include <linux/skbuff_ref.h>
84
85	#include <crypto/hash.h>
86	#include <linux/scatterlist.h>
87
88	#include <trace/events/tcp.h>
89
90	#ifdef CONFIG_TCP_MD5SIG
91	static int tcp_v4_md5_hash_hdr(char md5_hash, const* struct tcp_md5sig_key *key,
92	__be32 daddr, __be32 saddr, const struct tcphdr *th);
93	#endif
94
95	struct inet_hashinfo tcp_hashinfo;
96
97	static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
98	.bh_lock = INIT_LOCAL_LOCK(bh_lock),
99	};
100
101	static DEFINE_MUTEX(tcp_exit_batch_mutex);
102
103	static u32 tcp_v4_init_seq(const struct sk_buff *skb)
104	{
105	return secure_tcp_seq(saddr: ip_hdr(skb)->daddr,
106	daddr: ip_hdr(skb)->saddr,
107	sport: tcp_hdr(skb)->dest,
108	dport: tcp_hdr(skb)->source);
109	}
110
111	static u32 tcp_v4_init_ts_off(const struct net net, const* struct sk_buff *skb)
112	{
113	return secure_tcp_ts_off(net, saddr: ip_hdr(skb)->daddr, daddr: ip_hdr(skb)->saddr);
114	}
115
116	int tcp_twsk_unique(struct sock sk, struct* sock sktw, void* *twp)
117	{
118	int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
119	const struct inet_timewait_sock *tw = inet_twsk(sk: sktw);
120	const struct tcp_timewait_sock *tcptw = tcp_twsk(sk: sktw);
121	struct tcp_sock *tp = tcp_sk(sk);
122	int ts_recent_stamp;
123	u32 reuse_thresh;
124
125	if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
126	reuse = `0`;
127
128	if (reuse == `2`) {
129	/ Still does not detect everything that goes through*
130	* lo, since we require a loopback src or dst address
131	* or direct binding to 'lo' interface.
132	*/
133	bool loopback = false;
134	if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
135	loopback = true;
136	#if IS_ENABLED(CONFIG_IPV6)
137	if (tw->tw_family == AF_INET6) {
138	if (ipv6_addr_loopback(a: &tw->tw_v6_daddr) \|\|
139	ipv6_addr_v4mapped_loopback(a: &tw->tw_v6_daddr) \|\|
140	ipv6_addr_loopback(a: &tw->tw_v6_rcv_saddr) \|\|
141	ipv6_addr_v4mapped_loopback(a: &tw->tw_v6_rcv_saddr))
142	loopback = true;
143	} else
144	#endif
145	{
146	if (ipv4_is_loopback(addr: tw->tw_daddr) \|\|
147	ipv4_is_loopback(addr: tw->tw_rcv_saddr))
148	loopback = true;
149	}
150	if (!loopback)
151	reuse = `0`;
152	}
153
154	/ With PAWS, it is safe from the viewpoint*
155	of data integrity. Even without PAWS it is safe provided sequence
156	spaces do not overlap i.e. at data rates <= 80Mbit/sec.
157
158	Actually, the idea is close to VJ's one, only timestamp cache is
159	held not per host, but per port pair and TW bucket is used as state
160	holder.
161
162	If TW bucket has been already destroyed we fall back to VJ's scheme
163	and use initial timestamp retrieved from peer table.
164	*/
165	ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
166	reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
167	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
168	if (ts_recent_stamp &&
169	(!twp \|\| (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
170	/ inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk*
171	* and releasing the bucket lock.
172	*/
173	if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
174	return `0`;
175
176	/ In case of repair and re-using TIME-WAIT sockets we still*
177	* want to be sure that it is safe as above but honor the
178	* sequence numbers and time stamps set as part of the repair
179	* process.
180	*
181	* Without this check re-using a TIME-WAIT socket with TCP
182	* repair would accumulate a -1 on the repair assigned
183	* sequence number. The first time it is reused the sequence
184	* is -1, the second time -2, etc. This fixes that issue
185	* without appearing to create any others.
186	*/
187	if (likely(!tp->repair)) {
188	u32 seq = tcptw->tw_snd_nxt + `65535` + `2`;
189
190	if (!seq)
191	seq = `1`;
192	WRITE_ONCE(tp->write_seq, seq);
193	tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
194	tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
195	}
196
197	return `1`;
198	}
199
200	return `0`;
201	}
202	EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
203
204	static int tcp_v4_pre_connect(struct sock sk, struct* sockaddr *uaddr,
205	int addr_len)
206	{
207	/ This check is replicated from tcp_v4_connect() and intended to*
208	* prevent BPF program called below from accessing bytes that are out
209	* of the bound specified by user in addr_len.
210	*/
211	if (addr_len < sizeof(struct sockaddr_in))
212	return -EINVAL;
213
214	sock_owned_by_me(sk);
215
216	return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
217	}
218
219	/ This will initiate an outgoing connection. /
220	int tcp_v4_connect(struct sock sk, struct* sockaddr uaddr, int* addr_len)
221	{
222	struct sockaddr_in usin = (struct* sockaddr_in *)uaddr;
223	struct inet_timewait_death_row *tcp_death_row;
224	struct inet_sock *inet = inet_sk(sk);
225	struct tcp_sock *tp = tcp_sk(sk);
226	struct ip_options_rcu *inet_opt;
227	struct net *net = sock_net(sk);
228	__be16 orig_sport, orig_dport;
229	__be32 daddr, nexthop;
230	struct flowi4 *fl4;
231	struct rtable *rt;
232	int err;
233
234	if (addr_len < sizeof(struct sockaddr_in))
235	return -EINVAL;
236
237	if (usin->sin_family != AF_INET)
238	return -EAFNOSUPPORT;
239
240	nexthop = daddr = usin->sin_addr.s_addr;
241	inet_opt = rcu_dereference_protected(inet->inet_opt,
242	lockdep_sock_is_held(sk));
243	if (inet_opt && inet_opt->opt.srr) {
244	if (!daddr)
245	return -EINVAL;
246	nexthop = inet_opt->opt.faddr;
247	}
248
249	orig_sport = inet->inet_sport;
250	orig_dport = usin->sin_port;
251	fl4 = &inet->cork.fl.u.ip4;
252	rt = ip_route_connect(fl4, dst: nexthop, src: inet->inet_saddr,
253	oif: sk->sk_bound_dev_if, IPPROTO_TCP, sport: orig_sport,
254	dport: orig_dport, sk);
255	if (IS_ERR(ptr: rt)) {
256	err = PTR_ERR(ptr: rt);
257	if (err == -ENETUNREACH)
258	IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
259	return err;
260	}
261
262	if (rt->rt_flags & (RTCF_MULTICAST \| RTCF_BROADCAST)) {
263	ip_rt_put(rt);
264	return -ENETUNREACH;
265	}
266
267	if (!inet_opt \|\| !inet_opt->opt.srr)
268	daddr = fl4->daddr;
269
270	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
271
272	if (!inet->inet_saddr) {
273	err = inet_bhash2_update_saddr(sk, saddr: &fl4->saddr, AF_INET);
274	if (err) {
275	ip_rt_put(rt);
276	return err;
277	}
278	} else {
279	sk_rcv_saddr_set(sk, addr: inet->inet_saddr);
280	}
281
282	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
283	/ Reset inherited state /
284	tp->rx_opt.ts_recent = `0`;
285	tp->rx_opt.ts_recent_stamp = `0`;
286	if (likely(!tp->repair))
287	WRITE_ONCE(tp->write_seq, `0`);
288	}
289
290	inet->inet_dport = usin->sin_port;
291	sk_daddr_set(sk, addr: daddr);
292
293	inet_csk(sk)->icsk_ext_hdr_len = `0`;
294	if (inet_opt)
295	inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
296
297	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
298
299	/ Socket identity is still unknown (sport may be zero).*
300	* However we set state to SYN-SENT and not releasing socket
301	* lock select source port, enter ourselves into the hash tables and
302	* complete initialization after this.
303	*/
304	tcp_set_state(sk, state: TCP_SYN_SENT);
305	err = inet_hash_connect(death_row: tcp_death_row, sk);
306	if (err)
307	goto failure;
308
309	sk_set_txhash(sk);
310
311	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
312	sport: inet->inet_sport, dport: inet->inet_dport, sk);
313	if (IS_ERR(ptr: rt)) {
314	err = PTR_ERR(ptr: rt);
315	rt = NULL;
316	goto failure;
317	}
318	tp->tcp_usec_ts = dst_tcp_usec_ts(dst: &rt->dst);
319	/ OK, now commit destination to socket. /
320	sk->sk_gso_type = SKB_GSO_TCPV4;
321	sk_setup_caps(sk, dst: &rt->dst);
322	rt = NULL;
323
324	if (likely(!tp->repair)) {
325	if (!tp->write_seq)
326	WRITE_ONCE(tp->write_seq,
327	secure_tcp_seq(inet->inet_saddr,
328	inet->inet_daddr,
329	inet->inet_sport,
330	usin->sin_port));
331	WRITE_ONCE(tp->tsoffset,
332	secure_tcp_ts_off(net, inet->inet_saddr,
333	inet->inet_daddr));
334	}
335
336	atomic_set(v: &inet->inet_id, i: get_random_u16());
337
338	if (tcp_fastopen_defer_connect(sk, err: &err))
339	return err;
340	if (err)
341	goto failure;
342
343	err = tcp_connect(sk);
344
345	if (err)
346	goto failure;
347
348	return `0`;
349
350	failure:
351	/*
352	* This unhashes the socket and releases the local port,
353	* if necessary.
354	*/
355	tcp_set_state(sk, state: TCP_CLOSE);
356	inet_bhash2_reset_saddr(sk);
357	ip_rt_put(rt);
358	sk->sk_route_caps = `0`;
359	inet->inet_dport = `0`;
360	return err;
361	}
362	EXPORT_IPV6_MOD(tcp_v4_connect);
363
364	/*
365	* This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
366	* It can be called through tcp_release_cb() if socket was owned by user
367	* at the time tcp_v4_err() was called to handle ICMP message.
368	*/
369	void tcp_v4_mtu_reduced(struct sock *sk)
370	{
371	struct inet_sock *inet = inet_sk(sk);
372	struct dst_entry *dst;
373	u32 mtu;
374
375	if ((`1` << sk->sk_state) & (TCPF_LISTEN \| TCPF_CLOSE))
376	return;
377	mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
378	dst = inet_csk_update_pmtu(sk, mtu);
379	if (!dst)
380	return;
381
382	/ Something is about to be wrong... Remember soft error*
383	* for the case, if this connection will not able to recover.
384	*/
385	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
386	WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
387
388	mtu = dst_mtu(dst);
389
390	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
391	ip_sk_accept_pmtu(sk) &&
392	inet_csk(sk)->icsk_pmtu_cookie > mtu) {
393	tcp_sync_mss(sk, pmtu: mtu);
394
395	/ Resend the TCP packet because it's*
396	* clear that the old packet has been
397	* dropped. This is the new "fast" path mtu
398	* discovery.
399	*/
400	tcp_simple_retransmit(sk);
401	} / else let the usual retransmit timer handle it /
402	}
403	EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
404
405	static void do_redirect(struct sk_buff skb, struct* sock *sk)
406	{
407	struct dst_entry *dst = __sk_dst_check(sk, cookie: `0`);
408
409	if (dst)
410	dst->ops->redirect(dst, sk, skb);
411	}
412
413
414	/ handle ICMP messages on TCP_NEW_SYN_RECV request sockets /
415	void tcp_req_err(struct sock *sk, u32 seq, bool abort)
416	{
417	struct request_sock *req = inet_reqsk(sk);
418	struct net *net = sock_net(sk);
419
420	/ ICMPs are not backlogged, hence we cannot get*
421	* an established socket here.
422	*/
423	if (seq != tcp_rsk(req)->snt_isn) {
424	__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
425	} else if (abort) {
426	/*
427	* Still in SYN_RECV, just remove it silently.
428	* There is no good way to pass the error to the newly
429	* created socket, and POSIX does not want network
430	* errors returned from accept().
431	*/
432	inet_csk_reqsk_queue_drop(sk: req->rsk_listener, req);
433	tcp_listendrop(sk: req->rsk_listener);
434	}
435	reqsk_put(req);
436	}
437	EXPORT_IPV6_MOD(tcp_req_err);
438
439	/ TCP-LD (RFC 6069) logic /
440	void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
441	{
442	struct inet_connection_sock *icsk = inet_csk(sk);
443	struct tcp_sock *tp = tcp_sk(sk);
444	struct sk_buff *skb;
445	s32 remaining;
446	u32 delta_us;
447
448	if (sock_owned_by_user(sk))
449	return;
450
451	if (seq != tp->snd_una \|\| !icsk->icsk_retransmits \|\|
452	!icsk->icsk_backoff)
453	return;
454
455	skb = tcp_rtx_queue_head(sk);
456	if (WARN_ON_ONCE(!skb))
457	return;
458
459	icsk->icsk_backoff--;
460	icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
461	icsk->icsk_rto = inet_csk_rto_backoff(icsk, max_when: tcp_rto_max(sk));
462
463	tcp_mstamp_refresh(tp);
464	delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
465	remaining = icsk->icsk_rto - usecs_to_jiffies(u: delta_us);
466
467	if (remaining > `0`) {
468	tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, when: remaining, pace_delay: false);
469	} else {
470	/ RTO revert clocked out retransmission.*
471	* Will retransmit now.
472	*/
473	tcp_retransmit_timer(sk);
474	}
475	}
476	EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
477
478	/*
479	* This routine is called by the ICMP module when it gets some
480	* sort of error condition. If err < 0 then the socket should
481	* be closed and the error returned to the user. If err > 0
482	* it's just the icmp type << 8 \| icmp code. After adjustment
483	* header points to the first 8 bytes of the tcp header. We need
484	* to find the appropriate port.
485	*
486	* The locking strategy used here is very "optimistic". When
487	* someone else accesses the socket the ICMP is just dropped
488	* and for some paths there is no check at all.
489	* A more general error queue to queue errors for later handling
490	* is probably better.
491	*
492	*/
493
494	int tcp_v4_err(struct sk_buff *skb, u32 info)
495	{
496	const struct iphdr iph = (const* struct iphdr *)skb->data;
497	struct tcphdr th = (struct* tcphdr *)(skb->data + (iph->ihl << `2`));
498	struct net *net = dev_net_rcu(dev: skb->dev);
499	const int type = icmp_hdr(skb)->type;
500	const int code = icmp_hdr(skb)->code;
501	struct request_sock *fastopen;
502	struct tcp_sock *tp;
503	u32 seq, snd_una;
504	struct sock *sk;
505	int err;
506
507	sk = __inet_lookup_established(net, hashinfo: net->ipv4.tcp_death_row.hashinfo,
508	saddr: iph->daddr, sport: th->dest, daddr: iph->saddr,
509	ntohs(th->source), dif: inet_iif(skb), sdif: `0`);
510	if (!sk) {
511	__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
512	return -ENOENT;
513	}
514	if (sk->sk_state == TCP_TIME_WAIT) {
515	/ To increase the counter of ignored icmps for TCP-AO /
516	tcp_ao_ignore_icmp(sk, AF_INET, type, code);
517	inet_twsk_put(tw: inet_twsk(sk));
518	return `0`;
519	}
520	seq = ntohl(th->seq);
521	if (sk->sk_state == TCP_NEW_SYN_RECV) {
522	tcp_req_err(sk, seq, abort: type == ICMP_PARAMETERPROB \|\|
523	type == ICMP_TIME_EXCEEDED \|\|
524	(type == ICMP_DEST_UNREACH &&
525	(code == ICMP_NET_UNREACH \|\|
526	code == ICMP_HOST_UNREACH)));
527	return `0`;
528	}
529
530	if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
531	sock_put(sk);
532	return `0`;
533	}
534
535	bh_lock_sock(sk);
536	/ If too many ICMPs get dropped on busy*
537	* servers this needs to be solved differently.
538	* We do take care of PMTU discovery (RFC1191) special case :
539	* we can receive locally generated ICMP messages while socket is held.
540	*/
541	if (sock_owned_by_user(sk)) {
542	if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
543	__NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
544	}
545	if (sk->sk_state == TCP_CLOSE)
546	goto out;
547
548	if (static_branch_unlikely(&ip4_min_ttl)) {
549	/ min_ttl can be changed concurrently from do_ip_setsockopt() /
550	if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
551	__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
552	goto out;
553	}
554	}
555
556	tp = tcp_sk(sk);
557	/ XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() /
558	fastopen = rcu_dereference(tp->fastopen_rsk);
559	snd_una = fastopen ? tcp_rsk(req: fastopen)->snt_isn : tp->snd_una;
560	if (sk->sk_state != TCP_LISTEN &&
561	!between(seq1: seq, seq2: snd_una, seq3: tp->snd_nxt)) {
562	__NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
563	goto out;
564	}
565
566	switch (type) {
567	case ICMP_REDIRECT:
568	if (!sock_owned_by_user(sk))
569	do_redirect(skb, sk);
570	goto out;
571	case ICMP_SOURCE_QUENCH:
572	/ Just silently ignore these. /
573	goto out;
574	case ICMP_PARAMETERPROB:
575	err = EPROTO;
576	break;
577	case ICMP_DEST_UNREACH:
578	if (code > NR_ICMP_UNREACH)
579	goto out;
580
581	if (code == ICMP_FRAG_NEEDED) { / PMTU discovery (RFC1191) /
582	/ We are not interested in TCP_LISTEN and open_requests*
583	* (SYN-ACKs send out by Linux are always <576bytes so
584	* they should go through unfragmented).
585	*/
586	if (sk->sk_state == TCP_LISTEN)
587	goto out;
588
589	WRITE_ONCE(tp->mtu_info, info);
590	if (!sock_owned_by_user(sk)) {
591	tcp_v4_mtu_reduced(sk);
592	} else {
593	if (!test_and_set_bit(nr: TCP_MTU_REDUCED_DEFERRED, addr: &sk->sk_tsq_flags))
594	sock_hold(sk);
595	}
596	goto out;
597	}
598
599	err = icmp_err_convert[code].errno;
600	/ check if this ICMP message allows revert of backoff.*
601	* (see RFC 6069)
602	*/
603	if (!fastopen &&
604	(code == ICMP_NET_UNREACH \|\| code == ICMP_HOST_UNREACH))
605	tcp_ld_RTO_revert(sk, seq);
606	break;
607	case ICMP_TIME_EXCEEDED:
608	err = EHOSTUNREACH;
609	break;
610	default:
611	goto out;
612	}
613
614	switch (sk->sk_state) {
615	case TCP_SYN_SENT:
616	case TCP_SYN_RECV:
617	/ Only in fast or simultaneous open. If a fast open socket is*
618	* already accepted it is treated as a connected one below.
619	*/
620	if (fastopen && !fastopen->sk)
621	break;
622
623	ip_icmp_error(sk, skb, err, port: th->dest, info, payload: (u8 *)th);
624
625	if (!sock_owned_by_user(sk))
626	tcp_done_with_error(sk, err);
627	else
628	WRITE_ONCE(sk->sk_err_soft, err);
629	goto out;
630	}
631
632	/ If we've already connected we will keep trying*
633	* until we time out, or the user gives up.
634	*
635	* rfc1122 4.2.3.9 allows to consider as hard errors
636	* only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
637	* but it is obsoleted by pmtu discovery).
638	*
639	* Note, that in modern internet, where routing is unreliable
640	* and in each dark corner broken firewalls sit, sending random
641	* errors ordered by their masters even this two messages finally lose
642	* their original sense (even Linux sends invalid PORT_UNREACHs)
643	*
644	* Now we are in compliance with RFCs.
645	* --ANK (980905)
646	*/
647
648	if (!sock_owned_by_user(sk) &&
649	inet_test_bit(RECVERR, sk)) {
650	WRITE_ONCE(sk->sk_err, err);
651	sk_error_report(sk);
652	} else { / Only an error on timeout /
653	WRITE_ONCE(sk->sk_err_soft, err);
654	}
655
656	out:
657	bh_unlock_sock(sk);
658	sock_put(sk);
659	return `0`;
660	}
661
662	void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
663	{
664	struct tcphdr *th = tcp_hdr(skb);
665
666	th->check = ~tcp_v4_check(len: skb->len, saddr, daddr, base: `0`);
667	skb->csum_start = skb_transport_header(skb) - skb->head;
668	skb->csum_offset = offsetof(struct tcphdr, check);
669	}
670
671	/ This routine computes an IPv4 TCP checksum. /
672	void tcp_v4_send_check(struct sock sk, struct* sk_buff *skb)
673	{
674	const struct inet_sock *inet = inet_sk(sk);
675
676	__tcp_v4_send_check(skb, saddr: inet->inet_saddr, daddr: inet->inet_daddr);
677	}
678	EXPORT_IPV6_MOD(tcp_v4_send_check);
679
680	#define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
681
682	static bool tcp_v4_ao_sign_reset(const struct sock sk, struct* sk_buff *skb,
683	const struct tcp_ao_hdr *aoh,
684	struct ip_reply_arg arg, struct* tcphdr *reply,
685	__be32 reply_options[REPLY_OPTIONS_LEN])
686	{
687	#ifdef CONFIG_TCP_AO
688	int sdif = tcp_v4_sdif(skb);
689	int dif = inet_iif(skb);
690	int l3index = sdif ? dif : `0`;
691	bool allocated_traffic_key;
692	struct tcp_ao_key *key;
693	char *traffic_key;
694	bool drop = true;
695	u32 ao_sne = `0`;
696	u8 keyid;
697
698	rcu_read_lock();
699	if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
700	key: &key, traffic_key: &traffic_key, allocated_traffic_key: &allocated_traffic_key,
701	keyid: &keyid, sne: &ao_sne))
702	goto out;
703
704	reply_options[`0`] = htonl((TCPOPT_AO << `24`) \| (tcp_ao_len(key) << `16`) \|
705	(aoh->rnext_keyid << `8`) \| keyid);
706	arg->iov[`0`].iov_len += tcp_ao_len_aligned(key);
707	reply->doff = arg->iov[`0`].iov_len / `4`;
708
709	if (tcp_ao_hash_hdr(AF_INET, ao_hash: (char *)&reply_options[`1`],
710	key, tkey: traffic_key,
711	daddr: (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
712	saddr: (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
713	th: reply, sne: ao_sne))
714	goto out;
715	drop = false;
716	out:
717	rcu_read_unlock();
718	if (allocated_traffic_key)
719	kfree(objp: traffic_key);
720	return drop;
721	#else
722	return true;
723	#endif
724	}
725
726	/*
727	* This routine will send an RST to the other tcp.
728	*
729	* Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
730	* for reset.
731	* Answer: if a packet caused RST, it is not for a socket
732	* existing in our system, if it is matched to a socket,
733	* it is just duplicate segment or bug in other side's TCP.
734	* So that we build reply only basing on parameters
735	* arrived with segment.
736	* Exception: precedence violation. We do not implement it in any case.
737	*/
738
739	static void tcp_v4_send_reset(const struct sock sk, struct* sk_buff *skb,
740	enum sk_rst_reason reason)
741	{
742	const struct tcphdr *th = tcp_hdr(skb);
743	struct {
744	struct tcphdr th;
745	__be32 opt[REPLY_OPTIONS_LEN];
746	} rep;
747	const __u8 *md5_hash_location = NULL;
748	const struct tcp_ao_hdr *aoh;
749	struct ip_reply_arg arg;
750	#ifdef CONFIG_TCP_MD5SIG
751	struct tcp_md5sig_key *key = NULL;
752	unsigned char newhash[`16`];
753	struct sock *sk1 = NULL;
754	int genhash;
755	#endif
756	u64 transmit_time = `0`;
757	struct sock *ctl_sk;
758	struct net *net;
759	u32 txhash = `0`;
760
761	/ Never send a reset in response to a reset. /
762	if (th->rst)
763	return;
764
765	/ If sk not NULL, it means we did a successful lookup and incoming*
766	* route had to be correct. prequeue might have dropped our dst.
767	*/
768	if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
769	return;
770
771	/ Swap the send and the receive. /
772	memset(&rep, `0`, sizeof(rep));
773	rep.th.dest = th->source;
774	rep.th.source = th->dest;
775	rep.th.doff = sizeof(struct tcphdr) / `4`;
776	rep.th.rst = `1`;
777
778	if (th->ack) {
779	rep.th.seq = th->ack_seq;
780	} else {
781	rep.th.ack = `1`;
782	rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
783	skb->len - (th->doff << `2`));
784	}
785
786	memset(&arg, `0`, sizeof(arg));
787	arg.iov[`0`].iov_base = (unsigned char *)&rep;
788	arg.iov[`0`].iov_len = sizeof(rep.th);
789
790	net = sk ? sock_net(sk) : dev_net_rcu(dev: skb_dst(skb)->dev);
791
792	/ Invalid TCP option size or twice included auth /
793	if (tcp_parse_auth_options(th: tcp_hdr(skb), md5_hash: &md5_hash_location, aoh: &aoh))
794	return;
795
796	if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, arg: &arg, reply: &rep.th, reply_options: rep.opt))
797	return;
798
799	#ifdef CONFIG_TCP_MD5SIG
800	rcu_read_lock();
801	if (sk && sk_fullsock(sk)) {
802	const union tcp_md5_addr *addr;
803	int l3index;
804
805	/ sdif set, means packet ingressed via a device*
806	* in an L3 domain and inet_iif is set to it.
807	*/
808	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : `0`;
809	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
810	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
811	} else if (md5_hash_location) {
812	const union tcp_md5_addr *addr;
813	int sdif = tcp_v4_sdif(skb);
814	int dif = inet_iif(skb);
815	int l3index;
816
817	/*
818	* active side is lost. Try to find listening socket through
819	* source port, and then find md5 key through listening socket.
820	* we are not loose security here:
821	* Incoming packet is checked with md5 hash with finding key,
822	* no RST generated if md5 hash doesn't match.
823	*/
824	sk1 = __inet_lookup_listener(net, hashinfo: net->ipv4.tcp_death_row.hashinfo,
825	NULL, doff: `0`, saddr: ip_hdr(skb)->saddr,
826	sport: th->source, daddr: ip_hdr(skb)->daddr,
827	ntohs(th->source), dif, sdif);
828	/ don't send rst if it can't find key /
829	if (!sk1)
830	goto out;
831
832	/ sdif set, means packet ingressed via a device*
833	* in an L3 domain and dif is set to it.
834	*/
835	l3index = sdif ? dif : `0`;
836	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
837	key = tcp_md5_do_lookup(sk: sk1, l3index, addr, AF_INET);
838	if (!key)
839	goto out;
840
841
842	genhash = tcp_v4_md5_hash_skb(md5_hash: newhash, key, NULL, skb);
843	if (genhash \|\| memcmp(p: md5_hash_location, q: newhash, size: `16`) != `0`)
844	goto out;
845
846	}
847
848	if (key) {
849	rep.opt[`0`] = htonl((TCPOPT_NOP << `24`) \|
850	(TCPOPT_NOP << `16`) \|
851	(TCPOPT_MD5SIG << `8`) \|
852	TCPOLEN_MD5SIG);
853	/ Update length and the length the header thinks exists /
854	arg.iov[`0`].iov_len += TCPOLEN_MD5SIG_ALIGNED;
855	rep.th.doff = arg.iov[`0`].iov_len / `4`;
856
857	tcp_v4_md5_hash_hdr(md5_hash: (__u8 *) &rep.opt[`1`],
858	key, daddr: ip_hdr(skb)->saddr,
859	saddr: ip_hdr(skb)->daddr, th: &rep.th);
860	}
861	#endif
862	/ Can't co-exist with TCPMD5, hence check rep.opt[0] /
863	if (rep.opt[`0`] == `0`) {
864	__be32 mrst = mptcp_reset_option(skb);
865
866	if (mrst) {
867	rep.opt[`0`] = mrst;
868	arg.iov[`0`].iov_len += sizeof(mrst);
869	rep.th.doff = arg.iov[`0`].iov_len / `4`;
870	}
871	}
872
873	arg.csum = csum_tcpudp_nofold(saddr: ip_hdr(skb)->daddr,
874	daddr: ip_hdr(skb)->saddr, / XXX /
875	len: arg.iov[`0`].iov_len, IPPROTO_TCP, sum: `0`);
876	arg.csumoffset = offsetof(struct tcphdr, check) / `2`;
877	arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : `0`;
878
879	/ When socket is gone, all binding information is lost.*
880	* routing might fail in this case. No choice here, if we choose to force
881	* input interface, we will misroute in case of asymmetric route.
882	*/
883	if (sk)
884	arg.bound_dev_if = sk->sk_bound_dev_if;
885
886	trace_tcp_send_reset(sk, skb__nullable: skb, reason);
887
888	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
889	offsetof(struct inet_timewait_sock, tw_bound_dev_if));
890
891	/ ECN bits of TW reset are cleared /
892	arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
893	arg.uid = sock_net_uid(net, sk: sk && sk_fullsock(sk) ? sk : NULL);
894	local_bh_disable();
895	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
896	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
897
898	sock_net_set(sk: ctl_sk, net);
899	if (sk) {
900	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
901	inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
902	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
903	inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
904	transmit_time = tcp_transmit_time(sk);
905	xfrm_sk_clone_policy(sk: ctl_sk, osk: sk);
906	txhash = (sk->sk_state == TCP_TIME_WAIT) ?
907	inet_twsk(sk)->tw_txhash : sk->sk_txhash;
908	} else {
909	ctl_sk->sk_mark = `0`;
910	ctl_sk->sk_priority = `0`;
911	}
912	ip_send_unicast_reply(sk: ctl_sk, orig_sk: sk,
913	skb, sopt: &TCP_SKB_CB(skb)->header.h4.opt,
914	daddr: ip_hdr(skb)->saddr, saddr: ip_hdr(skb)->daddr,
915	arg: &arg, len: arg.iov[`0`].iov_len,
916	transmit_time, txhash);
917
918	xfrm_sk_free_policy(sk: ctl_sk);
919	sock_net_set(sk: ctl_sk, net: &init_net);
920	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
921	__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
922	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
923	local_bh_enable();
924
925	#ifdef CONFIG_TCP_MD5SIG
926	out:
927	rcu_read_unlock();
928	#endif
929	}
930
931	/ The code following below sending ACKs in SYN-RECV and TIME-WAIT states*
932	outside socket context is ugly, certainly. What can I do?
933	*/
934
935	static void tcp_v4_send_ack(const struct sock *sk,
936	struct sk_buff *skb, u32 seq, u32 ack,
937	u32 win, u32 tsval, u32 tsecr, int oif,
938	struct tcp_key *key,
939	int reply_flags, u8 tos, u32 txhash)
940	{
941	const struct tcphdr *th = tcp_hdr(skb);
942	struct {
943	struct tcphdr th;
944	__be32 opt[(MAX_TCP_OPTION_SPACE >> `2`)];
945	} rep;
946	struct net *net = sock_net(sk);
947	struct ip_reply_arg arg;
948	struct sock *ctl_sk;
949	u64 transmit_time;
950
951	memset(&rep.th, `0`, sizeof(struct tcphdr));
952	memset(&arg, `0`, sizeof(arg));
953
954	arg.iov[`0`].iov_base = (unsigned char *)&rep;
955	arg.iov[`0`].iov_len = sizeof(rep.th);
956	if (tsecr) {
957	rep.opt[`0`] = htonl((TCPOPT_NOP << `24`) \| (TCPOPT_NOP << `16`) \|
958	(TCPOPT_TIMESTAMP << `8`) \|
959	TCPOLEN_TIMESTAMP);
960	rep.opt[`1`] = htonl(tsval);
961	rep.opt[`2`] = htonl(tsecr);
962	arg.iov[`0`].iov_len += TCPOLEN_TSTAMP_ALIGNED;
963	}
964
965	/ Swap the send and the receive. /
966	rep.th.dest = th->source;
967	rep.th.source = th->dest;
968	rep.th.doff = arg.iov[`0`].iov_len / `4`;
969	rep.th.seq = htonl(seq);
970	rep.th.ack_seq = htonl(ack);
971	rep.th.ack = `1`;
972	rep.th.window = htons(win);
973
974	#ifdef CONFIG_TCP_MD5SIG
975	if (tcp_key_is_md5(key)) {
976	int offset = (tsecr) ? `3` : `0`;
977
978	rep.opt[offset++] = htonl((TCPOPT_NOP << `24`) \|
979	(TCPOPT_NOP << `16`) \|
980	(TCPOPT_MD5SIG << `8`) \|
981	TCPOLEN_MD5SIG);
982	arg.iov[`0`].iov_len += TCPOLEN_MD5SIG_ALIGNED;
983	rep.th.doff = arg.iov[`0`].iov_len/`4`;
984
985	tcp_v4_md5_hash_hdr(md5_hash: (__u8 *) &rep.opt[offset],
986	key: key->md5_key, daddr: ip_hdr(skb)->saddr,
987	saddr: ip_hdr(skb)->daddr, th: &rep.th);
988	}
989	#endif
990	#ifdef CONFIG_TCP_AO
991	if (tcp_key_is_ao(key)) {
992	int offset = (tsecr) ? `3` : `0`;
993
994	rep.opt[offset++] = htonl((TCPOPT_AO << `24`) \|
995	(tcp_ao_len(key->ao_key) << `16`) \|
996	(key->ao_key->sndid << `8`) \|
997	key->rcv_next);
998	arg.iov[`0`].iov_len += tcp_ao_len_aligned(key: key->ao_key);
999	rep.th.doff = arg.iov[`0`].iov_len / `4`;
1000
1001	tcp_ao_hash_hdr(AF_INET, ao_hash: (char *)&rep.opt[offset],
1002	key: key->ao_key, tkey: key->traffic_key,
1003	daddr: (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1004	saddr: (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1005	th: &rep.th, sne: key->sne);
1006	}
1007	#endif
1008	arg.flags = reply_flags;
1009	arg.csum = csum_tcpudp_nofold(saddr: ip_hdr(skb)->daddr,
1010	daddr: ip_hdr(skb)->saddr, / XXX /
1011	len: arg.iov[`0`].iov_len, IPPROTO_TCP, sum: `0`);
1012	arg.csumoffset = offsetof(struct tcphdr, check) / `2`;
1013	if (oif)
1014	arg.bound_dev_if = oif;
1015	arg.tos = tos;
1016	arg.uid = sock_net_uid(net, sk: sk_fullsock(sk) ? sk : NULL);
1017	local_bh_disable();
1018	local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1019	ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1020	sock_net_set(sk: ctl_sk, net);
1021	ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1022	inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1023	ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1024	inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1025	transmit_time = tcp_transmit_time(sk);
1026	ip_send_unicast_reply(sk: ctl_sk, orig_sk: sk,
1027	skb, sopt: &TCP_SKB_CB(skb)->header.h4.opt,
1028	daddr: ip_hdr(skb)->saddr, saddr: ip_hdr(skb)->daddr,
1029	arg: &arg, len: arg.iov[`0`].iov_len,
1030	transmit_time, txhash);
1031
1032	sock_net_set(sk: ctl_sk, net: &init_net);
1033	__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1034	local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1035	local_bh_enable();
1036	}
1037
1038	static void tcp_v4_timewait_ack(struct sock sk, struct* sk_buff *skb,
1039	enum tcp_tw_status tw_status)
1040	{
1041	struct inet_timewait_sock *tw = inet_twsk(sk);
1042	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1043	struct tcp_key key = {};
1044	u8 tos = tw->tw_tos;
1045
1046	/ Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,*
1047	* while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1048	* being placed in a different service queues (Classic rather than L4S)
1049	*/
1050	if (tw_status == TCP_TW_ACK_OOW)
1051	tos &= ~INET_ECN_MASK;
1052
1053	#ifdef CONFIG_TCP_AO
1054	struct tcp_ao_info *ao_info;
1055
1056	if (static_branch_unlikely(&tcp_ao_needed.key)) {
1057	/ FIXME: the segment to-be-acked is not verified yet /
1058	ao_info = rcu_dereference(tcptw->ao_info);
1059	if (ao_info) {
1060	const struct tcp_ao_hdr *aoh;
1061
1062	if (tcp_parse_auth_options(th: tcp_hdr(skb), NULL, aoh: &aoh)) {
1063	inet_twsk_put(tw);
1064	return;
1065	}
1066
1067	if (aoh)
1068	key.ao_key = tcp_ao_established_key(sk, ao: ao_info,
1069	sndid: aoh->rnext_keyid, rcvid: -`1`);
1070	}
1071	}
1072	if (key.ao_key) {
1073	struct tcp_ao_key *rnext_key;
1074
1075	key.traffic_key = snd_other_key(key: key.ao_key);
1076	key.sne = READ_ONCE(ao_info->snd_sne);
1077	rnext_key = READ_ONCE(ao_info->rnext_key);
1078	key.rcv_next = rnext_key->rcvid;
1079	key.type = TCP_KEY_AO;
1080	#else
1081	if (`0`) {
1082	#endif
1083	} else if (static_branch_tcp_md5()) {
1084	key.md5_key = tcp_twsk_md5_key(tcptw);
1085	if (key.md5_key)
1086	key.type = TCP_KEY_MD5;
1087	}
1088
1089	tcp_v4_send_ack(sk, skb,
1090	seq: tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1091	win: tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1092	tsval: tcp_tw_tsval(tcptw),
1093	READ_ONCE(tcptw->tw_ts_recent),
1094	oif: tw->tw_bound_dev_if, key: &key,
1095	reply_flags: tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : `0`,
1096	tos,
1097	txhash: tw->tw_txhash);
1098
1099	inet_twsk_put(tw);
1100	}
1101
1102	static void tcp_v4_reqsk_send_ack(const struct sock sk, struct* sk_buff *skb,
1103	struct request_sock *req)
1104	{
1105	struct tcp_key key = {};
1106
1107	/ sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV*
1108	* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1109	*/
1110	u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + `1` :
1111	tcp_sk(sk)->snd_nxt;
1112
1113	#ifdef CONFIG_TCP_AO
1114	if (static_branch_unlikely(&tcp_ao_needed.key) &&
1115	tcp_rsk_used_ao(req)) {
1116	const union tcp_md5_addr *addr;
1117	const struct tcp_ao_hdr *aoh;
1118	int l3index;
1119
1120	/ Invalid TCP option size or twice included auth /
1121	if (tcp_parse_auth_options(th: tcp_hdr(skb), NULL, aoh: &aoh))
1122	return;
1123	if (!aoh)
1124	return;
1125
1126	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1127	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : `0`;
1128	key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1129	sndid: aoh->rnext_keyid, rcvid: -`1`);
1130	if (unlikely(!key.ao_key)) {
1131	/ Send ACK with any matching MKT for the peer /
1132	key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, sndid: -`1`, rcvid: -`1`);
1133	/ Matching key disappeared (user removed the key?)*
1134	* let the handshake timeout.
1135	*/
1136	if (!key.ao_key) {
1137	net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1138	addr,
1139	ntohs(tcp_hdr(skb)->source),
1140	&ip_hdr(skb)->daddr,
1141	ntohs(tcp_hdr(skb)->dest));
1142	return;
1143	}
1144	}
1145	key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1146	if (!key.traffic_key)
1147	return;
1148
1149	key.type = TCP_KEY_AO;
1150	key.rcv_next = aoh->keyid;
1151	tcp_v4_ao_calc_key_rsk(mkt: key.ao_key, key: key.traffic_key, req);
1152	#else
1153	if (`0`) {
1154	#endif
1155	} else if (static_branch_tcp_md5()) {
1156	const union tcp_md5_addr *addr;
1157	int l3index;
1158
1159	addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1160	l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : `0`;
1161	key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1162	if (key.md5_key)
1163	key.type = TCP_KEY_MD5;
1164	}
1165
1166	/ Cleaning ECN bits of TW ACKs of oow data or is paws_reject /
1167	tcp_v4_send_ack(sk, skb, seq,
1168	ack: tcp_rsk(req)->rcv_nxt,
1169	win: tcp_synack_window(req) >> inet_rsk(sk: req)->rcv_wscale,
1170	tsval: tcp_rsk_tsval(treq: tcp_rsk(req)),
1171	tsecr: req->ts_recent,
1172	oif: `0`, key: &key,
1173	reply_flags: inet_rsk(sk: req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : `0`,
1174	tos: ip_hdr(skb)->tos & ~INET_ECN_MASK,
1175	READ_ONCE(tcp_rsk(req)->txhash));
1176	if (tcp_key_is_ao(key: &key))
1177	kfree(objp: key.traffic_key);
1178	}
1179
1180	/*
1181	* Send a SYN-ACK after having received a SYN.
1182	* This still operates on a request_sock only, not on a big
1183	* socket.
1184	*/
1185	static int tcp_v4_send_synack(const struct sock sk, struct* dst_entry *dst,
1186	struct flowi *fl,
1187	struct request_sock *req,
1188	struct tcp_fastopen_cookie *foc,
1189	enum tcp_synack_type synack_type,
1190	struct sk_buff *syn_skb)
1191	{
1192	const struct inet_request_sock *ireq = inet_rsk(sk: req);
1193	struct flowi4 fl4;
1194	int err = -`1`;
1195	struct sk_buff *skb;
1196	u8 tos;
1197
1198	/ First, grab a route. /
1199	if (!dst && (dst = inet_csk_route_req(sk, fl4: &fl4, req)) == NULL)
1200	return -`1`;
1201
1202	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1203
1204	if (skb) {
1205	__tcp_v4_send_check(skb, saddr: ireq->ir_loc_addr, daddr: ireq->ir_rmt_addr);
1206
1207	tos = READ_ONCE(inet_sk(sk)->tos);
1208
1209	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1210	tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) \|
1211	(tos & INET_ECN_MASK);
1212
1213	if (!INET_ECN_is_capable(dsfield: tos) &&
1214	tcp_bpf_ca_needs_ecn(sk: (struct sock *)req))
1215	tos \|= INET_ECN_ECT_0;
1216
1217	rcu_read_lock();
1218	err = ip_build_and_send_pkt(skb, sk, saddr: ireq->ir_loc_addr,
1219	daddr: ireq->ir_rmt_addr,
1220	rcu_dereference(ireq->ireq_opt),
1221	tos);
1222	rcu_read_unlock();
1223	err = net_xmit_eval(err);
1224	}
1225
1226	return err;
1227	}
1228
1229	/*
1230	* IPv4 request_sock destructor.
1231	*/
1232	static void tcp_v4_reqsk_destructor(struct request_sock *req)
1233	{
1234	kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, `1`));
1235	}
1236
1237	#ifdef CONFIG_TCP_MD5SIG
1238	/*
1239	* RFC2385 MD5 checksumming requires a mapping of
1240	* IP address->MD5 Key.
1241	* We need to maintain these in the sk structure.
1242	*/
1243
1244	DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1245	EXPORT_IPV6_MOD(tcp_md5_needed);
1246
1247	static bool better_md5_match(struct tcp_md5sig_key old, struct* tcp_md5sig_key *new)
1248	{
1249	if (!old)
1250	return true;
1251
1252	/ l3index always overrides non-l3index /
1253	if (old->l3index && new->l3index == `0`)
1254	return false;
1255	if (old->l3index == `0` && new->l3index)
1256	return true;
1257
1258	return old->prefixlen < new->prefixlen;
1259	}
1260
1261	/ Find the Key structure for an address. /
1262	struct tcp_md5sig_key __tcp_md5_do_lookup(const* struct sock sk, int* l3index,
1263	const union tcp_md5_addr *addr,
1264	int family, bool any_l3index)
1265	{
1266	const struct tcp_sock *tp = tcp_sk(sk);
1267	struct tcp_md5sig_key *key;
1268	const struct tcp_md5sig_info *md5sig;
1269	__be32 mask;
1270	struct tcp_md5sig_key *best_match = NULL;
1271	bool match;
1272
1273	/ caller either holds rcu_read_lock() or socket lock /
1274	md5sig = rcu_dereference_check(tp->md5sig_info,
1275	lockdep_sock_is_held(sk));
1276	if (!md5sig)
1277	return NULL;
1278
1279	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1280	lockdep_sock_is_held(sk)) {
1281	if (key->family != family)
1282	continue;
1283	if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1284	key->l3index != l3index)
1285	continue;
1286	if (family == AF_INET) {
1287	mask = inet_make_mask(logmask: key->prefixlen);
1288	match = (key->addr.a4.s_addr & mask) ==
1289	(addr->a4.s_addr & mask);
1290	#if IS_ENABLED(CONFIG_IPV6)
1291	} else if (family == AF_INET6) {
1292	match = ipv6_prefix_equal(addr1: &key->addr.a6, addr2: &addr->a6,
1293	prefixlen: key->prefixlen);
1294	#endif
1295	} else {
1296	match = false;
1297	}
1298
1299	if (match && better_md5_match(old: best_match, new: key))
1300	best_match = key;
1301	}
1302	return best_match;
1303	}
1304	EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1305
1306	static struct tcp_md5sig_key tcp_md5_do_lookup_exact(const* struct sock *sk,
1307	const union tcp_md5_addr *addr,
1308	int family, u8 prefixlen,
1309	int l3index, u8 flags)
1310	{
1311	const struct tcp_sock *tp = tcp_sk(sk);
1312	struct tcp_md5sig_key *key;
1313	unsigned int size = sizeof(struct in_addr);
1314	const struct tcp_md5sig_info *md5sig;
1315
1316	/ caller either holds rcu_read_lock() or socket lock /
1317	md5sig = rcu_dereference_check(tp->md5sig_info,
1318	lockdep_sock_is_held(sk));
1319	if (!md5sig)
1320	return NULL;
1321	#if IS_ENABLED(CONFIG_IPV6)
1322	if (family == AF_INET6)
1323	size = sizeof(struct in6_addr);
1324	#endif
1325	hlist_for_each_entry_rcu(key, &md5sig->head, node,
1326	lockdep_sock_is_held(sk)) {
1327	if (key->family != family)
1328	continue;
1329	if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1330	continue;
1331	if (key->l3index != l3index)
1332	continue;
1333	if (!memcmp(p: &key->addr, q: addr, size) &&
1334	key->prefixlen == prefixlen)
1335	return key;
1336	}
1337	return NULL;
1338	}
1339
1340	struct tcp_md5sig_key tcp_v4_md5_lookup(const* struct sock *sk,
1341	const struct sock *addr_sk)
1342	{
1343	const union tcp_md5_addr *addr;
1344	int l3index;
1345
1346	l3index = l3mdev_master_ifindex_by_index(net: sock_net(sk),
1347	ifindex: addr_sk->sk_bound_dev_if);
1348	addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1349	return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1350	}
1351	EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1352
1353	static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1354	{
1355	struct tcp_sock *tp = tcp_sk(sk);
1356	struct tcp_md5sig_info *md5sig;
1357
1358	md5sig = kmalloc(sizeof(*md5sig), gfp);
1359	if (!md5sig)
1360	return -ENOMEM;
1361
1362	sk_gso_disable(sk);
1363	INIT_HLIST_HEAD(&md5sig->head);
1364	rcu_assign_pointer(tp->md5sig_info, md5sig);
1365	return `0`;
1366	}
1367
1368	/ This can be called on a newly created socket, from other files /
1369	static int __tcp_md5_do_add(struct sock sk, const* union tcp_md5_addr *addr,
1370	int family, u8 prefixlen, int l3index, u8 flags,
1371	const u8 *newkey, u8 newkeylen, gfp_t gfp)
1372	{
1373	/ Add Key to the list /
1374	struct tcp_md5sig_key *key;
1375	struct tcp_sock *tp = tcp_sk(sk);
1376	struct tcp_md5sig_info *md5sig;
1377
1378	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1379	if (key) {
1380	/ Pre-existing entry - just update that one.*
1381	* Note that the key might be used concurrently.
1382	* data_race() is telling kcsan that we do not care of
1383	* key mismatches, since changing MD5 key on live flows
1384	* can lead to packet drops.
1385	*/
1386	data_race(memcpy(key->key, newkey, newkeylen));
1387
1388	/ Pairs with READ_ONCE() in tcp_md5_hash_key().*
1389	* Also note that a reader could catch new key->keylen value
1390	* but old key->key[], this is the reason we use __GFP_ZERO
1391	* at sock_kmalloc() time below these lines.
1392	*/
1393	WRITE_ONCE(key->keylen, newkeylen);
1394
1395	return `0`;
1396	}
1397
1398	md5sig = rcu_dereference_protected(tp->md5sig_info,
1399	lockdep_sock_is_held(sk));
1400
1401	key = sock_kmalloc(sk, size: sizeof(*key), priority: gfp \| __GFP_ZERO);
1402	if (!key)
1403	return -ENOMEM;
1404
1405	memcpy(key->key, newkey, newkeylen);
1406	key->keylen = newkeylen;
1407	key->family = family;
1408	key->prefixlen = prefixlen;
1409	key->l3index = l3index;
1410	key->flags = flags;
1411	memcpy(&key->addr, addr,
1412	(IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1413	sizeof(struct in_addr));
1414	hlist_add_head_rcu(n: &key->node, h: &md5sig->head);
1415	return `0`;
1416	}
1417
1418	int tcp_md5_do_add(struct sock sk, const* union tcp_md5_addr *addr,
1419	int family, u8 prefixlen, int l3index, u8 flags,
1420	const u8 *newkey, u8 newkeylen)
1421	{
1422	struct tcp_sock *tp = tcp_sk(sk);
1423
1424	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1425	if (tcp_md5_alloc_sigpool())
1426	return -ENOMEM;
1427
1428	if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1429	tcp_md5_release_sigpool();
1430	return -ENOMEM;
1431	}
1432
1433	if (!static_branch_inc(&tcp_md5_needed.key)) {
1434	struct tcp_md5sig_info *md5sig;
1435
1436	md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1437	rcu_assign_pointer(tp->md5sig_info, NULL);
1438	kfree_rcu(md5sig, rcu);
1439	tcp_md5_release_sigpool();
1440	return -EUSERS;
1441	}
1442	}
1443
1444	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1445	newkey, newkeylen, GFP_KERNEL);
1446	}
1447	EXPORT_IPV6_MOD(tcp_md5_do_add);
1448
1449	int tcp_md5_key_copy(struct sock sk, const* union tcp_md5_addr *addr,
1450	int family, u8 prefixlen, int l3index,
1451	struct tcp_md5sig_key *key)
1452	{
1453	struct tcp_sock *tp = tcp_sk(sk);
1454
1455	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1456	tcp_md5_add_sigpool();
1457
1458	if (tcp_md5sig_info_add(sk, gfp: sk_gfp_mask(sk, GFP_ATOMIC))) {
1459	tcp_md5_release_sigpool();
1460	return -ENOMEM;
1461	}
1462
1463	if (!static_key_fast_inc_not_disabled(key: &tcp_md5_needed.key.key)) {
1464	struct tcp_md5sig_info *md5sig;
1465
1466	md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1467	net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1468	rcu_assign_pointer(tp->md5sig_info, NULL);
1469	kfree_rcu(md5sig, rcu);
1470	tcp_md5_release_sigpool();
1471	return -EUSERS;
1472	}
1473	}
1474
1475	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1476	flags: key->flags, newkey: key->key, newkeylen: key->keylen,
1477	gfp: sk_gfp_mask(sk, GFP_ATOMIC));
1478	}
1479	EXPORT_IPV6_MOD(tcp_md5_key_copy);
1480
1481	int tcp_md5_do_del(struct sock sk, const* union tcp_md5_addr addr, int* family,
1482	u8 prefixlen, int l3index, u8 flags)
1483	{
1484	struct tcp_md5sig_key *key;
1485
1486	key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1487	if (!key)
1488	return -ENOENT;
1489	hlist_del_rcu(n: &key->node);
1490	atomic_sub(i: sizeof(*key), v: &sk->sk_omem_alloc);
1491	kfree_rcu(key, rcu);
1492	return `0`;
1493	}
1494	EXPORT_IPV6_MOD(tcp_md5_do_del);
1495
1496	void tcp_clear_md5_list(struct sock *sk)
1497	{
1498	struct tcp_sock *tp = tcp_sk(sk);
1499	struct tcp_md5sig_key *key;
1500	struct hlist_node *n;
1501	struct tcp_md5sig_info *md5sig;
1502
1503	md5sig = rcu_dereference_protected(tp->md5sig_info, `1`);
1504
1505	hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1506	hlist_del_rcu(n: &key->node);
1507	atomic_sub(i: sizeof(*key), v: &sk->sk_omem_alloc);
1508	kfree_rcu(key, rcu);
1509	}
1510	}
1511
1512	static int tcp_v4_parse_md5_keys(struct sock sk, int* optname,
1513	sockptr_t optval, int optlen)
1514	{
1515	struct tcp_md5sig cmd;
1516	struct sockaddr_in sin = (struct* sockaddr_in *)&cmd.tcpm_addr;
1517	const union tcp_md5_addr *addr;
1518	u8 prefixlen = `32`;
1519	int l3index = `0`;
1520	bool l3flag;
1521	u8 flags;
1522
1523	if (optlen < sizeof(cmd))
1524	return -EINVAL;
1525
1526	if (copy_from_sockptr(dst: &cmd, src: optval, size: sizeof(cmd)))
1527	return -EFAULT;
1528
1529	if (sin->sin_family != AF_INET)
1530	return -EINVAL;
1531
1532	flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1533	l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1534
1535	if (optname == TCP_MD5SIG_EXT &&
1536	cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1537	prefixlen = cmd.tcpm_prefixlen;
1538	if (prefixlen > `32`)
1539	return -EINVAL;
1540	}
1541
1542	if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1543	cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1544	struct net_device *dev;
1545
1546	rcu_read_lock();
1547	dev = dev_get_by_index_rcu(net: sock_net(sk), ifindex: cmd.tcpm_ifindex);
1548	if (dev && netif_is_l3_master(dev))
1549	l3index = dev->ifindex;
1550
1551	rcu_read_unlock();
1552
1553	/ ok to reference set/not set outside of rcu;*
1554	* right now device MUST be an L3 master
1555	*/
1556	if (!dev \|\| !l3index)
1557	return -EINVAL;
1558	}
1559
1560	addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1561
1562	if (!cmd.tcpm_keylen)
1563	return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1564
1565	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1566	return -EINVAL;
1567
1568	/ Don't allow keys for peers that have a matching TCP-AO key.*
1569	* See the comment in tcp_ao_add_cmd()
1570	*/
1571	if (tcp_ao_required(sk, saddr: addr, AF_INET, l3index: l3flag ? l3index : -`1`, stat_inc: false))
1572	return -EKEYREJECTED;
1573
1574	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1575	newkey: cmd.tcpm_key, newkeylen: cmd.tcpm_keylen);
1576	}
1577
1578	static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1579	__be32 daddr, __be32 saddr,
1580	const struct tcphdr th, int* nbytes)
1581	{
1582	struct tcp4_pseudohdr *bp;
1583	struct scatterlist sg;
1584	struct tcphdr *_th;
1585
1586	bp = hp->scratch;
1587	bp->saddr = saddr;
1588	bp->daddr = daddr;
1589	bp->pad = `0`;
1590	bp->protocol = IPPROTO_TCP;
1591	bp->len = cpu_to_be16(nbytes);
1592
1593	_th = (struct tcphdr *)(bp + `1`);
1594	memcpy(_th, th, sizeof(*th));
1595	_th->check = `0`;
1596
1597	sg_init_one(&sg, bp, sizeof(bp) + sizeof(th));
1598	ahash_request_set_crypt(req: hp->req, src: &sg, NULL,
1599	nbytes: sizeof(bp) + sizeof(th));
1600	return crypto_ahash_update(req: hp->req);
1601	}
1602
1603	static int tcp_v4_md5_hash_hdr(char md5_hash, const* struct tcp_md5sig_key *key,
1604	__be32 daddr, __be32 saddr, const struct tcphdr *th)
1605	{
1606	struct tcp_sigpool hp;
1607
1608	if (tcp_sigpool_start(id: tcp_md5_sigpool_id, c: &hp))
1609	goto clear_hash_nostart;
1610
1611	if (crypto_ahash_init(req: hp.req))
1612	goto clear_hash;
1613	if (tcp_v4_md5_hash_headers(hp: &hp, daddr, saddr, th, nbytes: th->doff << `2`))
1614	goto clear_hash;
1615	if (tcp_md5_hash_key(hp: &hp, key))
1616	goto clear_hash;
1617	ahash_request_set_crypt(req: hp.req, NULL, result: md5_hash, nbytes: `0`);
1618	if (crypto_ahash_final(req: hp.req))
1619	goto clear_hash;
1620
1621	tcp_sigpool_end(c: &hp);
1622	return `0`;
1623
1624	clear_hash:
1625	tcp_sigpool_end(c: &hp);
1626	clear_hash_nostart:
1627	memset(md5_hash, `0`, `16`);
1628	return `1`;
1629	}
1630
1631	int tcp_v4_md5_hash_skb(char md5_hash, const* struct tcp_md5sig_key *key,
1632	const struct sock *sk,
1633	const struct sk_buff *skb)
1634	{
1635	const struct tcphdr *th = tcp_hdr(skb);
1636	struct tcp_sigpool hp;
1637	__be32 saddr, daddr;
1638
1639	if (sk) { / valid for establish/request sockets /
1640	saddr = sk->sk_rcv_saddr;
1641	daddr = sk->sk_daddr;
1642	} else {
1643	const struct iphdr *iph = ip_hdr(skb);
1644	saddr = iph->saddr;
1645	daddr = iph->daddr;
1646	}
1647
1648	if (tcp_sigpool_start(id: tcp_md5_sigpool_id, c: &hp))
1649	goto clear_hash_nostart;
1650
1651	if (crypto_ahash_init(req: hp.req))
1652	goto clear_hash;
1653
1654	if (tcp_v4_md5_hash_headers(hp: &hp, daddr, saddr, th, nbytes: skb->len))
1655	goto clear_hash;
1656	if (tcp_sigpool_hash_skb_data(hp: &hp, skb, header_len: th->doff << `2`))
1657	goto clear_hash;
1658	if (tcp_md5_hash_key(hp: &hp, key))
1659	goto clear_hash;
1660	ahash_request_set_crypt(req: hp.req, NULL, result: md5_hash, nbytes: `0`);
1661	if (crypto_ahash_final(req: hp.req))
1662	goto clear_hash;
1663
1664	tcp_sigpool_end(c: &hp);
1665	return `0`;
1666
1667	clear_hash:
1668	tcp_sigpool_end(c: &hp);
1669	clear_hash_nostart:
1670	memset(md5_hash, `0`, `16`);
1671	return `1`;
1672	}
1673	EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1674
1675	#endif
1676
1677	static void tcp_v4_init_req(struct request_sock *req,
1678	const struct sock *sk_listener,
1679	struct sk_buff *skb)
1680	{
1681	struct inet_request_sock *ireq = inet_rsk(sk: req);
1682	struct net *net = sock_net(sk: sk_listener);
1683
1684	sk_rcv_saddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->daddr);
1685	sk_daddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->saddr);
1686	RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1687	}
1688
1689	static struct dst_entry tcp_v4_route_req(const* struct sock *sk,
1690	struct sk_buff *skb,
1691	struct flowi *fl,
1692	struct request_sock *req,
1693	u32 tw_isn)
1694	{
1695	tcp_v4_init_req(req, sk_listener: sk, skb);
1696
1697	if (security_inet_conn_request(sk, skb, req))
1698	return NULL;
1699
1700	return inet_csk_route_req(sk, fl4: &fl->u.ip4, req);
1701	}
1702
1703	struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1704	.family = PF_INET,
1705	.obj_size = sizeof(struct tcp_request_sock),
1706	.rtx_syn_ack = tcp_rtx_synack,
1707	.send_ack = tcp_v4_reqsk_send_ack,
1708	.destructor = tcp_v4_reqsk_destructor,
1709	.send_reset = tcp_v4_send_reset,
1710	.syn_ack_timeout = tcp_syn_ack_timeout,
1711	};
1712
1713	const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1714	.mss_clamp = TCP_MSS_DEFAULT,
1715	#ifdef CONFIG_TCP_MD5SIG
1716	.req_md5_lookup = tcp_v4_md5_lookup,
1717	.calc_md5_hash = tcp_v4_md5_hash_skb,
1718	#endif
1719	#ifdef CONFIG_TCP_AO
1720	.ao_lookup = tcp_v4_ao_lookup_rsk,
1721	.ao_calc_key = tcp_v4_ao_calc_key_rsk,
1722	.ao_synack_hash = tcp_v4_ao_synack_hash,
1723	#endif
1724	#ifdef CONFIG_SYN_COOKIES
1725	.cookie_init_seq = cookie_v4_init_sequence,
1726	#endif
1727	.route_req = tcp_v4_route_req,
1728	.init_seq = tcp_v4_init_seq,
1729	.init_ts_off = tcp_v4_init_ts_off,
1730	.send_synack = tcp_v4_send_synack,
1731	};
1732
1733	int tcp_v4_conn_request(struct sock sk, struct* sk_buff *skb)
1734	{
1735	/ Never answer to SYNs send to broadcast or multicast /
1736	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST \| RTCF_MULTICAST))
1737	goto drop;
1738
1739	return tcp_conn_request(rsk_ops: &tcp_request_sock_ops,
1740	af_ops: &tcp_request_sock_ipv4_ops, sk, skb);
1741
1742	drop:
1743	tcp_listendrop(sk);
1744	return `0`;
1745	}
1746	EXPORT_IPV6_MOD(tcp_v4_conn_request);
1747
1748
1749	/*
1750	* The three way handshake has completed - we got a valid synack -
1751	* now create the new socket.
1752	*/
1753	struct sock tcp_v4_syn_recv_sock(const* struct sock sk, struct* sk_buff *skb,
1754	struct request_sock *req,
1755	struct dst_entry *dst,
1756	struct request_sock *req_unhash,
1757	bool *own_req)
1758	{
1759	struct inet_request_sock *ireq;
1760	bool found_dup_sk = false;
1761	struct inet_sock *newinet;
1762	struct tcp_sock *newtp;
1763	struct sock *newsk;
1764	#ifdef CONFIG_TCP_MD5SIG
1765	const union tcp_md5_addr *addr;
1766	struct tcp_md5sig_key *key;
1767	int l3index;
1768	#endif
1769	struct ip_options_rcu *inet_opt;
1770
1771	if (sk_acceptq_is_full(sk))
1772	goto exit_overflow;
1773
1774	newsk = tcp_create_openreq_child(sk, req, skb);
1775	if (!newsk)
1776	goto exit_nonewsk;
1777
1778	newsk->sk_gso_type = SKB_GSO_TCPV4;
1779	inet_sk_rx_dst_set(sk: newsk, skb);
1780
1781	newtp = tcp_sk(newsk);
1782	newinet = inet_sk(newsk);
1783	ireq = inet_rsk(sk: req);
1784	inet_opt = rcu_dereference(ireq->ireq_opt);
1785	RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1786	newinet->mc_index = inet_iif(skb);
1787	newinet->mc_ttl = ip_hdr(skb)->ttl;
1788	newinet->rcv_tos = ip_hdr(skb)->tos;
1789	inet_csk(newsk)->icsk_ext_hdr_len = `0`;
1790	if (inet_opt)
1791	inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1792	atomic_set(v: &newinet->inet_id, i: get_random_u16());
1793
1794	/ Set ToS of the new socket based upon the value of incoming SYN.*
1795	* ECT bits are set later in tcp_init_transfer().
1796	*/
1797	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1798	newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1799
1800	if (!dst) {
1801	dst = inet_csk_route_child_sock(sk, newsk, req);
1802	if (!dst)
1803	goto put_and_exit;
1804	} else {
1805	/ syncookie case : see end of cookie_v4_check() /
1806	}
1807	sk_setup_caps(sk: newsk, dst);
1808
1809	tcp_ca_openreq_child(sk: newsk, dst);
1810
1811	tcp_sync_mss(sk: newsk, pmtu: dst_mtu(dst));
1812	newtp->advmss = tcp_mss_clamp(tcp_sk(sk), mss: dst_metric_advmss(dst));
1813
1814	tcp_initialize_rcv_mss(sk: newsk);
1815
1816	#ifdef CONFIG_TCP_MD5SIG
1817	l3index = l3mdev_master_ifindex_by_index(net: sock_net(sk), ifindex: ireq->ir_iif);
1818	/ Copy over the MD5 key from the original socket /
1819	addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1820	key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1821	if (key && !tcp_rsk_used_ao(req)) {
1822	if (tcp_md5_key_copy(sk: newsk, addr, AF_INET, prefixlen: `32`, l3index, key))
1823	goto put_and_exit;
1824	sk_gso_disable(sk: newsk);
1825	}
1826	#endif
1827	#ifdef CONFIG_TCP_AO
1828	if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1829	goto put_and_exit; / OOM, release back memory /
1830	#endif
1831
1832	if (__inet_inherit_port(sk, child: newsk) < `0`)
1833	goto put_and_exit;
1834	*own_req = inet_ehash_nolisten(sk: newsk, osk: req_to_sk(req: req_unhash),
1835	found_dup_sk: &found_dup_sk);
1836	if (likely(*own_req)) {
1837	tcp_move_syn(tp: newtp, req);
1838	ireq->ireq_opt = NULL;
1839	} else {
1840	newinet->inet_opt = NULL;
1841
1842	if (!req_unhash && found_dup_sk) {
1843	/ This code path should only be executed in the*
1844	* syncookie case only
1845	*/
1846	bh_unlock_sock(newsk);
1847	sock_put(sk: newsk);
1848	newsk = NULL;
1849	}
1850	}
1851	return newsk;
1852
1853	exit_overflow:
1854	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1855	exit_nonewsk:
1856	dst_release(dst);
1857	exit:
1858	tcp_listendrop(sk);
1859	return NULL;
1860	put_and_exit:
1861	newinet->inet_opt = NULL;
1862	inet_csk_prepare_forced_close(sk: newsk);
1863	tcp_done(sk: newsk);
1864	goto exit;
1865	}
1866	EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1867
1868	static struct sock tcp_v4_cookie_check(struct* sock sk, struct* sk_buff *skb)
1869	{
1870	#ifdef CONFIG_SYN_COOKIES
1871	const struct tcphdr *th = tcp_hdr(skb);
1872
1873	if (!th->syn)
1874	sk = cookie_v4_check(sk, skb);
1875	#endif
1876	return sk;
1877	}
1878
1879	u16 tcp_v4_get_syncookie(struct sock sk, struct* iphdr *iph,
1880	struct tcphdr th, u32 cookie)
1881	{
1882	u16 mss = `0`;
1883	#ifdef CONFIG_SYN_COOKIES
1884	mss = tcp_get_syncookie_mss(rsk_ops: &tcp_request_sock_ops,
1885	af_ops: &tcp_request_sock_ipv4_ops, sk, th);
1886	if (mss) {
1887	*cookie = __cookie_v4_init_sequence(iph, th, mssp: &mss);
1888	tcp_synq_overflow(sk);
1889	}
1890	#endif
1891	return mss;
1892	}
1893
1894	INDIRECT_CALLABLE_DECLARE(struct dst_entry ipv4_dst_check(struct* dst_entry *,
1895	u32));
1896	/ The socket must have it's spinlock held when we get*
1897	* here, unless it is a TCP_LISTEN socket.
1898	*
1899	* We have a potential double-lock case here, so even when
1900	* doing backlog processing we use the BH locking scheme.
1901	* This is because we cannot sleep with the original spinlock
1902	* held.
1903	*/
1904	int tcp_v4_do_rcv(struct sock sk, struct* sk_buff *skb)
1905	{
1906	enum skb_drop_reason reason;
1907	struct sock *rsk;
1908
1909	if (sk->sk_state == TCP_ESTABLISHED) { / Fast path /
1910	struct dst_entry *dst;
1911
1912	dst = rcu_dereference_protected(sk->sk_rx_dst,
1913	lockdep_sock_is_held(sk));
1914
1915	sock_rps_save_rxhash(sk, skb);
1916	sk_mark_napi_id(sk, skb);
1917	if (dst) {
1918	if (sk->sk_rx_dst_ifindex != skb->skb_iif \|\|
1919	!INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1920	dst, `0`)) {
1921	RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1922	dst_release(dst);
1923	}
1924	}
1925	tcp_rcv_established(sk, skb);
1926	return `0`;
1927	}
1928
1929	if (tcp_checksum_complete(skb))
1930	goto csum_err;
1931
1932	if (sk->sk_state == TCP_LISTEN) {
1933	struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1934
1935	if (!nsk)
1936	return `0`;
1937	if (nsk != sk) {
1938	reason = tcp_child_process(parent: sk, child: nsk, skb);
1939	if (reason) {
1940	rsk = nsk;
1941	goto reset;
1942	}
1943	return `0`;
1944	}
1945	} else
1946	sock_rps_save_rxhash(sk, skb);
1947
1948	reason = tcp_rcv_state_process(sk, skb);
1949	if (reason) {
1950	rsk = sk;
1951	goto reset;
1952	}
1953	return `0`;
1954
1955	reset:
1956	tcp_v4_send_reset(sk: rsk, skb, reason: sk_rst_convert_drop_reason(reason));
1957	discard:
1958	sk_skb_reason_drop(sk, skb, reason);
1959	/ Be careful here. If this function gets more complicated and*
1960	* gcc suffers from register pressure on the x86, sk (in %ebx)
1961	* might be destroyed here. This current version compiles correctly,
1962	* but you have been warned.
1963	*/
1964	return `0`;
1965
1966	csum_err:
1967	reason = SKB_DROP_REASON_TCP_CSUM;
1968	trace_tcp_bad_csum(skb);
1969	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1970	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1971	goto discard;
1972	}
1973	EXPORT_SYMBOL(tcp_v4_do_rcv);
1974
1975	int tcp_v4_early_demux(struct sk_buff *skb)
1976	{
1977	struct net *net = dev_net_rcu(dev: skb->dev);
1978	const struct iphdr *iph;
1979	const struct tcphdr *th;
1980	struct sock *sk;
1981
1982	if (skb->pkt_type != PACKET_HOST)
1983	return `0`;
1984
1985	if (!pskb_may_pull(skb, len: skb_transport_offset(skb) + sizeof(struct tcphdr)))
1986	return `0`;
1987
1988	iph = ip_hdr(skb);
1989	th = tcp_hdr(skb);
1990
1991	if (th->doff < sizeof(struct tcphdr) / `4`)
1992	return `0`;
1993
1994	sk = __inet_lookup_established(net, hashinfo: net->ipv4.tcp_death_row.hashinfo,
1995	saddr: iph->saddr, sport: th->source,
1996	daddr: iph->daddr, ntohs(th->dest),
1997	dif: skb->skb_iif, sdif: inet_sdif(skb));
1998	if (sk) {
1999	skb->sk = sk;
2000	skb->destructor = sock_edemux;
2001	if (sk_fullsock(sk)) {
2002	struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
2003
2004	if (dst)
2005	dst = dst_check(dst, cookie: `0`);
2006	if (dst &&
2007	sk->sk_rx_dst_ifindex == skb->skb_iif)
2008	skb_dst_set_noref(skb, dst);
2009	}
2010	}
2011	return `0`;
2012	}
2013
2014	bool tcp_add_backlog(struct sock sk, struct* sk_buff *skb,
2015	enum skb_drop_reason *reason)
2016	{
2017	u32 tail_gso_size, tail_gso_segs;
2018	struct skb_shared_info *shinfo;
2019	const struct tcphdr *th;
2020	struct tcphdr *thtail;
2021	struct sk_buff *tail;
2022	unsigned int hdrlen;
2023	bool fragstolen;
2024	u32 gso_segs;
2025	u32 gso_size;
2026	u64 limit;
2027	int delta;
2028
2029	/ In case all data was pulled from skb frags (in __pskb_pull_tail()),*
2030	* we can fix skb->truesize to its real value to avoid future drops.
2031	* This is valid because skb is not yet charged to the socket.
2032	* It has been noticed pure SACK packets were sometimes dropped
2033	* (if cooked by drivers without copybreak feature).
2034	*/
2035	skb_condense(skb);
2036
2037	tcp_cleanup_skb(skb);
2038
2039	if (unlikely(tcp_checksum_complete(skb))) {
2040	bh_unlock_sock(sk);
2041	trace_tcp_bad_csum(skb);
2042	*reason = SKB_DROP_REASON_TCP_CSUM;
2043	__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2044	__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2045	return true;
2046	}
2047
2048	/ Attempt coalescing to last skb in backlog, even if we are*
2049	* above the limits.
2050	* This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2051	*/
2052	th = (const struct tcphdr *)skb->data;
2053	hdrlen = th->doff * `4`;
2054
2055	tail = sk->sk_backlog.tail;
2056	if (!tail)
2057	goto no_coalesce;
2058	thtail = (struct tcphdr *)tail->data;
2059
2060	if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq \|\|
2061	TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield \|\|
2062	((TCP_SKB_CB(tail)->tcp_flags \|
2063	TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN \| TCPHDR_RST \| TCPHDR_URG)) \|\|
2064	!((TCP_SKB_CB(tail)->tcp_flags &
2065	TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) \|\|
2066	((TCP_SKB_CB(tail)->tcp_flags ^
2067	TCP_SKB_CB(skb)->tcp_flags) &
2068	(TCPHDR_ECE \| TCPHDR_CWR \| TCPHDR_AE)) \|\|
2069	!tcp_skb_can_collapse_rx(to: tail, from: skb) \|\|
2070	thtail->doff != th->doff \|\|
2071	memcmp(p: thtail + `1`, q: th + `1`, size: hdrlen - sizeof(*th)))
2072	goto no_coalesce;
2073
2074	__skb_pull(skb, len: hdrlen);
2075
2076	shinfo = skb_shinfo(skb);
2077	gso_size = shinfo->gso_size ?: skb->len;
2078	gso_segs = shinfo->gso_segs ?: `1`;
2079
2080	shinfo = skb_shinfo(tail);
2081	tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2082	tail_gso_segs = shinfo->gso_segs ?: `1`;
2083
2084	if (skb_try_coalesce(to: tail, from: skb, fragstolen: &fragstolen, delta_truesize: &delta)) {
2085	TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2086
2087	if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2088	TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2089	thtail->window = th->window;
2090	}
2091
2092	/ We have to update both TCP_SKB_CB(tail)->tcp_flags and*
2093	* thtail->fin, so that the fast path in tcp_rcv_established()
2094	* is not entered if we append a packet with a FIN.
2095	* SYN, RST, URG are not present.
2096	* ACK is set on both packets.
2097	* PSH : we do not really care in TCP stack,
2098	* at least for 'GRO' packets.
2099	*/
2100	thtail->fin \|= th->fin;
2101	TCP_SKB_CB(tail)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
2102
2103	if (TCP_SKB_CB(skb)->has_rxtstamp) {
2104	TCP_SKB_CB(tail)->has_rxtstamp = true;
2105	tail->tstamp = skb->tstamp;
2106	skb_hwtstamps(skb: tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2107	}
2108
2109	/ Not as strict as GRO. We only need to carry mss max value /
2110	shinfo->gso_size = max(gso_size, tail_gso_size);
2111	shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, `0xFFFF`);
2112
2113	sk->sk_backlog.len += delta;
2114	__NET_INC_STATS(sock_net(sk),
2115	LINUX_MIB_TCPBACKLOGCOALESCE);
2116	kfree_skb_partial(skb, head_stolen: fragstolen);
2117	return false;
2118	}
2119	__skb_push(skb, len: hdrlen);
2120
2121	no_coalesce:
2122	/ sk->sk_backlog.len is reset only at the end of __release_sock().*
2123	* Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2124	* sk_rcvbuf in normal conditions.
2125	*/
2126	limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << `1`;
2127
2128	limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> `1`;
2129
2130	/ Only socket owner can try to collapse/prune rx queues*
2131	* to reduce memory overhead, so add a little headroom here.
2132	* Few sockets backlog are possibly concurrently non empty.
2133	*/
2134	limit += `64` * `1024`;
2135
2136	limit = min_t(u64, limit, UINT_MAX);
2137
2138	if (unlikely(sk_add_backlog(sk, skb, limit))) {
2139	bh_unlock_sock(sk);
2140	*reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2141	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2142	return true;
2143	}
2144	return false;
2145	}
2146	EXPORT_IPV6_MOD(tcp_add_backlog);
2147
2148	int tcp_filter(struct sock sk, struct* sk_buff *skb)
2149	{
2150	struct tcphdr th = (struct* tcphdr *)skb->data;
2151
2152	return sk_filter_trim_cap(sk, skb, cap: th->doff * `4`);
2153	}
2154	EXPORT_IPV6_MOD(tcp_filter);
2155
2156	static void tcp_v4_restore_cb(struct sk_buff *skb)
2157	{
2158	memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2159	sizeof(struct inet_skb_parm));
2160	}
2161
2162	static void tcp_v4_fill_cb(struct sk_buff skb, const* struct iphdr *iph,
2163	const struct tcphdr *th)
2164	{
2165	/ This is tricky : We move IPCB at its correct location into TCP_SKB_CB()*
2166	* barrier() makes sure compiler wont play fool^Waliasing games.
2167	*/
2168	memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2169	sizeof(struct inet_skb_parm));
2170	barrier();
2171
2172	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2173	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2174	skb->len - th->doff * `4`);
2175	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2176	TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2177	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2178	TCP_SKB_CB(skb)->sacked = `0`;
2179	TCP_SKB_CB(skb)->has_rxtstamp =
2180	skb->tstamp \|\| skb_hwtstamps(skb)->hwtstamp;
2181	}
2182
2183	/*
2184	* From tcp_input.c
2185	*/
2186
2187	int tcp_v4_rcv(struct sk_buff *skb)
2188	{
2189	struct net *net = dev_net_rcu(dev: skb->dev);
2190	enum skb_drop_reason drop_reason;
2191	enum tcp_tw_status tw_status;
2192	int sdif = inet_sdif(skb);
2193	int dif = inet_iif(skb);
2194	const struct iphdr *iph;
2195	const struct tcphdr *th;
2196	struct sock *sk = NULL;
2197	bool refcounted;
2198	int ret;
2199	u32 isn;
2200
2201	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2202	if (skb->pkt_type != PACKET_HOST)
2203	goto discard_it;
2204
2205	/ Count it even if it's bad /
2206	__TCP_INC_STATS(net, TCP_MIB_INSEGS);
2207
2208	if (!pskb_may_pull(skb, len: sizeof(struct tcphdr)))
2209	goto discard_it;
2210
2211	th = (const struct tcphdr *)skb->data;
2212
2213	if (unlikely(th->doff < sizeof(struct tcphdr) / `4`)) {
2214	drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2215	goto bad_packet;
2216	}
2217	if (!pskb_may_pull(skb, len: th->doff * `4`))
2218	goto discard_it;
2219
2220	/ An explanation is required here, I think.*
2221	* Packet length and doff are validated by header prediction,
2222	* provided case of th->doff==0 is eliminated.
2223	* So, we defer the checks. */
2224
2225	if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2226	goto csum_error;
2227
2228	th = (const struct tcphdr *)skb->data;
2229	iph = ip_hdr(skb);
2230	lookup:
2231	sk = __inet_lookup_skb(hashinfo: net->ipv4.tcp_death_row.hashinfo,
2232	skb, doff: __tcp_hdrlen(th), sport: th->source,
2233	dport: th->dest, sdif, refcounted: &refcounted);
2234	if (!sk)
2235	goto no_tcp_socket;
2236
2237	if (sk->sk_state == TCP_TIME_WAIT)
2238	goto do_time_wait;
2239
2240	if (sk->sk_state == TCP_NEW_SYN_RECV) {
2241	struct request_sock *req = inet_reqsk(sk);
2242	bool req_stolen = false;
2243	struct sock *nsk;
2244
2245	sk = req->rsk_listener;
2246	if (!xfrm4_policy_check(sk, dir: XFRM_POLICY_IN, skb))
2247	drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2248	else
2249	drop_reason = tcp_inbound_hash(sk, req, skb,
2250	saddr: &iph->saddr, daddr: &iph->daddr,
2251	AF_INET, dif, sdif);
2252	if (unlikely(drop_reason)) {
2253	sk_drops_add(sk, skb);
2254	reqsk_put(req);
2255	goto discard_it;
2256	}
2257	if (tcp_checksum_complete(skb)) {
2258	reqsk_put(req);
2259	goto csum_error;
2260	}
2261	if (unlikely(sk->sk_state != TCP_LISTEN)) {
2262	nsk = reuseport_migrate_sock(sk, migrating_sk: req_to_sk(req), skb);
2263	if (!nsk) {
2264	inet_csk_reqsk_queue_drop_and_put(sk, req);
2265	goto lookup;
2266	}
2267	sk = nsk;
2268	/ reuseport_migrate_sock() has already held one sk_refcnt*
2269	* before returning.
2270	*/
2271	} else {
2272	/ We own a reference on the listener, increase it again*
2273	* as we might lose it too soon.
2274	*/
2275	sock_hold(sk);
2276	}
2277	refcounted = true;
2278	nsk = NULL;
2279	if (!tcp_filter(sk, skb)) {
2280	th = (const struct tcphdr *)skb->data;
2281	iph = ip_hdr(skb);
2282	tcp_v4_fill_cb(skb, iph, th);
2283	nsk = tcp_check_req(sk, skb, req, fastopen: false, lost_race: &req_stolen,
2284	drop_reason: &drop_reason);
2285	} else {
2286	drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2287	}
2288	if (!nsk) {
2289	reqsk_put(req);
2290	if (req_stolen) {
2291	/ Another cpu got exclusive access to req*
2292	* and created a full blown socket.
2293	* Try to feed this packet to this socket
2294	* instead of discarding it.
2295	*/
2296	tcp_v4_restore_cb(skb);
2297	sock_put(sk);
2298	goto lookup;
2299	}
2300	goto discard_and_relse;
2301	}
2302	nf_reset_ct(skb);
2303	if (nsk == sk) {
2304	reqsk_put(req);
2305	tcp_v4_restore_cb(skb);
2306	} else {
2307	drop_reason = tcp_child_process(parent: sk, child: nsk, skb);
2308	if (drop_reason) {
2309	enum sk_rst_reason rst_reason;
2310
2311	rst_reason = sk_rst_convert_drop_reason(reason: drop_reason);
2312	tcp_v4_send_reset(sk: nsk, skb, reason: rst_reason);
2313	goto discard_and_relse;
2314	}
2315	sock_put(sk);
2316	return `0`;
2317	}
2318	}
2319
2320	process:
2321	if (static_branch_unlikely(&ip4_min_ttl)) {
2322	/ min_ttl can be changed concurrently from do_ip_setsockopt() /
2323	if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2324	__NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2325	drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2326	goto discard_and_relse;
2327	}
2328	}
2329
2330	if (!xfrm4_policy_check(sk, dir: XFRM_POLICY_IN, skb)) {
2331	drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2332	goto discard_and_relse;
2333	}
2334
2335	drop_reason = tcp_inbound_hash(sk, NULL, skb, saddr: &iph->saddr, daddr: &iph->daddr,
2336	AF_INET, dif, sdif);
2337	if (drop_reason)
2338	goto discard_and_relse;
2339
2340	nf_reset_ct(skb);
2341
2342	if (tcp_filter(sk, skb)) {
2343	drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2344	goto discard_and_relse;
2345	}
2346	th = (const struct tcphdr *)skb->data;
2347	iph = ip_hdr(skb);
2348	tcp_v4_fill_cb(skb, iph, th);
2349
2350	skb->dev = NULL;
2351
2352	if (sk->sk_state == TCP_LISTEN) {
2353	ret = tcp_v4_do_rcv(sk, skb);
2354	goto put_and_return;
2355	}
2356
2357	sk_incoming_cpu_update(sk);
2358
2359	bh_lock_sock_nested(sk);
2360	tcp_segs_in(tcp_sk(sk), skb);
2361	ret = `0`;
2362	if (!sock_owned_by_user(sk)) {
2363	ret = tcp_v4_do_rcv(sk, skb);
2364	} else {
2365	if (tcp_add_backlog(sk, skb, reason: &drop_reason))
2366	goto discard_and_relse;
2367	}
2368	bh_unlock_sock(sk);
2369
2370	put_and_return:
2371	if (refcounted)
2372	sock_put(sk);
2373
2374	return ret;
2375
2376	no_tcp_socket:
2377	drop_reason = SKB_DROP_REASON_NO_SOCKET;
2378	if (!xfrm4_policy_check(NULL, dir: XFRM_POLICY_IN, skb))
2379	goto discard_it;
2380
2381	tcp_v4_fill_cb(skb, iph, th);
2382
2383	if (tcp_checksum_complete(skb)) {
2384	csum_error:
2385	drop_reason = SKB_DROP_REASON_TCP_CSUM;
2386	trace_tcp_bad_csum(skb);
2387	__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2388	bad_packet:
2389	__TCP_INC_STATS(net, TCP_MIB_INERRS);
2390	} else {
2391	tcp_v4_send_reset(NULL, skb, reason: sk_rst_convert_drop_reason(reason: drop_reason));
2392	}
2393
2394	discard_it:
2395	SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2396	/ Discard frame. /
2397	sk_skb_reason_drop(sk, skb, reason: drop_reason);
2398	return `0`;
2399
2400	discard_and_relse:
2401	sk_drops_add(sk, skb);
2402	if (refcounted)
2403	sock_put(sk);
2404	goto discard_it;
2405
2406	do_time_wait:
2407	if (!xfrm4_policy_check(NULL, dir: XFRM_POLICY_IN, skb)) {
2408	drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2409	inet_twsk_put(tw: inet_twsk(sk));
2410	goto discard_it;
2411	}
2412
2413	tcp_v4_fill_cb(skb, iph, th);
2414
2415	if (tcp_checksum_complete(skb)) {
2416	inet_twsk_put(tw: inet_twsk(sk));
2417	goto csum_error;
2418	}
2419
2420	tw_status = tcp_timewait_state_process(tw: inet_twsk(sk), skb, th, tw_isn: &isn,
2421	drop_reason: &drop_reason);
2422	switch (tw_status) {
2423	case TCP_TW_SYN: {
2424	struct sock *sk2 = inet_lookup_listener(net,
2425	hashinfo: net->ipv4.tcp_death_row.hashinfo,
2426	skb, doff: __tcp_hdrlen(th),
2427	saddr: iph->saddr, sport: th->source,
2428	daddr: iph->daddr, dport: th->dest,
2429	dif: inet_iif(skb),
2430	sdif);
2431	if (sk2) {
2432	inet_twsk_deschedule_put(tw: inet_twsk(sk));
2433	sk = sk2;
2434	tcp_v4_restore_cb(skb);
2435	refcounted = false;
2436	__this_cpu_write(tcp_tw_isn, isn);
2437	goto process;
2438	}
2439	}
2440	/ to ACK /
2441	fallthrough;
2442	case TCP_TW_ACK:
2443	case TCP_TW_ACK_OOW:
2444	tcp_v4_timewait_ack(sk, skb, tw_status);
2445	break;
2446	case TCP_TW_RST:
2447	tcp_v4_send_reset(sk, skb, reason: SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2448	inet_twsk_deschedule_put(tw: inet_twsk(sk));
2449	goto discard_it;
2450	case TCP_TW_SUCCESS:;
2451	}
2452	goto discard_it;
2453	}
2454
2455	static struct timewait_sock_ops tcp_timewait_sock_ops = {
2456	.twsk_obj_size = sizeof(struct tcp_timewait_sock),
2457	.twsk_destructor= tcp_twsk_destructor,
2458	};
2459
2460	void inet_sk_rx_dst_set(struct sock sk, const* struct sk_buff *skb)
2461	{
2462	struct dst_entry *dst = skb_dst(skb);
2463
2464	if (dst && dst_hold_safe(dst)) {
2465	rcu_assign_pointer(sk->sk_rx_dst, dst);
2466	sk->sk_rx_dst_ifindex = skb->skb_iif;
2467	}
2468	}
2469	EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2470
2471	const struct inet_connection_sock_af_ops ipv4_specific = {
2472	.queue_xmit = ip_queue_xmit,
2473	.send_check = tcp_v4_send_check,
2474	.rebuild_header = inet_sk_rebuild_header,
2475	.sk_rx_dst_set = inet_sk_rx_dst_set,
2476	.conn_request = tcp_v4_conn_request,
2477	.syn_recv_sock = tcp_v4_syn_recv_sock,
2478	.net_header_len = sizeof(struct iphdr),
2479	.setsockopt = ip_setsockopt,
2480	.getsockopt = ip_getsockopt,
2481	.mtu_reduced = tcp_v4_mtu_reduced,
2482	};
2483	EXPORT_IPV6_MOD(ipv4_specific);
2484
2485	#if defined(CONFIG_TCP_MD5SIG) \|\| defined(CONFIG_TCP_AO)
2486	static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2487	#ifdef CONFIG_TCP_MD5SIG
2488	.md5_lookup = tcp_v4_md5_lookup,
2489	.calc_md5_hash = tcp_v4_md5_hash_skb,
2490	.md5_parse = tcp_v4_parse_md5_keys,
2491	#endif
2492	#ifdef CONFIG_TCP_AO
2493	.ao_lookup = tcp_v4_ao_lookup,
2494	.calc_ao_hash = tcp_v4_ao_hash_skb,
2495	.ao_parse = tcp_v4_parse_ao,
2496	.ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
2497	#endif
2498	};
2499	#endif
2500
2501	/ NOTE: A lot of things set to zero explicitly by call to*
2502	* sk_alloc() so need not be done here.
2503	*/
2504	static int tcp_v4_init_sock(struct sock *sk)
2505	{
2506	struct inet_connection_sock *icsk = inet_csk(sk);
2507
2508	tcp_init_sock(sk);
2509
2510	icsk->icsk_af_ops = &ipv4_specific;
2511
2512	#if defined(CONFIG_TCP_MD5SIG) \|\| defined(CONFIG_TCP_AO)
2513	tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2514	#endif
2515
2516	return `0`;
2517	}
2518
2519	#ifdef CONFIG_TCP_MD5SIG
2520	static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2521	{
2522	struct tcp_md5sig_info *md5sig;
2523
2524	md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2525	kfree(objp: md5sig);
2526	static_branch_slow_dec_deferred(&tcp_md5_needed);
2527	tcp_md5_release_sigpool();
2528	}
2529	#endif
2530
2531	static void tcp_release_user_frags(struct sock *sk)
2532	{
2533	#ifdef CONFIG_PAGE_POOL
2534	unsigned long index;
2535	void *netmem;
2536
2537	xa_for_each(&sk->sk_user_frags, index, netmem)
2538	WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2539	#endif
2540	}
2541
2542	void tcp_v4_destroy_sock(struct sock *sk)
2543	{
2544	struct tcp_sock *tp = tcp_sk(sk);
2545
2546	tcp_release_user_frags(sk);
2547
2548	xa_destroy(&sk->sk_user_frags);
2549
2550	trace_tcp_destroy_sock(sk);
2551
2552	tcp_clear_xmit_timers(sk);
2553
2554	tcp_cleanup_congestion_control(sk);
2555
2556	tcp_cleanup_ulp(sk);
2557
2558	/ Cleanup up the write buffer. /
2559	tcp_write_queue_purge(sk);
2560
2561	/ Check if we want to disable active TFO /
2562	tcp_fastopen_active_disable_ofo_check(sk);
2563
2564	/ Cleans up our, hopefully empty, out_of_order_queue. /
2565	skb_rbtree_purge(root: &tp->out_of_order_queue);
2566
2567	#ifdef CONFIG_TCP_MD5SIG
2568	/ Clean up the MD5 key list, if any /
2569	if (tp->md5sig_info) {
2570	struct tcp_md5sig_info *md5sig;
2571
2572	md5sig = rcu_dereference_protected(tp->md5sig_info, `1`);
2573	tcp_clear_md5_list(sk);
2574	call_rcu(head: &md5sig->rcu, func: tcp_md5sig_info_free_rcu);
2575	rcu_assign_pointer(tp->md5sig_info, NULL);
2576	}
2577	#endif
2578	tcp_ao_destroy_sock(sk, twsk: false);
2579
2580	/ Clean up a referenced TCP bind bucket. /
2581	if (inet_csk(sk)->icsk_bind_hash)
2582	inet_put_port(sk);
2583
2584	BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2585
2586	/ If socket is aborted during connect operation /
2587	tcp_free_fastopen_req(tp);
2588	tcp_fastopen_destroy_cipher(sk);
2589	tcp_saved_syn_free(tp);
2590
2591	sk_sockets_allocated_dec(sk);
2592	}
2593	EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2594
2595	#ifdef CONFIG_PROC_FS
2596	/ Proc filesystem TCP sock list dumping. /
2597
2598	static unsigned short seq_file_family(const struct seq_file *seq);
2599
2600	static bool seq_sk_match(struct seq_file seq, const* struct sock *sk)
2601	{
2602	unsigned short family = seq_file_family(seq);
2603
2604	/ AF_UNSPEC is used as a match all /
2605	return ((family == AF_UNSPEC \|\| family == sk->sk_family) &&
2606	net_eq(net1: sock_net(sk), net2: seq_file_net(seq)));
2607	}
2608
2609	/ Find a non empty bucket (starting from st->bucket)*
2610	* and return the first sk from it.
2611	*/
2612	static void listening_get_first(struct* seq_file *seq)
2613	{
2614	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2615	struct tcp_iter_state *st = seq->private;
2616
2617	st->offset = `0`;
2618	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2619	struct inet_listen_hashbucket *ilb2;
2620	struct hlist_nulls_node *node;
2621	struct sock *sk;
2622
2623	ilb2 = &hinfo->lhash2[st->bucket];
2624	if (hlist_nulls_empty(h: &ilb2->nulls_head))
2625	continue;
2626
2627	spin_lock(lock: &ilb2->lock);
2628	sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2629	if (seq_sk_match(seq, sk))
2630	return sk;
2631	}
2632	spin_unlock(lock: &ilb2->lock);
2633	}
2634
2635	return NULL;
2636	}
2637
2638	/ Find the next sk of "cur" within the same bucket (i.e. st->bucket).*
2639	* If "cur" is the last one in the st->bucket,
2640	* call listening_get_first() to return the first sk of the next
2641	* non empty bucket.
2642	*/
2643	static void listening_get_next(struct* seq_file seq, void* *cur)
2644	{
2645	struct tcp_iter_state *st = seq->private;
2646	struct inet_listen_hashbucket *ilb2;
2647	struct hlist_nulls_node *node;
2648	struct inet_hashinfo *hinfo;
2649	struct sock *sk = cur;
2650
2651	++st->num;
2652	++st->offset;
2653
2654	sk = sk_nulls_next(sk);
2655	sk_nulls_for_each_from(sk, node) {
2656	if (seq_sk_match(seq, sk))
2657	return sk;
2658	}
2659
2660	hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2661	ilb2 = &hinfo->lhash2[st->bucket];
2662	spin_unlock(lock: &ilb2->lock);
2663	++st->bucket;
2664	return listening_get_first(seq);
2665	}
2666
2667	static void listening_get_idx(struct* seq_file seq, loff_t pos)
2668	{
2669	struct tcp_iter_state *st = seq->private;
2670	void *rc;
2671
2672	st->bucket = `0`;
2673	st->offset = `0`;
2674	rc = listening_get_first(seq);
2675
2676	while (rc && *pos) {
2677	rc = listening_get_next(seq, cur: rc);
2678	--*pos;
2679	}
2680	return rc;
2681	}
2682
2683	static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2684	const struct tcp_iter_state *st)
2685	{
2686	return hlist_nulls_empty(h: &hinfo->ehash[st->bucket].chain);
2687	}
2688
2689	/*
2690	* Get first established socket starting from bucket given in st->bucket.
2691	* If st->bucket is zero, the very first socket in the hash is returned.
2692	*/
2693	static void established_get_first(struct* seq_file *seq)
2694	{
2695	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2696	struct tcp_iter_state *st = seq->private;
2697
2698	st->offset = `0`;
2699	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2700	struct sock *sk;
2701	struct hlist_nulls_node *node;
2702	spinlock_t *lock = inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket);
2703
2704	cond_resched();
2705
2706	/ Lockless fast path for the common case of empty buckets /
2707	if (empty_bucket(hinfo, st))
2708	continue;
2709
2710	spin_lock_bh(lock);
2711	sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2712	if (seq_sk_match(seq, sk))
2713	return sk;
2714	}
2715	spin_unlock_bh(lock);
2716	}
2717
2718	return NULL;
2719	}
2720
2721	static void established_get_next(struct* seq_file seq, void* *cur)
2722	{
2723	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2724	struct tcp_iter_state *st = seq->private;
2725	struct hlist_nulls_node *node;
2726	struct sock *sk = cur;
2727
2728	++st->num;
2729	++st->offset;
2730
2731	sk = sk_nulls_next(sk);
2732
2733	sk_nulls_for_each_from(sk, node) {
2734	if (seq_sk_match(seq, sk))
2735	return sk;
2736	}
2737
2738	spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
2739	++st->bucket;
2740	return established_get_first(seq);
2741	}
2742
2743	static void established_get_idx(struct* seq_file *seq, loff_t pos)
2744	{
2745	struct tcp_iter_state *st = seq->private;
2746	void *rc;
2747
2748	st->bucket = `0`;
2749	rc = established_get_first(seq);
2750
2751	while (rc && pos) {
2752	rc = established_get_next(seq, cur: rc);
2753	--pos;
2754	}
2755	return rc;
2756	}
2757
2758	static void tcp_get_idx(struct* seq_file *seq, loff_t pos)
2759	{
2760	void *rc;
2761	struct tcp_iter_state *st = seq->private;
2762
2763	st->state = TCP_SEQ_STATE_LISTENING;
2764	rc = listening_get_idx(seq, pos: &pos);
2765
2766	if (!rc) {
2767	st->state = TCP_SEQ_STATE_ESTABLISHED;
2768	rc = established_get_idx(seq, pos);
2769	}
2770
2771	return rc;
2772	}
2773
2774	static void tcp_seek_last_pos(struct* seq_file *seq)
2775	{
2776	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2777	struct tcp_iter_state *st = seq->private;
2778	int bucket = st->bucket;
2779	int offset = st->offset;
2780	int orig_num = st->num;
2781	void *rc = NULL;
2782
2783	switch (st->state) {
2784	case TCP_SEQ_STATE_LISTENING:
2785	if (st->bucket > hinfo->lhash2_mask)
2786	break;
2787	rc = listening_get_first(seq);
2788	while (offset-- && rc && bucket == st->bucket)
2789	rc = listening_get_next(seq, cur: rc);
2790	if (rc)
2791	break;
2792	st->bucket = `0`;
2793	st->state = TCP_SEQ_STATE_ESTABLISHED;
2794	fallthrough;
2795	case TCP_SEQ_STATE_ESTABLISHED:
2796	if (st->bucket > hinfo->ehash_mask)
2797	break;
2798	rc = established_get_first(seq);
2799	while (offset-- && rc && bucket == st->bucket)
2800	rc = established_get_next(seq, cur: rc);
2801	}
2802
2803	st->num = orig_num;
2804
2805	return rc;
2806	}
2807
2808	void tcp_seq_start(struct* seq_file seq, loff_t pos)
2809	{
2810	struct tcp_iter_state *st = seq->private;
2811	void *rc;
2812
2813	if (pos && pos == st->last_pos) {
2814	rc = tcp_seek_last_pos(seq);
2815	if (rc)
2816	goto out;
2817	}
2818
2819	st->state = TCP_SEQ_STATE_LISTENING;
2820	st->num = `0`;
2821	st->bucket = `0`;
2822	st->offset = `0`;
2823	rc = pos ? tcp_get_idx(seq, pos: pos - `1`) : SEQ_START_TOKEN;
2824
2825	out:
2826	st->last_pos = *pos;
2827	return rc;
2828	}
2829	EXPORT_IPV6_MOD(tcp_seq_start);
2830
2831	void tcp_seq_next(struct* seq_file seq, void* v, loff_t pos)
2832	{
2833	struct tcp_iter_state *st = seq->private;
2834	void *rc = NULL;
2835
2836	if (v == SEQ_START_TOKEN) {
2837	rc = tcp_get_idx(seq, pos: `0`);
2838	goto out;
2839	}
2840
2841	switch (st->state) {
2842	case TCP_SEQ_STATE_LISTENING:
2843	rc = listening_get_next(seq, cur: v);
2844	if (!rc) {
2845	st->state = TCP_SEQ_STATE_ESTABLISHED;
2846	st->bucket = `0`;
2847	st->offset = `0`;
2848	rc = established_get_first(seq);
2849	}
2850	break;
2851	case TCP_SEQ_STATE_ESTABLISHED:
2852	rc = established_get_next(seq, cur: v);
2853	break;
2854	}
2855	out:
2856	++*pos;
2857	st->last_pos = *pos;
2858	return rc;
2859	}
2860	EXPORT_IPV6_MOD(tcp_seq_next);
2861
2862	void tcp_seq_stop(struct seq_file seq, void* *v)
2863	{
2864	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2865	struct tcp_iter_state *st = seq->private;
2866
2867	switch (st->state) {
2868	case TCP_SEQ_STATE_LISTENING:
2869	if (v != SEQ_START_TOKEN)
2870	spin_unlock(lock: &hinfo->lhash2[st->bucket].lock);
2871	break;
2872	case TCP_SEQ_STATE_ESTABLISHED:
2873	if (v)
2874	spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
2875	break;
2876	}
2877	}
2878	EXPORT_IPV6_MOD(tcp_seq_stop);
2879
2880	static void get_openreq4(const struct request_sock *req,
2881	struct seq_file f, int* i)
2882	{
2883	const struct inet_request_sock *ireq = inet_rsk(sk: req);
2884	long delta = req->rsk_timer.expires - jiffies;
2885
2886	seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X"
2887	" %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2888	i,
2889	ireq->ir_loc_addr,
2890	ireq->ir_num,
2891	ireq->ir_rmt_addr,
2892	ntohs(ireq->ir_rmt_port),
2893	TCP_SYN_RECV,
2894	`0`, `0`, / could print option size, but that is af dependent. /
2895	`1`, / timers active (only the expire timer) /
2896	jiffies_delta_to_clock_t(delta),
2897	req->num_timeout,
2898	from_kuid_munged(to: seq_user_ns(seq: f),
2899	uid: sock_i_uid(sk: req->rsk_listener)),
2900	`0`, / non standard timer /
2901	`0`, / open_requests have no inode /
2902	`0`,
2903	req);
2904	}
2905
2906	static void get_tcp4_sock(struct sock sk, struct* seq_file f, int* i)
2907	{
2908	int timer_active;
2909	unsigned long timer_expires;
2910	const struct tcp_sock *tp = tcp_sk(sk);
2911	const struct inet_connection_sock *icsk = inet_csk(sk);
2912	const struct inet_sock *inet = inet_sk(sk);
2913	const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2914	__be32 dest = inet->inet_daddr;
2915	__be32 src = inet->inet_rcv_saddr;
2916	__u16 destp = ntohs(inet->inet_dport);
2917	__u16 srcp = ntohs(inet->inet_sport);
2918	u8 icsk_pending;
2919	int rx_queue;
2920	int state;
2921
2922	icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2923	if (icsk_pending == ICSK_TIME_RETRANS \|\|
2924	icsk_pending == ICSK_TIME_REO_TIMEOUT \|\|
2925	icsk_pending == ICSK_TIME_LOSS_PROBE) {
2926	timer_active = `1`;
2927	timer_expires = icsk_timeout(icsk);
2928	} else if (icsk_pending == ICSK_TIME_PROBE0) {
2929	timer_active = `4`;
2930	timer_expires = icsk_timeout(icsk);
2931	} else if (timer_pending(timer: &sk->sk_timer)) {
2932	timer_active = `2`;
2933	timer_expires = sk->sk_timer.expires;
2934	} else {
2935	timer_active = `0`;
2936	timer_expires = jiffies;
2937	}
2938
2939	state = inet_sk_state_load(sk);
2940	if (state == TCP_LISTEN)
2941	rx_queue = READ_ONCE(sk->sk_ack_backlog);
2942	else
2943	/ Because we don't lock the socket,*
2944	* we might find a transient negative value.
2945	*/
2946	rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2947	READ_ONCE(tp->copied_seq), `0`);
2948
2949	seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2950	"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2951	i, src, srcp, dest, destp, state,
2952	READ_ONCE(tp->write_seq) - tp->snd_una,
2953	rx_queue,
2954	timer_active,
2955	jiffies_delta_to_clock_t(delta: timer_expires - jiffies),
2956	icsk->icsk_retransmits,
2957	from_kuid_munged(to: seq_user_ns(seq: f), uid: sock_i_uid(sk)),
2958	icsk->icsk_probes_out,
2959	sock_i_ino(sk),
2960	refcount_read(r: &sk->sk_refcnt), sk,
2961	jiffies_to_clock_t(x: icsk->icsk_rto),
2962	jiffies_to_clock_t(x: icsk->icsk_ack.ato),
2963	(icsk->icsk_ack.quick << `1`) \| inet_csk_in_pingpong_mode(sk),
2964	tcp_snd_cwnd(tp),
2965	state == TCP_LISTEN ?
2966	fastopenq->max_qlen :
2967	(tcp_in_initial_slowstart(tp) ? -`1` : tp->snd_ssthresh));
2968	}
2969
2970	static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2971	struct seq_file f, int* i)
2972	{
2973	long delta = tw->tw_timer.expires - jiffies;
2974	__be32 dest, src;
2975	__u16 destp, srcp;
2976
2977	dest = tw->tw_daddr;
2978	src = tw->tw_rcv_saddr;
2979	destp = ntohs(tw->tw_dport);
2980	srcp = ntohs(tw->tw_sport);
2981
2982	seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X"
2983	" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2984	i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), `0`, `0`,
2985	`3`, jiffies_delta_to_clock_t(delta), `0`, `0`, `0`, `0`,
2986	refcount_read(r: &tw->tw_refcnt), tw);
2987	}
2988
2989	#define TMPSZ 150
2990
2991	static int tcp4_seq_show(struct seq_file seq, void* *v)
2992	{
2993	struct tcp_iter_state *st;
2994	struct sock *sk = v;
2995
2996	seq_setwidth(m: seq, TMPSZ - `1`);
2997	if (v == SEQ_START_TOKEN) {
2998	seq_puts(m: seq, s: " sl local_address rem_address st tx_queue "
2999	"rx_queue tr tm->when retrnsmt uid timeout "
3000	"inode");
3001	goto out;
3002	}
3003	st = seq->private;
3004
3005	if (sk->sk_state == TCP_TIME_WAIT)
3006	get_timewait4_sock(tw: v, f: seq, i: st->num);
3007	else if (sk->sk_state == TCP_NEW_SYN_RECV)
3008	get_openreq4(req: v, f: seq, i: st->num);
3009	else
3010	get_tcp4_sock(sk: v, f: seq, i: st->num);
3011	out:
3012	seq_pad(m: seq, c: `'\n'`);
3013	return `0`;
3014	}
3015
3016	#ifdef CONFIG_BPF_SYSCALL
3017	struct bpf_tcp_iter_state {
3018	struct tcp_iter_state state;
3019	unsigned int cur_sk;
3020	unsigned int end_sk;
3021	unsigned int max_sk;
3022	struct sock **batch;
3023	bool st_bucket_done;
3024	};
3025
3026	struct bpf_iter__tcp {
3027	__bpf_md_ptr(struct bpf_iter_meta *, meta);
3028	__bpf_md_ptr(struct sock_common *, sk_common);
3029	uid_t uid __aligned(`8`);
3030	};
3031
3032	static int tcp_prog_seq_show(struct bpf_prog prog, struct* bpf_iter_meta *meta,
3033	struct sock_common *sk_common, uid_t uid)
3034	{
3035	struct bpf_iter__tcp ctx;
3036
3037	meta->seq_num--; / skip SEQ_START_TOKEN /
3038	ctx.meta = meta;
3039	ctx.sk_common = sk_common;
3040	ctx.uid = uid;
3041	return bpf_iter_run_prog(prog, ctx: &ctx);
3042	}
3043
3044	static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3045	{
3046	while (iter->cur_sk < iter->end_sk)
3047	sock_gen_put(sk: iter->batch[iter->cur_sk++]);
3048	}
3049
3050	static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3051	unsigned int new_batch_sz)
3052	{
3053	struct sock **new_batch;
3054
3055	new_batch = kvmalloc(sizeof(new_batch) new_batch_sz,
3056	GFP_USER \| __GFP_NOWARN);
3057	if (!new_batch)
3058	return -ENOMEM;
3059
3060	bpf_iter_tcp_put_batch(iter);
3061	kvfree(addr: iter->batch);
3062	iter->batch = new_batch;
3063	iter->max_sk = new_batch_sz;
3064
3065	return `0`;
3066	}
3067
3068	static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3069	struct sock *start_sk)
3070	{
3071	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3072	struct bpf_tcp_iter_state *iter = seq->private;
3073	struct tcp_iter_state *st = &iter->state;
3074	struct hlist_nulls_node *node;
3075	unsigned int expected = `1`;
3076	struct sock *sk;
3077
3078	sock_hold(sk: start_sk);
3079	iter->batch[iter->end_sk++] = start_sk;
3080
3081	sk = sk_nulls_next(sk: start_sk);
3082	sk_nulls_for_each_from(sk, node) {
3083	if (seq_sk_match(seq, sk)) {
3084	if (iter->end_sk < iter->max_sk) {
3085	sock_hold(sk);
3086	iter->batch[iter->end_sk++] = sk;
3087	}
3088	expected++;
3089	}
3090	}
3091	spin_unlock(lock: &hinfo->lhash2[st->bucket].lock);
3092
3093	return expected;
3094	}
3095
3096	static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3097	struct sock *start_sk)
3098	{
3099	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3100	struct bpf_tcp_iter_state *iter = seq->private;
3101	struct tcp_iter_state *st = &iter->state;
3102	struct hlist_nulls_node *node;
3103	unsigned int expected = `1`;
3104	struct sock *sk;
3105
3106	sock_hold(sk: start_sk);
3107	iter->batch[iter->end_sk++] = start_sk;
3108
3109	sk = sk_nulls_next(sk: start_sk);
3110	sk_nulls_for_each_from(sk, node) {
3111	if (seq_sk_match(seq, sk)) {
3112	if (iter->end_sk < iter->max_sk) {
3113	sock_hold(sk);
3114	iter->batch[iter->end_sk++] = sk;
3115	}
3116	expected++;
3117	}
3118	}
3119	spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
3120
3121	return expected;
3122	}
3123
3124	static struct sock bpf_iter_tcp_batch(struct* seq_file *seq)
3125	{
3126	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3127	struct bpf_tcp_iter_state *iter = seq->private;
3128	struct tcp_iter_state *st = &iter->state;
3129	unsigned int expected;
3130	bool resized = false;
3131	struct sock *sk;
3132
3133	/ The st->bucket is done. Directly advance to the next*
3134	* bucket instead of having the tcp_seek_last_pos() to skip
3135	* one by one in the current bucket and eventually find out
3136	* it has to advance to the next bucket.
3137	*/
3138	if (iter->st_bucket_done) {
3139	st->offset = `0`;
3140	st->bucket++;
3141	if (st->state == TCP_SEQ_STATE_LISTENING &&
3142	st->bucket > hinfo->lhash2_mask) {
3143	st->state = TCP_SEQ_STATE_ESTABLISHED;
3144	st->bucket = `0`;
3145	}
3146	}
3147
3148	again:
3149	/ Get a new batch /
3150	iter->cur_sk = `0`;
3151	iter->end_sk = `0`;
3152	iter->st_bucket_done = false;
3153
3154	sk = tcp_seek_last_pos(seq);
3155	if (!sk)
3156	return NULL; / Done /
3157
3158	if (st->state == TCP_SEQ_STATE_LISTENING)
3159	expected = bpf_iter_tcp_listening_batch(seq, start_sk: sk);
3160	else
3161	expected = bpf_iter_tcp_established_batch(seq, start_sk: sk);
3162
3163	if (iter->end_sk == expected) {
3164	iter->st_bucket_done = true;
3165	return sk;
3166	}
3167
3168	if (!resized && !bpf_iter_tcp_realloc_batch(iter, new_batch_sz: expected * `3` / `2`)) {
3169	resized = true;
3170	goto again;
3171	}
3172
3173	return sk;
3174	}
3175
3176	static void bpf_iter_tcp_seq_start(struct* seq_file seq, loff_t pos)
3177	{
3178	/ bpf iter does not support lseek, so it always*
3179	* continue from where it was stop()-ped.
3180	*/
3181	if (*pos)
3182	return bpf_iter_tcp_batch(seq);
3183
3184	return SEQ_START_TOKEN;
3185	}
3186
3187	static void bpf_iter_tcp_seq_next(struct* seq_file seq, void* v, loff_t pos)
3188	{
3189	struct bpf_tcp_iter_state *iter = seq->private;
3190	struct tcp_iter_state *st = &iter->state;
3191	struct sock *sk;
3192
3193	/ Whenever seq_next() is called, the iter->cur_sk is*
3194	* done with seq_show(), so advance to the next sk in
3195	* the batch.
3196	*/
3197	if (iter->cur_sk < iter->end_sk) {
3198	/ Keeping st->num consistent in tcp_iter_state.*
3199	* bpf_iter_tcp does not use st->num.
3200	* meta.seq_num is used instead.
3201	*/
3202	st->num++;
3203	/ Move st->offset to the next sk in the bucket such that*
3204	* the future start() will resume at st->offset in
3205	* st->bucket. See tcp_seek_last_pos().
3206	*/
3207	st->offset++;
3208	sock_gen_put(sk: iter->batch[iter->cur_sk++]);
3209	}
3210
3211	if (iter->cur_sk < iter->end_sk)
3212	sk = iter->batch[iter->cur_sk];
3213	else
3214	sk = bpf_iter_tcp_batch(seq);
3215
3216	++*pos;
3217	/ Keeping st->last_pos consistent in tcp_iter_state.*
3218	* bpf iter does not do lseek, so st->last_pos always equals to *pos.
3219	*/
3220	st->last_pos = *pos;
3221	return sk;
3222	}
3223
3224	static int bpf_iter_tcp_seq_show(struct seq_file seq, void* *v)
3225	{
3226	struct bpf_iter_meta meta;
3227	struct bpf_prog *prog;
3228	struct sock *sk = v;
3229	uid_t uid;
3230	int ret;
3231
3232	if (v == SEQ_START_TOKEN)
3233	return `0`;
3234
3235	if (sk_fullsock(sk))
3236	lock_sock(sk);
3237
3238	if (unlikely(sk_unhashed(sk))) {
3239	ret = SEQ_SKIP;
3240	goto unlock;
3241	}
3242
3243	if (sk->sk_state == TCP_TIME_WAIT) {
3244	uid = `0`;
3245	} else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3246	const struct request_sock *req = v;
3247
3248	uid = from_kuid_munged(to: seq_user_ns(seq),
3249	uid: sock_i_uid(sk: req->rsk_listener));
3250	} else {
3251	uid = from_kuid_munged(to: seq_user_ns(seq), uid: sock_i_uid(sk));
3252	}
3253
3254	meta.seq = seq;
3255	prog = bpf_iter_get_info(meta: &meta, in_stop: false);
3256	ret = tcp_prog_seq_show(prog, meta: &meta, sk_common: v, uid);
3257
3258	unlock:
3259	if (sk_fullsock(sk))
3260	release_sock(sk);
3261	return ret;
3262
3263	}
3264
3265	static void bpf_iter_tcp_seq_stop(struct seq_file seq, void* *v)
3266	{
3267	struct bpf_tcp_iter_state *iter = seq->private;
3268	struct bpf_iter_meta meta;
3269	struct bpf_prog *prog;
3270
3271	if (!v) {
3272	meta.seq = seq;
3273	prog = bpf_iter_get_info(meta: &meta, in_stop: true);
3274	if (prog)
3275	(void)tcp_prog_seq_show(prog, meta: &meta, sk_common: v, uid: `0`);
3276	}
3277
3278	if (iter->cur_sk < iter->end_sk) {
3279	bpf_iter_tcp_put_batch(iter);
3280	iter->st_bucket_done = false;
3281	}
3282	}
3283
3284	static const struct seq_operations bpf_iter_tcp_seq_ops = {
3285	.show = bpf_iter_tcp_seq_show,
3286	.start = bpf_iter_tcp_seq_start,
3287	.next = bpf_iter_tcp_seq_next,
3288	.stop = bpf_iter_tcp_seq_stop,
3289	};
3290	#endif
3291	static unsigned short seq_file_family(const struct seq_file *seq)
3292	{
3293	const struct tcp_seq_afinfo *afinfo;
3294
3295	#ifdef CONFIG_BPF_SYSCALL
3296	/ Iterated from bpf_iter. Let the bpf prog to filter instead. /
3297	if (seq->op == &bpf_iter_tcp_seq_ops)
3298	return AF_UNSPEC;
3299	#endif
3300
3301	/ Iterated from proc fs /
3302	afinfo = pde_data(inode: file_inode(f: seq->file));
3303	return afinfo->family;
3304	}
3305
3306	static const struct seq_operations tcp4_seq_ops = {
3307	.show = tcp4_seq_show,
3308	.start = tcp_seq_start,
3309	.next = tcp_seq_next,
3310	.stop = tcp_seq_stop,
3311	};
3312
3313	static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3314	.family = AF_INET,
3315	};
3316
3317	static int __net_init tcp4_proc_init_net(struct net *net)
3318	{
3319	if (!proc_create_net_data(name: "tcp", mode: `0444`, parent: net->proc_net, ops: &tcp4_seq_ops,
3320	state_size: sizeof(struct tcp_iter_state), data: &tcp4_seq_afinfo))
3321	return -ENOMEM;
3322	return `0`;
3323	}
3324
3325	static void __net_exit tcp4_proc_exit_net(struct net *net)
3326	{
3327	remove_proc_entry("tcp", net->proc_net);
3328	}
3329
3330	static struct pernet_operations tcp4_net_ops = {
3331	.init = tcp4_proc_init_net,
3332	.exit = tcp4_proc_exit_net,
3333	};
3334
3335	int __init tcp4_proc_init(void)
3336	{
3337	return register_pernet_subsys(&tcp4_net_ops);
3338	}
3339
3340	void tcp4_proc_exit(void)
3341	{
3342	unregister_pernet_subsys(&tcp4_net_ops);
3343	}
3344	#endif /* CONFIG_PROC_FS */
3345
3346	/ @wake is one when sk_stream_write_space() calls us.*
3347	* This sends EPOLLOUT only if notsent_bytes is half the limit.
3348	* This mimics the strategy used in sock_def_write_space().
3349	*/
3350	bool tcp_stream_memory_free(const struct sock sk, int* wake)
3351	{
3352	const struct tcp_sock *tp = tcp_sk(sk);
3353	u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3354	READ_ONCE(tp->snd_nxt);
3355
3356	return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3357	}
3358	EXPORT_SYMBOL(tcp_stream_memory_free);
3359
3360	struct proto tcp_prot = {
3361	.name = "TCP",
3362	.owner = THIS_MODULE,
3363	.close = tcp_close,
3364	.pre_connect = tcp_v4_pre_connect,
3365	.connect = tcp_v4_connect,
3366	.disconnect = tcp_disconnect,
3367	.accept = inet_csk_accept,
3368	.ioctl = tcp_ioctl,
3369	.init = tcp_v4_init_sock,
3370	.destroy = tcp_v4_destroy_sock,
3371	.shutdown = tcp_shutdown,
3372	.setsockopt = tcp_setsockopt,
3373	.getsockopt = tcp_getsockopt,
3374	.bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3375	.keepalive = tcp_set_keepalive,
3376	.recvmsg = tcp_recvmsg,
3377	.sendmsg = tcp_sendmsg,
3378	.splice_eof = tcp_splice_eof,
3379	.backlog_rcv = tcp_v4_do_rcv,
3380	.release_cb = tcp_release_cb,
3381	.hash = inet_hash,
3382	.unhash = inet_unhash,
3383	.get_port = inet_csk_get_port,
3384	.put_port = inet_put_port,
3385	#ifdef CONFIG_BPF_SYSCALL
3386	.psock_update_sk_prot = tcp_bpf_update_proto,
3387	#endif
3388	.enter_memory_pressure = tcp_enter_memory_pressure,
3389	.leave_memory_pressure = tcp_leave_memory_pressure,
3390	.stream_memory_free = tcp_stream_memory_free,
3391	.sockets_allocated = &tcp_sockets_allocated,
3392	.orphan_count = &tcp_orphan_count,
3393
3394	.memory_allocated = &tcp_memory_allocated,
3395	.per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3396
3397	.memory_pressure = &tcp_memory_pressure,
3398	.sysctl_mem = sysctl_tcp_mem,
3399	.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3400	.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3401	.max_header = MAX_TCP_HEADER,
3402	.obj_size = sizeof(struct tcp_sock),
3403	.slab_flags = SLAB_TYPESAFE_BY_RCU,
3404	.twsk_prot = &tcp_timewait_sock_ops,
3405	.rsk_prot = &tcp_request_sock_ops,
3406	.h.hashinfo = NULL,
3407	.no_autobind = true,
3408	.diag_destroy = tcp_abort,
3409	};
3410	EXPORT_SYMBOL(tcp_prot);
3411
3412	static void __net_exit tcp_sk_exit(struct net *net)
3413	{
3414	if (net->ipv4.tcp_congestion_control)
3415	bpf_module_put(data: net->ipv4.tcp_congestion_control,
3416	owner: net->ipv4.tcp_congestion_control->owner);
3417	}
3418
3419	static void __net_init tcp_set_hashinfo(struct net *net)
3420	{
3421	struct inet_hashinfo *hinfo;
3422	unsigned int ehash_entries;
3423	struct net *old_net;
3424
3425	if (net_eq(net1: net, net2: &init_net))
3426	goto fallback;
3427
3428	old_net = current->nsproxy->net_ns;
3429	ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3430	if (!ehash_entries)
3431	goto fallback;
3432
3433	ehash_entries = roundup_pow_of_two(ehash_entries);
3434	hinfo = inet_pernet_hashinfo_alloc(hashinfo: &tcp_hashinfo, ehash_entries);
3435	if (!hinfo) {
3436	pr_warn("Failed to allocate TCP ehash (entries: %u) "
3437	"for a netns, fallback to the global one\n",
3438	ehash_entries);
3439	fallback:
3440	hinfo = &tcp_hashinfo;
3441	ehash_entries = tcp_hashinfo.ehash_mask + `1`;
3442	}
3443
3444	net->ipv4.tcp_death_row.hashinfo = hinfo;
3445	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / `2`;
3446	net->ipv4.sysctl_max_syn_backlog = max(`128U`, ehash_entries / `128`);
3447	}
3448
3449	static int __net_init tcp_sk_init(struct net *net)
3450	{
3451	net->ipv4.sysctl_tcp_ecn = `2`;
3452	net->ipv4.sysctl_tcp_ecn_fallback = `1`;
3453
3454	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3455	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3456	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3457	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3458	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3459
3460	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3461	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3462	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3463
3464	net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3465	net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3466	net->ipv4.sysctl_tcp_syncookies = `1`;
3467	net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3468	net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3469	net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3470	net->ipv4.sysctl_tcp_orphan_retries = `0`;
3471	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3472	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3473	net->ipv4.sysctl_tcp_tw_reuse = `2`;
3474	net->ipv4.sysctl_tcp_tw_reuse_delay = `1` * MSEC_PER_SEC;
3475	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = `1`;
3476
3477	refcount_set(r: &net->ipv4.tcp_death_row.tw_refcount, n: `1`);
3478	tcp_set_hashinfo(net);
3479
3480	net->ipv4.sysctl_tcp_sack = `1`;
3481	net->ipv4.sysctl_tcp_window_scaling = `1`;
3482	net->ipv4.sysctl_tcp_timestamps = `1`;
3483	net->ipv4.sysctl_tcp_early_retrans = `3`;
3484	net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3485	net->ipv4.sysctl_tcp_slow_start_after_idle = `1`; / By default, RFC2861 behavior. /
3486	net->ipv4.sysctl_tcp_retrans_collapse = `1`;
3487	net->ipv4.sysctl_tcp_max_reordering = `300`;
3488	net->ipv4.sysctl_tcp_dsack = `1`;
3489	net->ipv4.sysctl_tcp_app_win = `31`;
3490	net->ipv4.sysctl_tcp_adv_win_scale = `1`;
3491	net->ipv4.sysctl_tcp_frto = `2`;
3492	net->ipv4.sysctl_tcp_moderate_rcvbuf = `1`;
3493	/ This limits the percentage of the congestion window which we*
3494	* will allow a single TSO frame to consume. Building TSO frames
3495	* which are too large can cause TCP streams to be bursty.
3496	*/
3497	net->ipv4.sysctl_tcp_tso_win_divisor = `3`;
3498	/ Default TSQ limit of 4 MB /
3499	net->ipv4.sysctl_tcp_limit_output_bytes = `4` << `20`;
3500
3501	/ rfc5961 challenge ack rate limiting, per net-ns, disabled by default. /
3502	net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3503
3504	net->ipv4.sysctl_tcp_min_tso_segs = `2`;
3505	net->ipv4.sysctl_tcp_tso_rtt_log = `9`; / 2^9 = 512 usec /
3506	net->ipv4.sysctl_tcp_min_rtt_wlen = `300`;
3507	net->ipv4.sysctl_tcp_autocorking = `1`;
3508	net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/`2`;
3509	net->ipv4.sysctl_tcp_pacing_ss_ratio = `200`;
3510	net->ipv4.sysctl_tcp_pacing_ca_ratio = `120`;
3511	if (net != &init_net) {
3512	memcpy(net->ipv4.sysctl_tcp_rmem,
3513	init_net.ipv4.sysctl_tcp_rmem,
3514	sizeof(init_net.ipv4.sysctl_tcp_rmem));
3515	memcpy(net->ipv4.sysctl_tcp_wmem,
3516	init_net.ipv4.sysctl_tcp_wmem,
3517	sizeof(init_net.ipv4.sysctl_tcp_wmem));
3518	}
3519	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3520	net->ipv4.sysctl_tcp_comp_sack_slack_ns = `100` * NSEC_PER_USEC;
3521	net->ipv4.sysctl_tcp_comp_sack_nr = `44`;
3522	net->ipv4.sysctl_tcp_backlog_ack_defer = `1`;
3523	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3524	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = `0`;
3525	atomic_set(v: &net->ipv4.tfo_active_disable_times, i: `0`);
3526
3527	/ Set default values for PLB /
3528	net->ipv4.sysctl_tcp_plb_enabled = `0`; / Disabled by default /
3529	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = `3`;
3530	net->ipv4.sysctl_tcp_plb_rehash_rounds = `12`;
3531	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = `60`;
3532	/ Default congestion threshold for PLB to mark a round is 50% /
3533	net->ipv4.sysctl_tcp_plb_cong_thresh = (`1` << TCP_PLB_SCALE) / `2`;
3534
3535	/ Reno is always built in /
3536	if (!net_eq(net1: net, net2: &init_net) &&
3537	bpf_try_module_get(data: init_net.ipv4.tcp_congestion_control,
3538	owner: init_net.ipv4.tcp_congestion_control->owner))
3539	net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3540	else
3541	net->ipv4.tcp_congestion_control = &tcp_reno;
3542
3543	net->ipv4.sysctl_tcp_syn_linear_timeouts = `4`;
3544	net->ipv4.sysctl_tcp_shrink_window = `0`;
3545
3546	net->ipv4.sysctl_tcp_pingpong_thresh = `1`;
3547	net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3548	net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3549
3550	return `0`;
3551	}
3552
3553	static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3554	{
3555	struct net *net;
3556
3557	/ make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work*
3558	* and failed setup_net error unwinding path are serialized.
3559	*
3560	* tcp_twsk_purge() handles twsk in any dead netns, not just those in
3561	* net_exit_list, the thread that dismantles a particular twsk must
3562	* do so without other thread progressing to refcount_dec_and_test() of
3563	* tcp_death_row.tw_refcount.
3564	*/
3565	mutex_lock(&tcp_exit_batch_mutex);
3566
3567	tcp_twsk_purge(net_exit_list);
3568
3569	list_for_each_entry(net, net_exit_list, exit_list) {
3570	inet_pernet_hashinfo_free(hashinfo: net->ipv4.tcp_death_row.hashinfo);
3571	WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3572	tcp_fastopen_ctx_destroy(net);
3573	}
3574
3575	mutex_unlock(lock: &tcp_exit_batch_mutex);
3576	}
3577
3578	static struct pernet_operations __net_initdata tcp_sk_ops = {
3579	.init = tcp_sk_init,
3580	.exit = tcp_sk_exit,
3581	.exit_batch = tcp_sk_exit_batch,
3582	};
3583
3584	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3585	DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3586	struct sock_common *sk_common, uid_t uid)
3587
3588	#define INIT_BATCH_SZ 16
3589
3590	static int bpf_iter_init_tcp(void priv_data, struct* bpf_iter_aux_info *aux)
3591	{
3592	struct bpf_tcp_iter_state *iter = priv_data;
3593	int err;
3594
3595	err = bpf_iter_init_seq_net(priv_data, aux);
3596	if (err)
3597	return err;
3598
3599	err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3600	if (err) {
3601	bpf_iter_fini_seq_net(priv_data);
3602	return err;
3603	}
3604
3605	return `0`;
3606	}
3607
3608	static void bpf_iter_fini_tcp(void *priv_data)
3609	{
3610	struct bpf_tcp_iter_state *iter = priv_data;
3611
3612	bpf_iter_fini_seq_net(priv_data);
3613	kvfree(addr: iter->batch);
3614	}
3615
3616	static const struct bpf_iter_seq_info tcp_seq_info = {
3617	.seq_ops = &bpf_iter_tcp_seq_ops,
3618	.init_seq_private = bpf_iter_init_tcp,
3619	.fini_seq_private = bpf_iter_fini_tcp,
3620	.seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3621	};
3622
3623	static const struct bpf_func_proto *
3624	bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3625	const struct bpf_prog *prog)
3626	{
3627	switch (func_id) {
3628	case BPF_FUNC_setsockopt:
3629	return &bpf_sk_setsockopt_proto;
3630	case BPF_FUNC_getsockopt:
3631	return &bpf_sk_getsockopt_proto;
3632	default:
3633	return NULL;
3634	}
3635	}
3636
3637	static struct bpf_iter_reg tcp_reg_info = {
3638	.target = "tcp",
3639	.ctx_arg_info_size = `1`,
3640	.ctx_arg_info = {
3641	{ offsetof(struct bpf_iter__tcp, sk_common),
3642	PTR_TO_BTF_ID_OR_NULL \| PTR_TRUSTED },
3643	},
3644	.get_func_proto = bpf_iter_tcp_get_func_proto,
3645	.seq_info = &tcp_seq_info,
3646	};
3647
3648	static void __init bpf_iter_register(void)
3649	{
3650	tcp_reg_info.ctx_arg_info[`0`].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3651	if (bpf_iter_reg_target(reg_info: &tcp_reg_info))
3652	pr_warn("Warning: could not register bpf iterator tcp\n");
3653	}
3654
3655	#endif
3656
3657	void __init tcp_v4_init(void)
3658	{
3659	int cpu, res;
3660
3661	for_each_possible_cpu(cpu) {
3662	struct sock *sk;
3663
3664	res = inet_ctl_sock_create(sk: &sk, PF_INET, type: SOCK_RAW,
3665	IPPROTO_TCP, net: &init_net);
3666	if (res)
3667	panic(fmt: "Failed to create the TCP control socket.\n");
3668	sock_set_flag(sk, flag: SOCK_USE_WRITE_QUEUE);
3669
3670	/ Please enforce IP_DF and IPID==0 for RST and*
3671	* ACK sent in SYN-RECV and TIME-WAIT state.
3672	*/
3673	inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3674
3675	sk->sk_clockid = CLOCK_MONOTONIC;
3676
3677	per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3678	}
3679	if (register_pernet_subsys(&tcp_sk_ops))
3680	panic(fmt: "Failed to create the TCP control socket.\n");
3681
3682	#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3683	bpf_iter_register();
3684	#endif
3685	}
3686

Provided by KDAB

Definitions

tcp_hashinfo
ipv4_tcp_sk
tcp_exit_batch_mutex
tcp_v4_init_seq
tcp_v4_init_ts_off
tcp_twsk_unique
tcp_v4_pre_connect
tcp_v4_connect
tcp_v4_mtu_reduced
do_redirect
tcp_req_err
tcp_ld_RTO_revert
tcp_v4_err
__tcp_v4_send_check
tcp_v4_send_check
tcp_v4_ao_sign_reset
tcp_v4_send_reset
tcp_v4_send_ack
tcp_v4_timewait_ack
tcp_v4_reqsk_send_ack
tcp_v4_send_synack
tcp_v4_reqsk_destructor
tcp_md5_needed
better_md5_match
__tcp_md5_do_lookup
tcp_md5_do_lookup_exact
tcp_v4_md5_lookup
tcp_md5sig_info_add
__tcp_md5_do_add
tcp_md5_do_add
tcp_md5_key_copy
tcp_md5_do_del
tcp_clear_md5_list
tcp_v4_parse_md5_keys
tcp_v4_md5_hash_headers
tcp_v4_md5_hash_hdr
tcp_v4_md5_hash_skb
tcp_v4_init_req
tcp_v4_route_req
tcp_request_sock_ops
tcp_request_sock_ipv4_ops
tcp_v4_conn_request
tcp_v4_syn_recv_sock
tcp_v4_cookie_check
tcp_v4_get_syncookie
tcp_v4_do_rcv
tcp_v4_early_demux
tcp_add_backlog
tcp_filter
tcp_v4_restore_cb
tcp_v4_fill_cb
tcp_v4_rcv
tcp_timewait_sock_ops
inet_sk_rx_dst_set
ipv4_specific
tcp_sock_ipv4_specific
tcp_v4_init_sock
tcp_md5sig_info_free_rcu
tcp_release_user_frags
tcp_v4_destroy_sock
seq_sk_match
listening_get_first
listening_get_next
listening_get_idx
empty_bucket
established_get_first
established_get_next
established_get_idx
tcp_get_idx
tcp_seek_last_pos
tcp_seq_start
tcp_seq_next
tcp_seq_stop
get_openreq4
get_tcp4_sock
get_timewait4_sock
tcp4_seq_show
bpf_tcp_iter_state
bpf_iter__tcp
tcp_prog_seq_show
bpf_iter_tcp_put_batch
bpf_iter_tcp_realloc_batch
bpf_iter_tcp_listening_batch
bpf_iter_tcp_established_batch
bpf_iter_tcp_batch
bpf_iter_tcp_seq_start
bpf_iter_tcp_seq_next
bpf_iter_tcp_seq_show
bpf_iter_tcp_seq_stop
bpf_iter_tcp_seq_ops
seq_file_family
tcp4_seq_ops
tcp4_seq_afinfo
tcp4_proc_init_net
tcp4_proc_exit_net
tcp4_net_ops
tcp4_proc_init
tcp4_proc_exit
tcp_stream_memory_free
tcp_prot
tcp_sk_exit
tcp_set_hashinfo
tcp_sk_init
tcp_sk_exit_batch
tcp_sk_ops
bpf_iter_init_tcp
bpf_iter_fini_tcp
tcp_seq_info
bpf_iter_tcp_get_func_proto
tcp_reg_info
bpf_iter_register

Improve your Profiling and Debugging skills

Find out more

Definitions

source code of linux/net/ipv4/tcp_ipv4.c