tcp.c source code [linux/net/ipv4/tcp.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* INET An implementation of the TCP/IP protocol suite for the LINUX
4	* operating system. INET is implemented using the BSD Socket
5	* interface as the means of communication with the user level.
6	*
7	* Implementation of the Transmission Control Protocol(TCP).
8	*
9	* Authors: Ross Biro
10	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11	* Mark Evans, <evansmp@uhura.aston.ac.uk>
12	* Corey Minyard <wf-rch!minyard@relay.EU.net>
13	* Florian La Roche, <flla@stud.uni-sb.de>
14	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
15	* Linus Torvalds, <torvalds@cs.helsinki.fi>
16	* Alan Cox, <gw4pts@gw4pts.ampr.org>
17	* Matthew Dillon, <dillon@apollo.west.oic.com>
18	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
19	* Jorge Cwik, <jorge@laser.satlink.net>
20	*
21	* Fixes:
22	* Alan Cox : Numerous verify_area() calls
23	* Alan Cox : Set the ACK bit on a reset
24	* Alan Cox : Stopped it crashing if it closed while
25	* sk->inuse=1 and was trying to connect
26	* (tcp_err()).
27	* Alan Cox : All icmp error handling was broken
28	* pointers passed where wrong and the
29	* socket was looked up backwards. Nobody
30	* tested any icmp error code obviously.
31	* Alan Cox : tcp_err() now handled properly. It
32	* wakes people on errors. poll
33	* behaves and the icmp error race
34	* has gone by moving it into sock.c
35	* Alan Cox : tcp_send_reset() fixed to work for
36	* everything not just packets for
37	* unknown sockets.
38	* Alan Cox : tcp option processing.
39	* Alan Cox : Reset tweaked (still not 100%) [Had
40	* syn rule wrong]
41	* Herp Rosmanith : More reset fixes
42	* Alan Cox : No longer acks invalid rst frames.
43	* Acking any kind of RST is right out.
44	* Alan Cox : Sets an ignore me flag on an rst
45	* receive otherwise odd bits of prattle
46	* escape still
47	* Alan Cox : Fixed another acking RST frame bug.
48	* Should stop LAN workplace lockups.
49	* Alan Cox : Some tidyups using the new skb list
50	* facilities
51	* Alan Cox : sk->keepopen now seems to work
52	* Alan Cox : Pulls options out correctly on accepts
53	* Alan Cox : Fixed assorted sk->rqueue->next errors
54	* Alan Cox : PSH doesn't end a TCP read. Switched a
55	* bit to skb ops.
56	* Alan Cox : Tidied tcp_data to avoid a potential
57	* nasty.
58	* Alan Cox : Added some better commenting, as the
59	* tcp is hard to follow
60	* Alan Cox : Removed incorrect check for 20 * psh
61	* Michael O'Reilly : ack < copied bug fix.
62	* Johannes Stille : Misc tcp fixes (not all in yet).
63	* Alan Cox : FIN with no memory -> CRASH
64	* Alan Cox : Added socket option proto entries.
65	* Also added awareness of them to accept.
66	* Alan Cox : Added TCP options (SOL_TCP)
67	* Alan Cox : Switched wakeup calls to callbacks,
68	* so the kernel can layer network
69	* sockets.
70	* Alan Cox : Use ip_tos/ip_ttl settings.
71	* Alan Cox : Handle FIN (more) properly (we hope).
72	* Alan Cox : RST frames sent on unsynchronised
73	* state ack error.
74	* Alan Cox : Put in missing check for SYN bit.
75	* Alan Cox : Added tcp_select_window() aka NET2E
76	* window non shrink trick.
77	* Alan Cox : Added a couple of small NET2E timer
78	* fixes
79	* Charles Hedrick : TCP fixes
80	* Toomas Tamm : TCP window fixes
81	* Alan Cox : Small URG fix to rlogin ^C ack fight
82	* Charles Hedrick : Rewrote most of it to actually work
83	* Linus : Rewrote tcp_read() and URG handling
84	* completely
85	* Gerhard Koerting: Fixed some missing timer handling
86	* Matthew Dillon : Reworked TCP machine states as per RFC
87	* Gerhard Koerting: PC/TCP workarounds
88	* Adam Caldwell : Assorted timer/timing errors
89	* Matthew Dillon : Fixed another RST bug
90	* Alan Cox : Move to kernel side addressing changes.
91	* Alan Cox : Beginning work on TCP fastpathing
92	* (not yet usable)
93	* Arnt Gulbrandsen: Turbocharged tcp_check() routine.
94	* Alan Cox : TCP fast path debugging
95	* Alan Cox : Window clamping
96	* Michael Riepe : Bug in tcp_check()
97	* Matt Dillon : More TCP improvements and RST bug fixes
98	* Matt Dillon : Yet more small nasties remove from the
99	* TCP code (Be very nice to this man if
100	* tcp finally works 100%) 8)
101	* Alan Cox : BSD accept semantics.
102	* Alan Cox : Reset on closedown bug.
103	* Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
104	* Michael Pall : Handle poll() after URG properly in
105	* all cases.
106	* Michael Pall : Undo the last fix in tcp_read_urg()
107	* (multi URG PUSH broke rlogin).
108	* Michael Pall : Fix the multi URG PUSH problem in
109	* tcp_readable(), poll() after URG
110	* works now.
111	* Michael Pall : recv(...,MSG_OOB) never blocks in the
112	* BSD api.
113	* Alan Cox : Changed the semantics of sk->socket to
114	* fix a race and a signal problem with
115	* accept() and async I/O.
116	* Alan Cox : Relaxed the rules on tcp_sendto().
117	* Yury Shevchuk : Really fixed accept() blocking problem.
118	* Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
119	* clients/servers which listen in on
120	* fixed ports.
121	* Alan Cox : Cleaned the above up and shrank it to
122	* a sensible code size.
123	* Alan Cox : Self connect lockup fix.
124	* Alan Cox : No connect to multicast.
125	* Ross Biro : Close unaccepted children on master
126	* socket close.
127	* Alan Cox : Reset tracing code.
128	* Alan Cox : Spurious resets on shutdown.
129	* Alan Cox : Giant 15 minute/60 second timer error
130	* Alan Cox : Small whoops in polling before an
131	* accept.
132	* Alan Cox : Kept the state trace facility since
133	* it's handy for debugging.
134	* Alan Cox : More reset handler fixes.
135	* Alan Cox : Started rewriting the code based on
136	* the RFC's for other useful protocol
137	* references see: Comer, KA9Q NOS, and
138	* for a reference on the difference
139	* between specifications and how BSD
140	* works see the 4.4lite source.
141	* A.N.Kuznetsov : Don't time wait on completion of tidy
142	* close.
143	* Linus Torvalds : Fin/Shutdown & copied_seq changes.
144	* Linus Torvalds : Fixed BSD port reuse to work first syn
145	* Alan Cox : Reimplemented timers as per the RFC
146	* and using multiple timers for sanity.
147	* Alan Cox : Small bug fixes, and a lot of new
148	* comments.
149	* Alan Cox : Fixed dual reader crash by locking
150	* the buffers (much like datagram.c)
151	* Alan Cox : Fixed stuck sockets in probe. A probe
152	* now gets fed up of retrying without
153	* (even a no space) answer.
154	* Alan Cox : Extracted closing code better
155	* Alan Cox : Fixed the closing state machine to
156	* resemble the RFC.
157	* Alan Cox : More 'per spec' fixes.
158	* Jorge Cwik : Even faster checksumming.
159	* Alan Cox : tcp_data() doesn't ack illegal PSH
160	* only frames. At least one pc tcp stack
161	* generates them.
162	* Alan Cox : Cache last socket.
163	* Alan Cox : Per route irtt.
164	* Matt Day : poll()->select() match BSD precisely on error
165	* Alan Cox : New buffers
166	* Marc Tamsky : Various sk->prot->retransmits and
167	* sk->retransmits misupdating fixed.
168	* Fixed tcp_write_timeout: stuck close,
169	* and TCP syn retries gets used now.
170	* Mark Yarvis : In tcp_read_wakeup(), don't send an
171	* ack if state is TCP_CLOSED.
172	* Alan Cox : Look up device on a retransmit - routes may
173	* change. Doesn't yet cope with MSS shrink right
174	* but it's a start!
175	* Marc Tamsky : Closing in closing fixes.
176	* Mike Shaver : RFC1122 verifications.
177	* Alan Cox : rcv_saddr errors.
178	* Alan Cox : Block double connect().
179	* Alan Cox : Small hooks for enSKIP.
180	* Alexey Kuznetsov: Path MTU discovery.
181	* Alan Cox : Support soft errors.
182	* Alan Cox : Fix MTU discovery pathological case
183	* when the remote claims no mtu!
184	* Marc Tamsky : TCP_CLOSE fix.
185	* Colin (G3TNE) : Send a reset on syn ack replies in
186	* window but wrong (fixes NT lpd problems)
187	* Pedro Roque : Better TCP window handling, delayed ack.
188	* Joerg Reuter : No modification of locked buffers in
189	* tcp_do_retransmit()
190	* Eric Schenk : Changed receiver side silly window
191	* avoidance algorithm to BSD style
192	* algorithm. This doubles throughput
193	* against machines running Solaris,
194	* and seems to result in general
195	* improvement.
196	* Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
197	* Willy Konynenberg : Transparent proxying support.
198	* Mike McLagan : Routing by source
199	* Keith Owens : Do proper merging with partial SKB's in
200	* tcp_do_sendmsg to avoid burstiness.
201	* Eric Schenk : Fix fast close down bug with
202	* shutdown() followed by close().
203	* Andi Kleen : Make poll agree with SIGIO
204	* Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
205	* lingertime == 0 (RFC 793 ABORT Call)
206	* Hirokazu Takahashi : Use copy_from_user() instead of
207	* csum_and_copy_from_user() if possible.
208	*
209	* Description of States:
210	*
211	* TCP_SYN_SENT sent a connection request, waiting for ack
212	*
213	* TCP_SYN_RECV received a connection request, sent ack,
214	* waiting for final ack in three-way handshake.
215	*
216	* TCP_ESTABLISHED connection established
217	*
218	* TCP_FIN_WAIT1 our side has shutdown, waiting to complete
219	* transmission of remaining buffered data
220	*
221	* TCP_FIN_WAIT2 all buffered data sent, waiting for remote
222	* to shutdown
223	*
224	* TCP_CLOSING both sides have shutdown but we still have
225	* data we have to finish sending
226	*
227	* TCP_TIME_WAIT timeout to catch resent junk before entering
228	* closed, can only be entered from FIN_WAIT2
229	* or CLOSING. Required because the other end
230	* may not have gotten our last ACK causing it
231	* to retransmit the data packet (which we ignore)
232	*
233	* TCP_CLOSE_WAIT remote side has shutdown and is waiting for
234	* us to finish writing our data and to shutdown
235	* (we have to close() to move on to LAST_ACK)
236	*
237	* TCP_LAST_ACK out side has shutdown after remote has
238	* shutdown. There may still be data in our
239	* buffer that we have to finish sending
240	*
241	* TCP_CLOSE socket is finished
242	*/
243
244	#define pr_fmt(fmt) "TCP: " fmt
245
246	#include <crypto/md5.h>
247	#include <linux/kernel.h>
248	#include <linux/module.h>
249	#include <linux/types.h>
250	#include <linux/fcntl.h>
251	#include <linux/poll.h>
252	#include <linux/inet_diag.h>
253	#include <linux/init.h>
254	#include <linux/fs.h>
255	#include <linux/skbuff.h>
256	#include <linux/splice.h>
257	#include <linux/net.h>
258	#include <linux/socket.h>
259	#include <linux/random.h>
260	#include <linux/memblock.h>
261	#include <linux/highmem.h>
262	#include <linux/cache.h>
263	#include <linux/err.h>
264	#include <linux/time.h>
265	#include <linux/slab.h>
266	#include <linux/errqueue.h>
267	#include <linux/static_key.h>
268	#include <linux/btf.h>
269
270	#include <net/icmp.h>
271	#include <net/inet_common.h>
272	#include <net/inet_ecn.h>
273	#include <net/tcp.h>
274	#include <net/tcp_ecn.h>
275	#include <net/mptcp.h>
276	#include <net/proto_memory.h>
277	#include <net/xfrm.h>
278	#include <net/ip.h>
279	#include <net/psp.h>
280	#include <net/sock.h>
281	#include <net/rstreason.h>
282
283	#include <linux/uaccess.h>
284	#include <asm/ioctls.h>
285	#include <net/busy_poll.h>
286	#include <net/hotdata.h>
287	#include <trace/events/tcp.h>
288	#include <net/rps.h>
289
290	#include "../core/devmem.h"
291
292	/ Track pending CMSGs. /
293	enum {
294	TCP_CMSG_INQ = `1`,
295	TCP_CMSG_TS = `2`
296	};
297
298	DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
299	EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
300
301	DEFINE_PER_CPU(u32, tcp_tw_isn);
302	EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn);
303
304	long sysctl_tcp_mem[`3`] __read_mostly;
305	EXPORT_IPV6_MOD(sysctl_tcp_mem);
306
307	DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
308	EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);
309
310	#if IS_ENABLED(CONFIG_SMC)
311	DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
312	EXPORT_SYMBOL(tcp_have_smc);
313	#endif
314
315	/*
316	* Current number of TCP sockets.
317	*/
318	struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
319	EXPORT_IPV6_MOD(tcp_sockets_allocated);
320
321	/*
322	* TCP splice context
323	*/
324	struct tcp_splice_state {
325	struct pipe_inode_info *pipe;
326	size_t len;
327	unsigned int flags;
328	};
329
330	/*
331	* Pressure flag: try to collapse.
332	* Technical note: it is used by multiple contexts non atomically.
333	* All the __sk_mem_schedule() is of this nature: accounting
334	* is strict, actions are advisory and have some latency.
335	*/
336	unsigned long tcp_memory_pressure __read_mostly;
337	EXPORT_SYMBOL_GPL(tcp_memory_pressure);
338
339	void tcp_enter_memory_pressure(struct sock *sk)
340	{
341	unsigned long val;
342
343	if (READ_ONCE(tcp_memory_pressure))
344	return;
345	val = jiffies;
346
347	if (!val)
348	val--;
349	if (!cmpxchg(&tcp_memory_pressure, `0`, val))
350	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
351	}
352	EXPORT_IPV6_MOD_GPL(tcp_enter_memory_pressure);
353
354	void tcp_leave_memory_pressure(struct sock *sk)
355	{
356	unsigned long val;
357
358	if (!READ_ONCE(tcp_memory_pressure))
359	return;
360	val = xchg(&tcp_memory_pressure, `0`);
361	if (val)
362	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
363	jiffies_to_msecs(jiffies - val));
364	}
365	EXPORT_IPV6_MOD_GPL(tcp_leave_memory_pressure);
366
367	/ Convert seconds to retransmits based on initial and max timeout /
368	static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
369	{
370	u8 res = `0`;
371
372	if (seconds > `0`) {
373	int period = timeout;
374
375	res = `1`;
376	while (seconds > period && res < `255`) {
377	res++;
378	timeout <<= `1`;
379	if (timeout > rto_max)
380	timeout = rto_max;
381	period += timeout;
382	}
383	}
384	return res;
385	}
386
387	/ Convert retransmits to seconds based on initial and max timeout /
388	static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
389	{
390	int period = `0`;
391
392	if (retrans > `0`) {
393	period = timeout;
394	while (--retrans) {
395	timeout <<= `1`;
396	if (timeout > rto_max)
397	timeout = rto_max;
398	period += timeout;
399	}
400	}
401	return period;
402	}
403
404	static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
405	{
406	u32 rate = READ_ONCE(tp->rate_delivered);
407	u32 intv = READ_ONCE(tp->rate_interval_us);
408	u64 rate64 = `0`;
409
410	if (rate && intv) {
411	rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC;
412	do_div(rate64, intv);
413	}
414	return rate64;
415	}
416
417	#ifdef CONFIG_TCP_MD5SIG
418	void tcp_md5_destruct_sock(struct sock *sk)
419	{
420	struct tcp_sock *tp = tcp_sk(sk);
421
422	if (tp->md5sig_info) {
423
424	tcp_clear_md5_list(sk);
425	kfree(rcu_replace_pointer(tp->md5sig_info, NULL, `1`));
426	static_branch_slow_dec_deferred(&tcp_md5_needed);
427	}
428	}
429	EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock);
430	#endif
431
432	/ Address-family independent initialization for a tcp_sock.*
433	*
434	* NOTE: A lot of things set to zero explicitly by call to
435	* sk_alloc() so need not be done here.
436	*/
437	void tcp_init_sock(struct sock *sk)
438	{
439	struct inet_connection_sock *icsk = inet_csk(sk);
440	struct tcp_sock *tp = tcp_sk(sk);
441	int rto_min_us, rto_max_ms;
442
443	tp->out_of_order_queue = RB_ROOT;
444	sk->tcp_rtx_queue = RB_ROOT;
445	tcp_init_xmit_timers(sk);
446	INIT_LIST_HEAD(list: &tp->tsq_node);
447	INIT_LIST_HEAD(list: &tp->tsorted_sent_queue);
448
449	icsk->icsk_rto = TCP_TIMEOUT_INIT;
450
451	rto_max_ms = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_max_ms);
452	icsk->icsk_rto_max = msecs_to_jiffies(m: rto_max_ms);
453
454	rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us);
455	icsk->icsk_rto_min = usecs_to_jiffies(u: rto_min_us);
456	icsk->icsk_delack_max = TCP_DELACK_MAX;
457	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
458	minmax_reset(m: &tp->rtt_min, tcp_jiffies32, meas: ~`0U`);
459
460	/ So many TCP implementations out there (incorrectly) count the*
461	* initial SYN frame in their delayed-ACK and congestion control
462	* algorithms that we must have the following bandaid to talk
463	* efficiently to them. -DaveM
464	*/
465	tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
466
467	/ There's a bubble in the pipe until at least the first ACK. /
468	tp->app_limited = ~`0U`;
469	tp->rate_app_limited = `1`;
470
471	/ See draft-stevens-tcpca-spec-01 for discussion of the*
472	* initialization of these values.
473	*/
474	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
475	tp->snd_cwnd_clamp = ~`0`;
476	tp->mss_cache = TCP_MSS_DEFAULT;
477
478	tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
479	tcp_assign_congestion_control(sk);
480
481	tp->tsoffset = `0`;
482	tp->rack.reo_wnd_steps = `1`;
483
484	sk->sk_write_space = sk_stream_write_space;
485	sock_set_flag(sk, flag: SOCK_USE_WRITE_QUEUE);
486
487	icsk->icsk_sync_mss = tcp_sync_mss;
488
489	WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[`1`]));
490	WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[`1`]));
491	tcp_scaling_ratio_init(sk);
492
493	set_bit(nr: SOCK_SUPPORT_ZC, addr: &sk->sk_socket->flags);
494	sk_sockets_allocated_inc(sk);
495	xa_init_flags(xa: &sk->sk_user_frags, XA_FLAGS_ALLOC1);
496	}
497	EXPORT_IPV6_MOD(tcp_init_sock);
498
499	static void tcp_tx_timestamp(struct sock sk, struct* sockcm_cookie *sockc)
500	{
501	struct sk_buff *skb = tcp_write_queue_tail(sk);
502	u32 tsflags = sockc->tsflags;
503
504	if (tsflags && skb) {
505	struct skb_shared_info *shinfo = skb_shinfo(skb);
506	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
507
508	sock_tx_timestamp(sk, sockc, tx_flags: &shinfo->tx_flags);
509	if (tsflags & SOF_TIMESTAMPING_TX_ACK)
510	tcb->txstamp_ack \|= TSTAMP_ACK_SK;
511	if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
512	shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - `1`;
513	}
514
515	if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
516	SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb)
517	bpf_skops_tx_timestamping(sk, skb, op: BPF_SOCK_OPS_TSTAMP_SENDMSG_CB);
518	}
519
520	static bool tcp_stream_is_readable(struct sock sk, int* target)
521	{
522	if (tcp_epollin_ready(sk, target))
523	return true;
524	return sk_is_readable(sk);
525	}
526
527	/*
528	* Wait for a TCP event.
529	*
530	* Note that we don't need to lock the socket, as the upper poll layers
531	* take care of normal races (between the test and the event) and we don't
532	* go look at any of the socket buffers directly.
533	*/
534	__poll_t tcp_poll(struct file file, struct* socket sock, poll_table wait)
535	{
536	__poll_t mask;
537	struct sock *sk = sock->sk;
538	const struct tcp_sock *tp = tcp_sk(sk);
539	u8 shutdown;
540	int state;
541
542	sock_poll_wait(filp: file, sock, p: wait);
543
544	state = inet_sk_state_load(sk);
545	if (state == TCP_LISTEN)
546	return inet_csk_listen_poll(sk);
547
548	/ Socket is not locked. We are protected from async events*
549	* by poll logic and correct handling of state changes
550	* made by other threads is impossible in any case.
551	*/
552
553	mask = `0`;
554
555	/*
556	* EPOLLHUP is certainly not done right. But poll() doesn't
557	* have a notion of HUP in just one direction, and for a
558	* socket the read side is more interesting.
559	*
560	* Some poll() documentation says that EPOLLHUP is incompatible
561	* with the EPOLLOUT/POLLWR flags, so somebody should check this
562	* all. But careful, it tends to be safer to return too many
563	* bits than too few, and you can easily break real applications
564	* if you don't tell them that something has hung up!
565	*
566	* Check-me.
567	*
568	* Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and
569	* our fs/select.c). It means that after we received EOF,
570	* poll always returns immediately, making impossible poll() on write()
571	* in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP
572	* if and only if shutdown has been made in both directions.
573	* Actually, it is interesting to look how Solaris and DUX
574	* solve this dilemma. I would prefer, if EPOLLHUP were maskable,
575	* then we could set it on SND_SHUTDOWN. BTW examples given
576	* in Stevens' books assume exactly this behaviour, it explains
577	* why EPOLLHUP is incompatible with EPOLLOUT. --ANK
578	*
579	* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
580	* blocking on fresh not-connected or disconnected socket. --ANK
581	*/
582	shutdown = READ_ONCE(sk->sk_shutdown);
583	if (shutdown == SHUTDOWN_MASK \|\| state == TCP_CLOSE)
584	mask \|= EPOLLHUP;
585	if (shutdown & RCV_SHUTDOWN)
586	mask \|= EPOLLIN \| EPOLLRDNORM \| EPOLLRDHUP;
587
588	/ Connected or passive Fast Open socket? /
589	if (state != TCP_SYN_SENT &&
590	(state != TCP_SYN_RECV \|\| rcu_access_pointer(tp->fastopen_rsk))) {
591	int target = sock_rcvlowat(sk, waitall: `0`, INT_MAX);
592	u16 urg_data = READ_ONCE(tp->urg_data);
593
594	if (unlikely(urg_data) &&
595	READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
596	!sock_flag(sk, flag: SOCK_URGINLINE))
597	target++;
598
599	if (tcp_stream_is_readable(sk, target))
600	mask \|= EPOLLIN \| EPOLLRDNORM;
601
602	if (!(shutdown & SEND_SHUTDOWN)) {
603	if (__sk_stream_is_writeable(sk, wake: `1`)) {
604	mask \|= EPOLLOUT \| EPOLLWRNORM;
605	} else { / send SIGIO later /
606	sk_set_bit(nr: SOCKWQ_ASYNC_NOSPACE, sk);
607	set_bit(nr: SOCK_NOSPACE, addr: &sk->sk_socket->flags);
608
609	/ Race breaker. If space is freed after*
610	* wspace test but before the flags are set,
611	* IO signal will be lost. Memory barrier
612	* pairs with the input side.
613	*/
614	smp_mb__after_atomic();
615	if (__sk_stream_is_writeable(sk, wake: `1`))
616	mask \|= EPOLLOUT \| EPOLLWRNORM;
617	}
618	} else
619	mask \|= EPOLLOUT \| EPOLLWRNORM;
620
621	if (urg_data & TCP_URG_VALID)
622	mask \|= EPOLLPRI;
623	} else if (state == TCP_SYN_SENT &&
624	inet_test_bit(DEFER_CONNECT, sk)) {
625	/ Active TCP fastopen socket with defer_connect*
626	* Return EPOLLOUT so application can call write()
627	* in order for kernel to generate SYN+data
628	*/
629	mask \|= EPOLLOUT \| EPOLLWRNORM;
630	}
631	/ This barrier is coupled with smp_wmb() in tcp_done_with_error() /
632	smp_rmb();
633	if (READ_ONCE(sk->sk_err) \|\|
634	!skb_queue_empty_lockless(list: &sk->sk_error_queue))
635	mask \|= EPOLLERR;
636
637	return mask;
638	}
639	EXPORT_SYMBOL(tcp_poll);
640
641	int tcp_ioctl(struct sock sk, int* cmd, int *karg)
642	{
643	struct tcp_sock *tp = tcp_sk(sk);
644	int answ;
645	bool slow;
646
647	switch (cmd) {
648	case SIOCINQ:
649	if (sk->sk_state == TCP_LISTEN)
650	return -EINVAL;
651
652	slow = lock_sock_fast(sk);
653	answ = tcp_inq(sk);
654	unlock_sock_fast(sk, slow);
655	break;
656	case SIOCATMARK:
657	answ = READ_ONCE(tp->urg_data) &&
658	READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
659	break;
660	case SIOCOUTQ:
661	if (sk->sk_state == TCP_LISTEN)
662	return -EINVAL;
663
664	if ((`1` << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
665	answ = `0`;
666	else
667	answ = READ_ONCE(tp->write_seq) - tp->snd_una;
668	break;
669	case SIOCOUTQNSD:
670	if (sk->sk_state == TCP_LISTEN)
671	return -EINVAL;
672
673	if ((`1` << sk->sk_state) & (TCPF_SYN_SENT \| TCPF_SYN_RECV))
674	answ = `0`;
675	else
676	answ = READ_ONCE(tp->write_seq) -
677	READ_ONCE(tp->snd_nxt);
678	break;
679	default:
680	return -ENOIOCTLCMD;
681	}
682
683	*karg = answ;
684	return `0`;
685	}
686	EXPORT_IPV6_MOD(tcp_ioctl);
687
688	void tcp_mark_push(struct tcp_sock tp, struct* sk_buff *skb)
689	{
690	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
691	tp->pushed_seq = tp->write_seq;
692	}
693
694	static inline bool forced_push(const struct tcp_sock *tp)
695	{
696	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> `1`));
697	}
698
699	void tcp_skb_entail(struct sock sk, struct* sk_buff *skb)
700	{
701	struct tcp_sock *tp = tcp_sk(sk);
702	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
703
704	tcb->seq = tcb->end_seq = tp->write_seq;
705	tcb->tcp_flags = TCPHDR_ACK;
706	__skb_header_release(skb);
707	psp_enqueue_set_decrypted(sk, skb);
708	tcp_add_write_queue_tail(sk, skb);
709	sk_wmem_queued_add(sk, val: skb->truesize);
710	sk_mem_charge(sk, size: skb->truesize);
711	if (tp->nonagle & TCP_NAGLE_PUSH)
712	tp->nonagle &= ~TCP_NAGLE_PUSH;
713
714	tcp_slow_start_after_idle_check(sk);
715	}
716
717	static inline void tcp_mark_urg(struct tcp_sock tp, int* flags)
718	{
719	if (flags & MSG_OOB)
720	tp->snd_up = tp->write_seq;
721	}
722
723	/ If a not yet filled skb is pushed, do not send it if*
724	* we have data packets in Qdisc or NIC queues :
725	* Because TX completion will happen shortly, it gives a chance
726	* to coalesce future sendmsg() payload into this skb, without
727	* need for a timer, and with no latency trade off.
728	* As packets containing data payload have a bigger truesize
729	* than pure acks (dataless) packets, the last checks prevent
730	* autocorking if we only have an ACK in Qdisc/NIC queues,
731	* or if TX completion was delayed after we processed ACK packet.
732	*/
733	static bool tcp_should_autocork(struct sock sk, struct* sk_buff *skb,
734	int size_goal)
735	{
736	return skb->len < size_goal &&
737	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
738	!tcp_rtx_queue_empty(sk) &&
739	refcount_read(r: &sk->sk_wmem_alloc) > skb->truesize &&
740	tcp_skb_can_collapse_to(skb);
741	}
742
743	void tcp_push(struct sock sk, int* flags, int mss_now,
744	int nonagle, int size_goal)
745	{
746	struct tcp_sock *tp = tcp_sk(sk);
747	struct sk_buff *skb;
748
749	skb = tcp_write_queue_tail(sk);
750	if (!skb)
751	return;
752	if (!(flags & MSG_MORE) \|\| forced_push(tp))
753	tcp_mark_push(tp, skb);
754
755	tcp_mark_urg(tp, flags);
756
757	if (tcp_should_autocork(sk, skb, size_goal)) {
758
759	/ avoid atomic op if TSQ_THROTTLED bit is already set /
760	if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
761	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
762	set_bit(nr: TSQ_THROTTLED, addr: &sk->sk_tsq_flags);
763	smp_mb__after_atomic();
764	}
765	/ It is possible TX completion already happened*
766	* before we set TSQ_THROTTLED.
767	*/
768	if (refcount_read(r: &sk->sk_wmem_alloc) > skb->truesize)
769	return;
770	}
771
772	if (flags & MSG_MORE)
773	nonagle = TCP_NAGLE_CORK;
774
775	__tcp_push_pending_frames(sk, cur_mss: mss_now, nonagle);
776	}
777
778	static int tcp_splice_data_recv(read_descriptor_t rd_desc, struct* sk_buff *skb,
779	unsigned int offset, size_t len)
780	{
781	struct tcp_splice_state *tss = rd_desc->arg.data;
782	int ret;
783
784	ret = skb_splice_bits(skb, sk: skb->sk, offset, pipe: tss->pipe,
785	min(rd_desc->count, len), flags: tss->flags);
786	if (ret > `0`)
787	rd_desc->count -= ret;
788	return ret;
789	}
790
791	static int __tcp_splice_read(struct sock sk, struct* tcp_splice_state *tss)
792	{
793	/ Store TCP splice context information in read_descriptor_t. /
794	read_descriptor_t rd_desc = {
795	.arg.data = tss,
796	.count = tss->len,
797	};
798
799	return tcp_read_sock(sk, desc: &rd_desc, recv_actor: tcp_splice_data_recv);
800	}
801
802	/**
803	* tcp_splice_read - splice data from TCP socket to a pipe
804	* @sock: socket to splice from
805	* @ppos: position (not valid)
806	* @pipe: pipe to splice to
807	* @len: number of bytes to splice
808	* @flags: splice modifier flags
809	*
810	* Description:
811	* Will read pages from given socket and fill them into a pipe.
812	*
813	**/
814	ssize_t tcp_splice_read(struct socket sock, loff_t ppos,
815	struct pipe_inode_info *pipe, size_t len,
816	unsigned int flags)
817	{
818	struct sock *sk = sock->sk;
819	struct tcp_splice_state tss = {
820	.pipe = pipe,
821	.len = len,
822	.flags = flags,
823	};
824	long timeo;
825	ssize_t spliced;
826	int ret;
827
828	sock_rps_record_flow(sk);
829	/*
830	* We can't seek on a socket input
831	*/
832	if (unlikely(*ppos))
833	return -ESPIPE;
834
835	ret = spliced = `0`;
836
837	lock_sock(sk);
838
839	timeo = sock_rcvtimeo(sk, noblock: sock->file->f_flags & O_NONBLOCK);
840	while (tss.len) {
841	ret = __tcp_splice_read(sk, tss: &tss);
842	if (ret < `0`)
843	break;
844	else if (!ret) {
845	if (spliced)
846	break;
847	if (sock_flag(sk, flag: SOCK_DONE))
848	break;
849	if (sk->sk_err) {
850	ret = sock_error(sk);
851	break;
852	}
853	if (sk->sk_shutdown & RCV_SHUTDOWN)
854	break;
855	if (sk->sk_state == TCP_CLOSE) {
856	/*
857	* This occurs when user tries to read
858	* from never connected socket.
859	*/
860	ret = -ENOTCONN;
861	break;
862	}
863	if (!timeo) {
864	ret = -EAGAIN;
865	break;
866	}
867	/ if __tcp_splice_read() got nothing while we have*
868	* an skb in receive queue, we do not want to loop.
869	* This might happen with URG data.
870	*/
871	if (!skb_queue_empty(list: &sk->sk_receive_queue))
872	break;
873	ret = sk_wait_data(sk, timeo: &timeo, NULL);
874	if (ret < `0`)
875	break;
876	if (signal_pending(current)) {
877	ret = sock_intr_errno(timeo);
878	break;
879	}
880	continue;
881	}
882	tss.len -= ret;
883	spliced += ret;
884
885	if (!tss.len \|\| !timeo)
886	break;
887	release_sock(sk);
888	lock_sock(sk);
889
890	if (sk->sk_err \|\| sk->sk_state == TCP_CLOSE \|\|
891	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
892	signal_pending(current))
893	break;
894	}
895
896	release_sock(sk);
897
898	if (spliced)
899	return spliced;
900
901	return ret;
902	}
903	EXPORT_IPV6_MOD(tcp_splice_read);
904
905	struct sk_buff tcp_stream_alloc_skb(struct* sock *sk, gfp_t gfp,
906	bool force_schedule)
907	{
908	struct sk_buff *skb;
909
910	skb = alloc_skb_fclone(MAX_TCP_HEADER, priority: gfp);
911	if (likely(skb)) {
912	bool mem_scheduled;
913
914	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
915	if (force_schedule) {
916	mem_scheduled = true;
917	sk_forced_mem_schedule(sk, size: skb->truesize);
918	} else {
919	mem_scheduled = sk_wmem_schedule(sk, size: skb->truesize);
920	}
921	if (likely(mem_scheduled)) {
922	skb_reserve(skb, MAX_TCP_HEADER);
923	skb->ip_summed = CHECKSUM_PARTIAL;
924	INIT_LIST_HEAD(list: &skb->tcp_tsorted_anchor);
925	return skb;
926	}
927	__kfree_skb(skb);
928	} else {
929	if (!sk->sk_bypass_prot_mem)
930	tcp_enter_memory_pressure(sk);
931	sk_stream_moderate_sndbuf(sk);
932	}
933	return NULL;
934	}
935
936	static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
937	int large_allowed)
938	{
939	struct tcp_sock *tp = tcp_sk(sk);
940	u32 new_size_goal, size_goal;
941
942	if (!large_allowed)
943	return mss_now;
944
945	/ Note : tcp_tso_autosize() will eventually split this later /
946	new_size_goal = tcp_bound_to_half_wnd(tp, pktsize: sk->sk_gso_max_size);
947
948	/ We try hard to avoid divides here /
949	size_goal = tp->gso_segs * mss_now;
950	if (unlikely(new_size_goal < size_goal \|\|
951	new_size_goal >= size_goal + mss_now)) {
952	tp->gso_segs = min_t(u16, new_size_goal / mss_now,
953	sk->sk_gso_max_segs);
954	size_goal = tp->gso_segs * mss_now;
955	}
956
957	return max(size_goal, mss_now);
958	}
959
960	int tcp_send_mss(struct sock sk, int* size_goal, int* flags)
961	{
962	int mss_now;
963
964	mss_now = tcp_current_mss(sk);
965	*size_goal = tcp_xmit_size_goal(sk, mss_now, large_allowed: !(flags & MSG_OOB));
966
967	return mss_now;
968	}
969
970	/ In some cases, sendmsg() could have added an skb to the write queue,*
971	* but failed adding payload on it. We need to remove it to consume less
972	* memory, but more importantly be able to generate EPOLLOUT for Edge Trigger
973	* epoll() users. Another reason is that tcp_write_xmit() does not like
974	* finding an empty skb in the write queue.
975	*/
976	void tcp_remove_empty_skb(struct sock *sk)
977	{
978	struct sk_buff *skb = tcp_write_queue_tail(sk);
979
980	if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
981	tcp_unlink_write_queue(skb, sk);
982	if (tcp_write_queue_empty(sk))
983	tcp_chrono_stop(sk, type: TCP_CHRONO_BUSY);
984	tcp_wmem_free_skb(sk, skb);
985	}
986	}
987
988	/ skb changing from pure zc to mixed, must charge zc /
989	static int tcp_downgrade_zcopy_pure(struct sock sk, struct* sk_buff *skb)
990	{
991	if (unlikely(skb_zcopy_pure(skb))) {
992	u32 extra = skb->truesize -
993	SKB_TRUESIZE(skb_end_offset(skb));
994
995	if (!sk_wmem_schedule(sk, size: extra))
996	return -ENOMEM;
997
998	sk_mem_charge(sk, size: extra);
999	skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
1000	}
1001	return `0`;
1002	}
1003
1004
1005	int tcp_wmem_schedule(struct sock sk, int* copy)
1006	{
1007	int left;
1008
1009	if (likely(sk_wmem_schedule(sk, copy)))
1010	return copy;
1011
1012	/ We could be in trouble if we have nothing queued.*
1013	* Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0]
1014	* to guarantee some progress.
1015	*/
1016	left = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[`0`]) - sk->sk_wmem_queued;
1017	if (left > `0`)
1018	sk_forced_mem_schedule(sk, min(left, copy));
1019	return min(copy, sk->sk_forward_alloc);
1020	}
1021
1022	void tcp_free_fastopen_req(struct tcp_sock *tp)
1023	{
1024	if (tp->fastopen_req) {
1025	kfree(objp: tp->fastopen_req);
1026	tp->fastopen_req = NULL;
1027	}
1028	}
1029
1030	int tcp_sendmsg_fastopen(struct sock sk, struct* msghdr msg, int* *copied,
1031	size_t size, struct ubuf_info *uarg)
1032	{
1033	struct tcp_sock *tp = tcp_sk(sk);
1034	struct inet_sock *inet = inet_sk(sk);
1035	struct sockaddr *uaddr = msg->msg_name;
1036	int err, flags;
1037
1038	if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
1039	TFO_CLIENT_ENABLE) \|\|
1040	(uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
1041	uaddr->sa_family == AF_UNSPEC))
1042	return -EOPNOTSUPP;
1043	if (tp->fastopen_req)
1044	return -EALREADY; / Another Fast Open is in progress /
1045
1046	tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request),
1047	sk->sk_allocation);
1048	if (unlikely(!tp->fastopen_req))
1049	return -ENOBUFS;
1050	tp->fastopen_req->data = msg;
1051	tp->fastopen_req->size = size;
1052	tp->fastopen_req->uarg = uarg;
1053
1054	if (inet_test_bit(DEFER_CONNECT, sk)) {
1055	err = tcp_connect(sk);
1056	/ Same failure procedure as in tcp_v4/6_connect /
1057	if (err) {
1058	tcp_set_state(sk, state: TCP_CLOSE);
1059	inet->inet_dport = `0`;
1060	sk->sk_route_caps = `0`;
1061	}
1062	}
1063	flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : `0`;
1064	err = __inet_stream_connect(sock: sk->sk_socket, uaddr: (struct sockaddr_unsized *)uaddr,
1065	addr_len: msg->msg_namelen, flags, is_sendmsg: `1`);
1066	/ fastopen_req could already be freed in __inet_stream_connect*
1067	* if the connection times out or gets rst
1068	*/
1069	if (tp->fastopen_req) {
1070	*copied = tp->fastopen_req->copied;
1071	tcp_free_fastopen_req(tp);
1072	inet_clear_bit(DEFER_CONNECT, sk);
1073	}
1074	return err;
1075	}
1076
1077	int tcp_sendmsg_locked(struct sock sk, struct* msghdr *msg, size_t size)
1078	{
1079	struct net_devmem_dmabuf_binding *binding = NULL;
1080	struct tcp_sock *tp = tcp_sk(sk);
1081	struct ubuf_info *uarg = NULL;
1082	struct sk_buff *skb;
1083	struct sockcm_cookie sockc;
1084	int flags, err, copied = `0`;
1085	int mss_now = `0`, size_goal, copied_syn = `0`;
1086	int process_backlog = `0`;
1087	int sockc_err = `0`;
1088	int zc = `0`;
1089	long timeo;
1090
1091	flags = msg->msg_flags;
1092
1093	sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) };
1094	if (msg->msg_controllen) {
1095	sockc_err = sock_cmsg_send(sk, msg, sockc: &sockc);
1096	/ Don't return error until MSG_FASTOPEN has been processed;*
1097	* that may succeed even if the cmsg is invalid.
1098	*/
1099	}
1100
1101	if ((flags & MSG_ZEROCOPY) && size) {
1102	if (msg->msg_ubuf) {
1103	uarg = msg->msg_ubuf;
1104	if (sk->sk_route_caps & NETIF_F_SG)
1105	zc = MSG_ZEROCOPY;
1106	} else if (sock_flag(sk, flag: SOCK_ZEROCOPY)) {
1107	skb = tcp_write_queue_tail(sk);
1108	uarg = msg_zerocopy_realloc(sk, size, uarg: skb_zcopy(skb),
1109	devmem: !sockc_err && sockc.dmabuf_id);
1110	if (!uarg) {
1111	err = -ENOBUFS;
1112	goto out_err;
1113	}
1114	if (sk->sk_route_caps & NETIF_F_SG)
1115	zc = MSG_ZEROCOPY;
1116	else
1117	uarg_to_msgzc(uarg)->zerocopy = `0`;
1118
1119	if (!sockc_err && sockc.dmabuf_id) {
1120	binding = net_devmem_get_binding(sk, dmabuf_id: sockc.dmabuf_id);
1121	if (IS_ERR(ptr: binding)) {
1122	err = PTR_ERR(ptr: binding);
1123	binding = NULL;
1124	goto out_err;
1125	}
1126	}
1127	}
1128	} else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
1129	if (sk->sk_route_caps & NETIF_F_SG)
1130	zc = MSG_SPLICE_PAGES;
1131	}
1132
1133	if (!sockc_err && sockc.dmabuf_id &&
1134	(!(flags & MSG_ZEROCOPY) \|\| !sock_flag(sk, flag: SOCK_ZEROCOPY))) {
1135	err = -EINVAL;
1136	goto out_err;
1137	}
1138
1139	if (unlikely(flags & MSG_FASTOPEN \|\|
1140	inet_test_bit(DEFER_CONNECT, sk)) &&
1141	!tp->repair) {
1142	err = tcp_sendmsg_fastopen(sk, msg, copied: &copied_syn, size, uarg);
1143	if (err == -EINPROGRESS && copied_syn > `0`)
1144	goto out;
1145	else if (err)
1146	goto out_err;
1147	}
1148
1149	timeo = sock_sndtimeo(sk, noblock: flags & MSG_DONTWAIT);
1150
1151	tcp_rate_check_app_limited(sk); / is sending application-limited? /
1152
1153	/ Wait for a connection to finish. One exception is TCP Fast Open*
1154	* (passive side) where data is allowed to be sent before a connection
1155	* is fully established.
1156	*/
1157	if (((`1` << sk->sk_state) & ~(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT)) &&
1158	!tcp_passive_fastopen(sk)) {
1159	err = sk_stream_wait_connect(sk, timeo_p: &timeo);
1160	if (err != `0`)
1161	goto do_error;
1162	}
1163
1164	if (unlikely(tp->repair)) {
1165	if (tp->repair_queue == TCP_RECV_QUEUE) {
1166	copied = tcp_send_rcvq(sk, msg, size);
1167	goto out_nopush;
1168	}
1169
1170	err = -EINVAL;
1171	if (tp->repair_queue == TCP_NO_QUEUE)
1172	goto out_err;
1173
1174	/ 'common' sending to sendq /
1175	}
1176
1177	if (sockc_err) {
1178	err = sockc_err;
1179	goto out_err;
1180	}
1181
1182	/ This should be in poll /
1183	sk_clear_bit(nr: SOCKWQ_ASYNC_NOSPACE, sk);
1184
1185	/ Ok commence sending. /
1186	copied = `0`;
1187
1188	restart:
1189	mss_now = tcp_send_mss(sk, size_goal: &size_goal, flags);
1190
1191	err = -EPIPE;
1192	if (sk->sk_err \|\| (sk->sk_shutdown & SEND_SHUTDOWN))
1193	goto do_error;
1194
1195	while (msg_data_left(msg)) {
1196	int copy = `0`;
1197
1198	skb = tcp_write_queue_tail(sk);
1199	if (skb)
1200	copy = size_goal - skb->len;
1201
1202	trace_tcp_sendmsg_locked(sk, msg, skb, size_goal);
1203
1204	if (copy <= `0` \|\| !tcp_skb_can_collapse_to(skb)) {
1205	bool first_skb;
1206
1207	new_segment:
1208	if (!sk_stream_memory_free(sk))
1209	goto wait_for_space;
1210
1211	if (unlikely(process_backlog >= `16`)) {
1212	process_backlog = `0`;
1213	if (sk_flush_backlog(sk))
1214	goto restart;
1215	}
1216	first_skb = tcp_rtx_and_write_queues_empty(sk);
1217	skb = tcp_stream_alloc_skb(sk, gfp: sk->sk_allocation,
1218	force_schedule: first_skb);
1219	if (!skb)
1220	goto wait_for_space;
1221
1222	process_backlog++;
1223
1224	#ifdef CONFIG_SKB_DECRYPTED
1225	skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
1226	#endif
1227	tcp_skb_entail(sk, skb);
1228	copy = size_goal;
1229
1230	/ All packets are restored as if they have*
1231	* already been sent. skb_mstamp_ns isn't set to
1232	* avoid wrong rtt estimation.
1233	*/
1234	if (tp->repair)
1235	TCP_SKB_CB(skb)->sacked \|= TCPCB_REPAIRED;
1236	}
1237
1238	/ Try to append data to the end of skb. /
1239	if (copy > msg_data_left(msg))
1240	copy = msg_data_left(msg);
1241
1242	if (zc == `0`) {
1243	bool merge = true;
1244	int i = skb_shinfo(skb)->nr_frags;
1245	struct page_frag *pfrag = sk_page_frag(sk);
1246
1247	if (!sk_page_frag_refill(sk, pfrag))
1248	goto wait_for_space;
1249
1250	if (!skb_can_coalesce(skb, i, page: pfrag->page,
1251	off: pfrag->offset)) {
1252	if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) {
1253	tcp_mark_push(tp, skb);
1254	goto new_segment;
1255	}
1256	merge = false;
1257	}
1258
1259	copy = min_t(int, copy, pfrag->size - pfrag->offset);
1260
1261	if (unlikely(skb_zcopy_pure(skb) \|\| skb_zcopy_managed(skb))) {
1262	if (tcp_downgrade_zcopy_pure(sk, skb))
1263	goto wait_for_space;
1264	skb_zcopy_downgrade_managed(skb);
1265	}
1266
1267	copy = tcp_wmem_schedule(sk, copy);
1268	if (!copy)
1269	goto wait_for_space;
1270
1271	err = skb_copy_to_page_nocache(sk, from: &msg->msg_iter, skb,
1272	page: pfrag->page,
1273	off: pfrag->offset,
1274	copy);
1275	if (err)
1276	goto do_error;
1277
1278	/ Update the skb. /
1279	if (merge) {
1280	skb_frag_size_add(frag: &skb_shinfo(skb)->frags[i - `1`], delta: copy);
1281	} else {
1282	skb_fill_page_desc(skb, i, page: pfrag->page,
1283	off: pfrag->offset, size: copy);
1284	page_ref_inc(page: pfrag->page);
1285	}
1286	pfrag->offset += copy;
1287	} else if (zc == MSG_ZEROCOPY) {
1288	/ First append to a fragless skb builds initial*
1289	* pure zerocopy skb
1290	*/
1291	if (!skb->len)
1292	skb_shinfo(skb)->flags \|= SKBFL_PURE_ZEROCOPY;
1293
1294	if (!skb_zcopy_pure(skb)) {
1295	copy = tcp_wmem_schedule(sk, copy);
1296	if (!copy)
1297	goto wait_for_space;
1298	}
1299
1300	err = skb_zerocopy_iter_stream(sk, skb, msg, len: copy, uarg,
1301	binding);
1302	if (err == -EMSGSIZE \|\| err == -EEXIST) {
1303	tcp_mark_push(tp, skb);
1304	goto new_segment;
1305	}
1306	if (err < `0`)
1307	goto do_error;
1308	copy = err;
1309	} else if (zc == MSG_SPLICE_PAGES) {
1310	/ Splice in data if we can; copy if we can't. /
1311	if (tcp_downgrade_zcopy_pure(sk, skb))
1312	goto wait_for_space;
1313	copy = tcp_wmem_schedule(sk, copy);
1314	if (!copy)
1315	goto wait_for_space;
1316
1317	err = skb_splice_from_iter(skb, iter: &msg->msg_iter, maxsize: copy);
1318	if (err < `0`) {
1319	if (err == -EMSGSIZE) {
1320	tcp_mark_push(tp, skb);
1321	goto new_segment;
1322	}
1323	goto do_error;
1324	}
1325	copy = err;
1326
1327	if (!(flags & MSG_NO_SHARED_FRAGS))
1328	skb_shinfo(skb)->flags \|= SKBFL_SHARED_FRAG;
1329
1330	sk_wmem_queued_add(sk, val: copy);
1331	sk_mem_charge(sk, size: copy);
1332	}
1333
1334	if (!copied)
1335	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
1336
1337	WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
1338	TCP_SKB_CB(skb)->end_seq += copy;
1339	tcp_skb_pcount_set(skb, segs: `0`);
1340
1341	copied += copy;
1342	if (!msg_data_left(msg)) {
1343	if (unlikely(flags & MSG_EOR))
1344	TCP_SKB_CB(skb)->eor = `1`;
1345	goto out;
1346	}
1347
1348	if (skb->len < size_goal \|\| (flags & MSG_OOB) \|\| unlikely(tp->repair))
1349	continue;
1350
1351	if (forced_push(tp)) {
1352	tcp_mark_push(tp, skb);
1353	__tcp_push_pending_frames(sk, cur_mss: mss_now, TCP_NAGLE_PUSH);
1354	} else if (skb == tcp_send_head(sk))
1355	tcp_push_one(sk, mss_now);
1356	continue;
1357
1358	wait_for_space:
1359	set_bit(nr: SOCK_NOSPACE, addr: &sk->sk_socket->flags);
1360	tcp_remove_empty_skb(sk);
1361	if (copied)
1362	tcp_push(sk, flags: flags & ~MSG_MORE, mss_now,
1363	TCP_NAGLE_PUSH, size_goal);
1364
1365	err = sk_stream_wait_memory(sk, timeo_p: &timeo);
1366	if (err != `0`)
1367	goto do_error;
1368
1369	mss_now = tcp_send_mss(sk, size_goal: &size_goal, flags);
1370	}
1371
1372	out:
1373	if (copied) {
1374	tcp_tx_timestamp(sk, sockc: &sockc);
1375	tcp_push(sk, flags, mss_now, nonagle: tp->nonagle, size_goal);
1376	}
1377	out_nopush:
1378	/ msg->msg_ubuf is pinned by the caller so we don't take extra refs /
1379	if (uarg && !msg->msg_ubuf)
1380	net_zcopy_put(uarg);
1381	if (binding)
1382	net_devmem_dmabuf_binding_put(binding);
1383	return copied + copied_syn;
1384
1385	do_error:
1386	tcp_remove_empty_skb(sk);
1387
1388	if (copied + copied_syn)
1389	goto out;
1390	out_err:
1391	/ msg->msg_ubuf is pinned by the caller so we don't take extra refs /
1392	if (uarg && !msg->msg_ubuf)
1393	net_zcopy_put_abort(uarg, have_uref: true);
1394	err = sk_stream_error(sk, flags, err);
1395	/ make sure we wake any epoll edge trigger waiter /
1396	if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
1397	sk->sk_write_space(sk);
1398	tcp_chrono_stop(sk, type: TCP_CHRONO_SNDBUF_LIMITED);
1399	}
1400	if (binding)
1401	net_devmem_dmabuf_binding_put(binding);
1402
1403	return err;
1404	}
1405	EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
1406
1407	int tcp_sendmsg(struct sock sk, struct* msghdr *msg, size_t size)
1408	{
1409	int ret;
1410
1411	lock_sock(sk);
1412	ret = tcp_sendmsg_locked(sk, msg, size);
1413	release_sock(sk);
1414
1415	return ret;
1416	}
1417	EXPORT_SYMBOL(tcp_sendmsg);
1418
1419	void tcp_splice_eof(struct socket *sock)
1420	{
1421	struct sock *sk = sock->sk;
1422	struct tcp_sock *tp = tcp_sk(sk);
1423	int mss_now, size_goal;
1424
1425	if (!tcp_write_queue_tail(sk))
1426	return;
1427
1428	lock_sock(sk);
1429	mss_now = tcp_send_mss(sk, size_goal: &size_goal, flags: `0`);
1430	tcp_push(sk, flags: `0`, mss_now, nonagle: tp->nonagle, size_goal);
1431	release_sock(sk);
1432	}
1433	EXPORT_IPV6_MOD_GPL(tcp_splice_eof);
1434
1435	/*
1436	* Handle reading urgent data. BSD has very simple semantics for
1437	* this, no blocking and very strange errors 8)
1438	*/
1439
1440	static int tcp_recv_urg(struct sock sk, struct* msghdr msg, int* len, int flags)
1441	{
1442	struct tcp_sock *tp = tcp_sk(sk);
1443
1444	/ No URG data to read. /
1445	if (sock_flag(sk, flag: SOCK_URGINLINE) \|\| !tp->urg_data \|\|
1446	tp->urg_data == TCP_URG_READ)
1447	return -EINVAL; / Yes this is right ! /
1448
1449	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, flag: SOCK_DONE))
1450	return -ENOTCONN;
1451
1452	if (tp->urg_data & TCP_URG_VALID) {
1453	int err = `0`;
1454	char c = tp->urg_data;
1455
1456	if (!(flags & MSG_PEEK))
1457	WRITE_ONCE(tp->urg_data, TCP_URG_READ);
1458
1459	/ Read urgent data. /
1460	msg->msg_flags \|= MSG_OOB;
1461
1462	if (len > `0`) {
1463	if (!(flags & MSG_TRUNC))
1464	err = memcpy_to_msg(msg, data: &c, len: `1`);
1465	len = `1`;
1466	} else
1467	msg->msg_flags \|= MSG_TRUNC;
1468
1469	return err ? -EFAULT : len;
1470	}
1471
1472	if (sk->sk_state == TCP_CLOSE \|\| (sk->sk_shutdown & RCV_SHUTDOWN))
1473	return `0`;
1474
1475	/ Fixed the recv(..., MSG_OOB) behaviour. BSD docs and*
1476	* the available implementations agree in this case:
1477	* this call should never block, independent of the
1478	* blocking state of the socket.
1479	* Mike <pall@rz.uni-karlsruhe.de>
1480	*/
1481	return -EAGAIN;
1482	}
1483
1484	static int tcp_peek_sndq(struct sock sk, struct* msghdr msg, int* len)
1485	{
1486	struct sk_buff *skb;
1487	int copied = `0`, err = `0`;
1488
1489	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
1490	err = skb_copy_datagram_msg(from: skb, offset: `0`, msg, size: skb->len);
1491	if (err)
1492	return err;
1493	copied += skb->len;
1494	}
1495
1496	skb_queue_walk(&sk->sk_write_queue, skb) {
1497	err = skb_copy_datagram_msg(from: skb, offset: `0`, msg, size: skb->len);
1498	if (err)
1499	break;
1500
1501	copied += skb->len;
1502	}
1503
1504	return err ?: copied;
1505	}
1506
1507	/ Clean up the receive buffer for full frames taken by the user,*
1508	* then send an ACK if necessary. COPIED is the number of bytes
1509	* tcp_recvmsg has given to the user so far, it speeds up the
1510	* calculation of whether or not we must ACK for the sake of
1511	* a window update.
1512	*/
1513	void __tcp_cleanup_rbuf(struct sock sk, int* copied)
1514	{
1515	struct tcp_sock *tp = tcp_sk(sk);
1516	bool time_to_ack = false;
1517
1518	if (inet_csk_ack_scheduled(sk)) {
1519	const struct inet_connection_sock *icsk = inet_csk(sk);
1520
1521	if (/ Once-per-two-segments ACK was not sent by tcp_input.c /
1522	tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss \|\|
1523	/*
1524	* If this read emptied read buffer, we send ACK, if
1525	* connection is not bidirectional, user drained
1526	* receive buffer and there was a small segment
1527	* in queue.
1528	*/
1529	(copied > `0` &&
1530	((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) \|\|
1531	((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1532	!inet_csk_in_pingpong_mode(sk))) &&
1533	!atomic_read(v: &sk->sk_rmem_alloc)))
1534	time_to_ack = true;
1535	}
1536
1537	/ We send an ACK if we can now advertise a non-zero window*
1538	* which has been raised "significantly".
1539	*
1540	* Even if window raised up to infinity, do not send window open ACK
1541	* in states, where we will not receive more. It is useless.
1542	*/
1543	if (copied > `0` && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1544	__u32 rcv_window_now = tcp_receive_window(tp);
1545
1546	/ Optimize, __tcp_select_window() is not cheap. /
1547	if (`2`*rcv_window_now <= tp->window_clamp) {
1548	__u32 new_window = __tcp_select_window(sk);
1549
1550	/ Send ACK now, if this read freed lots of space*
1551	* in our buffer. Certainly, new_window is new window.
1552	* We can advertise it now, if it is not less than current one.
1553	* "Lots" means "at least twice" here.
1554	*/
1555	if (new_window && new_window >= `2` * rcv_window_now)
1556	time_to_ack = true;
1557	}
1558	}
1559	if (time_to_ack) {
1560	tcp_mstamp_refresh(tp);
1561	tcp_send_ack(sk);
1562	}
1563	}
1564
1565	void tcp_cleanup_rbuf(struct sock sk, int* copied)
1566	{
1567	struct sk_buff *skb = skb_peek(list_: &sk->sk_receive_queue);
1568	struct tcp_sock *tp = tcp_sk(sk);
1569
1570	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1571	"cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1572	tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1573	__tcp_cleanup_rbuf(sk, copied);
1574	}
1575
1576	static void tcp_eat_recv_skb(struct sock sk, struct* sk_buff *skb)
1577	{
1578	__skb_unlink(skb, list: &sk->sk_receive_queue);
1579	if (likely(skb->destructor == sock_rfree)) {
1580	sock_rfree(skb);
1581	skb->destructor = NULL;
1582	skb->sk = NULL;
1583	return skb_attempt_defer_free(skb);
1584	}
1585	__kfree_skb(skb);
1586	}
1587
1588	struct sk_buff tcp_recv_skb(struct* sock sk, u32 seq, u32 off)
1589	{
1590	struct sk_buff *skb;
1591	u32 offset;
1592
1593	while ((skb = skb_peek(list_: &sk->sk_receive_queue)) != NULL) {
1594	offset = seq - TCP_SKB_CB(skb)->seq;
1595	if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
1596	pr_err_once("%s: found a SYN, please report !\n", __func__);
1597	offset--;
1598	}
1599	if (offset < skb->len \|\| (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
1600	*off = offset;
1601	return skb;
1602	}
1603	/ This looks weird, but this can happen if TCP collapsing*
1604	* splitted a fat GRO packet, while we released socket lock
1605	* in skb_splice_bits()
1606	*/
1607	tcp_eat_recv_skb(sk, skb);
1608	}
1609	return NULL;
1610	}
1611	EXPORT_SYMBOL(tcp_recv_skb);
1612
1613	/*
1614	* This routine provides an alternative to tcp_recvmsg() for routines
1615	* that would like to handle copying from skbuffs directly in 'sendfile'
1616	* fashion.
1617	* Note:
1618	* - It is assumed that the socket was locked by the caller.
1619	* - The routine does not block.
1620	* - At present, there is no support for reading OOB data
1621	* or for 'peeking' the socket using this routine
1622	* (although both would be easy to implement).
1623	*/
1624	static int __tcp_read_sock(struct sock sk, read_descriptor_t desc,
1625	sk_read_actor_t recv_actor, bool noack,
1626	u32 *copied_seq)
1627	{
1628	struct sk_buff *skb;
1629	struct tcp_sock *tp = tcp_sk(sk);
1630	u32 seq = *copied_seq;
1631	u32 offset;
1632	int copied = `0`;
1633
1634	if (sk->sk_state == TCP_LISTEN)
1635	return -ENOTCONN;
1636	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1637	if (offset < skb->len) {
1638	int used;
1639	size_t len;
1640
1641	len = skb->len - offset;
1642	/ Stop reading if we hit a patch of urgent data /
1643	if (unlikely(tp->urg_data)) {
1644	u32 urg_offset = tp->urg_seq - seq;
1645	if (urg_offset < len)
1646	len = urg_offset;
1647	if (!len)
1648	break;
1649	}
1650	used = recv_actor(desc, skb, offset, len);
1651	if (used <= `0`) {
1652	if (!copied)
1653	copied = used;
1654	break;
1655	}
1656	if (WARN_ON_ONCE(used > len))
1657	used = len;
1658	seq += used;
1659	copied += used;
1660	offset += used;
1661
1662	/ If recv_actor drops the lock (e.g. TCP splice*
1663	* receive) the skb pointer might be invalid when
1664	* getting here: tcp_collapse might have deleted it
1665	* while aggregating skbs from the socket queue.
1666	*/
1667	skb = tcp_recv_skb(sk, seq - `1`, &offset);
1668	if (!skb)
1669	break;
1670	/ TCP coalescing might have appended data to the skb.*
1671	* Try to splice more frags
1672	*/
1673	if (offset + `1` != skb->len)
1674	continue;
1675	}
1676	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1677	tcp_eat_recv_skb(sk, skb);
1678	++seq;
1679	break;
1680	}
1681	tcp_eat_recv_skb(sk, skb);
1682	if (!desc->count)
1683	break;
1684	WRITE_ONCE(*copied_seq, seq);
1685	}
1686	WRITE_ONCE(*copied_seq, seq);
1687
1688	if (noack)
1689	goto out;
1690
1691	tcp_rcv_space_adjust(sk);
1692
1693	/ Clean up data we have read: This will do ACK frames. /
1694	if (copied > `0`) {
1695	tcp_recv_skb(sk, seq, &offset);
1696	tcp_cleanup_rbuf(sk, copied);
1697	}
1698	out:
1699	return copied;
1700	}
1701
1702	int tcp_read_sock(struct sock sk, read_descriptor_t desc,
1703	sk_read_actor_t recv_actor)
1704	{
1705	return __tcp_read_sock(sk, desc, recv_actor, noack: false,
1706	copied_seq: &tcp_sk(sk)->copied_seq);
1707	}
1708	EXPORT_SYMBOL(tcp_read_sock);
1709
1710	int tcp_read_sock_noack(struct sock sk, read_descriptor_t desc,
1711	sk_read_actor_t recv_actor, bool noack,
1712	u32 *copied_seq)
1713	{
1714	return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq);
1715	}
1716
1717	int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
1718	{
1719	struct sk_buff *skb;
1720	int copied = `0`;
1721
1722	if (sk->sk_state == TCP_LISTEN)
1723	return -ENOTCONN;
1724
1725	while ((skb = skb_peek(list_: &sk->sk_receive_queue)) != NULL) {
1726	u8 tcp_flags;
1727	int used;
1728
1729	__skb_unlink(skb, list: &sk->sk_receive_queue);
1730	WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
1731	tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
1732	used = recv_actor(sk, skb);
1733	if (used < `0`) {
1734	if (!copied)
1735	copied = used;
1736	break;
1737	}
1738	copied += used;
1739
1740	if (tcp_flags & TCPHDR_FIN)
1741	break;
1742	}
1743	return copied;
1744	}
1745	EXPORT_IPV6_MOD(tcp_read_skb);
1746
1747	void tcp_read_done(struct sock *sk, size_t len)
1748	{
1749	struct tcp_sock *tp = tcp_sk(sk);
1750	u32 seq = tp->copied_seq;
1751	struct sk_buff *skb;
1752	size_t left;
1753	u32 offset;
1754
1755	if (sk->sk_state == TCP_LISTEN)
1756	return;
1757
1758	left = len;
1759	while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1760	int used;
1761
1762	used = min_t(size_t, skb->len - offset, left);
1763	seq += used;
1764	left -= used;
1765
1766	if (skb->len > offset + used)
1767	break;
1768
1769	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
1770	tcp_eat_recv_skb(sk, skb);
1771	++seq;
1772	break;
1773	}
1774	tcp_eat_recv_skb(sk, skb);
1775	}
1776	WRITE_ONCE(tp->copied_seq, seq);
1777
1778	tcp_rcv_space_adjust(sk);
1779
1780	/ Clean up data we have read: This will do ACK frames. /
1781	if (left != len)
1782	tcp_cleanup_rbuf(sk, copied: len - left);
1783	}
1784	EXPORT_SYMBOL(tcp_read_done);
1785
1786	int tcp_peek_len(struct socket *sock)
1787	{
1788	return tcp_inq(sk: sock->sk);
1789	}
1790	EXPORT_IPV6_MOD(tcp_peek_len);
1791
1792	/ Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint /
1793	int tcp_set_rcvlowat(struct sock sk, int* val)
1794	{
1795	struct tcp_sock *tp = tcp_sk(sk);
1796	int space, cap;
1797
1798	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1799	cap = sk->sk_rcvbuf >> `1`;
1800	else
1801	cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[`2`]) >> `1`;
1802	val = min(val, cap);
1803	WRITE_ONCE(sk->sk_rcvlowat, val ? : `1`);
1804
1805	/ Check if we need to signal EPOLLIN right now /
1806	tcp_data_ready(sk);
1807
1808	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
1809	return `0`;
1810
1811	space = tcp_space_from_win(sk, win: val);
1812	if (space > sk->sk_rcvbuf) {
1813	WRITE_ONCE(sk->sk_rcvbuf, space);
1814
1815	if (tp->window_clamp && tp->window_clamp < val)
1816	WRITE_ONCE(tp->window_clamp, val);
1817	}
1818	return `0`;
1819	}
1820	EXPORT_IPV6_MOD(tcp_set_rcvlowat);
1821
1822	void tcp_update_recv_tstamps(struct sk_buff *skb,
1823	struct scm_timestamping_internal *tss)
1824	{
1825	if (skb->tstamp)
1826	tss->ts[`0`] = ktime_to_timespec64(skb->tstamp);
1827	else
1828	tss->ts[`0`] = (struct timespec64) {`0`};
1829
1830	if (skb_hwtstamps(skb)->hwtstamp)
1831	tss->ts[`2`] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
1832	else
1833	tss->ts[`2`] = (struct timespec64) {`0`};
1834	}
1835
1836	#ifdef CONFIG_MMU
1837	static const struct vm_operations_struct tcp_vm_ops = {
1838	};
1839
1840	int tcp_mmap(struct file file, struct* socket *sock,
1841	struct vm_area_struct *vma)
1842	{
1843	if (vma->vm_flags & (VM_WRITE \| VM_EXEC))
1844	return -EPERM;
1845	vm_flags_clear(vma, VM_MAYWRITE \| VM_MAYEXEC);
1846
1847	/ Instruct vm_insert_page() to not mmap_read_lock(mm) /
1848	vm_flags_set(vma, VM_MIXEDMAP);
1849
1850	vma->vm_ops = &tcp_vm_ops;
1851	return `0`;
1852	}
1853	EXPORT_IPV6_MOD(tcp_mmap);
1854
1855	static skb_frag_t skb_advance_to_frag(struct* sk_buff *skb, u32 offset_skb,
1856	u32 *offset_frag)
1857	{
1858	skb_frag_t *frag;
1859
1860	if (unlikely(offset_skb >= skb->len))
1861	return NULL;
1862
1863	offset_skb -= skb_headlen(skb);
1864	if ((int)offset_skb < `0` \|\| skb_has_frag_list(skb))
1865	return NULL;
1866
1867	frag = skb_shinfo(skb)->frags;
1868	while (offset_skb) {
1869	if (skb_frag_size(frag) > offset_skb) {
1870	*offset_frag = offset_skb;
1871	return frag;
1872	}
1873	offset_skb -= skb_frag_size(frag);
1874	++frag;
1875	}
1876	*offset_frag = `0`;
1877	return frag;
1878	}
1879
1880	static bool can_map_frag(const skb_frag_t *frag)
1881	{
1882	struct page *page;
1883
1884	if (skb_frag_size(frag) != PAGE_SIZE \|\| skb_frag_off(frag))
1885	return false;
1886
1887	page = skb_frag_page(frag);
1888
1889	if (PageCompound(page) \|\| page->mapping)
1890	return false;
1891
1892	return true;
1893	}
1894
1895	static int find_next_mappable_frag(const skb_frag_t *frag,
1896	int remaining_in_skb)
1897	{
1898	int offset = `0`;
1899
1900	if (likely(can_map_frag(frag)))
1901	return `0`;
1902
1903	while (offset < remaining_in_skb && !can_map_frag(frag)) {
1904	offset += skb_frag_size(frag);
1905	++frag;
1906	}
1907	return offset;
1908	}
1909
1910	static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
1911	struct tcp_zerocopy_receive *zc,
1912	struct sk_buff *skb, u32 offset)
1913	{
1914	u32 frag_offset, partial_frag_remainder = `0`;
1915	int mappable_offset;
1916	skb_frag_t *frag;
1917
1918	/ worst case: skip to next skb. try to improve on this case below /
1919	zc->recv_skip_hint = skb->len - offset;
1920
1921	/ Find the frag containing this offset (and how far into that frag) /
1922	frag = skb_advance_to_frag(skb, offset_skb: offset, offset_frag: &frag_offset);
1923	if (!frag)
1924	return;
1925
1926	if (frag_offset) {
1927	struct skb_shared_info *info = skb_shinfo(skb);
1928
1929	/ We read part of the last frag, must recvmsg() rest of skb. /
1930	if (frag == &info->frags[info->nr_frags - `1`])
1931	return;
1932
1933	/ Else, we must at least read the remainder in this frag. /
1934	partial_frag_remainder = skb_frag_size(frag) - frag_offset;
1935	zc->recv_skip_hint -= partial_frag_remainder;
1936	++frag;
1937	}
1938
1939	/ partial_frag_remainder: If part way through a frag, must read rest.*
1940	* mappable_offset: Bytes till next mappable frag, not counting bytes
1941	* in partial_frag_remainder.
1942	*/
1943	mappable_offset = find_next_mappable_frag(frag, remaining_in_skb: zc->recv_skip_hint);
1944	zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
1945	}
1946
1947	static int tcp_recvmsg_locked(struct sock sk, struct* msghdr *msg, size_t len,
1948	int flags, struct scm_timestamping_internal *tss,
1949	int *cmsg_flags);
1950	static int receive_fallback_to_copy(struct sock *sk,
1951	struct tcp_zerocopy_receive zc, int* inq,
1952	struct scm_timestamping_internal *tss)
1953	{
1954	unsigned long copy_address = (unsigned long)zc->copybuf_address;
1955	struct msghdr msg = {};
1956	int err;
1957
1958	zc->length = `0`;
1959	zc->recv_skip_hint = `0`;
1960
1961	if (copy_address != zc->copybuf_address)
1962	return -EINVAL;
1963
1964	err = import_ubuf(ITER_DEST, buf: (void __user *)copy_address, len: inq,
1965	i: &msg.msg_iter);
1966	if (err)
1967	return err;
1968
1969	err = tcp_recvmsg_locked(sk, msg: &msg, len: inq, MSG_DONTWAIT,
1970	tss, cmsg_flags: &zc->msg_flags);
1971	if (err < `0`)
1972	return err;
1973
1974	zc->copybuf_len = err;
1975	if (likely(zc->copybuf_len)) {
1976	struct sk_buff *skb;
1977	u32 offset;
1978
1979	skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
1980	if (skb)
1981	tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
1982	}
1983	return `0`;
1984	}
1985
1986	static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
1987	struct sk_buff *skb, u32 copylen,
1988	u32 offset, u32 seq)
1989	{
1990	unsigned long copy_address = (unsigned long)zc->copybuf_address;
1991	struct msghdr msg = {};
1992	int err;
1993
1994	if (copy_address != zc->copybuf_address)
1995	return -EINVAL;
1996
1997	err = import_ubuf(ITER_DEST, buf: (void __user *)copy_address, len: copylen,
1998	i: &msg.msg_iter);
1999	if (err)
2000	return err;
2001	err = skb_copy_datagram_msg(from: skb, offset: *offset, msg: &msg, size: copylen);
2002	if (err)
2003	return err;
2004	zc->recv_skip_hint -= copylen;
2005	*offset += copylen;
2006	*seq += copylen;
2007	return (__s32)copylen;
2008	}
2009
2010	static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
2011	struct sock *sk,
2012	struct sk_buff *skb,
2013	u32 *seq,
2014	s32 copybuf_len,
2015	struct scm_timestamping_internal *tss)
2016	{
2017	u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
2018
2019	if (!copylen)
2020	return `0`;
2021	/ skb is null if inq < PAGE_SIZE. /
2022	if (skb) {
2023	offset = *seq - TCP_SKB_CB(skb)->seq;
2024	} else {
2025	skb = tcp_recv_skb(sk, *seq, &offset);
2026	if (TCP_SKB_CB(skb)->has_rxtstamp) {
2027	tcp_update_recv_tstamps(skb, tss);
2028	zc->msg_flags \|= TCP_CMSG_TS;
2029	}
2030	}
2031
2032	zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, offset: &offset,
2033	seq);
2034	return zc->copybuf_len < `0` ? `0` : copylen;
2035	}
2036
2037	static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
2038	struct page **pending_pages,
2039	unsigned long pages_remaining,
2040	unsigned long *address,
2041	u32 *length,
2042	u32 *seq,
2043	struct tcp_zerocopy_receive *zc,
2044	u32 total_bytes_to_map,
2045	int err)
2046	{
2047	/ At least one page did not map. Try zapping if we skipped earlier. /
2048	if (err == -EBUSY &&
2049	zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
2050	u32 maybe_zap_len;
2051
2052	maybe_zap_len = total_bytes_to_map - / All bytes to map /
2053	length + /* Mapped or pending /
2054	(pages_remaining * PAGE_SIZE); / Failed map. /
2055	zap_page_range_single(vma, address: *address, size: maybe_zap_len, NULL);
2056	err = `0`;
2057	}
2058
2059	if (!err) {
2060	unsigned long leftover_pages = pages_remaining;
2061	int bytes_mapped;
2062
2063	/ We called zap_page_range_single, try to reinsert. /
2064	err = vm_insert_pages(vma, addr: *address,
2065	pages: pending_pages,
2066	num: &pages_remaining);
2067	bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
2068	*seq += bytes_mapped;
2069	*address += bytes_mapped;
2070	}
2071	if (err) {
2072	/ Either we were unable to zap, OR we zapped, retried an*
2073	* insert, and still had an issue. Either ways, pages_remaining
2074	* is the number of pages we were unable to map, and we unroll
2075	* some state we speculatively touched before.
2076	*/
2077	const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
2078
2079	*length -= bytes_not_mapped;
2080	zc->recv_skip_hint += bytes_not_mapped;
2081	}
2082	return err;
2083	}
2084
2085	static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
2086	struct page **pages,
2087	unsigned int pages_to_map,
2088	unsigned long *address,
2089	u32 *length,
2090	u32 *seq,
2091	struct tcp_zerocopy_receive *zc,
2092	u32 total_bytes_to_map)
2093	{
2094	unsigned long pages_remaining = pages_to_map;
2095	unsigned int pages_mapped;
2096	unsigned int bytes_mapped;
2097	int err;
2098
2099	err = vm_insert_pages(vma, addr: *address, pages, num: &pages_remaining);
2100	pages_mapped = pages_to_map - (unsigned int)pages_remaining;
2101	bytes_mapped = PAGE_SIZE * pages_mapped;
2102	/ Even if vm_insert_pages fails, it may have partially succeeded in*
2103	* mapping (some but not all of the pages).
2104	*/
2105	*seq += bytes_mapped;
2106	*address += bytes_mapped;
2107
2108	if (likely(!err))
2109	return `0`;
2110
2111	/ Error: maybe zap and retry + rollback state for failed inserts. /
2112	return tcp_zerocopy_vm_insert_batch_error(vma, pending_pages: pages + pages_mapped,
2113	pages_remaining, address, length, seq, zc, total_bytes_to_map,
2114	err);
2115	}
2116
2117	#define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS)
2118	static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
2119	struct tcp_zerocopy_receive *zc,
2120	struct scm_timestamping_internal *tss)
2121	{
2122	unsigned long msg_control_addr;
2123	struct msghdr cmsg_dummy;
2124
2125	msg_control_addr = (unsigned long)zc->msg_control;
2126	cmsg_dummy.msg_control_user = (void __user *)msg_control_addr;
2127	cmsg_dummy.msg_controllen =
2128	(__kernel_size_t)zc->msg_controllen;
2129	cmsg_dummy.msg_flags = in_compat_syscall()
2130	? MSG_CMSG_COMPAT : `0`;
2131	cmsg_dummy.msg_control_is_user = true;
2132	zc->msg_flags = `0`;
2133	if (zc->msg_control == msg_control_addr &&
2134	zc->msg_controllen == cmsg_dummy.msg_controllen) {
2135	tcp_recv_timestamp(msg: &cmsg_dummy, sk, tss);
2136	zc->msg_control = (__u64)
2137	((uintptr_t)cmsg_dummy.msg_control_user);
2138	zc->msg_controllen =
2139	(__u64)cmsg_dummy.msg_controllen;
2140	zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
2141	}
2142	}
2143
2144	static struct vm_area_struct find_tcp_vma(struct* mm_struct *mm,
2145	unsigned long address,
2146	bool *mmap_locked)
2147	{
2148	struct vm_area_struct *vma = lock_vma_under_rcu(mm, address);
2149
2150	if (vma) {
2151	if (vma->vm_ops != &tcp_vm_ops) {
2152	vma_end_read(vma);
2153	return NULL;
2154	}
2155	*mmap_locked = false;
2156	return vma;
2157	}
2158
2159	mmap_read_lock(mm);
2160	vma = vma_lookup(mm, addr: address);
2161	if (!vma \|\| vma->vm_ops != &tcp_vm_ops) {
2162	mmap_read_unlock(mm);
2163	return NULL;
2164	}
2165	*mmap_locked = true;
2166	return vma;
2167	}
2168
2169	#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
2170	static int tcp_zerocopy_receive(struct sock *sk,
2171	struct tcp_zerocopy_receive *zc,
2172	struct scm_timestamping_internal *tss)
2173	{
2174	u32 length = `0`, offset, vma_len, avail_len, copylen = `0`;
2175	unsigned long address = (unsigned long)zc->address;
2176	struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
2177	s32 copybuf_len = zc->copybuf_len;
2178	struct tcp_sock *tp = tcp_sk(sk);
2179	const skb_frag_t *frags = NULL;
2180	unsigned int pages_to_map = `0`;
2181	struct vm_area_struct *vma;
2182	struct sk_buff *skb = NULL;
2183	u32 seq = tp->copied_seq;
2184	u32 total_bytes_to_map;
2185	int inq = tcp_inq(sk);
2186	bool mmap_locked;
2187	int ret;
2188
2189	zc->copybuf_len = `0`;
2190	zc->msg_flags = `0`;
2191
2192	if (address & (PAGE_SIZE - `1`) \|\| address != zc->address)
2193	return -EINVAL;
2194
2195	if (sk->sk_state == TCP_LISTEN)
2196	return -ENOTCONN;
2197
2198	sock_rps_record_flow(sk);
2199
2200	if (inq && inq <= copybuf_len)
2201	return receive_fallback_to_copy(sk, zc, inq, tss);
2202
2203	if (inq < PAGE_SIZE) {
2204	zc->length = `0`;
2205	zc->recv_skip_hint = inq;
2206	if (!inq && sock_flag(sk, flag: SOCK_DONE))
2207	return -EIO;
2208	return `0`;
2209	}
2210
2211	vma = find_tcp_vma(current->mm, address, mmap_locked: &mmap_locked);
2212	if (!vma)
2213	return -EINVAL;
2214
2215	vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
2216	avail_len = min_t(u32, vma_len, inq);
2217	total_bytes_to_map = avail_len & ~(PAGE_SIZE - `1`);
2218	if (total_bytes_to_map) {
2219	if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
2220	zap_page_range_single(vma, address, size: total_bytes_to_map,
2221	NULL);
2222	zc->length = total_bytes_to_map;
2223	zc->recv_skip_hint = `0`;
2224	} else {
2225	zc->length = avail_len;
2226	zc->recv_skip_hint = avail_len;
2227	}
2228	ret = `0`;
2229	while (length + PAGE_SIZE <= zc->length) {
2230	int mappable_offset;
2231	struct page *page;
2232
2233	if (zc->recv_skip_hint < PAGE_SIZE) {
2234	u32 offset_frag;
2235
2236	if (skb) {
2237	if (zc->recv_skip_hint > `0`)
2238	break;
2239	skb = skb->next;
2240	offset = seq - TCP_SKB_CB(skb)->seq;
2241	} else {
2242	skb = tcp_recv_skb(sk, seq, &offset);
2243	}
2244
2245	if (!skb_frags_readable(skb))
2246	break;
2247
2248	if (TCP_SKB_CB(skb)->has_rxtstamp) {
2249	tcp_update_recv_tstamps(skb, tss);
2250	zc->msg_flags \|= TCP_CMSG_TS;
2251	}
2252	zc->recv_skip_hint = skb->len - offset;
2253	frags = skb_advance_to_frag(skb, offset_skb: offset, offset_frag: &offset_frag);
2254	if (!frags \|\| offset_frag)
2255	break;
2256	}
2257
2258	mappable_offset = find_next_mappable_frag(frag: frags,
2259	remaining_in_skb: zc->recv_skip_hint);
2260	if (mappable_offset) {
2261	zc->recv_skip_hint = mappable_offset;
2262	break;
2263	}
2264	page = skb_frag_page(frag: frags);
2265	if (WARN_ON_ONCE(!page))
2266	break;
2267
2268	prefetchw(x: page);
2269	pages[pages_to_map++] = page;
2270	length += PAGE_SIZE;
2271	zc->recv_skip_hint -= PAGE_SIZE;
2272	frags++;
2273	if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE \|\|
2274	zc->recv_skip_hint < PAGE_SIZE) {
2275	/ Either full batch, or we're about to go to next skb*
2276	* (and we cannot unroll failed ops across skbs).
2277	*/
2278	ret = tcp_zerocopy_vm_insert_batch(vma, pages,
2279	pages_to_map,
2280	address: &address, length: &length,
2281	seq: &seq, zc,
2282	total_bytes_to_map);
2283	if (ret)
2284	goto out;
2285	pages_to_map = `0`;
2286	}
2287	}
2288	if (pages_to_map) {
2289	ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
2290	address: &address, length: &length, seq: &seq,
2291	zc, total_bytes_to_map);
2292	}
2293	out:
2294	if (mmap_locked)
2295	mmap_read_unlock(current->mm);
2296	else
2297	vma_end_read(vma);
2298	/ Try to copy straggler data. /
2299	if (!ret)
2300	copylen = tcp_zc_handle_leftover(zc, sk, skb, seq: &seq, copybuf_len, tss);
2301
2302	if (length + copylen) {
2303	WRITE_ONCE(tp->copied_seq, seq);
2304	tcp_rcv_space_adjust(sk);
2305
2306	/ Clean up data we have read: This will do ACK frames. /
2307	tcp_recv_skb(sk, seq, &offset);
2308	tcp_cleanup_rbuf(sk, copied: length + copylen);
2309	ret = `0`;
2310	if (length == zc->length)
2311	zc->recv_skip_hint = `0`;
2312	} else {
2313	if (!zc->recv_skip_hint && sock_flag(sk, flag: SOCK_DONE))
2314	ret = -EIO;
2315	}
2316	zc->length = length;
2317	return ret;
2318	}
2319	#endif
2320
2321	/ Similar to __sock_recv_timestamp, but does not require an skb /
2322	void tcp_recv_timestamp(struct msghdr msg, const* struct sock *sk,
2323	struct scm_timestamping_internal *tss)
2324	{
2325	int new_tstamp = sock_flag(sk, flag: SOCK_TSTAMP_NEW);
2326	u32 tsflags = READ_ONCE(sk->sk_tsflags);
2327	bool has_timestamping = false;
2328
2329	if (tss->ts[`0`].tv_sec \|\| tss->ts[`0`].tv_nsec) {
2330	if (sock_flag(sk, flag: SOCK_RCVTSTAMP)) {
2331	if (sock_flag(sk, flag: SOCK_RCVTSTAMPNS)) {
2332	if (new_tstamp) {
2333	struct __kernel_timespec kts = {
2334	.tv_sec = tss->ts[`0`].tv_sec,
2335	.tv_nsec = tss->ts[`0`].tv_nsec,
2336	};
2337	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
2338	len: sizeof(kts), data: &kts);
2339	} else {
2340	struct __kernel_old_timespec ts_old = {
2341	.tv_sec = tss->ts[`0`].tv_sec,
2342	.tv_nsec = tss->ts[`0`].tv_nsec,
2343	};
2344	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
2345	len: sizeof(ts_old), data: &ts_old);
2346	}
2347	} else {
2348	if (new_tstamp) {
2349	struct __kernel_sock_timeval stv = {
2350	.tv_sec = tss->ts[`0`].tv_sec,
2351	.tv_usec = tss->ts[`0`].tv_nsec / `1000`,
2352	};
2353	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
2354	len: sizeof(stv), data: &stv);
2355	} else {
2356	struct __kernel_old_timeval tv = {
2357	.tv_sec = tss->ts[`0`].tv_sec,
2358	.tv_usec = tss->ts[`0`].tv_nsec / `1000`,
2359	};
2360	put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
2361	len: sizeof(tv), data: &tv);
2362	}
2363	}
2364	}
2365
2366	if (tsflags & SOF_TIMESTAMPING_SOFTWARE &&
2367	(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE \|\|
2368	!(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))
2369	has_timestamping = true;
2370	else
2371	tss->ts[`0`] = (struct timespec64) {`0`};
2372	}
2373
2374	if (tss->ts[`2`].tv_sec \|\| tss->ts[`2`].tv_nsec) {
2375	if (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE &&
2376	(tsflags & SOF_TIMESTAMPING_RX_HARDWARE \|\|
2377	!(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))
2378	has_timestamping = true;
2379	else
2380	tss->ts[`2`] = (struct timespec64) {`0`};
2381	}
2382
2383	if (has_timestamping) {
2384	tss->ts[`1`] = (struct timespec64) {`0`};
2385	if (sock_flag(sk, SOCK_TSTAMP_NEW))
2386	put_cmsg_scm_timestamping64(msg, tss);
2387	else
2388	put_cmsg_scm_timestamping(msg, tss);
2389	}
2390	}
2391
2392	static int tcp_inq_hint(struct sock *sk)
2393	{
2394	const struct tcp_sock *tp = tcp_sk(sk);
2395	u32 copied_seq = READ_ONCE(tp->copied_seq);
2396	u32 rcv_nxt = READ_ONCE(tp->rcv_nxt);
2397	int inq;
2398
2399	inq = rcv_nxt - copied_seq;
2400	if (unlikely(inq < `0` \|\| copied_seq != READ_ONCE(tp->copied_seq))) {
2401	lock_sock(sk);
2402	inq = tp->rcv_nxt - tp->copied_seq;
2403	release_sock(sk);
2404	}
2405	/ After receiving a FIN, tell the user-space to continue reading*
2406	* by returning a non-zero inq.
2407	*/
2408	if (inq == `0` && sock_flag(sk, flag: SOCK_DONE))
2409	inq = `1`;
2410	return inq;
2411	}
2412
2413	/ batch __xa_alloc() calls and reduce xa_lock()/xa_unlock() overhead. /
2414	struct tcp_xa_pool {
2415	u8 max; / max <= MAX_SKB_FRAGS /
2416	u8 idx; / idx <= max /
2417	__u32 tokens[MAX_SKB_FRAGS];
2418	netmem_ref netmems[MAX_SKB_FRAGS];
2419	};
2420
2421	static void tcp_xa_pool_commit_locked(struct sock sk, struct* tcp_xa_pool *p)
2422	{
2423	int i;
2424
2425	/ Commit part that has been copied to user space. /
2426	for (i = `0`; i < p->idx; i++)
2427	__xa_cmpxchg(&sk->sk_user_frags, index: p->tokens[i], XA_ZERO_ENTRY,
2428	entry: (__force void *)p->netmems[i], GFP_KERNEL);
2429	/ Rollback what has been pre-allocated and is no longer needed. /
2430	for (; i < p->max; i++)
2431	__xa_erase(&sk->sk_user_frags, index: p->tokens[i]);
2432
2433	p->max = `0`;
2434	p->idx = `0`;
2435	}
2436
2437	static void tcp_xa_pool_commit(struct sock sk, struct* tcp_xa_pool *p)
2438	{
2439	if (!p->max)
2440	return;
2441
2442	xa_lock_bh(&sk->sk_user_frags);
2443
2444	tcp_xa_pool_commit_locked(sk, p);
2445
2446	xa_unlock_bh(&sk->sk_user_frags);
2447	}
2448
2449	static int tcp_xa_pool_refill(struct sock sk, struct* tcp_xa_pool *p,
2450	unsigned int max_frags)
2451	{
2452	int err, k;
2453
2454	if (p->idx < p->max)
2455	return `0`;
2456
2457	xa_lock_bh(&sk->sk_user_frags);
2458
2459	tcp_xa_pool_commit_locked(sk, p);
2460
2461	for (k = `0`; k < max_frags; k++) {
2462	err = __xa_alloc(&sk->sk_user_frags, id: &p->tokens[k],
2463	XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL);
2464	if (err)
2465	break;
2466	}
2467
2468	xa_unlock_bh(&sk->sk_user_frags);
2469
2470	p->max = k;
2471	p->idx = `0`;
2472	return k ? `0` : err;
2473	}
2474
2475	/ On error, returns the -errno. On success, returns number of bytes sent to the*
2476	* user. May not consume all of @remaining_len.
2477	*/
2478	static int tcp_recvmsg_dmabuf(struct sock sk, const* struct sk_buff *skb,
2479	unsigned int offset, struct msghdr *msg,
2480	int remaining_len)
2481	{
2482	struct dmabuf_cmsg dmabuf_cmsg = { `0` };
2483	struct tcp_xa_pool tcp_xa_pool;
2484	unsigned int start;
2485	int i, copy, n;
2486	int sent = `0`;
2487	int err = `0`;
2488
2489	tcp_xa_pool.max = `0`;
2490	tcp_xa_pool.idx = `0`;
2491	do {
2492	start = skb_headlen(skb);
2493
2494	if (skb_frags_readable(skb)) {
2495	err = -ENODEV;
2496	goto out;
2497	}
2498
2499	/ Copy header. /
2500	copy = start - offset;
2501	if (copy > `0`) {
2502	copy = min(copy, remaining_len);
2503
2504	n = copy_to_iter(addr: skb->data + offset, bytes: copy,
2505	i: &msg->msg_iter);
2506	if (n != copy) {
2507	err = -EFAULT;
2508	goto out;
2509	}
2510
2511	offset += copy;
2512	remaining_len -= copy;
2513
2514	/ First a dmabuf_cmsg for # bytes copied to user*
2515	* buffer.
2516	*/
2517	memset(&dmabuf_cmsg, `0`, sizeof(dmabuf_cmsg));
2518	dmabuf_cmsg.frag_size = copy;
2519	err = put_cmsg_notrunc(msg, SOL_SOCKET,
2520	SO_DEVMEM_LINEAR,
2521	len: sizeof(dmabuf_cmsg),
2522	data: &dmabuf_cmsg);
2523	if (err)
2524	goto out;
2525
2526	sent += copy;
2527
2528	if (remaining_len == `0`)
2529	goto out;
2530	}
2531
2532	/ after that, send information of dmabuf pages through a*
2533	* sequence of cmsg
2534	*/
2535	for (i = `0`; i < skb_shinfo(skb)->nr_frags; i++) {
2536	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2537	struct net_iov *niov;
2538	u64 frag_offset;
2539	int end;
2540
2541	/ !skb_frags_readable() should indicate that ALL the*
2542	* frags in this skb are dmabuf net_iovs. We're checking
2543	* for that flag above, but also check individual frags
2544	* here. If the tcp stack is not setting
2545	* skb_frags_readable() correctly, we still don't want
2546	* to crash here.
2547	*/
2548	if (!skb_frag_net_iov(frag)) {
2549	net_err_ratelimited("Found non-dmabuf skb with net_iov");
2550	err = -ENODEV;
2551	goto out;
2552	}
2553
2554	niov = skb_frag_net_iov(frag);
2555	if (!net_is_devmem_iov(niov)) {
2556	err = -ENODEV;
2557	goto out;
2558	}
2559
2560	end = start + skb_frag_size(frag);
2561	copy = end - offset;
2562
2563	if (copy > `0`) {
2564	copy = min(copy, remaining_len);
2565
2566	frag_offset = net_iov_virtual_addr(niov) +
2567	skb_frag_off(frag) + offset -
2568	start;
2569	dmabuf_cmsg.frag_offset = frag_offset;
2570	dmabuf_cmsg.frag_size = copy;
2571	err = tcp_xa_pool_refill(sk, p: &tcp_xa_pool,
2572	skb_shinfo(skb)->nr_frags - i);
2573	if (err)
2574	goto out;
2575
2576	/ Will perform the exchange later /
2577	dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx];
2578	dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov);
2579
2580	offset += copy;
2581	remaining_len -= copy;
2582
2583	err = put_cmsg_notrunc(msg, SOL_SOCKET,
2584	SO_DEVMEM_DMABUF,
2585	len: sizeof(dmabuf_cmsg),
2586	data: &dmabuf_cmsg);
2587	if (err)
2588	goto out;
2589
2590	atomic_long_inc(v: &niov->desc.pp_ref_count);
2591	tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag);
2592
2593	sent += copy;
2594
2595	if (remaining_len == `0`)
2596	goto out;
2597	}
2598	start = end;
2599	}
2600
2601	tcp_xa_pool_commit(sk, p: &tcp_xa_pool);
2602	if (!remaining_len)
2603	goto out;
2604
2605	/ if remaining_len is not satisfied yet, we need to go to the*
2606	* next frag in the frag_list to satisfy remaining_len.
2607	*/
2608	skb = skb_shinfo(skb)->frag_list ?: skb->next;
2609
2610	offset = offset - start;
2611	} while (skb);
2612
2613	if (remaining_len) {
2614	err = -EFAULT;
2615	goto out;
2616	}
2617
2618	out:
2619	tcp_xa_pool_commit(sk, p: &tcp_xa_pool);
2620	if (!sent)
2621	sent = err;
2622
2623	return sent;
2624	}
2625
2626	/*
2627	* This routine copies from a sock struct into the user buffer.
2628	*
2629	* Technical note: in 2.3 we work on _locked_ socket, so that
2630	* tricks with *seq access order and skb->users are not required.
2631	* Probably, code can be easily improved even more.
2632	*/
2633
2634	static int tcp_recvmsg_locked(struct sock sk, struct* msghdr *msg, size_t len,
2635	int flags, struct scm_timestamping_internal *tss,
2636	int *cmsg_flags)
2637	{
2638	struct tcp_sock *tp = tcp_sk(sk);
2639	int last_copied_dmabuf = -`1`; / uninitialized /
2640	int copied = `0`;
2641	u32 peek_seq;
2642	u32 *seq;
2643	unsigned long used;
2644	int err;
2645	int target; / Read at least this many bytes /
2646	long timeo;
2647	struct sk_buff skb, last;
2648	u32 peek_offset = `0`;
2649	u32 urg_hole = `0`;
2650
2651	err = -ENOTCONN;
2652	if (sk->sk_state == TCP_LISTEN)
2653	goto out;
2654
2655	if (tp->recvmsg_inq)
2656	*cmsg_flags = TCP_CMSG_INQ;
2657	timeo = sock_rcvtimeo(sk, noblock: flags & MSG_DONTWAIT);
2658
2659	/ Urgent data needs to be handled specially. /
2660	if (flags & MSG_OOB)
2661	goto recv_urg;
2662
2663	if (unlikely(tp->repair)) {
2664	err = -EPERM;
2665	if (!(flags & MSG_PEEK))
2666	goto out;
2667
2668	if (tp->repair_queue == TCP_SEND_QUEUE)
2669	goto recv_sndq;
2670
2671	err = -EINVAL;
2672	if (tp->repair_queue == TCP_NO_QUEUE)
2673	goto out;
2674
2675	/ 'common' recv queue MSG_PEEK-ing /
2676	}
2677
2678	seq = &tp->copied_seq;
2679	if (flags & MSG_PEEK) {
2680	peek_offset = max(sk_peek_offset(sk, flags), `0`);
2681	peek_seq = tp->copied_seq + peek_offset;
2682	seq = &peek_seq;
2683	}
2684
2685	target = sock_rcvlowat(sk, waitall: flags & MSG_WAITALL, len);
2686
2687	do {
2688	u32 offset;
2689
2690	/ Are we at urgent data? Stop if we have read anything or have SIGURG pending. /
2691	if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
2692	if (copied)
2693	break;
2694	if (signal_pending(current)) {
2695	copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
2696	break;
2697	}
2698	}
2699
2700	/ Next get a buffer. /
2701
2702	last = skb_peek_tail(list_: &sk->sk_receive_queue);
2703	skb_queue_walk(&sk->sk_receive_queue, skb) {
2704	last = skb;
2705	/ Now that we have two receive queues this*
2706	* shouldn't happen.
2707	*/
2708	if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
2709	"TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
2710	*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
2711	flags))
2712	break;
2713
2714	offset = *seq - TCP_SKB_CB(skb)->seq;
2715	if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
2716	pr_err_once("%s: found a SYN, please report !\n", __func__);
2717	offset--;
2718	}
2719	if (offset < skb->len)
2720	goto found_ok_skb;
2721	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2722	goto found_fin_ok;
2723	WARN(!(flags & MSG_PEEK),
2724	"TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
2725	*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
2726	}
2727
2728	/ Well, if we have backlog, try to process it now yet. /
2729
2730	if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
2731	break;
2732
2733	if (copied) {
2734	if (!timeo \|\|
2735	sk->sk_err \|\|
2736	sk->sk_state == TCP_CLOSE \|\|
2737	(sk->sk_shutdown & RCV_SHUTDOWN) \|\|
2738	signal_pending(current))
2739	break;
2740	} else {
2741	if (sock_flag(sk, flag: SOCK_DONE))
2742	break;
2743
2744	if (sk->sk_err) {
2745	copied = sock_error(sk);
2746	break;
2747	}
2748
2749	if (sk->sk_shutdown & RCV_SHUTDOWN)
2750	break;
2751
2752	if (sk->sk_state == TCP_CLOSE) {
2753	/ This occurs when user tries to read*
2754	* from never connected socket.
2755	*/
2756	copied = -ENOTCONN;
2757	break;
2758	}
2759
2760	if (!timeo) {
2761	copied = -EAGAIN;
2762	break;
2763	}
2764
2765	if (signal_pending(current)) {
2766	copied = sock_intr_errno(timeo);
2767	break;
2768	}
2769	}
2770
2771	if (copied >= target) {
2772	/ Do not sleep, just process backlog. /
2773	__sk_flush_backlog(sk);
2774	} else {
2775	tcp_cleanup_rbuf(sk, copied);
2776	err = sk_wait_data(sk, timeo: &timeo, skb: last);
2777	if (err < `0`) {
2778	err = copied ? : err;
2779	goto out;
2780	}
2781	}
2782
2783	if ((flags & MSG_PEEK) &&
2784	(peek_seq - peek_offset - copied - urg_hole != tp->copied_seq)) {
2785	net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
2786	current->comm,
2787	task_pid_nr(current));
2788	peek_seq = tp->copied_seq + peek_offset;
2789	}
2790	continue;
2791
2792	found_ok_skb:
2793	/ Ok so how much can we use? /
2794	used = skb->len - offset;
2795	if (len < used)
2796	used = len;
2797
2798	/ Do we have urgent data here? /
2799	if (unlikely(tp->urg_data)) {
2800	u32 urg_offset = tp->urg_seq - *seq;
2801	if (urg_offset < used) {
2802	if (!urg_offset) {
2803	if (!sock_flag(sk, flag: SOCK_URGINLINE)) {
2804	WRITE_ONCE(seq, seq + `1`);
2805	urg_hole++;
2806	offset++;
2807	used--;
2808	if (!used)
2809	goto skip_copy;
2810	}
2811	} else
2812	used = urg_offset;
2813	}
2814	}
2815
2816	if (!(flags & MSG_TRUNC)) {
2817	if (last_copied_dmabuf != -`1` &&
2818	last_copied_dmabuf != !skb_frags_readable(skb))
2819	break;
2820
2821	if (skb_frags_readable(skb)) {
2822	err = skb_copy_datagram_msg(from: skb, offset, msg,
2823	size: used);
2824	if (err) {
2825	/ Exception. Bailout! /
2826	if (!copied)
2827	copied = -EFAULT;
2828	break;
2829	}
2830	} else {
2831	if (!(flags & MSG_SOCK_DEVMEM)) {
2832	/ dmabuf skbs can only be received*
2833	* with the MSG_SOCK_DEVMEM flag.
2834	*/
2835	if (!copied)
2836	copied = -EFAULT;
2837
2838	break;
2839	}
2840
2841	err = tcp_recvmsg_dmabuf(sk, skb, offset, msg,
2842	remaining_len: used);
2843	if (err < `0`) {
2844	if (!copied)
2845	copied = err;
2846
2847	break;
2848	}
2849	used = err;
2850	}
2851	}
2852
2853	last_copied_dmabuf = !skb_frags_readable(skb);
2854
2855	WRITE_ONCE(seq, seq + used);
2856	copied += used;
2857	len -= used;
2858	if (flags & MSG_PEEK)
2859	sk_peek_offset_fwd(sk, val: used);
2860	else
2861	sk_peek_offset_bwd(sk, val: used);
2862	tcp_rcv_space_adjust(sk);
2863
2864	skip_copy:
2865	if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
2866	WRITE_ONCE(tp->urg_data, `0`);
2867	tcp_fast_path_check(sk);
2868	}
2869
2870	if (TCP_SKB_CB(skb)->has_rxtstamp) {
2871	tcp_update_recv_tstamps(skb, tss);
2872	*cmsg_flags \|= TCP_CMSG_TS;
2873	}
2874
2875	if (used + offset < skb->len)
2876	continue;
2877
2878	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
2879	goto found_fin_ok;
2880	if (!(flags & MSG_PEEK))
2881	tcp_eat_recv_skb(sk, skb);
2882	continue;
2883
2884	found_fin_ok:
2885	/ Process the FIN. /
2886	WRITE_ONCE(seq, seq + `1`);
2887	if (!(flags & MSG_PEEK))
2888	tcp_eat_recv_skb(sk, skb);
2889	break;
2890	} while (len > `0`);
2891
2892	/ According to UNIX98, msg_name/msg_namelen are ignored*
2893	* on connected socket. I was just happy when found this 8) --ANK
2894	*/
2895
2896	/ Clean up data we have read: This will do ACK frames. /
2897	tcp_cleanup_rbuf(sk, copied);
2898	return copied;
2899
2900	out:
2901	return err;
2902
2903	recv_urg:
2904	err = tcp_recv_urg(sk, msg, len, flags);
2905	goto out;
2906
2907	recv_sndq:
2908	err = tcp_peek_sndq(sk, msg, len);
2909	goto out;
2910	}
2911
2912	int tcp_recvmsg(struct sock sk, struct* msghdr msg, size_t len, int* flags,
2913	int *addr_len)
2914	{
2915	int cmsg_flags = `0`, ret;
2916	struct scm_timestamping_internal tss;
2917
2918	if (unlikely(flags & MSG_ERRQUEUE))
2919	return inet_recv_error(sk, msg, len, addr_len);
2920
2921	if (sk_can_busy_loop(sk) &&
2922	skb_queue_empty_lockless(list: &sk->sk_receive_queue) &&
2923	sk->sk_state == TCP_ESTABLISHED)
2924	sk_busy_loop(sk, nonblock: flags & MSG_DONTWAIT);
2925
2926	lock_sock(sk);
2927	ret = tcp_recvmsg_locked(sk, msg, len, flags, tss: &tss, cmsg_flags: &cmsg_flags);
2928	release_sock(sk);
2929
2930	if ((cmsg_flags \| msg->msg_get_inq) && ret >= `0`) {
2931	if (cmsg_flags & TCP_CMSG_TS)
2932	tcp_recv_timestamp(msg, sk, tss: &tss);
2933	if ((cmsg_flags & TCP_CMSG_INQ) \| msg->msg_get_inq) {
2934	msg->msg_inq = tcp_inq_hint(sk);
2935	if (cmsg_flags & TCP_CMSG_INQ)
2936	put_cmsg(msg, SOL_TCP, TCP_CM_INQ,
2937	len: sizeof(msg->msg_inq), data: &msg->msg_inq);
2938	}
2939	}
2940	return ret;
2941	}
2942	EXPORT_IPV6_MOD(tcp_recvmsg);
2943
2944	void tcp_set_state(struct sock sk, int* state)
2945	{
2946	int oldstate = sk->sk_state;
2947
2948	/ We defined a new enum for TCP states that are exported in BPF*
2949	* so as not force the internal TCP states to be frozen. The
2950	* following checks will detect if an internal state value ever
2951	* differs from the BPF value. If this ever happens, then we will
2952	* need to remap the internal value to the BPF value before calling
2953	* tcp_call_bpf_2arg.
2954	*/
2955	BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
2956	BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
2957	BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
2958	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
2959	BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
2960	BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
2961	BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
2962	BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
2963	BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
2964	BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
2965	BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
2966	BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
2967	BUILD_BUG_ON((int)BPF_TCP_BOUND_INACTIVE != (int)TCP_BOUND_INACTIVE);
2968	BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
2969
2970	/ bpf uapi header bpf.h defines an anonymous enum with values*
2971	* BPF_TCP_* used by bpf programs. Currently gcc built vmlinux
2972	* is able to emit this enum in DWARF due to the above BUILD_BUG_ON.
2973	* But clang built vmlinux does not have this enum in DWARF
2974	* since clang removes the above code before generating IR/debuginfo.
2975	* Let us explicitly emit the type debuginfo to ensure the
2976	* above-mentioned anonymous enum in the vmlinux DWARF and hence BTF
2977	* regardless of which compiler is used.
2978	*/
2979	BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED);
2980
2981	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
2982	tcp_call_bpf_2arg(sk, op: BPF_SOCK_OPS_STATE_CB, arg1: oldstate, arg2: state);
2983
2984	switch (state) {
2985	case TCP_ESTABLISHED:
2986	if (oldstate != TCP_ESTABLISHED)
2987	TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2988	break;
2989	case TCP_CLOSE_WAIT:
2990	if (oldstate == TCP_SYN_RECV)
2991	TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
2992	break;
2993
2994	case TCP_CLOSE:
2995	if (oldstate == TCP_CLOSE_WAIT \|\| oldstate == TCP_ESTABLISHED)
2996	TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS);
2997
2998	sk->sk_prot->unhash(sk);
2999	if (inet_csk(sk)->icsk_bind_hash &&
3000	!(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
3001	inet_put_port(sk);
3002	fallthrough;
3003	default:
3004	if (oldstate == TCP_ESTABLISHED \|\| oldstate == TCP_CLOSE_WAIT)
3005	TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
3006	}
3007
3008	/ Change state AFTER socket is unhashed to avoid closed*
3009	* socket sitting in hash tables.
3010	*/
3011	inet_sk_state_store(sk, newstate: state);
3012	}
3013	EXPORT_SYMBOL_GPL(tcp_set_state);
3014
3015	/*
3016	* State processing on a close. This implements the state shift for
3017	* sending our FIN frame. Note that we only send a FIN for some
3018	* states. A shutdown() may have already sent the FIN, or we may be
3019	* closed.
3020	*/
3021
3022	static const unsigned char new_state[`16`] = {
3023	/ current state: new state: action: /
3024	[`0` / (Invalid) /] = TCP_CLOSE,
3025	[TCP_ESTABLISHED] = TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
3026	[TCP_SYN_SENT] = TCP_CLOSE,
3027	[TCP_SYN_RECV] = TCP_FIN_WAIT1 \| TCP_ACTION_FIN,
3028	[TCP_FIN_WAIT1] = TCP_FIN_WAIT1,
3029	[TCP_FIN_WAIT2] = TCP_FIN_WAIT2,
3030	[TCP_TIME_WAIT] = TCP_CLOSE,
3031	[TCP_CLOSE] = TCP_CLOSE,
3032	[TCP_CLOSE_WAIT] = TCP_LAST_ACK \| TCP_ACTION_FIN,
3033	[TCP_LAST_ACK] = TCP_LAST_ACK,
3034	[TCP_LISTEN] = TCP_CLOSE,
3035	[TCP_CLOSING] = TCP_CLOSING,
3036	[TCP_NEW_SYN_RECV] = TCP_CLOSE, / should not happen ! /
3037	};
3038
3039	static int tcp_close_state(struct sock *sk)
3040	{
3041	int next = (int)new_state[sk->sk_state];
3042	int ns = next & TCP_STATE_MASK;
3043
3044	tcp_set_state(sk, ns);
3045
3046	return next & TCP_ACTION_FIN;
3047	}
3048
3049	/*
3050	* Shutdown the sending side of a connection. Much like close except
3051	* that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD).
3052	*/
3053
3054	void tcp_shutdown(struct sock sk, int* how)
3055	{
3056	/ We need to grab some memory, and put together a FIN,*
3057	* and then put it into the queue to be sent.
3058	* Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
3059	*/
3060	if (!(how & SEND_SHUTDOWN))
3061	return;
3062
3063	/ If we've already sent a FIN, or it's a closed state, skip this. /
3064	if ((`1` << sk->sk_state) &
3065	(TCPF_ESTABLISHED \| TCPF_SYN_SENT \|
3066	TCPF_CLOSE_WAIT)) {
3067	/ Clear out any half completed packets. FIN if needed. /
3068	if (tcp_close_state(sk))
3069	tcp_send_fin(sk);
3070	}
3071	}
3072	EXPORT_IPV6_MOD(tcp_shutdown);
3073
3074	int tcp_orphan_count_sum(void)
3075	{
3076	int i, total = `0`;
3077
3078	for_each_possible_cpu(i)
3079	total += per_cpu(tcp_orphan_count, i);
3080
3081	return max(total, `0`);
3082	}
3083
3084	static int tcp_orphan_cache;
3085	static struct timer_list tcp_orphan_timer;
3086	#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
3087
3088	static void tcp_orphan_update(struct timer_list *unused)
3089	{
3090	WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
3091	mod_timer(timer: &tcp_orphan_timer, expires: jiffies + TCP_ORPHAN_TIMER_PERIOD);
3092	}
3093
3094	static bool tcp_too_many_orphans(int shift)
3095	{
3096	return READ_ONCE(tcp_orphan_cache) << shift >
3097	READ_ONCE(sysctl_tcp_max_orphans);
3098	}
3099
3100	static bool tcp_out_of_memory(const struct sock *sk)
3101	{
3102	if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
3103	sk_memory_allocated(sk) > sk_prot_mem_limits(sk, index: `2`))
3104	return true;
3105	return false;
3106	}
3107
3108	bool tcp_check_oom(const struct sock sk, int* shift)
3109	{
3110	bool too_many_orphans, out_of_socket_memory;
3111
3112	too_many_orphans = tcp_too_many_orphans(shift);
3113	out_of_socket_memory = tcp_out_of_memory(sk);
3114
3115	if (too_many_orphans)
3116	net_info_ratelimited("too many orphaned sockets\n");
3117	if (out_of_socket_memory)
3118	net_info_ratelimited("out of memory -- consider tuning tcp_mem\n");
3119	return too_many_orphans \|\| out_of_socket_memory;
3120	}
3121
3122	void __tcp_close(struct sock sk, long* timeout)
3123	{
3124	bool data_was_unread = false;
3125	struct sk_buff *skb;
3126	int state;
3127
3128	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
3129
3130	if (sk->sk_state == TCP_LISTEN) {
3131	tcp_set_state(sk, TCP_CLOSE);
3132
3133	/ Special case. /
3134	inet_csk_listen_stop(sk);
3135
3136	goto adjudge_to_death;
3137	}
3138
3139	/ We need to flush the recv. buffs. We do this only on the*
3140	* descriptor close, not protocol-sourced closes, because the
3141	* reader process may not have drained the data yet!
3142	*/
3143	while ((skb = skb_peek(list_: &sk->sk_receive_queue)) != NULL) {
3144	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
3145
3146	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
3147	end_seq--;
3148	if (after(end_seq, tcp_sk(sk)->copied_seq))
3149	data_was_unread = true;
3150	tcp_eat_recv_skb(sk, skb);
3151	}
3152
3153	/ If socket has been already reset (e.g. in tcp_reset()) - kill it. /
3154	if (sk->sk_state == TCP_CLOSE)
3155	goto adjudge_to_death;
3156
3157	/ As outlined in RFC 2525, section 2.17, we send a RST here because*
3158	* data was lost. To witness the awful effects of the old behavior of
3159	* always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
3160	* GET in an FTP client, suspend the process, wait for the client to
3161	* advertise a zero window, then kill -9 the FTP client, wheee...
3162	* Note: timeout is always zero in such a case.
3163	*/
3164	if (unlikely(tcp_sk(sk)->repair)) {
3165	sk->sk_prot->disconnect(sk, `0`);
3166	} else if (data_was_unread) {
3167	/ Unread data was tossed, zap the connection. /
3168	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
3169	tcp_set_state(sk, TCP_CLOSE);
3170	tcp_send_active_reset(sk, priority: sk->sk_allocation,
3171	reason: SK_RST_REASON_TCP_ABORT_ON_CLOSE);
3172	} else if (sock_flag(sk, flag: SOCK_LINGER) && !sk->sk_lingertime) {
3173	/ Check zero linger _after_ checking for unread data. /
3174	sk->sk_prot->disconnect(sk, `0`);
3175	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
3176	} else if (tcp_close_state(sk)) {
3177	/ We FIN if the application ate all the data before*
3178	* zapping the connection.
3179	*/
3180
3181	/ RED-PEN. Formally speaking, we have broken TCP state*
3182	* machine. State transitions:
3183	*
3184	* TCP_ESTABLISHED -> TCP_FIN_WAIT1
3185	* TCP_SYN_RECV -> TCP_FIN_WAIT1 (it is difficult)
3186	* TCP_CLOSE_WAIT -> TCP_LAST_ACK
3187	*
3188	* are legal only when FIN has been sent (i.e. in window),
3189	* rather than queued out of window. Purists blame.
3190	*
3191	* F.e. "RFC state" is ESTABLISHED,
3192	* if Linux state is FIN-WAIT-1, but FIN is still not sent.
3193	*
3194	* The visible declinations are that sometimes
3195	* we enter time-wait state, when it is not required really
3196	* (harmless), do not send active resets, when they are
3197	* required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
3198	* they look as CLOSING or LAST_ACK for Linux)
3199	* Probably, I missed some more holelets.
3200	* --ANK
3201	* XXX (TFO) - To start off we don't support SYN+ACK+FIN
3202	* in a single packet! (May consider it later but will
3203	* probably need API support or TCP_CORK SYN-ACK until
3204	* data is written and socket is closed.)
3205	*/
3206	tcp_send_fin(sk);
3207	}
3208
3209	sk_stream_wait_close(sk, timeo_p: timeout);
3210
3211	adjudge_to_death:
3212	state = sk->sk_state;
3213	sock_hold(sk);
3214	sock_orphan(sk);
3215
3216	local_bh_disable();
3217	bh_lock_sock(sk);
3218	/ remove backlog if any, without releasing ownership. /
3219	__release_sock(sk);
3220
3221	tcp_orphan_count_inc();
3222
3223	/ Have we already been destroyed by a softirq or backlog? /
3224	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
3225	goto out;
3226
3227	/ This is a (useful) BSD violating of the RFC. There is a*
3228	* problem with TCP as specified in that the other end could
3229	* keep a socket open forever with no application left this end.
3230	* We use a 1 minute timeout (about the same as BSD) then kill
3231	* our end. If they send after that then tough - BUT: long enough
3232	* that we won't make the old 4*rto = almost no time - whoops
3233	* reset mistake.
3234	*
3235	* Nope, it was not mistake. It is really desired behaviour
3236	* f.e. on http servers, when such sockets are useless, but
3237	* consume significant resources. Let's do it with special
3238	* linger2 option. --ANK
3239	*/
3240
3241	if (sk->sk_state == TCP_FIN_WAIT2) {
3242	struct tcp_sock *tp = tcp_sk(sk);
3243	if (READ_ONCE(tp->linger2) < `0`) {
3244	tcp_set_state(sk, TCP_CLOSE);
3245	tcp_send_active_reset(sk, GFP_ATOMIC,
3246	reason: SK_RST_REASON_TCP_ABORT_ON_LINGER);
3247	__NET_INC_STATS(sock_net(sk),
3248	LINUX_MIB_TCPABORTONLINGER);
3249	} else {
3250	const int tmo = tcp_fin_time(sk);
3251
3252	if (tmo > TCP_TIMEWAIT_LEN) {
3253	tcp_reset_keepalive_timer(sk,
3254	timeout: tmo - TCP_TIMEWAIT_LEN);
3255	} else {
3256	tcp_time_wait(sk, state: TCP_FIN_WAIT2, timeo: tmo);
3257	goto out;
3258	}
3259	}
3260	}
3261	if (sk->sk_state != TCP_CLOSE) {
3262	if (tcp_check_oom(sk, shift: `0`)) {
3263	tcp_set_state(sk, TCP_CLOSE);
3264	tcp_send_active_reset(sk, GFP_ATOMIC,
3265	reason: SK_RST_REASON_TCP_ABORT_ON_MEMORY);
3266	__NET_INC_STATS(sock_net(sk),
3267	LINUX_MIB_TCPABORTONMEMORY);
3268	} else if (!check_net(net: sock_net(sk))) {
3269	/ Not possible to send reset; just close /
3270	tcp_set_state(sk, TCP_CLOSE);
3271	}
3272	}
3273
3274	if (sk->sk_state == TCP_CLOSE) {
3275	struct request_sock *req;
3276
3277	req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
3278	lockdep_sock_is_held(sk));
3279	/ We could get here with a non-NULL req if the socket is*
3280	* aborted (e.g., closed with unread data) before 3WHS
3281	* finishes.
3282	*/
3283	if (req)
3284	reqsk_fastopen_remove(sk, req, reset: false);
3285	inet_csk_destroy_sock(sk);
3286	}
3287	/ Otherwise, socket is reprieved until protocol close. /
3288
3289	out:
3290	bh_unlock_sock(sk);
3291	local_bh_enable();
3292	}
3293
3294	void tcp_close(struct sock sk, long* timeout)
3295	{
3296	lock_sock(sk);
3297	__tcp_close(sk, timeout);
3298	release_sock(sk);
3299	if (!sk->sk_net_refcnt)
3300	inet_csk_clear_xmit_timers_sync(sk);
3301	sock_put(sk);
3302	}
3303	EXPORT_SYMBOL(tcp_close);
3304
3305	/ These states need RST on ABORT according to RFC793 /
3306
3307	static inline bool tcp_need_reset(int state)
3308	{
3309	return (`1` << state) &
3310	(TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT \| TCPF_FIN_WAIT1 \|
3311	TCPF_FIN_WAIT2 \| TCPF_SYN_RECV);
3312	}
3313
3314	static void tcp_rtx_queue_purge(struct sock *sk)
3315	{
3316	struct rb_node *p = rb_first(root: &sk->tcp_rtx_queue);
3317
3318	tcp_sk(sk)->highest_sack = NULL;
3319	while (p) {
3320	struct sk_buff *skb = rb_to_skb(p);
3321
3322	p = rb_next(p);
3323	/ Since we are deleting whole queue, no need to*
3324	* list_del(&skb->tcp_tsorted_anchor)
3325	*/
3326	tcp_rtx_queue_unlink(skb, sk);
3327	tcp_wmem_free_skb(sk, skb);
3328	}
3329	}
3330
3331	void tcp_write_queue_purge(struct sock *sk)
3332	{
3333	struct sk_buff *skb;
3334
3335	tcp_chrono_stop(sk, type: TCP_CHRONO_BUSY);
3336	while ((skb = __skb_dequeue(list: &sk->sk_write_queue)) != NULL) {
3337	tcp_skb_tsorted_anchor_cleanup(skb);
3338	tcp_wmem_free_skb(sk, skb);
3339	}
3340	tcp_rtx_queue_purge(sk);
3341	INIT_LIST_HEAD(list: &tcp_sk(sk)->tsorted_sent_queue);
3342	tcp_clear_all_retrans_hints(tcp_sk(sk));
3343	tcp_sk(sk)->packets_out = `0`;
3344	inet_csk(sk)->icsk_backoff = `0`;
3345	}
3346
3347	int tcp_disconnect(struct sock sk, int* flags)
3348	{
3349	struct inet_sock *inet = inet_sk(sk);
3350	struct inet_connection_sock *icsk = inet_csk(sk);
3351	struct tcp_sock *tp = tcp_sk(sk);
3352	int old_state = sk->sk_state;
3353	struct request_sock *req;
3354	u32 seq;
3355
3356	if (old_state != TCP_CLOSE)
3357	tcp_set_state(sk, TCP_CLOSE);
3358
3359	/ ABORT function of RFC793 /
3360	if (old_state == TCP_LISTEN) {
3361	inet_csk_listen_stop(sk);
3362	} else if (unlikely(tp->repair)) {
3363	WRITE_ONCE(sk->sk_err, ECONNABORTED);
3364	} else if (tcp_need_reset(state: old_state)) {
3365	tcp_send_active_reset(sk, priority: gfp_any(), reason: SK_RST_REASON_TCP_STATE);
3366	WRITE_ONCE(sk->sk_err, ECONNRESET);
3367	} else if (tp->snd_nxt != tp->write_seq &&
3368	(`1` << old_state) & (TCPF_CLOSING \| TCPF_LAST_ACK)) {
3369	/ The last check adjusts for discrepancy of Linux wrt. RFC*
3370	* states
3371	*/
3372	tcp_send_active_reset(sk, priority: gfp_any(),
3373	reason: SK_RST_REASON_TCP_DISCONNECT_WITH_DATA);
3374	WRITE_ONCE(sk->sk_err, ECONNRESET);
3375	} else if (old_state == TCP_SYN_SENT)
3376	WRITE_ONCE(sk->sk_err, ECONNRESET);
3377
3378	tcp_clear_xmit_timers(sk);
3379	__skb_queue_purge(list: &sk->sk_receive_queue);
3380	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
3381	WRITE_ONCE(tp->urg_data, `0`);
3382	sk_set_peek_off(sk, val: -`1`);
3383	tcp_write_queue_purge(sk);
3384	tcp_fastopen_active_disable_ofo_check(sk);
3385	skb_rbtree_purge(root: &tp->out_of_order_queue);
3386
3387	inet->inet_dport = `0`;
3388
3389	inet_bhash2_reset_saddr(sk);
3390
3391	WRITE_ONCE(sk->sk_shutdown, `0`);
3392	sock_reset_flag(sk, flag: SOCK_DONE);
3393	tp->srtt_us = `0`;
3394	tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
3395	tp->rcv_rtt_last_tsecr = `0`;
3396
3397	seq = tp->write_seq + tp->max_window + `2`;
3398	if (!seq)
3399	seq = `1`;
3400	WRITE_ONCE(tp->write_seq, seq);
3401
3402	icsk->icsk_backoff = `0`;
3403	WRITE_ONCE(icsk->icsk_probes_out, `0`);
3404	icsk->icsk_probes_tstamp = `0`;
3405	icsk->icsk_rto = TCP_TIMEOUT_INIT;
3406	WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN);
3407	WRITE_ONCE(icsk->icsk_delack_max, TCP_DELACK_MAX);
3408	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
3409	tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
3410	tp->snd_cwnd_cnt = `0`;
3411	tp->is_cwnd_limited = `0`;
3412	tp->max_packets_out = `0`;
3413	tp->window_clamp = `0`;
3414	tp->delivered = `0`;
3415	tp->delivered_ce = `0`;
3416	tp->accecn_fail_mode = `0`;
3417	tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
3418	tcp_accecn_init_counters(tp);
3419	tp->prev_ecnfield = `0`;
3420	tp->accecn_opt_tstamp = `0`;
3421	if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
3422	icsk->icsk_ca_ops->release(sk);
3423	memset(icsk->icsk_ca_priv, `0`, sizeof(icsk->icsk_ca_priv));
3424	icsk->icsk_ca_initialized = `0`;
3425	tcp_set_ca_state(sk, ca_state: TCP_CA_Open);
3426	tp->is_sack_reneg = `0`;
3427	tcp_clear_retrans(tp);
3428	tp->total_retrans = `0`;
3429	inet_csk_delack_init(sk);
3430	/ Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0*
3431	* issue in __tcp_select_window()
3432	*/
3433	icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
3434	memset(&tp->rx_opt, `0`, sizeof(tp->rx_opt));
3435	__sk_dst_reset(sk);
3436	dst_release(unrcu_pointer(xchg(&sk->sk_rx_dst, NULL)));
3437	tcp_saved_syn_free(tp);
3438	tp->compressed_ack = `0`;
3439	tp->segs_in = `0`;
3440	tp->segs_out = `0`;
3441	tp->bytes_sent = `0`;
3442	tp->bytes_acked = `0`;
3443	tp->bytes_received = `0`;
3444	tp->bytes_retrans = `0`;
3445	tp->data_segs_in = `0`;
3446	tp->data_segs_out = `0`;
3447	tp->duplicate_sack[`0`].start_seq = `0`;
3448	tp->duplicate_sack[`0`].end_seq = `0`;
3449	tp->dsack_dups = `0`;
3450	tp->reord_seen = `0`;
3451	tp->retrans_out = `0`;
3452	tp->sacked_out = `0`;
3453	tp->tlp_high_seq = `0`;
3454	tp->last_oow_ack_time = `0`;
3455	tp->plb_rehash = `0`;
3456	/ There's a bubble in the pipe until at least the first ACK. /
3457	tp->app_limited = ~`0U`;
3458	tp->rate_app_limited = `1`;
3459	tp->rack.mstamp = `0`;
3460	tp->rack.advanced = `0`;
3461	tp->rack.reo_wnd_steps = `1`;
3462	tp->rack.last_delivered = `0`;
3463	tp->rack.reo_wnd_persist = `0`;
3464	tp->rack.dsack_seen = `0`;
3465	tp->syn_data_acked = `0`;
3466	tp->syn_fastopen_child = `0`;
3467	tp->rx_opt.saw_tstamp = `0`;
3468	tp->rx_opt.dsack = `0`;
3469	tp->rx_opt.num_sacks = `0`;
3470	tp->rcv_ooopack = `0`;
3471
3472
3473	/ Clean up fastopen related fields /
3474	req = rcu_dereference_protected(tp->fastopen_rsk,
3475	lockdep_sock_is_held(sk));
3476	if (req)
3477	reqsk_fastopen_remove(sk, req, reset: false);
3478	tcp_free_fastopen_req(tp);
3479	inet_clear_bit(DEFER_CONNECT, sk);
3480	tp->fastopen_client_fail = `0`;
3481
3482	WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
3483
3484	if (sk->sk_frag.page) {
3485	put_page(page: sk->sk_frag.page);
3486	sk->sk_frag.page = NULL;
3487	sk->sk_frag.offset = `0`;
3488	}
3489	sk_error_report(sk);
3490	return `0`;
3491	}
3492	EXPORT_SYMBOL(tcp_disconnect);
3493
3494	static inline bool tcp_can_repair_sock(const struct sock *sk)
3495	{
3496	return sockopt_ns_capable(ns: sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
3497	(sk->sk_state != TCP_LISTEN);
3498	}
3499
3500	static int tcp_repair_set_window(struct tcp_sock tp, sockptr_t optbuf, int* len)
3501	{
3502	struct tcp_repair_window opt;
3503
3504	if (!tp->repair)
3505	return -EPERM;
3506
3507	if (len != sizeof(opt))
3508	return -EINVAL;
3509
3510	if (copy_from_sockptr(dst: &opt, src: optbuf, size: sizeof(opt)))
3511	return -EFAULT;
3512
3513	if (opt.max_window < opt.snd_wnd)
3514	return -EINVAL;
3515
3516	if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
3517	return -EINVAL;
3518
3519	if (after(opt.rcv_wup, tp->rcv_nxt))
3520	return -EINVAL;
3521
3522	tp->snd_wl1 = opt.snd_wl1;
3523	tp->snd_wnd = opt.snd_wnd;
3524	tp->max_window = opt.max_window;
3525
3526	tp->rcv_wnd = opt.rcv_wnd;
3527	tp->rcv_wup = opt.rcv_wup;
3528
3529	return `0`;
3530	}
3531
3532	static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
3533	unsigned int len)
3534	{
3535	struct tcp_sock *tp = tcp_sk(sk);
3536	struct tcp_repair_opt opt;
3537	size_t offset = `0`;
3538
3539	while (len >= sizeof(opt)) {
3540	if (copy_from_sockptr_offset(dst: &opt, src: optbuf, offset, size: sizeof(opt)))
3541	return -EFAULT;
3542
3543	offset += sizeof(opt);
3544	len -= sizeof(opt);
3545
3546	switch (opt.opt_code) {
3547	case TCPOPT_MSS:
3548	tp->rx_opt.mss_clamp = opt.opt_val;
3549	tcp_mtup_init(sk);
3550	break;
3551	case TCPOPT_WINDOW:
3552	{
3553	u16 snd_wscale = opt.opt_val & `0xFFFF`;
3554	u16 rcv_wscale = opt.opt_val >> `16`;
3555
3556	if (snd_wscale > TCP_MAX_WSCALE \|\| rcv_wscale > TCP_MAX_WSCALE)
3557	return -EFBIG;
3558
3559	tp->rx_opt.snd_wscale = snd_wscale;
3560	tp->rx_opt.rcv_wscale = rcv_wscale;
3561	tp->rx_opt.wscale_ok = `1`;
3562	}
3563	break;
3564	case TCPOPT_SACK_PERM:
3565	if (opt.opt_val != `0`)
3566	return -EINVAL;
3567
3568	tp->rx_opt.sack_ok \|= TCP_SACK_SEEN;
3569	break;
3570	case TCPOPT_TIMESTAMP:
3571	if (opt.opt_val != `0`)
3572	return -EINVAL;
3573
3574	tp->rx_opt.tstamp_ok = `1`;
3575	break;
3576	}
3577	}
3578
3579	return `0`;
3580	}
3581
3582	DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
3583	EXPORT_IPV6_MOD(tcp_tx_delay_enabled);
3584
3585	static void tcp_enable_tx_delay(struct sock sk, int* val)
3586	{
3587	struct tcp_sock *tp = tcp_sk(sk);
3588	s32 delta = (val - tp->tcp_tx_delay) << `3`;
3589
3590	if (val && !static_branch_unlikely(&tcp_tx_delay_enabled)) {
3591	static int __tcp_tx_delay_enabled = `0`;
3592
3593	if (cmpxchg(&__tcp_tx_delay_enabled, `0`, `1`) == `0`) {
3594	static_branch_enable(&tcp_tx_delay_enabled);
3595	pr_info("TCP_TX_DELAY enabled\n");
3596	}
3597	}
3598	/ If we change tcp_tx_delay on a live flow, adjust tp->srtt_us,*
3599	* tp->rtt_min, icsk_rto and sk->sk_pacing_rate.
3600	* This is best effort.
3601	*/
3602	if (delta && sk->sk_state == TCP_ESTABLISHED) {
3603	s64 srtt = (s64)tp->srtt_us + delta;
3604
3605	tp->srtt_us = clamp_t(s64, srtt, `1`, ~`0U`);
3606
3607	/ Note: does not deal with non zero icsk_backoff /
3608	tcp_set_rto(sk);
3609
3610	minmax_reset(m: &tp->rtt_min, tcp_jiffies32, meas: ~`0U`);
3611
3612	tcp_update_pacing_rate(sk);
3613	}
3614	}
3615
3616	/ When set indicates to always queue non-full frames. Later the user clears*
3617	* this option and we transmit any pending partial frames in the queue. This is
3618	* meant to be used alongside sendfile() to get properly filled frames when the
3619	* user (for example) must write out headers with a write() call first and then
3620	* use sendfile to send out the data parts.
3621	*
3622	* TCP_CORK can be set together with TCP_NODELAY and it is stronger than
3623	* TCP_NODELAY.
3624	*/
3625	void __tcp_sock_set_cork(struct sock *sk, bool on)
3626	{
3627	struct tcp_sock *tp = tcp_sk(sk);
3628
3629	if (on) {
3630	tp->nonagle \|= TCP_NAGLE_CORK;
3631	} else {
3632	tp->nonagle &= ~TCP_NAGLE_CORK;
3633	if (tp->nonagle & TCP_NAGLE_OFF)
3634	tp->nonagle \|= TCP_NAGLE_PUSH;
3635	tcp_push_pending_frames(sk);
3636	}
3637	}
3638
3639	void tcp_sock_set_cork(struct sock *sk, bool on)
3640	{
3641	lock_sock(sk);
3642	__tcp_sock_set_cork(sk, on);
3643	release_sock(sk);
3644	}
3645	EXPORT_SYMBOL(tcp_sock_set_cork);
3646
3647	/ TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is*
3648	* remembered, but it is not activated until cork is cleared.
3649	*
3650	* However, when TCP_NODELAY is set we make an explicit push, which overrides
3651	* even TCP_CORK for currently queued segments.
3652	*/
3653	void __tcp_sock_set_nodelay(struct sock *sk, bool on)
3654	{
3655	if (on) {
3656	tcp_sk(sk)->nonagle \|= TCP_NAGLE_OFF\|TCP_NAGLE_PUSH;
3657	tcp_push_pending_frames(sk);
3658	} else {
3659	tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
3660	}
3661	}
3662
3663	void tcp_sock_set_nodelay(struct sock *sk)
3664	{
3665	lock_sock(sk);
3666	__tcp_sock_set_nodelay(sk, on: true);
3667	release_sock(sk);
3668	}
3669	EXPORT_SYMBOL(tcp_sock_set_nodelay);
3670
3671	static void __tcp_sock_set_quickack(struct sock sk, int* val)
3672	{
3673	if (!val) {
3674	inet_csk_enter_pingpong_mode(sk);
3675	return;
3676	}
3677
3678	inet_csk_exit_pingpong_mode(sk);
3679	if ((`1` << sk->sk_state) & (TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT) &&
3680	inet_csk_ack_scheduled(sk)) {
3681	inet_csk(sk)->icsk_ack.pending \|= ICSK_ACK_PUSHED;
3682	tcp_cleanup_rbuf(sk, copied: `1`);
3683	if (!(val & `1`))
3684	inet_csk_enter_pingpong_mode(sk);
3685	}
3686	}
3687
3688	void tcp_sock_set_quickack(struct sock sk, int* val)
3689	{
3690	lock_sock(sk);
3691	__tcp_sock_set_quickack(sk, val);
3692	release_sock(sk);
3693	}
3694	EXPORT_SYMBOL(tcp_sock_set_quickack);
3695
3696	int tcp_sock_set_syncnt(struct sock sk, int* val)
3697	{
3698	if (val < `1` \|\| val > MAX_TCP_SYNCNT)
3699	return -EINVAL;
3700
3701	WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val);
3702	return `0`;
3703	}
3704	EXPORT_SYMBOL(tcp_sock_set_syncnt);
3705
3706	int tcp_sock_set_user_timeout(struct sock sk, int* val)
3707	{
3708	/ Cap the max time in ms TCP will retry or probe the window*
3709	* before giving up and aborting (ETIMEDOUT) a connection.
3710	*/
3711	if (val < `0`)
3712	return -EINVAL;
3713
3714	WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
3715	return `0`;
3716	}
3717	EXPORT_SYMBOL(tcp_sock_set_user_timeout);
3718
3719	int tcp_sock_set_keepidle_locked(struct sock sk, int* val)
3720	{
3721	struct tcp_sock *tp = tcp_sk(sk);
3722
3723	if (val < `1` \|\| val > MAX_TCP_KEEPIDLE)
3724	return -EINVAL;
3725
3726	/ Paired with WRITE_ONCE() in keepalive_time_when() /
3727	WRITE_ONCE(tp->keepalive_time, val * HZ);
3728	if (sock_flag(sk, flag: SOCK_KEEPOPEN) &&
3729	!((`1` << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN))) {
3730	u32 elapsed = keepalive_time_elapsed(tp);
3731
3732	if (tp->keepalive_time > elapsed)
3733	elapsed = tp->keepalive_time - elapsed;
3734	else
3735	elapsed = `0`;
3736	tcp_reset_keepalive_timer(sk, timeout: elapsed);
3737	}
3738
3739	return `0`;
3740	}
3741
3742	int tcp_sock_set_keepidle(struct sock sk, int* val)
3743	{
3744	int err;
3745
3746	lock_sock(sk);
3747	err = tcp_sock_set_keepidle_locked(sk, val);
3748	release_sock(sk);
3749	return err;
3750	}
3751	EXPORT_SYMBOL(tcp_sock_set_keepidle);
3752
3753	int tcp_sock_set_keepintvl(struct sock sk, int* val)
3754	{
3755	if (val < `1` \|\| val > MAX_TCP_KEEPINTVL)
3756	return -EINVAL;
3757
3758	WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
3759	return `0`;
3760	}
3761	EXPORT_SYMBOL(tcp_sock_set_keepintvl);
3762
3763	int tcp_sock_set_keepcnt(struct sock sk, int* val)
3764	{
3765	if (val < `1` \|\| val > MAX_TCP_KEEPCNT)
3766	return -EINVAL;
3767
3768	/ Paired with READ_ONCE() in keepalive_probes() /
3769	WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
3770	return `0`;
3771	}
3772	EXPORT_SYMBOL(tcp_sock_set_keepcnt);
3773
3774	int tcp_set_window_clamp(struct sock sk, int* val)
3775	{
3776	u32 old_window_clamp, new_window_clamp, new_rcv_ssthresh;
3777	struct tcp_sock *tp = tcp_sk(sk);
3778
3779	if (!val) {
3780	if (sk->sk_state != TCP_CLOSE)
3781	return -EINVAL;
3782	WRITE_ONCE(tp->window_clamp, `0`);
3783	return `0`;
3784	}
3785
3786	old_window_clamp = tp->window_clamp;
3787	new_window_clamp = max_t(int, SOCK_MIN_RCVBUF / `2`, val);
3788
3789	if (new_window_clamp == old_window_clamp)
3790	return `0`;
3791
3792	WRITE_ONCE(tp->window_clamp, new_window_clamp);
3793
3794	/ Need to apply the reserved mem provisioning only*
3795	* when shrinking the window clamp.
3796	*/
3797	if (new_window_clamp < old_window_clamp) {
3798	__tcp_adjust_rcv_ssthresh(sk, new_ssthresh: new_window_clamp);
3799	} else {
3800	new_rcv_ssthresh = min(tp->rcv_wnd, new_window_clamp);
3801	tp->rcv_ssthresh = max(new_rcv_ssthresh, tp->rcv_ssthresh);
3802	}
3803	return `0`;
3804	}
3805
3806	int tcp_sock_set_maxseg(struct sock sk, int* val)
3807	{
3808	/ Values greater than interface MTU won't take effect. However*
3809	* at the point when this call is done we typically don't yet
3810	* know which interface is going to be used
3811	*/
3812	if (val && (val < TCP_MIN_MSS \|\| val > MAX_TCP_WINDOW))
3813	return -EINVAL;
3814
3815	WRITE_ONCE(tcp_sk(sk)->rx_opt.user_mss, val);
3816	return `0`;
3817	}
3818
3819	/*
3820	* Socket option code for TCP.
3821	*/
3822	int do_tcp_setsockopt(struct sock sk, int* level, int optname,
3823	sockptr_t optval, unsigned int optlen)
3824	{
3825	struct tcp_sock *tp = tcp_sk(sk);
3826	struct inet_connection_sock *icsk = inet_csk(sk);
3827	struct net *net = sock_net(sk);
3828	int val;
3829	int err = `0`;
3830
3831	/ These are data/string values, all the others are ints /
3832	switch (optname) {
3833	case TCP_CONGESTION: {
3834	char name[TCP_CA_NAME_MAX];
3835
3836	if (optlen < `1`)
3837	return -EINVAL;
3838
3839	val = strncpy_from_sockptr(dst: name, src: optval,
3840	min_t(long, TCP_CA_NAME_MAX-`1`, optlen));
3841	if (val < `0`)
3842	return -EFAULT;
3843	name[val] = `0`;
3844
3845	sockopt_lock_sock(sk);
3846	err = tcp_set_congestion_control(sk, name, load: !has_current_bpf_ctx(),
3847	cap_net_admin: sockopt_ns_capable(ns: sock_net(sk)->user_ns,
3848	CAP_NET_ADMIN));
3849	sockopt_release_sock(sk);
3850	return err;
3851	}
3852	case TCP_ULP: {
3853	char name[TCP_ULP_NAME_MAX];
3854
3855	if (optlen < `1`)
3856	return -EINVAL;
3857
3858	val = strncpy_from_sockptr(dst: name, src: optval,
3859	min_t(long, TCP_ULP_NAME_MAX - `1`,
3860	optlen));
3861	if (val < `0`)
3862	return -EFAULT;
3863	name[val] = `0`;
3864
3865	sockopt_lock_sock(sk);
3866	err = tcp_set_ulp(sk, name);
3867	sockopt_release_sock(sk);
3868	return err;
3869	}
3870	case TCP_FASTOPEN_KEY: {
3871	__u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
3872	__u8 *backup_key = NULL;
3873
3874	/ Allow a backup key as well to facilitate key rotation*
3875	* First key is the active one.
3876	*/
3877	if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
3878	optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
3879	return -EINVAL;
3880
3881	if (copy_from_sockptr(dst: key, src: optval, size: optlen))
3882	return -EFAULT;
3883
3884	if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
3885	backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
3886
3887	return tcp_fastopen_reset_cipher(net, sk, primary_key: key, backup_key);
3888	}
3889	default:
3890	/ fallthru /
3891	break;
3892	}
3893
3894	if (optlen < sizeof(int))
3895	return -EINVAL;
3896
3897	if (copy_from_sockptr(dst: &val, src: optval, size: sizeof(val)))
3898	return -EFAULT;
3899
3900	/ Handle options that can be set without locking the socket. /
3901	switch (optname) {
3902	case TCP_SYNCNT:
3903	return tcp_sock_set_syncnt(sk, val);
3904	case TCP_USER_TIMEOUT:
3905	return tcp_sock_set_user_timeout(sk, val);
3906	case TCP_KEEPINTVL:
3907	return tcp_sock_set_keepintvl(sk, val);
3908	case TCP_KEEPCNT:
3909	return tcp_sock_set_keepcnt(sk, val);
3910	case TCP_LINGER2:
3911	if (val < `0`)
3912	WRITE_ONCE(tp->linger2, -`1`);
3913	else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
3914	WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
3915	else
3916	WRITE_ONCE(tp->linger2, val * HZ);
3917	return `0`;
3918	case TCP_DEFER_ACCEPT:
3919	/ Translate value in seconds to number of retransmits /
3920	WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
3921	secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
3922	TCP_RTO_MAX / HZ));
3923	return `0`;
3924	case TCP_RTO_MAX_MS:
3925	if (val < MSEC_PER_SEC \|\| val > TCP_RTO_MAX_SEC * MSEC_PER_SEC)
3926	return -EINVAL;
3927	WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val));
3928	return `0`;
3929	case TCP_RTO_MIN_US: {
3930	int rto_min = usecs_to_jiffies(u: val);
3931
3932	if (rto_min > TCP_RTO_MIN \|\| rto_min < TCP_TIMEOUT_MIN)
3933	return -EINVAL;
3934	WRITE_ONCE(inet_csk(sk)->icsk_rto_min, rto_min);
3935	return `0`;
3936	}
3937	case TCP_DELACK_MAX_US: {
3938	int delack_max = usecs_to_jiffies(u: val);
3939
3940	if (delack_max > TCP_DELACK_MAX \|\| delack_max < TCP_TIMEOUT_MIN)
3941	return -EINVAL;
3942	WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max);
3943	return `0`;
3944	}
3945	case TCP_MAXSEG:
3946	return tcp_sock_set_maxseg(sk, val);
3947	}
3948
3949	sockopt_lock_sock(sk);
3950
3951	switch (optname) {
3952	case TCP_NODELAY:
3953	__tcp_sock_set_nodelay(sk, on: val);
3954	break;
3955
3956	case TCP_THIN_LINEAR_TIMEOUTS:
3957	if (val < `0` \|\| val > `1`)
3958	err = -EINVAL;
3959	else
3960	tp->thin_lto = val;
3961	break;
3962
3963	case TCP_THIN_DUPACK:
3964	if (val < `0` \|\| val > `1`)
3965	err = -EINVAL;
3966	break;
3967
3968	case TCP_REPAIR:
3969	if (!tcp_can_repair_sock(sk))
3970	err = -EPERM;
3971	else if (val == TCP_REPAIR_ON) {
3972	tp->repair = `1`;
3973	sk->sk_reuse = SK_FORCE_REUSE;
3974	tp->repair_queue = TCP_NO_QUEUE;
3975	} else if (val == TCP_REPAIR_OFF) {
3976	tp->repair = `0`;
3977	sk->sk_reuse = SK_NO_REUSE;
3978	tcp_send_window_probe(sk);
3979	} else if (val == TCP_REPAIR_OFF_NO_WP) {
3980	tp->repair = `0`;
3981	sk->sk_reuse = SK_NO_REUSE;
3982	} else
3983	err = -EINVAL;
3984
3985	break;
3986
3987	case TCP_REPAIR_QUEUE:
3988	if (!tp->repair)
3989	err = -EPERM;
3990	else if ((unsigned int)val < TCP_QUEUES_NR)
3991	tp->repair_queue = val;
3992	else
3993	err = -EINVAL;
3994	break;
3995
3996	case TCP_QUEUE_SEQ:
3997	if (sk->sk_state != TCP_CLOSE) {
3998	err = -EPERM;
3999	} else if (tp->repair_queue == TCP_SEND_QUEUE) {
4000	if (!tcp_rtx_queue_empty(sk))
4001	err = -EPERM;
4002	else
4003	WRITE_ONCE(tp->write_seq, val);
4004	} else if (tp->repair_queue == TCP_RECV_QUEUE) {
4005	if (tp->rcv_nxt != tp->copied_seq) {
4006	err = -EPERM;
4007	} else {
4008	WRITE_ONCE(tp->rcv_nxt, val);
4009	WRITE_ONCE(tp->copied_seq, val);
4010	}
4011	} else {
4012	err = -EINVAL;
4013	}
4014	break;
4015
4016	case TCP_REPAIR_OPTIONS:
4017	if (!tp->repair)
4018	err = -EINVAL;
4019	else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent)
4020	err = tcp_repair_options_est(sk, optbuf: optval, len: optlen);
4021	else
4022	err = -EPERM;
4023	break;
4024
4025	case TCP_CORK:
4026	__tcp_sock_set_cork(sk, on: val);
4027	break;
4028
4029	case TCP_KEEPIDLE:
4030	err = tcp_sock_set_keepidle_locked(sk, val);
4031	break;
4032	case TCP_SAVE_SYN:
4033	/ 0: disable, 1: enable, 2: start from ether_header /
4034	if (val < `0` \|\| val > `2`)
4035	err = -EINVAL;
4036	else
4037	tp->save_syn = val;
4038	break;
4039
4040	case TCP_WINDOW_CLAMP:
4041	err = tcp_set_window_clamp(sk, val);
4042	break;
4043
4044	case TCP_QUICKACK:
4045	__tcp_sock_set_quickack(sk, val);
4046	break;
4047
4048	case TCP_AO_REPAIR:
4049	if (!tcp_can_repair_sock(sk)) {
4050	err = -EPERM;
4051	break;
4052	}
4053	err = tcp_ao_set_repair(sk, optval, optlen);
4054	break;
4055	#ifdef CONFIG_TCP_AO
4056	case TCP_AO_ADD_KEY:
4057	case TCP_AO_DEL_KEY:
4058	case TCP_AO_INFO: {
4059	/ If this is the first TCP-AO setsockopt() on the socket,*
4060	* sk_state has to be LISTEN or CLOSE. Allow TCP_REPAIR
4061	* in any state.
4062	*/
4063	if ((`1` << sk->sk_state) & (TCPF_LISTEN \| TCPF_CLOSE))
4064	goto ao_parse;
4065	if (rcu_dereference_protected(tcp_sk(sk)->ao_info,
4066	lockdep_sock_is_held(sk)))
4067	goto ao_parse;
4068	if (tp->repair)
4069	goto ao_parse;
4070	err = -EISCONN;
4071	break;
4072	ao_parse:
4073	err = tp->af_specific->ao_parse(sk, optname, optval, optlen);
4074	break;
4075	}
4076	#endif
4077	#ifdef CONFIG_TCP_MD5SIG
4078	case TCP_MD5SIG:
4079	case TCP_MD5SIG_EXT:
4080	err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
4081	break;
4082	#endif
4083	case TCP_FASTOPEN:
4084	if (val >= `0` && ((`1` << sk->sk_state) & (TCPF_CLOSE \|
4085	TCPF_LISTEN))) {
4086	tcp_fastopen_init_key_once(net);
4087
4088	fastopen_queue_tune(sk, backlog: val);
4089	} else {
4090	err = -EINVAL;
4091	}
4092	break;
4093	case TCP_FASTOPEN_CONNECT:
4094	if (val > `1` \|\| val < `0`) {
4095	err = -EINVAL;
4096	} else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
4097	TFO_CLIENT_ENABLE) {
4098	if (sk->sk_state == TCP_CLOSE)
4099	tp->fastopen_connect = val;
4100	else
4101	err = -EINVAL;
4102	} else {
4103	err = -EOPNOTSUPP;
4104	}
4105	break;
4106	case TCP_FASTOPEN_NO_COOKIE:
4107	if (val > `1` \|\| val < `0`)
4108	err = -EINVAL;
4109	else if (!((`1` << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
4110	err = -EINVAL;
4111	else
4112	tp->fastopen_no_cookie = val;
4113	break;
4114	case TCP_TIMESTAMP:
4115	if (!tp->repair) {
4116	err = -EPERM;
4117	break;
4118	}
4119	/ val is an opaque field,*
4120	* and low order bit contains usec_ts enable bit.
4121	* Its a best effort, and we do not care if user makes an error.
4122	*/
4123	tp->tcp_usec_ts = val & `1`;
4124	WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts));
4125	break;
4126	case TCP_REPAIR_WINDOW:
4127	err = tcp_repair_set_window(tp, optbuf: optval, len: optlen);
4128	break;
4129	case TCP_NOTSENT_LOWAT:
4130	WRITE_ONCE(tp->notsent_lowat, val);
4131	sk->sk_write_space(sk);
4132	break;
4133	case TCP_INQ:
4134	if (val > `1` \|\| val < `0`)
4135	err = -EINVAL;
4136	else
4137	tp->recvmsg_inq = val;
4138	break;
4139	case TCP_TX_DELAY:
4140	/ tp->srtt_us is u32, and is shifted by 3 /
4141	if (val < `0` \|\| val >= (`1U` << (`31` - `3`))) {
4142	err = -EINVAL;
4143	break;
4144	}
4145	tcp_enable_tx_delay(sk, val);
4146	WRITE_ONCE(tp->tcp_tx_delay, val);
4147	break;
4148	default:
4149	err = -ENOPROTOOPT;
4150	break;
4151	}
4152
4153	sockopt_release_sock(sk);
4154	return err;
4155	}
4156
4157	int tcp_setsockopt(struct sock sk, int* level, int optname, sockptr_t optval,
4158	unsigned int optlen)
4159	{
4160	const struct inet_connection_sock *icsk = inet_csk(sk);
4161
4162	if (level != SOL_TCP)
4163	/ Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() /
4164	return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname,
4165	optval, optlen);
4166	return do_tcp_setsockopt(sk, level, optname, optval, optlen);
4167	}
4168	EXPORT_IPV6_MOD(tcp_setsockopt);
4169
4170	static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
4171	struct tcp_info *info)
4172	{
4173	u64 stats[__TCP_CHRONO_MAX], total = `0`;
4174	enum tcp_chrono i;
4175
4176	for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) {
4177	stats[i] = tp->chrono_stat[i - `1`];
4178	if (i == tp->chrono_type)
4179	stats[i] += tcp_jiffies32 - tp->chrono_start;
4180	stats[i] *= USEC_PER_SEC / HZ;
4181	total += stats[i];
4182	}
4183
4184	info->tcpi_busy_time = total;
4185	info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED];
4186	info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED];
4187	}
4188
4189	/ Return information about state of tcp endpoint in API format. /
4190	void tcp_get_info(struct sock sk, struct* tcp_info *info)
4191	{
4192	const struct tcp_sock tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM /
4193	const struct inet_connection_sock *icsk = inet_csk(sk);
4194	const u8 ect1_idx = INET_ECN_ECT_1 - `1`;
4195	const u8 ect0_idx = INET_ECN_ECT_0 - `1`;
4196	const u8 ce_idx = INET_ECN_CE - `1`;
4197	unsigned long rate;
4198	u32 now;
4199	u64 rate64;
4200	bool slow;
4201
4202	memset(info, `0`, sizeof(*info));
4203	if (sk->sk_type != SOCK_STREAM)
4204	return;
4205
4206	info->tcpi_state = inet_sk_state_load(sk);
4207
4208	/ Report meaningful fields for all TCP states, including listeners /
4209	rate = READ_ONCE(sk->sk_pacing_rate);
4210	rate64 = (rate != ~`0UL`) ? rate : ~`0ULL`;
4211	info->tcpi_pacing_rate = rate64;
4212
4213	rate = READ_ONCE(sk->sk_max_pacing_rate);
4214	rate64 = (rate != ~`0UL`) ? rate : ~`0ULL`;
4215	info->tcpi_max_pacing_rate = rate64;
4216
4217	info->tcpi_reordering = tp->reordering;
4218	info->tcpi_snd_cwnd = tcp_snd_cwnd(tp);
4219
4220	if (info->tcpi_state == TCP_LISTEN) {
4221	/ listeners aliased fields :*
4222	* tcpi_unacked -> Number of children ready for accept()
4223	* tcpi_sacked -> max backlog
4224	*/
4225	info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
4226	info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
4227	return;
4228	}
4229
4230	slow = lock_sock_fast(sk);
4231
4232	info->tcpi_ca_state = icsk->icsk_ca_state;
4233	info->tcpi_retransmits = icsk->icsk_retransmits;
4234	info->tcpi_probes = icsk->icsk_probes_out;
4235	info->tcpi_backoff = icsk->icsk_backoff;
4236
4237	if (tp->rx_opt.tstamp_ok)
4238	info->tcpi_options \|= TCPI_OPT_TIMESTAMPS;
4239	if (tcp_is_sack(tp))
4240	info->tcpi_options \|= TCPI_OPT_SACK;
4241	if (tp->rx_opt.wscale_ok) {
4242	info->tcpi_options \|= TCPI_OPT_WSCALE;
4243	info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
4244	info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
4245	}
4246
4247	if (tcp_ecn_mode_any(tp))
4248	info->tcpi_options \|= TCPI_OPT_ECN;
4249	if (tp->ecn_flags & TCP_ECN_SEEN)
4250	info->tcpi_options \|= TCPI_OPT_ECN_SEEN;
4251	if (tp->syn_data_acked)
4252	info->tcpi_options \|= TCPI_OPT_SYN_DATA;
4253	if (tp->tcp_usec_ts)
4254	info->tcpi_options \|= TCPI_OPT_USEC_TS;
4255	if (tp->syn_fastopen_child)
4256	info->tcpi_options \|= TCPI_OPT_TFO_CHILD;
4257
4258	info->tcpi_rto = jiffies_to_usecs(j: icsk->icsk_rto);
4259	info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato,
4260	tcp_delack_max(sk)));
4261	info->tcpi_snd_mss = tp->mss_cache;
4262	info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
4263
4264	info->tcpi_unacked = tp->packets_out;
4265	info->tcpi_sacked = tp->sacked_out;
4266
4267	info->tcpi_lost = tp->lost_out;
4268	info->tcpi_retrans = tp->retrans_out;
4269
4270	now = tcp_jiffies32;
4271	info->tcpi_last_data_sent = jiffies_to_msecs(j: now - tp->lsndtime);
4272	info->tcpi_last_data_recv = jiffies_to_msecs(j: now - icsk->icsk_ack.lrcvtime);
4273	info->tcpi_last_ack_recv = jiffies_to_msecs(j: now - tp->rcv_tstamp);
4274
4275	info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
4276	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
4277	info->tcpi_rtt = tp->srtt_us >> `3`;
4278	info->tcpi_rttvar = tp->mdev_us >> `2`;
4279	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
4280	info->tcpi_advmss = tp->advmss;
4281
4282	info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> `3`;
4283	info->tcpi_rcv_space = tp->rcvq_space.space;
4284
4285	info->tcpi_total_retrans = tp->total_retrans;
4286
4287	info->tcpi_bytes_acked = tp->bytes_acked;
4288	info->tcpi_bytes_received = tp->bytes_received;
4289	info->tcpi_notsent_bytes = max_t(int, `0`, tp->write_seq - tp->snd_nxt);
4290	tcp_get_info_chrono_stats(tp, info);
4291
4292	info->tcpi_segs_out = tp->segs_out;
4293
4294	/ segs_in and data_segs_in can be updated from tcp_segs_in() from BH /
4295	info->tcpi_segs_in = READ_ONCE(tp->segs_in);
4296	info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);
4297
4298	info->tcpi_min_rtt = tcp_min_rtt(tp);
4299	info->tcpi_data_segs_out = tp->data_segs_out;
4300
4301	info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? `1` : `0`;
4302	rate64 = tcp_compute_delivery_rate(tp);
4303	if (rate64)
4304	info->tcpi_delivery_rate = rate64;
4305	info->tcpi_delivered = tp->delivered;
4306	info->tcpi_delivered_ce = tp->delivered_ce;
4307	info->tcpi_bytes_sent = tp->bytes_sent;
4308	info->tcpi_bytes_retrans = tp->bytes_retrans;
4309	info->tcpi_dsack_dups = tp->dsack_dups;
4310	info->tcpi_reord_seen = tp->reord_seen;
4311	info->tcpi_rcv_ooopack = tp->rcv_ooopack;
4312	info->tcpi_snd_wnd = tp->snd_wnd;
4313	info->tcpi_rcv_wnd = tp->rcv_wnd;
4314	info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash;
4315	info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
4316
4317	info->tcpi_total_rto = tp->total_rto;
4318	info->tcpi_total_rto_recoveries = tp->total_rto_recoveries;
4319	info->tcpi_total_rto_time = tp->total_rto_time;
4320	if (tp->rto_stamp)
4321	info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;
4322
4323	info->tcpi_accecn_fail_mode = tp->accecn_fail_mode;
4324	info->tcpi_accecn_opt_seen = tp->saw_accecn_opt;
4325	info->tcpi_received_ce = tp->received_ce;
4326	info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx];
4327	info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx];
4328	info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx];
4329	info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx];
4330	info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx];
4331	info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx];
4332
4333	unlock_sock_fast(sk, slow);
4334	}
4335	EXPORT_SYMBOL_GPL(tcp_get_info);
4336
4337	static size_t tcp_opt_stats_get_size(void)
4338	{
4339	return
4340	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_BUSY /
4341	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_RWND_LIMITED /
4342	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_SNDBUF_LIMITED /
4343	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_DATA_SEGS_OUT /
4344	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_TOTAL_RETRANS /
4345	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_PACING_RATE /
4346	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_DELIVERY_RATE /
4347	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_SND_CWND /
4348	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_REORDERING /
4349	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_MIN_RTT /
4350	nla_total_size(payload: sizeof(u8)) + / TCP_NLA_RECUR_RETRANS /
4351	nla_total_size(payload: sizeof(u8)) + / TCP_NLA_DELIVERY_RATE_APP_LMT /
4352	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_SNDQ_SIZE /
4353	nla_total_size(payload: sizeof(u8)) + / TCP_NLA_CA_STATE /
4354	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_SND_SSTHRESH /
4355	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_DELIVERED /
4356	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_DELIVERED_CE /
4357	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_BYTES_SENT /
4358	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_BYTES_RETRANS /
4359	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_DSACK_DUPS /
4360	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_REORD_SEEN /
4361	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_SRTT /
4362	nla_total_size(payload: sizeof(u16)) + / TCP_NLA_TIMEOUT_REHASH /
4363	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_BYTES_NOTSENT /
4364	nla_total_size_64bit(payload: sizeof(u64)) + / TCP_NLA_EDT /
4365	nla_total_size(payload: sizeof(u8)) + / TCP_NLA_TTL /
4366	nla_total_size(payload: sizeof(u32)) + / TCP_NLA_REHASH /
4367	`0`;
4368	}
4369
4370	/ Returns TTL or hop limit of an incoming packet from skb. /
4371	static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
4372	{
4373	if (skb->protocol == htons(ETH_P_IP))
4374	return ip_hdr(skb)->ttl;
4375	else if (skb->protocol == htons(ETH_P_IPV6))
4376	return ipv6_hdr(skb)->hop_limit;
4377	else
4378	return `0`;
4379	}
4380
4381	struct sk_buff tcp_get_timestamping_opt_stats(const* struct sock *sk,
4382	const struct sk_buff *orig_skb,
4383	const struct sk_buff *ack_skb)
4384	{
4385	const struct tcp_sock *tp = tcp_sk(sk);
4386	struct sk_buff *stats;
4387	struct tcp_info info;
4388	unsigned long rate;
4389	u64 rate64;
4390
4391	stats = alloc_skb(size: tcp_opt_stats_get_size(), GFP_ATOMIC);
4392	if (!stats)
4393	return NULL;
4394
4395	tcp_get_info_chrono_stats(tp, info: &info);
4396	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_BUSY,
4397	value: info.tcpi_busy_time, padattr: TCP_NLA_PAD);
4398	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_RWND_LIMITED,
4399	value: info.tcpi_rwnd_limited, padattr: TCP_NLA_PAD);
4400	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_SNDBUF_LIMITED,
4401	value: info.tcpi_sndbuf_limited, padattr: TCP_NLA_PAD);
4402	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_DATA_SEGS_OUT,
4403	value: tp->data_segs_out, padattr: TCP_NLA_PAD);
4404	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_TOTAL_RETRANS,
4405	value: tp->total_retrans, padattr: TCP_NLA_PAD);
4406
4407	rate = READ_ONCE(sk->sk_pacing_rate);
4408	rate64 = (rate != ~`0UL`) ? rate : ~`0ULL`;
4409	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_PACING_RATE, value: rate64, padattr: TCP_NLA_PAD);
4410
4411	rate64 = tcp_compute_delivery_rate(tp);
4412	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_DELIVERY_RATE, value: rate64, padattr: TCP_NLA_PAD);
4413
4414	nla_put_u32(skb: stats, attrtype: TCP_NLA_SND_CWND, value: tcp_snd_cwnd(tp));
4415	nla_put_u32(skb: stats, attrtype: TCP_NLA_REORDERING, value: tp->reordering);
4416	nla_put_u32(skb: stats, attrtype: TCP_NLA_MIN_RTT, value: tcp_min_rtt(tp));
4417
4418	nla_put_u8(skb: stats, attrtype: TCP_NLA_RECUR_RETRANS,
4419	READ_ONCE(inet_csk(sk)->icsk_retransmits));
4420	nla_put_u8(skb: stats, attrtype: TCP_NLA_DELIVERY_RATE_APP_LMT, value: !!tp->rate_app_limited);
4421	nla_put_u32(skb: stats, attrtype: TCP_NLA_SND_SSTHRESH, value: tp->snd_ssthresh);
4422	nla_put_u32(skb: stats, attrtype: TCP_NLA_DELIVERED, value: tp->delivered);
4423	nla_put_u32(skb: stats, attrtype: TCP_NLA_DELIVERED_CE, value: tp->delivered_ce);
4424
4425	nla_put_u32(skb: stats, attrtype: TCP_NLA_SNDQ_SIZE, value: tp->write_seq - tp->snd_una);
4426	nla_put_u8(skb: stats, attrtype: TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
4427
4428	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_BYTES_SENT, value: tp->bytes_sent,
4429	padattr: TCP_NLA_PAD);
4430	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_BYTES_RETRANS, value: tp->bytes_retrans,
4431	padattr: TCP_NLA_PAD);
4432	nla_put_u32(skb: stats, attrtype: TCP_NLA_DSACK_DUPS, value: tp->dsack_dups);
4433	nla_put_u32(skb: stats, attrtype: TCP_NLA_REORD_SEEN, value: tp->reord_seen);
4434	nla_put_u32(skb: stats, attrtype: TCP_NLA_SRTT, value: tp->srtt_us >> `3`);
4435	nla_put_u16(skb: stats, attrtype: TCP_NLA_TIMEOUT_REHASH, value: tp->timeout_rehash);
4436	nla_put_u32(skb: stats, attrtype: TCP_NLA_BYTES_NOTSENT,
4437	max_t(int, `0`, tp->write_seq - tp->snd_nxt));
4438	nla_put_u64_64bit(skb: stats, attrtype: TCP_NLA_EDT, value: orig_skb->skb_mstamp_ns,
4439	padattr: TCP_NLA_PAD);
4440	if (ack_skb)
4441	nla_put_u8(skb: stats, attrtype: TCP_NLA_TTL,
4442	value: tcp_skb_ttl_or_hop_limit(skb: ack_skb));
4443
4444	nla_put_u32(skb: stats, attrtype: TCP_NLA_REHASH, value: tp->plb_rehash + tp->timeout_rehash);
4445	return stats;
4446	}
4447
4448	int do_tcp_getsockopt(struct sock sk, int* level,
4449	int optname, sockptr_t optval, sockptr_t optlen)
4450	{
4451	struct inet_connection_sock *icsk = inet_csk(sk);
4452	struct tcp_sock *tp = tcp_sk(sk);
4453	struct net *net = sock_net(sk);
4454	int user_mss;
4455	int val, len;
4456
4457	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4458	return -EFAULT;
4459
4460	if (len < `0`)
4461	return -EINVAL;
4462
4463	len = min_t(unsigned int, len, sizeof(int));
4464
4465	switch (optname) {
4466	case TCP_MAXSEG:
4467	val = tp->mss_cache;
4468	user_mss = READ_ONCE(tp->rx_opt.user_mss);
4469	if (user_mss &&
4470	((`1` << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
4471	val = user_mss;
4472	if (tp->repair)
4473	val = tp->rx_opt.mss_clamp;
4474	break;
4475	case TCP_NODELAY:
4476	val = !!(tp->nonagle&TCP_NAGLE_OFF);
4477	break;
4478	case TCP_CORK:
4479	val = !!(tp->nonagle&TCP_NAGLE_CORK);
4480	break;
4481	case TCP_KEEPIDLE:
4482	val = keepalive_time_when(tp) / HZ;
4483	break;
4484	case TCP_KEEPINTVL:
4485	val = keepalive_intvl_when(tp) / HZ;
4486	break;
4487	case TCP_KEEPCNT:
4488	val = keepalive_probes(tp);
4489	break;
4490	case TCP_SYNCNT:
4491	val = READ_ONCE(icsk->icsk_syn_retries) ? :
4492	READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
4493	break;
4494	case TCP_LINGER2:
4495	val = READ_ONCE(tp->linger2);
4496	if (val >= `0`)
4497	val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
4498	break;
4499	case TCP_DEFER_ACCEPT:
4500	val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept);
4501	val = retrans_to_secs(retrans: val, TCP_TIMEOUT_INIT / HZ,
4502	TCP_RTO_MAX / HZ);
4503	break;
4504	case TCP_WINDOW_CLAMP:
4505	val = READ_ONCE(tp->window_clamp);
4506	break;
4507	case TCP_INFO: {
4508	struct tcp_info info;
4509
4510	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4511	return -EFAULT;
4512
4513	tcp_get_info(sk, &info);
4514
4515	len = min_t(unsigned int, len, sizeof(info));
4516	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4517	return -EFAULT;
4518	if (copy_to_sockptr(dst: optval, src: &info, size: len))
4519	return -EFAULT;
4520	return `0`;
4521	}
4522	case TCP_CC_INFO: {
4523	const struct tcp_congestion_ops *ca_ops;
4524	union tcp_cc_info info;
4525	size_t sz = `0`;
4526	int attr;
4527
4528	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4529	return -EFAULT;
4530
4531	ca_ops = icsk->icsk_ca_ops;
4532	if (ca_ops && ca_ops->get_info)
4533	sz = ca_ops->get_info(sk, ~`0U`, &attr, &info);
4534
4535	len = min_t(unsigned int, len, sz);
4536	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4537	return -EFAULT;
4538	if (copy_to_sockptr(dst: optval, src: &info, size: len))
4539	return -EFAULT;
4540	return `0`;
4541	}
4542	case TCP_QUICKACK:
4543	val = !inet_csk_in_pingpong_mode(sk);
4544	break;
4545
4546	case TCP_CONGESTION:
4547	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4548	return -EFAULT;
4549	len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
4550	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4551	return -EFAULT;
4552	if (copy_to_sockptr(dst: optval, src: icsk->icsk_ca_ops->name, size: len))
4553	return -EFAULT;
4554	return `0`;
4555
4556	case TCP_ULP:
4557	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4558	return -EFAULT;
4559	len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
4560	if (!icsk->icsk_ulp_ops) {
4561	len = `0`;
4562	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4563	return -EFAULT;
4564	return `0`;
4565	}
4566	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4567	return -EFAULT;
4568	if (copy_to_sockptr(dst: optval, src: icsk->icsk_ulp_ops->name, size: len))
4569	return -EFAULT;
4570	return `0`;
4571
4572	case TCP_FASTOPEN_KEY: {
4573	u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
4574	unsigned int key_len;
4575
4576	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4577	return -EFAULT;
4578
4579	key_len = tcp_fastopen_get_cipher(net, icsk, key) *
4580	TCP_FASTOPEN_KEY_LENGTH;
4581	len = min_t(unsigned int, len, key_len);
4582	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4583	return -EFAULT;
4584	if (copy_to_sockptr(dst: optval, src: key, size: len))
4585	return -EFAULT;
4586	return `0`;
4587	}
4588	case TCP_THIN_LINEAR_TIMEOUTS:
4589	val = tp->thin_lto;
4590	break;
4591
4592	case TCP_THIN_DUPACK:
4593	val = `0`;
4594	break;
4595
4596	case TCP_REPAIR:
4597	val = tp->repair;
4598	break;
4599
4600	case TCP_REPAIR_QUEUE:
4601	if (tp->repair)
4602	val = tp->repair_queue;
4603	else
4604	return -EINVAL;
4605	break;
4606
4607	case TCP_REPAIR_WINDOW: {
4608	struct tcp_repair_window opt;
4609
4610	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4611	return -EFAULT;
4612
4613	if (len != sizeof(opt))
4614	return -EINVAL;
4615
4616	if (!tp->repair)
4617	return -EPERM;
4618
4619	opt.snd_wl1 = tp->snd_wl1;
4620	opt.snd_wnd = tp->snd_wnd;
4621	opt.max_window = tp->max_window;
4622	opt.rcv_wnd = tp->rcv_wnd;
4623	opt.rcv_wup = tp->rcv_wup;
4624
4625	if (copy_to_sockptr(dst: optval, src: &opt, size: len))
4626	return -EFAULT;
4627	return `0`;
4628	}
4629	case TCP_QUEUE_SEQ:
4630	if (tp->repair_queue == TCP_SEND_QUEUE)
4631	val = tp->write_seq;
4632	else if (tp->repair_queue == TCP_RECV_QUEUE)
4633	val = tp->rcv_nxt;
4634	else
4635	return -EINVAL;
4636	break;
4637
4638	case TCP_USER_TIMEOUT:
4639	val = READ_ONCE(icsk->icsk_user_timeout);
4640	break;
4641
4642	case TCP_FASTOPEN:
4643	val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen);
4644	break;
4645
4646	case TCP_FASTOPEN_CONNECT:
4647	val = tp->fastopen_connect;
4648	break;
4649
4650	case TCP_FASTOPEN_NO_COOKIE:
4651	val = tp->fastopen_no_cookie;
4652	break;
4653
4654	case TCP_TX_DELAY:
4655	val = READ_ONCE(tp->tcp_tx_delay);
4656	break;
4657
4658	case TCP_TIMESTAMP:
4659	val = tcp_clock_ts(usec_ts: tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset);
4660	if (tp->tcp_usec_ts)
4661	val \|= `1`;
4662	else
4663	val &= ~`1`;
4664	break;
4665	case TCP_NOTSENT_LOWAT:
4666	val = READ_ONCE(tp->notsent_lowat);
4667	break;
4668	case TCP_INQ:
4669	val = tp->recvmsg_inq;
4670	break;
4671	case TCP_SAVE_SYN:
4672	val = tp->save_syn;
4673	break;
4674	case TCP_SAVED_SYN: {
4675	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4676	return -EFAULT;
4677
4678	sockopt_lock_sock(sk);
4679	if (tp->saved_syn) {
4680	if (len < tcp_saved_syn_len(saved_syn: tp->saved_syn)) {
4681	len = tcp_saved_syn_len(saved_syn: tp->saved_syn);
4682	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int))) {
4683	sockopt_release_sock(sk);
4684	return -EFAULT;
4685	}
4686	sockopt_release_sock(sk);
4687	return -EINVAL;
4688	}
4689	len = tcp_saved_syn_len(saved_syn: tp->saved_syn);
4690	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int))) {
4691	sockopt_release_sock(sk);
4692	return -EFAULT;
4693	}
4694	if (copy_to_sockptr(dst: optval, src: tp->saved_syn->data, size: len)) {
4695	sockopt_release_sock(sk);
4696	return -EFAULT;
4697	}
4698	tcp_saved_syn_free(tp);
4699	sockopt_release_sock(sk);
4700	} else {
4701	sockopt_release_sock(sk);
4702	len = `0`;
4703	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4704	return -EFAULT;
4705	}
4706	return `0`;
4707	}
4708	#ifdef CONFIG_MMU
4709	case TCP_ZEROCOPY_RECEIVE: {
4710	struct scm_timestamping_internal tss;
4711	struct tcp_zerocopy_receive zc = {};
4712	int err;
4713
4714	if (copy_from_sockptr(dst: &len, src: optlen, size: sizeof(int)))
4715	return -EFAULT;
4716	if (len < `0` \|\|
4717	len < offsetofend(struct tcp_zerocopy_receive, length))
4718	return -EINVAL;
4719	if (unlikely(len > sizeof(zc))) {
4720	err = check_zeroed_sockptr(src: optval, offset: sizeof(zc),
4721	size: len - sizeof(zc));
4722	if (err < `1`)
4723	return err == `0` ? -EINVAL : err;
4724	len = sizeof(zc);
4725	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4726	return -EFAULT;
4727	}
4728	if (copy_from_sockptr(dst: &zc, src: optval, size: len))
4729	return -EFAULT;
4730	if (zc.reserved)
4731	return -EINVAL;
4732	if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS))
4733	return -EINVAL;
4734	sockopt_lock_sock(sk);
4735	err = tcp_zerocopy_receive(sk, zc: &zc, tss: &tss);
4736	err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
4737	&zc, &len, err);
4738	sockopt_release_sock(sk);
4739	if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
4740	goto zerocopy_rcv_cmsg;
4741	switch (len) {
4742	case offsetofend(struct tcp_zerocopy_receive, msg_flags):
4743	goto zerocopy_rcv_cmsg;
4744	case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
4745	case offsetofend(struct tcp_zerocopy_receive, msg_control):
4746	case offsetofend(struct tcp_zerocopy_receive, flags):
4747	case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
4748	case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
4749	case offsetofend(struct tcp_zerocopy_receive, err):
4750	goto zerocopy_rcv_sk_err;
4751	case offsetofend(struct tcp_zerocopy_receive, inq):
4752	goto zerocopy_rcv_inq;
4753	case offsetofend(struct tcp_zerocopy_receive, length):
4754	default:
4755	goto zerocopy_rcv_out;
4756	}
4757	zerocopy_rcv_cmsg:
4758	if (zc.msg_flags & TCP_CMSG_TS)
4759	tcp_zc_finalize_rx_tstamp(sk, zc: &zc, tss: &tss);
4760	else
4761	zc.msg_flags = `0`;
4762	zerocopy_rcv_sk_err:
4763	if (!err)
4764	zc.err = sock_error(sk);
4765	zerocopy_rcv_inq:
4766	zc.inq = tcp_inq_hint(sk);
4767	zerocopy_rcv_out:
4768	if (!err && copy_to_sockptr(dst: optval, src: &zc, size: len))
4769	err = -EFAULT;
4770	return err;
4771	}
4772	#endif
4773	case TCP_AO_REPAIR:
4774	if (!tcp_can_repair_sock(sk))
4775	return -EPERM;
4776	return tcp_ao_get_repair(sk, optval, optlen);
4777	case TCP_AO_GET_KEYS:
4778	case TCP_AO_INFO: {
4779	int err;
4780
4781	sockopt_lock_sock(sk);
4782	if (optname == TCP_AO_GET_KEYS)
4783	err = tcp_ao_get_mkts(sk, optval, optlen);
4784	else
4785	err = tcp_ao_get_sock_info(sk, optval, optlen);
4786	sockopt_release_sock(sk);
4787
4788	return err;
4789	}
4790	case TCP_IS_MPTCP:
4791	val = `0`;
4792	break;
4793	case TCP_RTO_MAX_MS:
4794	val = jiffies_to_msecs(j: tcp_rto_max(sk));
4795	break;
4796	case TCP_RTO_MIN_US:
4797	val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_rto_min));
4798	break;
4799	case TCP_DELACK_MAX_US:
4800	val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_delack_max));
4801	break;
4802	default:
4803	return -ENOPROTOOPT;
4804	}
4805
4806	if (copy_to_sockptr(dst: optlen, src: &len, size: sizeof(int)))
4807	return -EFAULT;
4808	if (copy_to_sockptr(dst: optval, src: &val, size: len))
4809	return -EFAULT;
4810	return `0`;
4811	}
4812
4813	bool tcp_bpf_bypass_getsockopt(int level, int optname)
4814	{
4815	/ TCP do_tcp_getsockopt has optimized getsockopt implementation*
4816	* to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
4817	*/
4818	if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
4819	return true;
4820
4821	return false;
4822	}
4823	EXPORT_IPV6_MOD(tcp_bpf_bypass_getsockopt);
4824
4825	int tcp_getsockopt(struct sock sk, int* level, int optname, char __user *optval,
4826	int __user *optlen)
4827	{
4828	struct inet_connection_sock *icsk = inet_csk(sk);
4829
4830	if (level != SOL_TCP)
4831	/ Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() /
4832	return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname,
4833	optval, optlen);
4834	return do_tcp_getsockopt(sk, level, optname, optval: USER_SOCKPTR(p: optval),
4835	optlen: USER_SOCKPTR(p: optlen));
4836	}
4837	EXPORT_IPV6_MOD(tcp_getsockopt);
4838
4839	#ifdef CONFIG_TCP_MD5SIG
4840	void tcp_md5_hash_skb_data(struct md5_ctx ctx, const* struct sk_buff *skb,
4841	unsigned int header_len)
4842	{
4843	const unsigned int head_data_len = skb_headlen(skb) > header_len ?
4844	skb_headlen(skb) - header_len : `0`;
4845	const struct skb_shared_info *shi = skb_shinfo(skb);
4846	struct sk_buff *frag_iter;
4847	unsigned int i;
4848
4849	md5_update(ctx, data: (const u8 *)tcp_hdr(skb) + header_len, len: head_data_len);
4850
4851	for (i = `0`; i < shi->nr_frags; ++i) {
4852	const skb_frag_t *f = &shi->frags[i];
4853	u32 p_off, p_len, copied;
4854	const void *vaddr;
4855	struct page *p;
4856
4857	skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
4858	p, p_off, p_len, copied) {
4859	vaddr = kmap_local_page(page: p);
4860	md5_update(ctx, data: vaddr + p_off, len: p_len);
4861	kunmap_local(vaddr);
4862	}
4863	}
4864
4865	skb_walk_frags(skb, frag_iter)
4866	tcp_md5_hash_skb_data(ctx, skb: frag_iter, header_len: `0`);
4867	}
4868	EXPORT_IPV6_MOD(tcp_md5_hash_skb_data);
4869
4870	void tcp_md5_hash_key(struct md5_ctx *ctx,
4871	const struct tcp_md5sig_key *key)
4872	{
4873	u8 keylen = READ_ONCE(key->keylen); / paired with WRITE_ONCE() in tcp_md5_do_add /
4874
4875	/ We use data_race() because tcp_md5_do_add() might change*
4876	* key->key under us
4877	*/
4878	data_race(({ md5_update(ctx, key->key, keylen), `0`; }));
4879	}
4880	EXPORT_IPV6_MOD(tcp_md5_hash_key);
4881
4882	/ Called with rcu_read_lock() /
4883	static enum skb_drop_reason
4884	tcp_inbound_md5_hash(const struct sock sk, const* struct sk_buff *skb,
4885	const void saddr, const* void *daddr,
4886	int family, int l3index, const __u8 *hash_location)
4887	{
4888	/ This gets called for each TCP segment that has TCP-MD5 option.*
4889	* We have 2 drop cases:
4890	* o An MD5 signature is present, but we're not expecting one.
4891	* o The MD5 signature is wrong.
4892	*/
4893	const struct tcp_sock *tp = tcp_sk(sk);
4894	struct tcp_md5sig_key *key;
4895	u8 newhash[`16`];
4896
4897	key = tcp_md5_do_lookup(sk, l3index, addr: saddr, family);
4898	if (!key) {
4899	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
4900	trace_tcp_hash_md5_unexpected(sk, skb);
4901	return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
4902	}
4903
4904	/ Check the signature.*
4905	* To support dual stack listeners, we need to handle
4906	* IPv4-mapped case.
4907	*/
4908	if (family == AF_INET)
4909	tcp_v4_md5_hash_skb(md5_hash: newhash, key, NULL, skb);
4910	else
4911	tp->af_specific->calc_md5_hash(newhash, key, NULL, skb);
4912	if (memcmp(p: hash_location, q: newhash, size: `16`) != `0`) {
4913	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
4914	trace_tcp_hash_md5_mismatch(sk, skb);
4915	return SKB_DROP_REASON_TCP_MD5FAILURE;
4916	}
4917	return SKB_NOT_DROPPED_YET;
4918	}
4919	#else
4920	static inline enum skb_drop_reason
4921	tcp_inbound_md5_hash(const struct sock sk, const* struct sk_buff *skb,
4922	const void saddr, const* void *daddr,
4923	int family, int l3index, const __u8 *hash_location)
4924	{
4925	return SKB_NOT_DROPPED_YET;
4926	}
4927
4928	#endif
4929
4930	/ Called with rcu_read_lock() /
4931	enum skb_drop_reason
4932	tcp_inbound_hash(struct sock sk, const* struct request_sock *req,
4933	const struct sk_buff *skb,
4934	const void saddr, const* void *daddr,
4935	int family, int dif, int sdif)
4936	{
4937	const struct tcphdr *th = tcp_hdr(skb);
4938	const struct tcp_ao_hdr *aoh;
4939	const __u8 *md5_location;
4940	int l3index;
4941
4942	/ Invalid option or two times meet any of auth options /
4943	if (tcp_parse_auth_options(th, md5_hash: &md5_location, aoh: &aoh)) {
4944	trace_tcp_hash_bad_header(sk, skb);
4945	return SKB_DROP_REASON_TCP_AUTH_HDR;
4946	}
4947
4948	if (req) {
4949	if (tcp_rsk_used_ao(req) != !!aoh) {
4950	u8 keyid, rnext, maclen;
4951
4952	if (aoh) {
4953	keyid = aoh->keyid;
4954	rnext = aoh->rnext_keyid;
4955	maclen = tcp_ao_hdr_maclen(aoh);
4956	} else {
4957	keyid = rnext = maclen = `0`;
4958	}
4959
4960	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
4961	trace_tcp_ao_handshake_failure(sk, skb, keyid, rnext, maclen);
4962	return SKB_DROP_REASON_TCP_AOFAILURE;
4963	}
4964	}
4965
4966	/ sdif set, means packet ingressed via a device*
4967	* in an L3 domain and dif is set to the l3mdev
4968	*/
4969	l3index = sdif ? dif : `0`;
4970
4971	/ Fast path: unsigned segments /
4972	if (likely(!md5_location && !aoh)) {
4973	/ Drop if there's TCP-MD5 or TCP-AO key with any rcvid/sndid*
4974	* for the remote peer. On TCP-AO established connection
4975	* the last key is impossible to remove, so there's
4976	* always at least one current_key.
4977	*/
4978	if (tcp_ao_required(sk, saddr, family, l3index, stat_inc: true)) {
4979	trace_tcp_hash_ao_required(sk, skb);
4980	return SKB_DROP_REASON_TCP_AONOTFOUND;
4981	}
4982	if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) {
4983	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
4984	trace_tcp_hash_md5_required(sk, skb);
4985	return SKB_DROP_REASON_TCP_MD5NOTFOUND;
4986	}
4987	return SKB_NOT_DROPPED_YET;
4988	}
4989
4990	if (aoh)
4991	return tcp_inbound_ao_hash(sk, skb, family, req, l3index, aoh);
4992
4993	return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family,
4994	l3index, hash_location: md5_location);
4995	}
4996	EXPORT_IPV6_MOD_GPL(tcp_inbound_hash);
4997
4998	void tcp_done(struct sock *sk)
4999	{
5000	struct request_sock *req;
5001
5002	/ We might be called with a new socket, after*
5003	* inet_csk_prepare_forced_close() has been called
5004	* so we can not use lockdep_sock_is_held(sk)
5005	*/
5006	req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, `1`);
5007
5008	if (sk->sk_state == TCP_SYN_SENT \|\| sk->sk_state == TCP_SYN_RECV)
5009	TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
5010
5011	tcp_set_state(sk, TCP_CLOSE);
5012	tcp_clear_xmit_timers(sk);
5013	if (req)
5014	reqsk_fastopen_remove(sk, req, reset: false);
5015
5016	WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
5017
5018	if (!sock_flag(sk, flag: SOCK_DEAD))
5019	sk->sk_state_change(sk);
5020	else
5021	inet_csk_destroy_sock(sk);
5022	}
5023	EXPORT_SYMBOL_GPL(tcp_done);
5024
5025	int tcp_abort(struct sock sk, int* err)
5026	{
5027	int state = inet_sk_state_load(sk);
5028
5029	if (state == TCP_NEW_SYN_RECV) {
5030	struct request_sock *req = inet_reqsk(sk);
5031
5032	local_bh_disable();
5033	inet_csk_reqsk_queue_drop(sk: req->rsk_listener, req);
5034	local_bh_enable();
5035	return `0`;
5036	}
5037	if (state == TCP_TIME_WAIT) {
5038	struct inet_timewait_sock *tw = inet_twsk(sk);
5039
5040	refcount_inc(r: &tw->tw_refcnt);
5041	local_bh_disable();
5042	inet_twsk_deschedule_put(tw);
5043	local_bh_enable();
5044	return `0`;
5045	}
5046
5047	/ BPF context ensures sock locking. /
5048	if (!has_current_bpf_ctx())
5049	/ Don't race with userspace socket closes such as tcp_close. /
5050	lock_sock(sk);
5051
5052	/ Avoid closing the same socket twice. /
5053	if (sk->sk_state == TCP_CLOSE) {
5054	if (!has_current_bpf_ctx())
5055	release_sock(sk);
5056	return -ENOENT;
5057	}
5058
5059	if (sk->sk_state == TCP_LISTEN) {
5060	tcp_set_state(sk, TCP_CLOSE);
5061	inet_csk_listen_stop(sk);
5062	}
5063
5064	/ Don't race with BH socket closes such as inet_csk_listen_stop. /
5065	local_bh_disable();
5066	bh_lock_sock(sk);
5067
5068	if (tcp_need_reset(state: sk->sk_state))
5069	tcp_send_active_reset(sk, GFP_ATOMIC,
5070	reason: SK_RST_REASON_TCP_STATE);
5071	tcp_done_with_error(sk, err);
5072
5073	bh_unlock_sock(sk);
5074	local_bh_enable();
5075	if (!has_current_bpf_ctx())
5076	release_sock(sk);
5077	return `0`;
5078	}
5079	EXPORT_SYMBOL_GPL(tcp_abort);
5080
5081	extern struct tcp_congestion_ops tcp_reno;
5082
5083	static __initdata unsigned long thash_entries;
5084	static int __init set_thash_entries(char *str)
5085	{
5086	ssize_t ret;
5087
5088	if (!str)
5089	return `0`;
5090
5091	ret = kstrtoul(s: str, base: `0`, res: &thash_entries);
5092	if (ret)
5093	return `0`;
5094
5095	return `1`;
5096	}
5097	__setup("thash_entries=", set_thash_entries);
5098
5099	static void __init tcp_init_mem(void)
5100	{
5101	unsigned long limit = nr_free_buffer_pages() / `16`;
5102
5103	limit = max(limit, `128UL`);
5104	sysctl_tcp_mem[`0`] = limit / `4` * `3`; / 4.68 % /
5105	sysctl_tcp_mem[`1`] = limit; / 6.25 % /
5106	sysctl_tcp_mem[`2`] = sysctl_tcp_mem[`0`] * `2`; / 9.37 % /
5107	}
5108
5109	static void __init tcp_struct_check(void)
5110	{
5111	/ TX read-mostly hotpath cache lines /
5112	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window);
5113	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh);
5114	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering);
5115	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat);
5116	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs);
5117	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint);
5118	#if IS_ENABLED(CONFIG_TLS_DEVICE)
5119	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, tcp_clean_acked);
5120	#endif
5121
5122	/ TXRX read-mostly hotpath cache lines /
5123	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset);
5124	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd);
5125	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache);
5126	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd);
5127	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out);
5128	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out);
5129	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out);
5130	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio);
5131
5132	/ RX read-mostly hotpath cache lines /
5133	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq);
5134	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1);
5135	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq);
5136	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us);
5137	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out);
5138	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss);
5139	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data);
5140	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost);
5141	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min);
5142	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue);
5143	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh);
5144
5145	/ TX read-write hotpath cache lines /
5146	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out);
5147	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out);
5148	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent);
5149	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml);
5150	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start);
5151	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat);
5152	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq);
5153	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq);
5154	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
5155	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
5156	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
5157	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp);
5158	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
5159	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
5160	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
5161	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
5162
5163	/ TXRX read-write hotpath cache lines /
5164	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
5165	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_clock_cache);
5166	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_mstamp);
5167	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt);
5168	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt);
5169	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una);
5170	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp);
5171	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us);
5172	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out);
5173	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up);
5174	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered);
5175	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce);
5176	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce);
5177	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes);
5178	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
5179	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
5180	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp);
5181	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);
5182
5183	/ RX read-write hotpath cache lines /
5184	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
5185	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in);
5186	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in);
5187	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup);
5188	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out);
5189	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq);
5190	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered);
5191	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
5192	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
5193	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes);
5194	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
5195	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
5196	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);
5197	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est);
5198	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space);
5199	}
5200
5201	void __init tcp_init(void)
5202	{
5203	int max_rshare, max_wshare, cnt;
5204	unsigned long limit;
5205	unsigned int i;
5206
5207	BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
5208	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
5209	sizeof_field(struct sk_buff, cb));
5210
5211	tcp_struct_check();
5212
5213	percpu_counter_init(&tcp_sockets_allocated, `0`, GFP_KERNEL);
5214
5215	timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
5216	mod_timer(timer: &tcp_orphan_timer, expires: jiffies + TCP_ORPHAN_TIMER_PERIOD);
5217
5218	inet_hashinfo2_init(h: &tcp_hashinfo, name: "tcp_listen_portaddr_hash",
5219	numentries: thash_entries, scale: `21`, / one slot per 2 MB/
5220	low_limit: `0`, high_limit: `64` * `1024`);
5221	tcp_hashinfo.bind_bucket_cachep =
5222	kmem_cache_create("tcp_bind_bucket",
5223	sizeof(struct inet_bind_bucket), `0`,
5224	SLAB_HWCACHE_ALIGN \| SLAB_PANIC \|
5225	SLAB_ACCOUNT,
5226	NULL);
5227	tcp_hashinfo.bind2_bucket_cachep =
5228	kmem_cache_create("tcp_bind2_bucket",
5229	sizeof(struct inet_bind2_bucket), `0`,
5230	SLAB_HWCACHE_ALIGN \| SLAB_PANIC \|
5231	SLAB_ACCOUNT,
5232	NULL);
5233
5234	/ Size and allocate the main established and bind bucket*
5235	* hash tables.
5236	*
5237	* The methodology is similar to that of the buffer cache.
5238	*/
5239	tcp_hashinfo.ehash =
5240	alloc_large_system_hash(tablename: "TCP established",
5241	bucketsize: sizeof(struct inet_ehash_bucket),
5242	numentries: thash_entries,
5243	scale: `17`, / one slot per 128 KB of memory /
5244	flags: `0`,
5245	NULL,
5246	hash_mask: &tcp_hashinfo.ehash_mask,
5247	low_limit: `0`,
5248	high_limit: thash_entries ? `0` : `512` * `1024`);
5249	for (i = `0`; i <= tcp_hashinfo.ehash_mask; i++)
5250	INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
5251
5252	if (inet_ehash_locks_alloc(hashinfo: &tcp_hashinfo))
5253	panic(fmt: "TCP: failed to alloc ehash_locks");
5254	tcp_hashinfo.bhash =
5255	alloc_large_system_hash(tablename: "TCP bind",
5256	bucketsize: `2` * sizeof(struct inet_bind_hashbucket),
5257	numentries: tcp_hashinfo.ehash_mask + `1`,
5258	scale: `17`, / one slot per 128 KB of memory /
5259	flags: `0`,
5260	hash_shift: &tcp_hashinfo.bhash_size,
5261	NULL,
5262	low_limit: `0`,
5263	high_limit: `64` * `1024`);
5264	tcp_hashinfo.bhash_size = `1U` << tcp_hashinfo.bhash_size;
5265	tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size;
5266	for (i = `0`; i < tcp_hashinfo.bhash_size; i++) {
5267	spin_lock_init(&tcp_hashinfo.bhash[i].lock);
5268	INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
5269	spin_lock_init(&tcp_hashinfo.bhash2[i].lock);
5270	INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
5271	}
5272
5273	tcp_hashinfo.pernet = false;
5274
5275	cnt = tcp_hashinfo.ehash_mask + `1`;
5276	sysctl_tcp_max_orphans = cnt / `2`;
5277
5278	tcp_init_mem();
5279	/ Set per-socket limits to no more than 1/128 the pressure threshold /
5280	limit = nr_free_buffer_pages() << (PAGE_SHIFT - `7`);
5281	max_wshare = min(`4UL``1024``1024`, limit);
5282	max_rshare = min(`32UL``1024``1024`, limit);
5283
5284	init_net.ipv4.sysctl_tcp_wmem[`0`] = PAGE_SIZE;
5285	init_net.ipv4.sysctl_tcp_wmem[`1`] = `16`*`1024`;
5286	init_net.ipv4.sysctl_tcp_wmem[`2`] = max(`64`*`1024`, max_wshare);
5287
5288	init_net.ipv4.sysctl_tcp_rmem[`0`] = PAGE_SIZE;
5289	init_net.ipv4.sysctl_tcp_rmem[`1`] = `131072`;
5290	init_net.ipv4.sysctl_tcp_rmem[`2`] = max(`131072`, max_rshare);
5291
5292	pr_info("Hash tables configured (established %u bind %u)\n",
5293	tcp_hashinfo.ehash_mask + `1`, tcp_hashinfo.bhash_size);
5294
5295	tcp_v4_init();
5296	tcp_metrics_init();
5297	BUG_ON(tcp_register_congestion_control(&tcp_reno) != `0`);
5298	tcp_tsq_work_init();
5299	mptcp_init();
5300	}
5301

source code of linux/net/ipv4/tcp.c