tcp_output.c source code [linux/net/ipv4/tcp_output.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* INET An implementation of the TCP/IP protocol suite for the LINUX
4	* operating system. INET is implemented using the BSD Socket
5	* interface as the means of communication with the user level.
6	*
7	* Implementation of the Transmission Control Protocol(TCP).
8	*
9	* Authors: Ross Biro
10	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11	* Mark Evans, <evansmp@uhura.aston.ac.uk>
12	* Corey Minyard <wf-rch!minyard@relay.EU.net>
13	* Florian La Roche, <flla@stud.uni-sb.de>
14	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
15	* Linus Torvalds, <torvalds@cs.helsinki.fi>
16	* Alan Cox, <gw4pts@gw4pts.ampr.org>
17	* Matthew Dillon, <dillon@apollo.west.oic.com>
18	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
19	* Jorge Cwik, <jorge@laser.satlink.net>
20	*/
21
22	/*
23	* Changes: Pedro Roque : Retransmit queue handled by TCP.
24	* : Fragmentation on mtu decrease
25	* : Segment collapse on retransmit
26	* : AF independence
27	*
28	* Linus Torvalds : send_delayed_ack
29	* David S. Miller : Charge memory using the right skb
30	* during syn/ack processing.
31	* David S. Miller : Output engine completely rewritten.
32	* Andrea Arcangeli: SYNACK carry ts_recent in tsecr.
33	* Cacophonix Gaul : draft-minshall-nagle-01
34	* J Hadi Salim : ECN support
35	*
36	*/
37
38	#define pr_fmt(fmt) "TCP: " fmt
39
40	#include <net/tcp.h>
41	#include <net/mptcp.h>
42
43	#include <linux/compiler.h>
44	#include <linux/gfp.h>
45	#include <linux/module.h>
46	#include <linux/static_key.h>
47
48	#include <trace/events/tcp.h>
49
50	/ Refresh clocks of a TCP socket,*
51	* ensuring monotically increasing values.
52	*/
53	void tcp_mstamp_refresh(struct tcp_sock *tp)
54	{
55	u64 val = tcp_clock_ns();
56
57	tp->tcp_clock_cache = val;
58	tp->tcp_mstamp = div_u64(dividend: val, NSEC_PER_USEC);
59	}
60
61	static bool tcp_write_xmit(struct sock sk, unsigned* int mss_now, int nonagle,
62	int push_one, gfp_t gfp);
63
64	/ Account for new data that has been sent to the network. /
65	static void tcp_event_new_data_sent(struct sock sk, struct* sk_buff *skb)
66	{
67	struct inet_connection_sock *icsk = inet_csk(sk);
68	struct tcp_sock *tp = tcp_sk(sk);
69	unsigned int prior_packets = tp->packets_out;
70
71	WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
72
73	__skb_unlink(skb, list: &sk->sk_write_queue);
74	tcp_rbtree_insert(root: &sk->tcp_rtx_queue, skb);
75
76	if (tp->highest_sack == NULL)
77	tp->highest_sack = skb;
78
79	tp->packets_out += tcp_skb_pcount(skb);
80	if (!prior_packets \|\| icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
81	tcp_rearm_rto(sk);
82
83	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
84	tcp_skb_pcount(skb));
85	tcp_check_space(sk);
86	}
87
88	/ SND.NXT, if window was not shrunk or the amount of shrunk was less than one*
89	* window scaling factor due to loss of precision.
90	* If window has been shrunk, what should we make? It is not clear at all.
91	* Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
92	* Anything in between SND.UNA...SND.UNA+SND.WND also can be already
93	* invalid. OK, let's make this for now:
94	*/
95	static inline __u32 tcp_acceptable_seq(const struct sock *sk)
96	{
97	const struct tcp_sock *tp = tcp_sk(sk);
98
99	if (!before(seq1: tcp_wnd_end(tp), seq2: tp->snd_nxt) \|\|
100	(tp->rx_opt.wscale_ok &&
101	((tp->snd_nxt - tcp_wnd_end(tp)) < (`1` << tp->rx_opt.rcv_wscale))))
102	return tp->snd_nxt;
103	else
104	return tcp_wnd_end(tp);
105	}
106
107	/ Calculate mss to advertise in SYN segment.*
108	* RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
109	*
110	* 1. It is independent of path mtu.
111	* 2. Ideally, it is maximal possible segment size i.e. 65535-40.
112	* 3. For IPv4 it is reasonable to calculate it from maximal MTU of
113	* attached devices, because some buggy hosts are confused by
114	* large MSS.
115	* 4. We do not make 3, we advertise MSS, calculated from first
116	* hop device mtu, but allow to raise it to ip_rt_min_advmss.
117	* This may be overridden via information stored in routing table.
118	* 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
119	* probably even Jumbo".
120	*/
121	static __u16 tcp_advertise_mss(struct sock *sk)
122	{
123	struct tcp_sock *tp = tcp_sk(sk);
124	const struct dst_entry *dst = __sk_dst_get(sk);
125	int mss = tp->advmss;
126
127	if (dst) {
128	unsigned int metric = dst_metric_advmss(dst);
129
130	if (metric < mss) {
131	mss = metric;
132	tp->advmss = mss;
133	}
134	}
135
136	return (__u16)mss;
137	}
138
139	/ RFC2861. Reset CWND after idle period longer RTO to "restart window".*
140	* This is the first part of cwnd validation mechanism.
141	*/
142	void tcp_cwnd_restart(struct sock *sk, s32 delta)
143	{
144	struct tcp_sock *tp = tcp_sk(sk);
145	u32 restart_cwnd = tcp_init_cwnd(tp, dst: __sk_dst_get(sk));
146	u32 cwnd = tcp_snd_cwnd(tp);
147
148	tcp_ca_event(sk, event: CA_EVENT_CWND_RESTART);
149
150	tp->snd_ssthresh = tcp_current_ssthresh(sk);
151	restart_cwnd = min(restart_cwnd, cwnd);
152
153	while ((delta -= inet_csk(sk)->icsk_rto) > `0` && cwnd > restart_cwnd)
154	cwnd >>= `1`;
155	tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd));
156	tp->snd_cwnd_stamp = tcp_jiffies32;
157	tp->snd_cwnd_used = `0`;
158	}
159
160	/ Congestion state accounting after a packet has been sent. /
161	static void tcp_event_data_sent(struct tcp_sock *tp,
162	struct sock *sk)
163	{
164	struct inet_connection_sock *icsk = inet_csk(sk);
165	const u32 now = tcp_jiffies32;
166
167	if (tcp_packets_in_flight(tp) == `0`)
168	tcp_ca_event(sk, event: CA_EVENT_TX_START);
169
170	tp->lsndtime = now;
171
172	/ If it is a reply for ato after last received*
173	* packet, increase pingpong count.
174	*/
175	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
176	inet_csk_inc_pingpong_cnt(sk);
177	}
178
179	/ Account for an ACK we sent. /
180	static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
181	{
182	struct tcp_sock *tp = tcp_sk(sk);
183
184	if (unlikely(tp->compressed_ack)) {
185	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
186	tp->compressed_ack);
187	tp->compressed_ack = `0`;
188	if (hrtimer_try_to_cancel(timer: &tp->compressed_ack_timer) == `1`)
189	__sock_put(sk);
190	}
191
192	if (unlikely(rcv_nxt != tp->rcv_nxt))
193	return; / Special ACK sent by DCTCP to reflect ECN /
194	tcp_dec_quickack_mode(sk);
195	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
196	}
197
198	/ Determine a window scaling and initial window to offer.*
199	* Based on the assumption that the given amount of space
200	* will be offered. Store the results in the tp structure.
201	* NOTE: for smooth operation initial space offering should
202	* be a multiple of mss if possible. We assume here that mss >= 1.
203	* This MUST be enforced by all callers.
204	*/
205	void tcp_select_initial_window(const struct sock sk, int* __space, __u32 mss,
206	__u32 rcv_wnd, __u32 window_clamp,
207	int wscale_ok, __u8 *rcv_wscale,
208	__u32 init_rcv_wnd)
209	{
210	unsigned int space = (__space < `0` ? `0` : __space);
211
212	/ If no clamp set the clamp to the max possible scaled window /
213	if (*window_clamp == `0`)
214	(*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
215	space = min(*window_clamp, space);
216
217	/ Quantize space offering to a multiple of mss if possible. /
218	if (space > mss)
219	space = rounddown(space, mss);
220
221	/ NOTE: offering an initial window larger than 32767*
222	* will break some buggy TCP stacks. If the admin tells us
223	* it is likely we could be speaking with such a buggy stack
224	* we will truncate our initial window offering to 32K-1
225	* unless the remote has sent us a window scaling option,
226	* which we interpret as a sign the remote TCP is not
227	* misinterpreting the window field as a signed quantity.
228	*/
229	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
230	(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
231	else
232	(*rcv_wnd) = min_t(u32, space, U16_MAX);
233
234	if (init_rcv_wnd)
235	rcv_wnd = min(rcv_wnd, init_rcv_wnd * mss);
236
237	*rcv_wscale = `0`;
238	if (wscale_ok) {
239	/ Set window scaling on max possible window /
240	space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[`2`]));
241	space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
242	space = min_t(u32, space, *window_clamp);
243	rcv_wscale = clamp_t(int*, ilog2(space) - `15`,
244	`0`, TCP_MAX_WSCALE);
245	}
246	/ Set the clamp no higher than max representable value /
247	(window_clamp) = min_t(__u32, U16_MAX << (rcv_wscale), *window_clamp);
248	}
249	EXPORT_SYMBOL(tcp_select_initial_window);
250
251	/ Chose a new window to advertise, update state in tcp_sock for the*
252	* socket, and return result with RFC1323 scaling applied. The return
253	* value can be stuffed directly into th->window for an outgoing
254	* frame.
255	*/
256	static u16 tcp_select_window(struct sock *sk)
257	{
258	struct tcp_sock *tp = tcp_sk(sk);
259	struct net *net = sock_net(sk);
260	u32 old_win = tp->rcv_wnd;
261	u32 cur_win, new_win;
262
263	/ Make the window 0 if we failed to queue the data because we*
264	* are out of memory. The window is temporary, so we don't store
265	* it on the socket.
266	*/
267	if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM))
268	return `0`;
269
270	cur_win = tcp_receive_window(tp);
271	new_win = __tcp_select_window(sk);
272	if (new_win < cur_win) {
273	/ Danger Will Robinson!*
274	* Don't update rcv_wup/rcv_wnd here or else
275	* we will not be able to advertise a zero
276	* window in time. --DaveM
277	*
278	* Relax Will Robinson.
279	*/
280	if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) \|\| !tp->rx_opt.rcv_wscale) {
281	/ Never shrink the offered window /
282	if (new_win == `0`)
283	NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV);
284	new_win = ALIGN(cur_win, `1` << tp->rx_opt.rcv_wscale);
285	}
286	}
287
288	tp->rcv_wnd = new_win;
289	tp->rcv_wup = tp->rcv_nxt;
290
291	/ Make sure we do not exceed the maximum possible*
292	* scaled window.
293	*/
294	if (!tp->rx_opt.rcv_wscale &&
295	READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows))
296	new_win = min(new_win, MAX_TCP_WINDOW);
297	else
298	new_win = min(new_win, (`65535U` << tp->rx_opt.rcv_wscale));
299
300	/ RFC1323 scaling applied /
301	new_win >>= tp->rx_opt.rcv_wscale;
302
303	/ If we advertise zero window, disable fast path. /
304	if (new_win == `0`) {
305	tp->pred_flags = `0`;
306	if (old_win)
307	NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV);
308	} else if (old_win == `0`) {
309	NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV);
310	}
311
312	return new_win;
313	}
314
315	/ Packet ECN state for a SYN-ACK /
316	static void tcp_ecn_send_synack(struct sock sk, struct* sk_buff *skb)
317	{
318	const struct tcp_sock *tp = tcp_sk(sk);
319
320	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
321	if (!(tp->ecn_flags & TCP_ECN_OK))
322	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
323	else if (tcp_ca_needs_ecn(sk) \|\|
324	tcp_bpf_ca_needs_ecn(sk))
325	INET_ECN_xmit(sk);
326	}
327
328	/ Packet ECN state for a SYN. /
329	static void tcp_ecn_send_syn(struct sock sk, struct* sk_buff *skb)
330	{
331	struct tcp_sock *tp = tcp_sk(sk);
332	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
333	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == `1` \|\|
334	tcp_ca_needs_ecn(sk) \|\| bpf_needs_ecn;
335
336	if (!use_ecn) {
337	const struct dst_entry *dst = __sk_dst_get(sk);
338
339	if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
340	use_ecn = true;
341	}
342
343	tp->ecn_flags = `0`;
344
345	if (use_ecn) {
346	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_ECE \| TCPHDR_CWR;
347	tp->ecn_flags = TCP_ECN_OK;
348	if (tcp_ca_needs_ecn(sk) \|\| bpf_needs_ecn)
349	INET_ECN_xmit(sk);
350	}
351	}
352
353	static void tcp_ecn_clear_syn(struct sock sk, struct* sk_buff *skb)
354	{
355	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
356	/ tp->ecn_flags are cleared at a later point in time when*
357	* SYN ACK is ultimatively being received.
358	*/
359	TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE \| TCPHDR_CWR);
360	}
361
362	static void
363	tcp_ecn_make_synack(const struct request_sock req, struct* tcphdr *th)
364	{
365	if (inet_rsk(sk: req)->ecn_ok)
366	th->ece = `1`;
367	}
368
369	/ Set up ECN state for a packet on a ESTABLISHED socket that is about to*
370	* be sent.
371	*/
372	static void tcp_ecn_send(struct sock sk, struct* sk_buff *skb,
373	struct tcphdr th, int* tcp_header_len)
374	{
375	struct tcp_sock *tp = tcp_sk(sk);
376
377	if (tp->ecn_flags & TCP_ECN_OK) {
378	/ Not-retransmitted data segment: set ECT and inject CWR. /
379	if (skb->len != tcp_header_len &&
380	!before(TCP_SKB_CB(skb)->seq, seq2: tp->snd_nxt)) {
381	INET_ECN_xmit(sk);
382	if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
383	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
384	th->cwr = `1`;
385	skb_shinfo(skb)->gso_type \|= SKB_GSO_TCP_ECN;
386	}
387	} else if (!tcp_ca_needs_ecn(sk)) {
388	/ ACK or retransmitted segment: clear ECT\|CE /
389	INET_ECN_dontxmit(sk);
390	}
391	if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
392	th->ece = `1`;
393	}
394	}
395
396	/ Constructs common control bits of non-data skb. If SYN/FIN is present,*
397	* auto increment end seqno.
398	*/
399	static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
400	{
401	skb->ip_summed = CHECKSUM_PARTIAL;
402
403	TCP_SKB_CB(skb)->tcp_flags = flags;
404
405	tcp_skb_pcount_set(skb, segs: `1`);
406
407	TCP_SKB_CB(skb)->seq = seq;
408	if (flags & (TCPHDR_SYN \| TCPHDR_FIN))
409	seq++;
410	TCP_SKB_CB(skb)->end_seq = seq;
411	}
412
413	static inline bool tcp_urg_mode(const struct tcp_sock *tp)
414	{
415	return tp->snd_una != tp->snd_up;
416	}
417
418	#define OPTION_SACK_ADVERTISE BIT(0)
419	#define OPTION_TS BIT(1)
420	#define OPTION_MD5 BIT(2)
421	#define OPTION_WSCALE BIT(3)
422	#define OPTION_FAST_OPEN_COOKIE BIT(8)
423	#define OPTION_SMC BIT(9)
424	#define OPTION_MPTCP BIT(10)
425	#define OPTION_AO BIT(11)
426
427	static void smc_options_write(__be32 ptr, u16 options)
428	{
429	#if IS_ENABLED(CONFIG_SMC)
430	if (static_branch_unlikely(&tcp_have_smc)) {
431	if (unlikely(OPTION_SMC & *options)) {
432	*ptr++ = htonl((TCPOPT_NOP << `24`) \|
433	(TCPOPT_NOP << `16`) \|
434	(TCPOPT_EXP << `8`) \|
435	(TCPOLEN_EXP_SMC_BASE));
436	*ptr++ = htonl(TCPOPT_SMC_MAGIC);
437	}
438	}
439	#endif
440	}
441
442	struct tcp_out_options {
443	u16 options; / bit field of OPTION_* /
444	u16 mss; / 0 to disable /
445	u8 ws; / window scale, 0 to disable /
446	u8 num_sack_blocks; / number of SACK blocks to include /
447	u8 hash_size; / bytes in hash_location /
448	u8 bpf_opt_len; / length of BPF hdr option /
449	__u8 hash_location; /* temporary pointer, overloaded /
450	__u32 tsval, tsecr; / need to include OPTION_TS /
451	struct tcp_fastopen_cookie fastopen_cookie; /* Fast open cookie /
452	struct mptcp_out_options mptcp;
453	};
454
455	static void mptcp_options_write(struct tcphdr th, __be32 ptr,
456	struct tcp_sock *tp,
457	struct tcp_out_options *opts)
458	{
459	#if IS_ENABLED(CONFIG_MPTCP)
460	if (unlikely(OPTION_MPTCP & opts->options))
461	mptcp_write_options(th, ptr, tp, opts: &opts->mptcp);
462	#endif
463	}
464
465	#ifdef CONFIG_CGROUP_BPF
466	static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
467	enum tcp_synack_type synack_type)
468	{
469	if (unlikely(!skb))
470	return BPF_WRITE_HDR_TCP_CURRENT_MSS;
471
472	if (unlikely(synack_type == TCP_SYNACK_COOKIE))
473	return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
474
475	return `0`;
476	}
477
478	/ req, syn_skb and synack_type are used when writing synack /
479	static void bpf_skops_hdr_opt_len(struct sock sk, struct* sk_buff *skb,
480	struct request_sock *req,
481	struct sk_buff *syn_skb,
482	enum tcp_synack_type synack_type,
483	struct tcp_out_options *opts,
484	unsigned int *remaining)
485	{
486	struct bpf_sock_ops_kern sock_ops;
487	int err;
488
489	if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
490	BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) \|\|
491	!*remaining)
492	return;
493
494	/ remaining has already been aligned to 4 bytes, so remaining >= 4 /
495
496	/ init sock_ops /
497	memset(&sock_ops, `0`, offsetof(struct bpf_sock_ops_kern, temp));
498
499	sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
500
501	if (req) {
502	/ The listen "sk" cannot be passed here because*
503	* it is not locked. It would not make too much
504	* sense to do bpf_setsockopt(listen_sk) based
505	* on individual connection request also.
506	*
507	* Thus, "req" is passed here and the cgroup-bpf-progs
508	* of the listen "sk" will be run.
509	*
510	* "req" is also used here for fastopen even the "sk" here is
511	* a fullsock "child" sk. It is to keep the behavior
512	* consistent between fastopen and non-fastopen on
513	* the bpf programming side.
514	*/
515	sock_ops.sk = (struct sock *)req;
516	sock_ops.syn_skb = syn_skb;
517	} else {
518	sock_owned_by_me(sk);
519
520	sock_ops.is_fullsock = `1`;
521	sock_ops.sk = sk;
522	}
523
524	sock_ops.args[`0`] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
525	sock_ops.remaining_opt_len = *remaining;
526	/ tcp_current_mss() does not pass a skb /
527	if (skb)
528	bpf_skops_init_skb(skops: &sock_ops, skb, end_offset: `0`);
529
530	err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
531
532	if (err \|\| sock_ops.remaining_opt_len == *remaining)
533	return;
534
535	opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
536	/ round up to 4 bytes /
537	opts->bpf_opt_len = (opts->bpf_opt_len + `3`) & ~`3`;
538
539	*remaining -= opts->bpf_opt_len;
540	}
541
542	static void bpf_skops_write_hdr_opt(struct sock sk, struct* sk_buff *skb,
543	struct request_sock *req,
544	struct sk_buff *syn_skb,
545	enum tcp_synack_type synack_type,
546	struct tcp_out_options *opts)
547	{
548	u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
549	struct bpf_sock_ops_kern sock_ops;
550	int err;
551
552	if (likely(!max_opt_len))
553	return;
554
555	memset(&sock_ops, `0`, offsetof(struct bpf_sock_ops_kern, temp));
556
557	sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
558
559	if (req) {
560	sock_ops.sk = (struct sock *)req;
561	sock_ops.syn_skb = syn_skb;
562	} else {
563	sock_owned_by_me(sk);
564
565	sock_ops.is_fullsock = `1`;
566	sock_ops.sk = sk;
567	}
568
569	sock_ops.args[`0`] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
570	sock_ops.remaining_opt_len = max_opt_len;
571	first_opt_off = tcp_hdrlen(skb) - max_opt_len;
572	bpf_skops_init_skb(skops: &sock_ops, skb, end_offset: first_opt_off);
573
574	err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
575
576	if (err)
577	nr_written = `0`;
578	else
579	nr_written = max_opt_len - sock_ops.remaining_opt_len;
580
581	if (nr_written < max_opt_len)
582	memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
583	max_opt_len - nr_written);
584	}
585	#else
586	static void bpf_skops_hdr_opt_len(struct sock sk, struct* sk_buff *skb,
587	struct request_sock *req,
588	struct sk_buff *syn_skb,
589	enum tcp_synack_type synack_type,
590	struct tcp_out_options *opts,
591	unsigned int *remaining)
592	{
593	}
594
595	static void bpf_skops_write_hdr_opt(struct sock sk, struct* sk_buff *skb,
596	struct request_sock *req,
597	struct sk_buff *syn_skb,
598	enum tcp_synack_type synack_type,
599	struct tcp_out_options *opts)
600	{
601	}
602	#endif
603
604	static __be32 process_tcp_ao_options(struct* tcp_sock *tp,
605	const struct tcp_request_sock *tcprsk,
606	struct tcp_out_options *opts,
607	struct tcp_key key, __be32 ptr)
608	{
609	#ifdef CONFIG_TCP_AO
610	u8 maclen = tcp_ao_maclen(key: key->ao_key);
611
612	if (tcprsk) {
613	u8 aolen = maclen + sizeof(struct tcp_ao_hdr);
614
615	*ptr++ = htonl((TCPOPT_AO << `24`) \| (aolen << `16`) \|
616	(tcprsk->ao_keyid << `8`) \|
617	(tcprsk->ao_rcv_next));
618	} else {
619	struct tcp_ao_key *rnext_key;
620	struct tcp_ao_info *ao_info;
621
622	ao_info = rcu_dereference_check(tp->ao_info,
623	lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk));
624	rnext_key = READ_ONCE(ao_info->rnext_key);
625	if (WARN_ON_ONCE(!rnext_key))
626	return ptr;
627	*ptr++ = htonl((TCPOPT_AO << `24`) \|
628	(tcp_ao_len(key->ao_key) << `16`) \|
629	(key->ao_key->sndid << `8`) \|
630	(rnext_key->rcvid));
631	}
632	opts->hash_location = (__u8 *)ptr;
633	ptr += maclen / sizeof(*ptr);
634	if (unlikely(maclen % sizeof(*ptr))) {
635	memset(ptr, TCPOPT_NOP, sizeof(*ptr));
636	ptr++;
637	}
638	#endif
639	return ptr;
640	}
641
642	/ Write previously computed TCP options to the packet.*
643	*
644	* Beware: Something in the Internet is very sensitive to the ordering of
645	* TCP options, we learned this through the hard way, so be careful here.
646	* Luckily we can at least blame others for their non-compliance but from
647	* inter-operability perspective it seems that we're somewhat stuck with
648	* the ordering which we have been using if we want to keep working with
649	* those broken things (not that it currently hurts anybody as there isn't
650	* particular reason why the ordering would need to be changed).
651	*
652	* At least SACK_PERM as the first option is known to lead to a disaster
653	* (but it may well be that other scenarios fail similarly).
654	*/
655	static void tcp_options_write(struct tcphdr th, struct* tcp_sock *tp,
656	const struct tcp_request_sock *tcprsk,
657	struct tcp_out_options *opts,
658	struct tcp_key *key)
659	{
660	__be32 ptr = (__be32 )(th + `1`);
661	u16 options = opts->options; / mungable copy /
662
663	if (tcp_key_is_md5(key)) {
664	*ptr++ = htonl((TCPOPT_NOP << `24`) \| (TCPOPT_NOP << `16`) \|
665	(TCPOPT_MD5SIG << `8`) \| TCPOLEN_MD5SIG);
666	/ overload cookie hash location /
667	opts->hash_location = (__u8 *)ptr;
668	ptr += `4`;
669	} else if (tcp_key_is_ao(key)) {
670	ptr = process_tcp_ao_options(tp, tcprsk, opts, key, ptr);
671	}
672	if (unlikely(opts->mss)) {
673	*ptr++ = htonl((TCPOPT_MSS << `24`) \|
674	(TCPOLEN_MSS << `16`) \|
675	opts->mss);
676	}
677
678	if (likely(OPTION_TS & options)) {
679	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
680	*ptr++ = htonl((TCPOPT_SACK_PERM << `24`) \|
681	(TCPOLEN_SACK_PERM << `16`) \|
682	(TCPOPT_TIMESTAMP << `8`) \|
683	TCPOLEN_TIMESTAMP);
684	options &= ~OPTION_SACK_ADVERTISE;
685	} else {
686	*ptr++ = htonl((TCPOPT_NOP << `24`) \|
687	(TCPOPT_NOP << `16`) \|
688	(TCPOPT_TIMESTAMP << `8`) \|
689	TCPOLEN_TIMESTAMP);
690	}
691	*ptr++ = htonl(opts->tsval);
692	*ptr++ = htonl(opts->tsecr);
693	}
694
695	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
696	*ptr++ = htonl((TCPOPT_NOP << `24`) \|
697	(TCPOPT_NOP << `16`) \|
698	(TCPOPT_SACK_PERM << `8`) \|
699	TCPOLEN_SACK_PERM);
700	}
701
702	if (unlikely(OPTION_WSCALE & options)) {
703	*ptr++ = htonl((TCPOPT_NOP << `24`) \|
704	(TCPOPT_WINDOW << `16`) \|
705	(TCPOLEN_WINDOW << `8`) \|
706	opts->ws);
707	}
708
709	if (unlikely(opts->num_sack_blocks)) {
710	struct tcp_sack_block *sp = tp->rx_opt.dsack ?
711	tp->duplicate_sack : tp->selective_acks;
712	int this_sack;
713
714	*ptr++ = htonl((TCPOPT_NOP << `24`) \|
715	(TCPOPT_NOP << `16`) \|
716	(TCPOPT_SACK << `8`) \|
717	(TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
718	TCPOLEN_SACK_PERBLOCK)));
719
720	for (this_sack = `0`; this_sack < opts->num_sack_blocks;
721	++this_sack) {
722	*ptr++ = htonl(sp[this_sack].start_seq);
723	*ptr++ = htonl(sp[this_sack].end_seq);
724	}
725
726	tp->rx_opt.dsack = `0`;
727	}
728
729	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
730	struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
731	u8 p = (u8 )ptr;
732	u32 len; / Fast Open option length /
733
734	if (foc->exp) {
735	len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
736	*ptr = htonl((TCPOPT_EXP << `24`) \| (len << `16`) \|
737	TCPOPT_FASTOPEN_MAGIC);
738	p += TCPOLEN_EXP_FASTOPEN_BASE;
739	} else {
740	len = TCPOLEN_FASTOPEN_BASE + foc->len;
741	*p++ = TCPOPT_FASTOPEN;
742	*p++ = len;
743	}
744
745	memcpy(p, foc->val, foc->len);
746	if ((len & `3`) == `2`) {
747	p[foc->len] = TCPOPT_NOP;
748	p[foc->len + `1`] = TCPOPT_NOP;
749	}
750	ptr += (len + `3`) >> `2`;
751	}
752
753	smc_options_write(ptr, options: &options);
754
755	mptcp_options_write(th, ptr, tp, opts);
756	}
757
758	static void smc_set_option(const struct tcp_sock *tp,
759	struct tcp_out_options *opts,
760	unsigned int *remaining)
761	{
762	#if IS_ENABLED(CONFIG_SMC)
763	if (static_branch_unlikely(&tcp_have_smc)) {
764	if (tp->syn_smc) {
765	if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
766	opts->options \|= OPTION_SMC;
767	*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
768	}
769	}
770	}
771	#endif
772	}
773
774	static void smc_set_option_cond(const struct tcp_sock *tp,
775	const struct inet_request_sock *ireq,
776	struct tcp_out_options *opts,
777	unsigned int *remaining)
778	{
779	#if IS_ENABLED(CONFIG_SMC)
780	if (static_branch_unlikely(&tcp_have_smc)) {
781	if (tp->syn_smc && ireq->smc_ok) {
782	if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
783	opts->options \|= OPTION_SMC;
784	*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
785	}
786	}
787	}
788	#endif
789	}
790
791	static void mptcp_set_option_cond(const struct request_sock *req,
792	struct tcp_out_options *opts,
793	unsigned int *remaining)
794	{
795	if (rsk_is_mptcp(req)) {
796	unsigned int size;
797
798	if (mptcp_synack_options(req, size: &size, opts: &opts->mptcp)) {
799	if (*remaining >= size) {
800	opts->options \|= OPTION_MPTCP;
801	*remaining -= size;
802	}
803	}
804	}
805	}
806
807	/ Compute TCP options for SYN packets. This is not the final*
808	* network wire format yet.
809	*/
810	static unsigned int tcp_syn_options(struct sock sk, struct* sk_buff *skb,
811	struct tcp_out_options *opts,
812	struct tcp_key *key)
813	{
814	struct tcp_sock *tp = tcp_sk(sk);
815	unsigned int remaining = MAX_TCP_OPTION_SPACE;
816	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
817	bool timestamps;
818
819	/ Better than switch (key.type) as it has static branches /
820	if (tcp_key_is_md5(key)) {
821	timestamps = false;
822	opts->options \|= OPTION_MD5;
823	remaining -= TCPOLEN_MD5SIG_ALIGNED;
824	} else {
825	timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps);
826	if (tcp_key_is_ao(key)) {
827	opts->options \|= OPTION_AO;
828	remaining -= tcp_ao_len_aligned(key: key->ao_key);
829	}
830	}
831
832	/ We always get an MSS option. The option bytes which will be seen in*
833	* normal data packets should timestamps be used, must be in the MSS
834	* advertised. But we subtract them from tp->mss_cache so that
835	* calculations in tcp_sendmsg are simpler etc. So account for this
836	* fact here if necessary. If we don't do this correctly, as a
837	* receiver we won't recognize data packets as being full sized when we
838	* should, and thus we won't abide by the delayed ACK rules correctly.
839	* SACKs don't matter, we never delay an ACK when we have any of those
840	* going out. */
841	opts->mss = tcp_advertise_mss(sk);
842	remaining -= TCPOLEN_MSS_ALIGNED;
843
844	if (likely(timestamps)) {
845	opts->options \|= OPTION_TS;
846	opts->tsval = tcp_skb_timestamp_ts(usec_ts: tp->tcp_usec_ts, skb) + tp->tsoffset;
847	opts->tsecr = tp->rx_opt.ts_recent;
848	remaining -= TCPOLEN_TSTAMP_ALIGNED;
849	}
850	if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) {
851	opts->ws = tp->rx_opt.rcv_wscale;
852	opts->options \|= OPTION_WSCALE;
853	remaining -= TCPOLEN_WSCALE_ALIGNED;
854	}
855	if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) {
856	opts->options \|= OPTION_SACK_ADVERTISE;
857	if (unlikely(!(OPTION_TS & opts->options)))
858	remaining -= TCPOLEN_SACKPERM_ALIGNED;
859	}
860
861	if (fastopen && fastopen->cookie.len >= `0`) {
862	u32 need = fastopen->cookie.len;
863
864	need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
865	TCPOLEN_FASTOPEN_BASE;
866	need = (need + `3`) & ~`3U`; / Align to 32 bits /
867	if (remaining >= need) {
868	opts->options \|= OPTION_FAST_OPEN_COOKIE;
869	opts->fastopen_cookie = &fastopen->cookie;
870	remaining -= need;
871	tp->syn_fastopen = `1`;
872	tp->syn_fastopen_exp = fastopen->cookie.exp ? `1` : `0`;
873	}
874	}
875
876	smc_set_option(tp, opts, remaining: &remaining);
877
878	if (sk_is_mptcp(sk)) {
879	unsigned int size;
880
881	if (mptcp_syn_options(sk, skb, size: &size, opts: &opts->mptcp)) {
882	opts->options \|= OPTION_MPTCP;
883	remaining -= size;
884	}
885	}
886
887	bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, synack_type: `0`, opts, remaining: &remaining);
888
889	return MAX_TCP_OPTION_SPACE - remaining;
890	}
891
892	/ Set up TCP options for SYN-ACKs. /
893	static unsigned int tcp_synack_options(const struct sock *sk,
894	struct request_sock *req,
895	unsigned int mss, struct sk_buff *skb,
896	struct tcp_out_options *opts,
897	const struct tcp_key *key,
898	struct tcp_fastopen_cookie *foc,
899	enum tcp_synack_type synack_type,
900	struct sk_buff *syn_skb)
901	{
902	struct inet_request_sock *ireq = inet_rsk(sk: req);
903	unsigned int remaining = MAX_TCP_OPTION_SPACE;
904
905	if (tcp_key_is_md5(key)) {
906	opts->options \|= OPTION_MD5;
907	remaining -= TCPOLEN_MD5SIG_ALIGNED;
908
909	/ We can't fit any SACK blocks in a packet with MD5 + TS*
910	* options. There was discussion about disabling SACK
911	* rather than TS in order to fit in better with old,
912	* buggy kernels, but that was deemed to be unnecessary.
913	*/
914	if (synack_type != TCP_SYNACK_COOKIE)
915	ireq->tstamp_ok &= !ireq->sack_ok;
916	} else if (tcp_key_is_ao(key)) {
917	opts->options \|= OPTION_AO;
918	remaining -= tcp_ao_len_aligned(key: key->ao_key);
919	ireq->tstamp_ok &= !ireq->sack_ok;
920	}
921
922	/ We always send an MSS option. /
923	opts->mss = mss;
924	remaining -= TCPOLEN_MSS_ALIGNED;
925
926	if (likely(ireq->wscale_ok)) {
927	opts->ws = ireq->rcv_wscale;
928	opts->options \|= OPTION_WSCALE;
929	remaining -= TCPOLEN_WSCALE_ALIGNED;
930	}
931	if (likely(ireq->tstamp_ok)) {
932	opts->options \|= OPTION_TS;
933	opts->tsval = tcp_skb_timestamp_ts(usec_ts: tcp_rsk(req)->req_usec_ts, skb) +
934	tcp_rsk(req)->ts_off;
935	opts->tsecr = READ_ONCE(req->ts_recent);
936	remaining -= TCPOLEN_TSTAMP_ALIGNED;
937	}
938	if (likely(ireq->sack_ok)) {
939	opts->options \|= OPTION_SACK_ADVERTISE;
940	if (unlikely(!ireq->tstamp_ok))
941	remaining -= TCPOLEN_SACKPERM_ALIGNED;
942	}
943	if (foc != NULL && foc->len >= `0`) {
944	u32 need = foc->len;
945
946	need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
947	TCPOLEN_FASTOPEN_BASE;
948	need = (need + `3`) & ~`3U`; / Align to 32 bits /
949	if (remaining >= need) {
950	opts->options \|= OPTION_FAST_OPEN_COOKIE;
951	opts->fastopen_cookie = foc;
952	remaining -= need;
953	}
954	}
955
956	mptcp_set_option_cond(req, opts, remaining: &remaining);
957
958	smc_set_option_cond(tcp_sk(sk), ireq, opts, remaining: &remaining);
959
960	bpf_skops_hdr_opt_len(sk: (struct sock *)sk, skb, req, syn_skb,
961	synack_type, opts, remaining: &remaining);
962
963	return MAX_TCP_OPTION_SPACE - remaining;
964	}
965
966	/ Compute TCP options for ESTABLISHED sockets. This is not the*
967	* final wire format yet.
968	*/
969	static unsigned int tcp_established_options(struct sock sk, struct* sk_buff *skb,
970	struct tcp_out_options *opts,
971	struct tcp_key *key)
972	{
973	struct tcp_sock *tp = tcp_sk(sk);
974	unsigned int size = `0`;
975	unsigned int eff_sacks;
976
977	opts->options = `0`;
978
979	/ Better than switch (key.type) as it has static branches /
980	if (tcp_key_is_md5(key)) {
981	opts->options \|= OPTION_MD5;
982	size += TCPOLEN_MD5SIG_ALIGNED;
983	} else if (tcp_key_is_ao(key)) {
984	opts->options \|= OPTION_AO;
985	size += tcp_ao_len_aligned(key: key->ao_key);
986	}
987
988	if (likely(tp->rx_opt.tstamp_ok)) {
989	opts->options \|= OPTION_TS;
990	opts->tsval = skb ? tcp_skb_timestamp_ts(usec_ts: tp->tcp_usec_ts, skb) +
991	tp->tsoffset : `0`;
992	opts->tsecr = tp->rx_opt.ts_recent;
993	size += TCPOLEN_TSTAMP_ALIGNED;
994	}
995
996	/ MPTCP options have precedence over SACK for the limited TCP*
997	* option space because a MPTCP connection would be forced to
998	* fall back to regular TCP if a required multipath option is
999	* missing. SACK still gets a chance to use whatever space is
1000	* left.
1001	*/
1002	if (sk_is_mptcp(sk)) {
1003	unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
1004	unsigned int opt_size = `0`;
1005
1006	if (mptcp_established_options(sk, skb, size: &opt_size, remaining,
1007	opts: &opts->mptcp)) {
1008	opts->options \|= OPTION_MPTCP;
1009	size += opt_size;
1010	}
1011	}
1012
1013	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
1014	if (unlikely(eff_sacks)) {
1015	const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
1016	if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
1017	TCPOLEN_SACK_PERBLOCK))
1018	return size;
1019
1020	opts->num_sack_blocks =
1021	min_t(unsigned int, eff_sacks,
1022	(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
1023	TCPOLEN_SACK_PERBLOCK);
1024
1025	size += TCPOLEN_SACK_BASE_ALIGNED +
1026	opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
1027	}
1028
1029	if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
1030	BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
1031	unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
1032
1033	bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, synack_type: `0`, opts, remaining: &remaining);
1034
1035	size = MAX_TCP_OPTION_SPACE - remaining;
1036	}
1037
1038	return size;
1039	}
1040
1041
1042	/ TCP SMALL QUEUES (TSQ)*
1043	*
1044	* TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
1045	* to reduce RTT and bufferbloat.
1046	* We do this using a special skb destructor (tcp_wfree).
1047	*
1048	* Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
1049	* needs to be reallocated in a driver.
1050	* The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
1051	*
1052	* Since transmit from skb destructor is forbidden, we use a tasklet
1053	* to process all sockets that eventually need to send more skbs.
1054	* We use one tasklet per cpu, with its own queue of sockets.
1055	*/
1056	struct tsq_tasklet {
1057	struct tasklet_struct tasklet;
1058	struct list_head head; / queue of tcp sockets /
1059	};
1060	static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
1061
1062	static void tcp_tsq_write(struct sock *sk)
1063	{
1064	if ((`1` << sk->sk_state) &
1065	(TCPF_ESTABLISHED \| TCPF_FIN_WAIT1 \| TCPF_CLOSING \|
1066	TCPF_CLOSE_WAIT \| TCPF_LAST_ACK)) {
1067	struct tcp_sock *tp = tcp_sk(sk);
1068
1069	if (tp->lost_out > tp->retrans_out &&
1070	tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) {
1071	tcp_mstamp_refresh(tp);
1072	tcp_xmit_retransmit_queue(sk);
1073	}
1074
1075	tcp_write_xmit(sk, mss_now: tcp_current_mss(sk), nonagle: tp->nonagle,
1076	push_one: `0`, GFP_ATOMIC);
1077	}
1078	}
1079
1080	static void tcp_tsq_handler(struct sock *sk)
1081	{
1082	bh_lock_sock(sk);
1083	if (!sock_owned_by_user(sk))
1084	tcp_tsq_write(sk);
1085	else if (!test_and_set_bit(nr: TCP_TSQ_DEFERRED, addr: &sk->sk_tsq_flags))
1086	sock_hold(sk);
1087	bh_unlock_sock(sk);
1088	}
1089	/*
1090	* One tasklet per cpu tries to send more skbs.
1091	* We run in tasklet context but need to disable irqs when
1092	* transferring tsq->head because tcp_wfree() might
1093	* interrupt us (non NAPI drivers)
1094	*/
1095	static void tcp_tasklet_func(struct tasklet_struct *t)
1096	{
1097	struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet);
1098	LIST_HEAD(list);
1099	unsigned long flags;
1100	struct list_head q, n;
1101	struct tcp_sock *tp;
1102	struct sock *sk;
1103
1104	local_irq_save(flags);
1105	list_splice_init(list: &tsq->head, head: &list);
1106	local_irq_restore(flags);
1107
1108	list_for_each_safe(q, n, &list) {
1109	tp = list_entry(q, struct tcp_sock, tsq_node);
1110	list_del(entry: &tp->tsq_node);
1111
1112	sk = (struct sock *)tp;
1113	smp_mb__before_atomic();
1114	clear_bit(nr: TSQ_QUEUED, addr: &sk->sk_tsq_flags);
1115
1116	tcp_tsq_handler(sk);
1117	sk_free(sk);
1118	}
1119	}
1120
1121	#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED \| \
1122	TCPF_WRITE_TIMER_DEFERRED \| \
1123	TCPF_DELACK_TIMER_DEFERRED \| \
1124	TCPF_MTU_REDUCED_DEFERRED \| \
1125	TCPF_ACK_DEFERRED)
1126	/**
1127	* tcp_release_cb - tcp release_sock() callback
1128	* @sk: socket
1129	*
1130	* called from release_sock() to perform protocol dependent
1131	* actions before socket release.
1132	*/
1133	void tcp_release_cb(struct sock *sk)
1134	{
1135	unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags);
1136	unsigned long nflags;
1137
1138	/ perform an atomic operation only if at least one flag is set /
1139	do {
1140	if (!(flags & TCP_DEFERRED_ALL))
1141	return;
1142	nflags = flags & ~TCP_DEFERRED_ALL;
1143	} while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags));
1144
1145	if (flags & TCPF_TSQ_DEFERRED) {
1146	tcp_tsq_write(sk);
1147	__sock_put(sk);
1148	}
1149
1150	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
1151	tcp_write_timer_handler(sk);
1152	__sock_put(sk);
1153	}
1154	if (flags & TCPF_DELACK_TIMER_DEFERRED) {
1155	tcp_delack_timer_handler(sk);
1156	__sock_put(sk);
1157	}
1158	if (flags & TCPF_MTU_REDUCED_DEFERRED) {
1159	inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
1160	__sock_put(sk);
1161	}
1162	if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
1163	tcp_send_ack(sk);
1164	}
1165	EXPORT_SYMBOL(tcp_release_cb);
1166
1167	void __init tcp_tasklet_init(void)
1168	{
1169	int i;
1170
1171	for_each_possible_cpu(i) {
1172	struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
1173
1174	INIT_LIST_HEAD(list: &tsq->head);
1175	tasklet_setup(t: &tsq->tasklet, callback: tcp_tasklet_func);
1176	}
1177	}
1178
1179	/*
1180	* Write buffer destructor automatically called from kfree_skb.
1181	* We can't xmit new skbs from this context, as we might already
1182	* hold qdisc lock.
1183	*/
1184	void tcp_wfree(struct sk_buff *skb)
1185	{
1186	struct sock *sk = skb->sk;
1187	struct tcp_sock *tp = tcp_sk(sk);
1188	unsigned long flags, nval, oval;
1189	struct tsq_tasklet *tsq;
1190	bool empty;
1191
1192	/ Keep one reference on sk_wmem_alloc.*
1193	* Will be released by sk_free() from here or tcp_tasklet_func()
1194	*/
1195	WARN_ON(refcount_sub_and_test(skb->truesize - `1`, &sk->sk_wmem_alloc));
1196
1197	/ If this softirq is serviced by ksoftirqd, we are likely under stress.*
1198	* Wait until our queues (qdisc + devices) are drained.
1199	* This gives :
1200	* - less callbacks to tcp_write_xmit(), reducing stress (batches)
1201	* - chance for incoming ACK (processed by another cpu maybe)
1202	* to migrate this flow (skb->ooo_okay will be eventually set)
1203	*/
1204	if (refcount_read(r: &sk->sk_wmem_alloc) >= SKB_TRUESIZE(`1`) && this_cpu_ksoftirqd() == current)
1205	goto out;
1206
1207	oval = smp_load_acquire(&sk->sk_tsq_flags);
1208	do {
1209	if (!(oval & TSQF_THROTTLED) \|\| (oval & TSQF_QUEUED))
1210	goto out;
1211
1212	nval = (oval & ~TSQF_THROTTLED) \| TSQF_QUEUED;
1213	} while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval));
1214
1215	/ queue this socket to tasklet queue /
1216	local_irq_save(flags);
1217	tsq = this_cpu_ptr(&tsq_tasklet);
1218	empty = list_empty(head: &tsq->head);
1219	list_add(new: &tp->tsq_node, head: &tsq->head);
1220	if (empty)
1221	tasklet_schedule(t: &tsq->tasklet);
1222	local_irq_restore(flags);
1223	return;
1224	out:
1225	sk_free(sk);
1226	}
1227
1228	/ Note: Called under soft irq.*
1229	* We can call TCP stack right away, unless socket is owned by user.
1230	*/
1231	enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
1232	{
1233	struct tcp_sock tp = container_of(timer, struct* tcp_sock, pacing_timer);
1234	struct sock sk = (struct* sock *)tp;
1235
1236	tcp_tsq_handler(sk);
1237	sock_put(sk);
1238
1239	return HRTIMER_NORESTART;
1240	}
1241
1242	static void tcp_update_skb_after_send(struct sock sk, struct* sk_buff *skb,
1243	u64 prior_wstamp)
1244	{
1245	struct tcp_sock *tp = tcp_sk(sk);
1246
1247	if (sk->sk_pacing_status != SK_PACING_NONE) {
1248	unsigned long rate = READ_ONCE(sk->sk_pacing_rate);
1249
1250	/ Original sch_fq does not pace first 10 MSS*
1251	* Note that tp->data_segs_out overflows after 2^32 packets,
1252	* this is a minor annoyance.
1253	*/
1254	if (rate != ~`0UL` && rate && tp->data_segs_out >= `10`) {
1255	u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
1256	u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
1257
1258	/ take into account OS jitter /
1259	len_ns -= min_t(u64, len_ns / `2`, credit);
1260	tp->tcp_wstamp_ns += len_ns;
1261	}
1262	}
1263	list_move_tail(list: &skb->tcp_tsorted_anchor, head: &tp->tsorted_sent_queue);
1264	}
1265
1266	INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock sk, struct* sk_buff skb, struct* flowi *fl));
1267	INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock sk, struct* sk_buff skb, struct* flowi *fl));
1268	INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock sk, struct* sk_buff *skb));
1269
1270	/ This routine actually transmits TCP packets queued in by*
1271	* tcp_do_sendmsg(). This is used by both the initial
1272	* transmission and possible later retransmissions.
1273	* All SKB's seen here are completely headerless. It is our
1274	* job to build the TCP header, and pass the packet down to
1275	* IP so it can do the same plus pass the packet off to the
1276	* device.
1277	*
1278	* We are working here with either a clone of the original
1279	* SKB, or a fresh unique copy made by the retransmit engine.
1280	*/
1281	static int __tcp_transmit_skb(struct sock sk, struct* sk_buff *skb,
1282	int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
1283	{
1284	const struct inet_connection_sock *icsk = inet_csk(sk);
1285	struct inet_sock *inet;
1286	struct tcp_sock *tp;
1287	struct tcp_skb_cb *tcb;
1288	struct tcp_out_options opts;
1289	unsigned int tcp_options_size, tcp_header_size;
1290	struct sk_buff *oskb = NULL;
1291	struct tcp_key key;
1292	struct tcphdr *th;
1293	u64 prior_wstamp;
1294	int err;
1295
1296	BUG_ON(!skb \|\| !tcp_skb_pcount(skb));
1297	tp = tcp_sk(sk);
1298	prior_wstamp = tp->tcp_wstamp_ns;
1299	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
1300	skb_set_delivery_time(skb, kt: tp->tcp_wstamp_ns, mono: true);
1301	if (clone_it) {
1302	oskb = skb;
1303
1304	tcp_skb_tsorted_save(oskb) {
1305	if (unlikely(skb_cloned(oskb)))
1306	skb = pskb_copy(skb: oskb, gfp_mask);
1307	else
1308	skb = skb_clone(skb: oskb, priority: gfp_mask);
1309	} tcp_skb_tsorted_restore(oskb);
1310
1311	if (unlikely(!skb))
1312	return -ENOBUFS;
1313	/ retransmit skbs might have a non zero value in skb->dev*
1314	* because skb->dev is aliased with skb->rbnode.rb_left
1315	*/
1316	skb->dev = NULL;
1317	}
1318
1319	inet = inet_sk(sk);
1320	tcb = TCP_SKB_CB(skb);
1321	memset(&opts, `0`, sizeof(opts));
1322
1323	tcp_get_current_key(sk, out: &key);
1324	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
1325	tcp_options_size = tcp_syn_options(sk, skb, opts: &opts, key: &key);
1326	} else {
1327	tcp_options_size = tcp_established_options(sk, skb, opts: &opts, key: &key);
1328	/ Force a PSH flag on all (GSO) packets to expedite GRO flush*
1329	* at receiver : This slightly improve GRO performance.
1330	* Note that we do not force the PSH flag for non GSO packets,
1331	* because they might be sent under high congestion events,
1332	* and in this case it is better to delay the delivery of 1-MSS
1333	* packets and thus the corresponding ACK packet that would
1334	* release the following packet.
1335	*/
1336	if (tcp_skb_pcount(skb) > `1`)
1337	tcb->tcp_flags \|= TCPHDR_PSH;
1338	}
1339	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
1340
1341	/ We set skb->ooo_okay to one if this packet can select*
1342	* a different TX queue than prior packets of this flow,
1343	* to avoid self inflicted reorders.
1344	* The 'other' queue decision is based on current cpu number
1345	* if XPS is enabled, or sk->sk_txhash otherwise.
1346	* We can switch to another (and better) queue if:
1347	* 1) No packet with payload is in qdisc/device queues.
1348	* Delays in TX completion can defeat the test
1349	* even if packets were already sent.
1350	* 2) Or rtx queue is empty.
1351	* This mitigates above case if ACK packets for
1352	* all prior packets were already processed.
1353	*/
1354	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(`1`) \|\|
1355	tcp_rtx_queue_empty(sk);
1356
1357	/ If we had to use memory reserve to allocate this skb,*
1358	* this might cause drops if packet is looped back :
1359	* Other socket might not have SOCK_MEMALLOC.
1360	* Packets not looped back do not care about pfmemalloc.
1361	*/
1362	skb->pfmemalloc = `0`;
1363
1364	skb_push(skb, len: tcp_header_size);
1365	skb_reset_transport_header(skb);
1366
1367	skb_orphan(skb);
1368	skb->sk = sk;
1369	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
1370	refcount_add(i: skb->truesize, r: &sk->sk_wmem_alloc);
1371
1372	skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm));
1373
1374	/ Build TCP header and checksum it. /
1375	th = (struct tcphdr *)skb->data;
1376	th->source = inet->inet_sport;
1377	th->dest = inet->inet_dport;
1378	th->seq = htonl(tcb->seq);
1379	th->ack_seq = htonl(rcv_nxt);
1380	(((__be16 )th) + `6`) = htons(((tcp_header_size >> `2`) << `12`) \|
1381	tcb->tcp_flags);
1382
1383	th->check = `0`;
1384	th->urg_ptr = `0`;
1385
1386	/ The urg_mode check is necessary during a below snd_una win probe /
1387	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
1388	if (before(seq1: tp->snd_up, seq2: tcb->seq + `0x10000`)) {
1389	th->urg_ptr = htons(tp->snd_up - tcb->seq);
1390	th->urg = `1`;
1391	} else if (after(tcb->seq + `0xFFFF`, tp->snd_nxt)) {
1392	th->urg_ptr = htons(`0xFFFF`);
1393	th->urg = `1`;
1394	}
1395	}
1396
1397	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
1398	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
1399	th->window = htons(tcp_select_window(sk));
1400	tcp_ecn_send(sk, skb, th, tcp_header_len: tcp_header_size);
1401	} else {
1402	/ RFC1323: The window in SYN & SYN/ACK segments*
1403	* is never scaled.
1404	*/
1405	th->window = htons(min(tp->rcv_wnd, `65535U`));
1406	}
1407
1408	tcp_options_write(th, tp, NULL, opts: &opts, key: &key);
1409
1410	if (tcp_key_is_md5(key: &key)) {
1411	#ifdef CONFIG_TCP_MD5SIG
1412	/ Calculate the MD5 hash, as we have all we need now /
1413	sk_gso_disable(sk);
1414	tp->af_specific->calc_md5_hash(opts.hash_location,
1415	key.md5_key, sk, skb);
1416	#endif
1417	} else if (tcp_key_is_ao(key: &key)) {
1418	int err;
1419
1420	err = tcp_ao_transmit_skb(sk, skb, key: key.ao_key, th,
1421	hash_location: opts.hash_location);
1422	if (err) {
1423	kfree_skb_reason(skb, reason: SKB_DROP_REASON_NOT_SPECIFIED);
1424	return -ENOMEM;
1425	}
1426	}
1427
1428	/ BPF prog is the last one writing header option /
1429	bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, synack_type: `0`, opts: &opts);
1430
1431	INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
1432	tcp_v6_send_check, tcp_v4_send_check,
1433	sk, skb);
1434
1435	if (likely(tcb->tcp_flags & TCPHDR_ACK))
1436	tcp_event_ack_sent(sk, rcv_nxt);
1437
1438	if (skb->len != tcp_header_size) {
1439	tcp_event_data_sent(tp, sk);
1440	tp->data_segs_out += tcp_skb_pcount(skb);
1441	tp->bytes_sent += skb->len - tcp_header_size;
1442	}
1443
1444	if (after(tcb->end_seq, tp->snd_nxt) \|\| tcb->seq == tcb->end_seq)
1445	TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
1446	tcp_skb_pcount(skb));
1447
1448	tp->segs_out += tcp_skb_pcount(skb);
1449	skb_set_hash_from_sk(skb, sk);
1450	/ OK, its time to fill skb_shinfo(skb)->gso_{segs\|size} /
1451	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
1452	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
1453
1454	/ Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) /
1455
1456	/ Cleanup our debris for IP stacks /
1457	memset(skb->cb, `0`, max(sizeof(struct inet_skb_parm),
1458	sizeof(struct inet6_skb_parm)));
1459
1460	tcp_add_tx_delay(skb, tp);
1461
1462	err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
1463	inet6_csk_xmit, ip_queue_xmit,
1464	sk, skb, &inet->cork.fl);
1465
1466	if (unlikely(err > `0`)) {
1467	tcp_enter_cwr(sk);
1468	err = net_xmit_eval(err);
1469	}
1470	if (!err && oskb) {
1471	tcp_update_skb_after_send(sk, skb: oskb, prior_wstamp);
1472	tcp_rate_skb_sent(sk, skb: oskb);
1473	}
1474	return err;
1475	}
1476
1477	static int tcp_transmit_skb(struct sock sk, struct* sk_buff skb, int* clone_it,
1478	gfp_t gfp_mask)
1479	{
1480	return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
1481	tcp_sk(sk)->rcv_nxt);
1482	}
1483
1484	/ This routine just queues the buffer for sending.*
1485	*
1486	* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
1487	* otherwise socket can stall.
1488	*/
1489	static void tcp_queue_skb(struct sock sk, struct* sk_buff *skb)
1490	{
1491	struct tcp_sock *tp = tcp_sk(sk);
1492
1493	/ Advance write_seq and place onto the write_queue. /
1494	WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
1495	__skb_header_release(skb);
1496	tcp_add_write_queue_tail(sk, skb);
1497	sk_wmem_queued_add(sk, val: skb->truesize);
1498	sk_mem_charge(sk, size: skb->truesize);
1499	}
1500
1501	/ Initialize TSO segments for a packet. /
1502	static void tcp_set_skb_tso_segs(struct sk_buff skb, unsigned* int mss_now)
1503	{
1504	if (skb->len <= mss_now) {
1505	/ Avoid the costly divide in the normal*
1506	* non-TSO case.
1507	*/
1508	tcp_skb_pcount_set(skb, segs: `1`);
1509	TCP_SKB_CB(skb)->tcp_gso_size = `0`;
1510	} else {
1511	tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
1512	TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
1513	}
1514	}
1515
1516	/ Pcount in the middle of the write queue got changed, we need to do various*
1517	* tweaks to fix counters
1518	*/
1519	static void tcp_adjust_pcount(struct sock sk, const* struct sk_buff skb, int* decr)
1520	{
1521	struct tcp_sock *tp = tcp_sk(sk);
1522
1523	tp->packets_out -= decr;
1524
1525	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1526	tp->sacked_out -= decr;
1527	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
1528	tp->retrans_out -= decr;
1529	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
1530	tp->lost_out -= decr;
1531
1532	/ Reno case is special. Sigh... /
1533	if (tcp_is_reno(tp) && decr > `0`)
1534	tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
1535
1536	if (tp->lost_skb_hint &&
1537	before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
1538	(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
1539	tp->lost_cnt_hint -= decr;
1540
1541	tcp_verify_left_out(tp);
1542	}
1543
1544	static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
1545	{
1546	return TCP_SKB_CB(skb)->txstamp_ack \|\|
1547	(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
1548	}
1549
1550	static void tcp_fragment_tstamp(struct sk_buff skb, struct* sk_buff *skb2)
1551	{
1552	struct skb_shared_info *shinfo = skb_shinfo(skb);
1553
1554	if (unlikely(tcp_has_tx_tstamp(skb)) &&
1555	!before(seq1: shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
1556	struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
1557	u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
1558
1559	shinfo->tx_flags &= ~tsflags;
1560	shinfo2->tx_flags \|= tsflags;
1561	swap(shinfo->tskey, shinfo2->tskey);
1562	TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
1563	TCP_SKB_CB(skb)->txstamp_ack = `0`;
1564	}
1565	}
1566
1567	static void tcp_skb_fragment_eor(struct sk_buff skb, struct* sk_buff *skb2)
1568	{
1569	TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
1570	TCP_SKB_CB(skb)->eor = `0`;
1571	}
1572
1573	/ Insert buff after skb on the write or rtx queue of sk. /
1574	static void tcp_insert_write_queue_after(struct sk_buff *skb,
1575	struct sk_buff *buff,
1576	struct sock *sk,
1577	enum tcp_queue tcp_queue)
1578	{
1579	if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
1580	__skb_queue_after(list: &sk->sk_write_queue, prev: skb, newsk: buff);
1581	else
1582	tcp_rbtree_insert(root: &sk->tcp_rtx_queue, skb: buff);
1583	}
1584
1585	/ Function to create two new TCP segments. Shrinks the given segment*
1586	* to the specified size and appends a new segment with the rest of the
1587	* packet to the list. This won't be called frequently, I hope.
1588	* Remember, these are still headerless SKBs at this point.
1589	*/
1590	int tcp_fragment(struct sock sk, enum* tcp_queue tcp_queue,
1591	struct sk_buff *skb, u32 len,
1592	unsigned int mss_now, gfp_t gfp)
1593	{
1594	struct tcp_sock *tp = tcp_sk(sk);
1595	struct sk_buff *buff;
1596	int old_factor;
1597	long limit;
1598	int nlen;
1599	u8 flags;
1600
1601	if (WARN_ON(len > skb->len))
1602	return -EINVAL;
1603
1604	DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));
1605
1606	/ tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb.*
1607	* We need some allowance to not penalize applications setting small
1608	* SO_SNDBUF values.
1609	* Also allow first and last skb in retransmit queue to be split.
1610	*/
1611	limit = sk->sk_sndbuf + `2` * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE);
1612	if (unlikely((sk->sk_wmem_queued >> `1`) > limit &&
1613	tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
1614	skb != tcp_rtx_queue_head(sk) &&
1615	skb != tcp_rtx_queue_tail(sk))) {
1616	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
1617	return -ENOMEM;
1618	}
1619
1620	if (skb_unclone_keeptruesize(skb, pri: gfp))
1621	return -ENOMEM;
1622
1623	/ Get a new skb... force flag on. /
1624	buff = tcp_stream_alloc_skb(sk, gfp, force_schedule: true);
1625	if (!buff)
1626	return -ENOMEM; / We'll just try again later. /
1627	skb_copy_decrypted(to: buff, from: skb);
1628	mptcp_skb_ext_copy(to: buff, from: skb);
1629
1630	sk_wmem_queued_add(sk, val: buff->truesize);
1631	sk_mem_charge(sk, size: buff->truesize);
1632	nlen = skb->len - len;
1633	buff->truesize += nlen;
1634	skb->truesize -= nlen;
1635
1636	/ Correct the sequence numbers. /
1637	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
1638	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
1639	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
1640
1641	/ PSH and FIN should only be set in the second packet. /
1642	flags = TCP_SKB_CB(skb)->tcp_flags;
1643	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN \| TCPHDR_PSH);
1644	TCP_SKB_CB(buff)->tcp_flags = flags;
1645	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
1646	tcp_skb_fragment_eor(skb, skb2: buff);
1647
1648	skb_split(skb, skb1: buff, len);
1649
1650	skb_set_delivery_time(skb: buff, kt: skb->tstamp, mono: true);
1651	tcp_fragment_tstamp(skb, skb2: buff);
1652
1653	old_factor = tcp_skb_pcount(skb);
1654
1655	/ Fix up tso_factor for both original and new SKB. /
1656	tcp_set_skb_tso_segs(skb, mss_now);
1657	tcp_set_skb_tso_segs(skb: buff, mss_now);
1658
1659	/ Update delivered info for the new segment /
1660	TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
1661
1662	/ If this packet has been sent out already, we must*
1663	* adjust the various packet counters.
1664	*/
1665	if (!before(seq1: tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
1666	int diff = old_factor - tcp_skb_pcount(skb) -
1667	tcp_skb_pcount(skb: buff);
1668
1669	if (diff)
1670	tcp_adjust_pcount(sk, skb, decr: diff);
1671	}
1672
1673	/ Link BUFF into the send queue. /
1674	__skb_header_release(skb: buff);
1675	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
1676	if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
1677	list_add(new: &buff->tcp_tsorted_anchor, head: &skb->tcp_tsorted_anchor);
1678
1679	return `0`;
1680	}
1681
1682	/ This is similar to __pskb_pull_tail(). The difference is that pulled*
1683	* data is not copied, but immediately discarded.
1684	*/
1685	static int __pskb_trim_head(struct sk_buff skb, int* len)
1686	{
1687	struct skb_shared_info *shinfo;
1688	int i, k, eat;
1689
1690	DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));
1691	eat = len;
1692	k = `0`;
1693	shinfo = skb_shinfo(skb);
1694	for (i = `0`; i < shinfo->nr_frags; i++) {
1695	int size = skb_frag_size(frag: &shinfo->frags[i]);
1696
1697	if (size <= eat) {
1698	skb_frag_unref(skb, f: i);
1699	eat -= size;
1700	} else {
1701	shinfo->frags[k] = shinfo->frags[i];
1702	if (eat) {
1703	skb_frag_off_add(frag: &shinfo->frags[k], delta: eat);
1704	skb_frag_size_sub(frag: &shinfo->frags[k], delta: eat);
1705	eat = `0`;
1706	}
1707	k++;
1708	}
1709	}
1710	shinfo->nr_frags = k;
1711
1712	skb->data_len -= len;
1713	skb->len = skb->data_len;
1714	return len;
1715	}
1716
1717	/ Remove acked data from a packet in the transmit queue. /
1718	int tcp_trim_head(struct sock sk, struct* sk_buff *skb, u32 len)
1719	{
1720	u32 delta_truesize;
1721
1722	if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
1723	return -ENOMEM;
1724
1725	delta_truesize = __pskb_trim_head(skb, len);
1726
1727	TCP_SKB_CB(skb)->seq += len;
1728
1729	skb->truesize -= delta_truesize;
1730	sk_wmem_queued_add(sk, val: -delta_truesize);
1731	if (!skb_zcopy_pure(skb))
1732	sk_mem_uncharge(sk, size: delta_truesize);
1733
1734	/ Any change of skb->len requires recalculation of tso factor. /
1735	if (tcp_skb_pcount(skb) > `1`)
1736	tcp_set_skb_tso_segs(skb, mss_now: tcp_skb_mss(skb));
1737
1738	return `0`;
1739	}
1740
1741	/ Calculate MSS not accounting any TCP options. /
1742	static inline int __tcp_mtu_to_mss(struct sock sk, int* pmtu)
1743	{
1744	const struct tcp_sock *tp = tcp_sk(sk);
1745	const struct inet_connection_sock *icsk = inet_csk(sk);
1746	int mss_now;
1747
1748	/ Calculate base mss without TCP options:*
1749	It is MMS_S - sizeof(tcphdr) of rfc1122
1750	*/
1751	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
1752
1753	/ Clamp it (mss_clamp does not include tcp options) /
1754	if (mss_now > tp->rx_opt.mss_clamp)
1755	mss_now = tp->rx_opt.mss_clamp;
1756
1757	/ Now subtract optional transport overhead /
1758	mss_now -= icsk->icsk_ext_hdr_len;
1759
1760	/ Then reserve room for full set of TCP options and 8 bytes of data /
1761	mss_now = max(mss_now,
1762	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss));
1763	return mss_now;
1764	}
1765
1766	/ Calculate MSS. Not accounting for SACKs here. /
1767	int tcp_mtu_to_mss(struct sock sk, int* pmtu)
1768	{
1769	/ Subtract TCP options size, not including SACKs /
1770	return __tcp_mtu_to_mss(sk, pmtu) -
1771	(tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
1772	}
1773	EXPORT_SYMBOL(tcp_mtu_to_mss);
1774
1775	/ Inverse of above /
1776	int tcp_mss_to_mtu(struct sock sk, int* mss)
1777	{
1778	const struct tcp_sock *tp = tcp_sk(sk);
1779	const struct inet_connection_sock *icsk = inet_csk(sk);
1780
1781	return mss +
1782	tp->tcp_header_len +
1783	icsk->icsk_ext_hdr_len +
1784	icsk->icsk_af_ops->net_header_len;
1785	}
1786	EXPORT_SYMBOL(tcp_mss_to_mtu);
1787
1788	/ MTU probing init per socket /
1789	void tcp_mtup_init(struct sock *sk)
1790	{
1791	struct tcp_sock *tp = tcp_sk(sk);
1792	struct inet_connection_sock *icsk = inet_csk(sk);
1793	struct net *net = sock_net(sk);
1794
1795	icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > `1`;
1796	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
1797	icsk->icsk_af_ops->net_header_len;
1798	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss));
1799	icsk->icsk_mtup.probe_size = `0`;
1800	if (icsk->icsk_mtup.enabled)
1801	icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
1802	}
1803	EXPORT_SYMBOL(tcp_mtup_init);
1804
1805	/ This function synchronize snd mss to current pmtu/exthdr set.*
1806
1807	tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
1808	for TCP options, but includes only bare TCP header.
1809
1810	tp->rx_opt.mss_clamp is mss negotiated at connection setup.
1811	It is minimum of user_mss and mss received with SYN.
1812	It also does not include TCP options.
1813
1814	inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
1815
1816	tp->mss_cache is current effective sending mss, including
1817	all tcp options except for SACKs. It is evaluated,
1818	taking into account current pmtu, but never exceeds
1819	tp->rx_opt.mss_clamp.
1820
1821	NOTE1. rfc1122 clearly states that advertised MSS
1822	DOES NOT include either tcp or ip options.
1823
1824	NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
1825	are READ ONLY outside this function. --ANK (980731)
1826	*/
1827	unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1828	{
1829	struct tcp_sock *tp = tcp_sk(sk);
1830	struct inet_connection_sock *icsk = inet_csk(sk);
1831	int mss_now;
1832
1833	if (icsk->icsk_mtup.search_high > pmtu)
1834	icsk->icsk_mtup.search_high = pmtu;
1835
1836	mss_now = tcp_mtu_to_mss(sk, pmtu);
1837	mss_now = tcp_bound_to_half_wnd(tp, pktsize: mss_now);
1838
1839	/ And store cached results /
1840	icsk->icsk_pmtu_cookie = pmtu;
1841	if (icsk->icsk_mtup.enabled)
1842	mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
1843	tp->mss_cache = mss_now;
1844
1845	return mss_now;
1846	}
1847	EXPORT_SYMBOL(tcp_sync_mss);
1848
1849	/ Compute the current effective MSS, taking SACKs and IP options,*
1850	* and even PMTU discovery events into account.
1851	*/
1852	unsigned int tcp_current_mss(struct sock *sk)
1853	{
1854	const struct tcp_sock *tp = tcp_sk(sk);
1855	const struct dst_entry *dst = __sk_dst_get(sk);
1856	u32 mss_now;
1857	unsigned int header_len;
1858	struct tcp_out_options opts;
1859	struct tcp_key key;
1860
1861	mss_now = tp->mss_cache;
1862
1863	if (dst) {
1864	u32 mtu = dst_mtu(dst);
1865	if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1866	mss_now = tcp_sync_mss(sk, mtu);
1867	}
1868	tcp_get_current_key(sk, out: &key);
1869	header_len = tcp_established_options(sk, NULL, opts: &opts, key: &key) +
1870	sizeof(struct tcphdr);
1871	/ The mss_cache is sized based on tp->tcp_header_len, which assumes*
1872	* some common options. If this is an odd packet (because we have SACK
1873	* blocks etc) then our calculated header_len will be different, and
1874	* we have to adjust mss_now correspondingly */
1875	if (header_len != tp->tcp_header_len) {
1876	int delta = (int) header_len - tp->tcp_header_len;
1877	mss_now -= delta;
1878	}
1879
1880	return mss_now;
1881	}
1882
1883	/ RFC2861, slow part. Adjust cwnd, after it was not full during one rto.*
1884	* As additional protections, we do not touch cwnd in retransmission phases,
1885	* and if application hit its sndbuf limit recently.
1886	*/
1887	static void tcp_cwnd_application_limited(struct sock *sk)
1888	{
1889	struct tcp_sock *tp = tcp_sk(sk);
1890
1891	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1892	sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
1893	/ Limited by application or receiver window. /
1894	u32 init_win = tcp_init_cwnd(tp, dst: __sk_dst_get(sk));
1895	u32 win_used = max(tp->snd_cwnd_used, init_win);
1896	if (win_used < tcp_snd_cwnd(tp)) {
1897	tp->snd_ssthresh = tcp_current_ssthresh(sk);
1898	tcp_snd_cwnd_set(tp, val: (tcp_snd_cwnd(tp) + win_used) >> `1`);
1899	}
1900	tp->snd_cwnd_used = `0`;
1901	}
1902	tp->snd_cwnd_stamp = tcp_jiffies32;
1903	}
1904
1905	static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
1906	{
1907	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
1908	struct tcp_sock *tp = tcp_sk(sk);
1909
1910	/ Track the strongest available signal of the degree to which the cwnd*
1911	* is fully utilized. If cwnd-limited then remember that fact for the
1912	* current window. If not cwnd-limited then track the maximum number of
1913	* outstanding packets in the current window. (If cwnd-limited then we
1914	* chose to not update tp->max_packets_out to avoid an extra else
1915	* clause with no functional impact.)
1916	*/
1917	if (!before(seq1: tp->snd_una, seq2: tp->cwnd_usage_seq) \|\|
1918	is_cwnd_limited \|\|
1919	(!tp->is_cwnd_limited &&
1920	tp->packets_out > tp->max_packets_out)) {
1921	tp->is_cwnd_limited = is_cwnd_limited;
1922	tp->max_packets_out = tp->packets_out;
1923	tp->cwnd_usage_seq = tp->snd_nxt;
1924	}
1925
1926	if (tcp_is_cwnd_limited(sk)) {
1927	/ Network is feed fully. /
1928	tp->snd_cwnd_used = `0`;
1929	tp->snd_cwnd_stamp = tcp_jiffies32;
1930	} else {
1931	/ Network starves. /
1932	if (tp->packets_out > tp->snd_cwnd_used)
1933	tp->snd_cwnd_used = tp->packets_out;
1934
1935	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) &&
1936	(s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
1937	!ca_ops->cong_control)
1938	tcp_cwnd_application_limited(sk);
1939
1940	/ The following conditions together indicate the starvation*
1941	* is caused by insufficient sender buffer:
1942	* 1) just sent some data (see tcp_write_xmit)
1943	* 2) not cwnd limited (this else condition)
1944	* 3) no more data to send (tcp_write_queue_empty())
1945	* 4) application is hitting buffer limit (SOCK_NOSPACE)
1946	*/
1947	if (tcp_write_queue_empty(sk) && sk->sk_socket &&
1948	test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
1949	(`1` << sk->sk_state) & (TCPF_ESTABLISHED \| TCPF_CLOSE_WAIT))
1950	tcp_chrono_start(sk, type: TCP_CHRONO_SNDBUF_LIMITED);
1951	}
1952	}
1953
1954	/ Minshall's variant of the Nagle send check. /
1955	static bool tcp_minshall_check(const struct tcp_sock *tp)
1956	{
1957	return after(tp->snd_sml, tp->snd_una) &&
1958	!after(tp->snd_sml, tp->snd_nxt);
1959	}
1960
1961	/ Update snd_sml if this skb is under mss*
1962	* Note that a TSO packet might end with a sub-mss segment
1963	* The test is really :
1964	* if ((skb->len % mss) != 0)
1965	* tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1966	* But we can avoid doing the divide again given we already have
1967	* skb_pcount = skb->len / mss_now
1968	*/
1969	static void tcp_minshall_update(struct tcp_sock tp, unsigned* int mss_now,
1970	const struct sk_buff *skb)
1971	{
1972	if (skb->len < tcp_skb_pcount(skb) * mss_now)
1973	tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
1974	}
1975
1976	/ Return false, if packet can be sent now without violation Nagle's rules:*
1977	* 1. It is full sized. (provided by caller in %partial bool)
1978	* 2. Or it contains FIN. (already checked by caller)
1979	* 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
1980	* 4. Or TCP_CORK is not set, and all sent packets are ACKed.
1981	* With Minshall's modification: all sent small packets are ACKed.
1982	*/
1983	static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
1984	int nonagle)
1985	{
1986	return partial &&
1987	((nonagle & TCP_NAGLE_CORK) \|\|
1988	(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
1989	}
1990
1991	/ Return how many segs we'd like on a TSO packet,*
1992	* depending on current pacing rate, and how close the peer is.
1993	*
1994	* Rationale is:
1995	* - For close peers, we rather send bigger packets to reduce
1996	* cpu costs, because occasional losses will be repaired fast.
1997	* - For long distance/rtt flows, we would like to get ACK clocking
1998	* with 1 ACK per ms.
1999	*
2000	* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
2001	* in bigger TSO bursts. We we cut the RTT-based allowance in half
2002	* for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
2003	* is below 1500 bytes after 6 * ~500 usec = 3ms.
2004	*/
2005	static u32 tcp_tso_autosize(const struct sock sk, unsigned* int mss_now,
2006	int min_tso_segs)
2007	{
2008	unsigned long bytes;
2009	u32 r;
2010
2011	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
2012
2013	r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log);
2014	if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
2015	bytes += sk->sk_gso_max_size >> r;
2016
2017	bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size);
2018
2019	return max_t(u32, bytes / mss_now, min_tso_segs);
2020	}
2021
2022	/ Return the number of segments we want in the skb we are transmitting.*
2023	* See if congestion control module wants to decide; otherwise, autosize.
2024	*/
2025	static u32 tcp_tso_segs(struct sock sk, unsigned* int mss_now)
2026	{
2027	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
2028	u32 min_tso, tso_segs;
2029
2030	min_tso = ca_ops->min_tso_segs ?
2031	ca_ops->min_tso_segs(sk) :
2032	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
2033
2034	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso_segs: min_tso);
2035	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
2036	}
2037
2038	/ Returns the portion of skb which can be sent right away /
2039	static unsigned int tcp_mss_split_point(const struct sock *sk,
2040	const struct sk_buff *skb,
2041	unsigned int mss_now,
2042	unsigned int max_segs,
2043	int nonagle)
2044	{
2045	const struct tcp_sock *tp = tcp_sk(sk);
2046	u32 partial, needed, window, max_len;
2047
2048	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2049	max_len = mss_now * max_segs;
2050
2051	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
2052	return max_len;
2053
2054	needed = min(skb->len, window);
2055
2056	if (max_len <= needed)
2057	return max_len;
2058
2059	partial = needed % mss_now;
2060	/ If last segment is not a full MSS, check if Nagle rules allow us*
2061	* to include this last segment in this skb.
2062	* Otherwise, we'll split the skb at last MSS boundary
2063	*/
2064	if (tcp_nagle_check(partial: partial != `0`, tp, nonagle))
2065	return needed - partial;
2066
2067	return needed;
2068	}
2069
2070	/ Can at least one segment of SKB be sent right now, according to the*
2071	* congestion window rules? If so, return how many segments are allowed.
2072	*/
2073	static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
2074	const struct sk_buff *skb)
2075	{
2076	u32 in_flight, cwnd, halfcwnd;
2077
2078	/ Don't be strict about the congestion window for the final FIN. /
2079	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
2080	tcp_skb_pcount(skb) == `1`)
2081	return `1`;
2082
2083	in_flight = tcp_packets_in_flight(tp);
2084	cwnd = tcp_snd_cwnd(tp);
2085	if (in_flight >= cwnd)
2086	return `0`;
2087
2088	/ For better scheduling, ensure we have at least*
2089	* 2 GSO packets in flight.
2090	*/
2091	halfcwnd = max(cwnd >> `1`, `1U`);
2092	return min(halfcwnd, cwnd - in_flight);
2093	}
2094
2095	/ Initialize TSO state of a skb.*
2096	* This must be invoked the first time we consider transmitting
2097	* SKB onto the wire.
2098	*/
2099	static int tcp_init_tso_segs(struct sk_buff skb, unsigned* int mss_now)
2100	{
2101	int tso_segs = tcp_skb_pcount(skb);
2102
2103	if (!tso_segs \|\| (tso_segs > `1` && tcp_skb_mss(skb) != mss_now)) {
2104	tcp_set_skb_tso_segs(skb, mss_now);
2105	tso_segs = tcp_skb_pcount(skb);
2106	}
2107	return tso_segs;
2108	}
2109
2110
2111	/ Return true if the Nagle test allows this packet to be*
2112	* sent now.
2113	*/
2114	static inline bool tcp_nagle_test(const struct tcp_sock tp, const* struct sk_buff *skb,
2115	unsigned int cur_mss, int nonagle)
2116	{
2117	/ Nagle rule does not apply to frames, which sit in the middle of the*
2118	* write_queue (they have no chances to get new data).
2119	*
2120	* This is implemented in the callers, where they modify the 'nonagle'
2121	* argument based upon the location of SKB in the send queue.
2122	*/
2123	if (nonagle & TCP_NAGLE_PUSH)
2124	return true;
2125
2126	/ Don't use the nagle rule for urgent data (or for the final FIN). /
2127	if (tcp_urg_mode(tp) \|\| (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
2128	return true;
2129
2130	if (!tcp_nagle_check(partial: skb->len < cur_mss, tp, nonagle))
2131	return true;
2132
2133	return false;
2134	}
2135
2136	/ Does at least the first segment of SKB fit into the send window? /
2137	static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
2138	const struct sk_buff *skb,
2139	unsigned int cur_mss)
2140	{
2141	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
2142
2143	if (skb->len > cur_mss)
2144	end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
2145
2146	return !after(end_seq, tcp_wnd_end(tp));
2147	}
2148
2149	/ Trim TSO SKB to LEN bytes, put the remaining data into a new packet*
2150	* which is put after SKB on the list. It is very much like
2151	* tcp_fragment() except that it may make several kinds of assumptions
2152	* in order to speed up the splitting operation. In particular, we
2153	* know that all the data is in scatter-gather pages, and that the
2154	* packet has never been sent out before (and thus is not cloned).
2155	*/
2156	static int tso_fragment(struct sock sk, struct* sk_buff skb, unsigned* int len,
2157	unsigned int mss_now, gfp_t gfp)
2158	{
2159	int nlen = skb->len - len;
2160	struct sk_buff *buff;
2161	u8 flags;
2162
2163	/ All of a TSO frame must be composed of paged data. /
2164	DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len);
2165
2166	buff = tcp_stream_alloc_skb(sk, gfp, force_schedule: true);
2167	if (unlikely(!buff))
2168	return -ENOMEM;
2169	skb_copy_decrypted(to: buff, from: skb);
2170	mptcp_skb_ext_copy(to: buff, from: skb);
2171
2172	sk_wmem_queued_add(sk, val: buff->truesize);
2173	sk_mem_charge(sk, size: buff->truesize);
2174	buff->truesize += nlen;
2175	skb->truesize -= nlen;
2176
2177	/ Correct the sequence numbers. /
2178	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
2179	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
2180	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
2181
2182	/ PSH and FIN should only be set in the second packet. /
2183	flags = TCP_SKB_CB(skb)->tcp_flags;
2184	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN \| TCPHDR_PSH);
2185	TCP_SKB_CB(buff)->tcp_flags = flags;
2186
2187	tcp_skb_fragment_eor(skb, skb2: buff);
2188
2189	skb_split(skb, skb1: buff, len);
2190	tcp_fragment_tstamp(skb, skb2: buff);
2191
2192	/ Fix up tso_factor for both original and new SKB. /
2193	tcp_set_skb_tso_segs(skb, mss_now);
2194	tcp_set_skb_tso_segs(skb: buff, mss_now);
2195
2196	/ Link BUFF into the send queue. /
2197	__skb_header_release(skb: buff);
2198	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue: TCP_FRAG_IN_WRITE_QUEUE);
2199
2200	return `0`;
2201	}
2202
2203	/ Try to defer sending, if possible, in order to minimize the amount*
2204	* of TSO splitting we do. View it as a kind of TSO Nagle test.
2205	*
2206	* This algorithm is from John Heffner.
2207	*/
2208	static bool tcp_tso_should_defer(struct sock sk, struct* sk_buff *skb,
2209	bool *is_cwnd_limited,
2210	bool *is_rwnd_limited,
2211	u32 max_segs)
2212	{
2213	const struct inet_connection_sock *icsk = inet_csk(sk);
2214	u32 send_win, cong_win, limit, in_flight;
2215	struct tcp_sock *tp = tcp_sk(sk);
2216	struct sk_buff *head;
2217	int win_divisor;
2218	s64 delta;
2219
2220	if (icsk->icsk_ca_state >= TCP_CA_Recovery)
2221	goto send_now;
2222
2223	/ Avoid bursty behavior by allowing defer*
2224	* only if the last write was recent (1 ms).
2225	* Note that tp->tcp_wstamp_ns can be in the future if we have
2226	* packets waiting in a qdisc or device for EDT delivery.
2227	*/
2228	delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
2229	if (delta > `0`)
2230	goto send_now;
2231
2232	in_flight = tcp_packets_in_flight(tp);
2233
2234	BUG_ON(tcp_skb_pcount(skb) <= `1`);
2235	BUG_ON(tcp_snd_cwnd(tp) <= in_flight);
2236
2237	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2238
2239	/ From in_flight test above, we know that cwnd > in_flight. /
2240	cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache;
2241
2242	limit = min(send_win, cong_win);
2243
2244	/ If a full-sized TSO skb can be sent, do it. /
2245	if (limit >= max_segs * tp->mss_cache)
2246	goto send_now;
2247
2248	/ Middle in queue won't get any more data, full sendable already? /
2249	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
2250	goto send_now;
2251
2252	win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
2253	if (win_divisor) {
2254	u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache);
2255
2256	/ If at least some fraction of a window is available,*
2257	* just use it.
2258	*/
2259	chunk /= win_divisor;
2260	if (limit >= chunk)
2261	goto send_now;
2262	} else {
2263	/ Different approach, try not to defer past a single*
2264	* ACK. Receiver should ACK every other full sized
2265	* frame, so if we have space for more than 3 frames
2266	* then send now.
2267	*/
2268	if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
2269	goto send_now;
2270	}
2271
2272	/ TODO : use tsorted_sent_queue ? /
2273	head = tcp_rtx_queue_head(sk);
2274	if (!head)
2275	goto send_now;
2276	delta = tp->tcp_clock_cache - head->tstamp;
2277	/ If next ACK is likely to come too late (half srtt), do not defer /
2278	if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> `4`)) < `0`)
2279	goto send_now;
2280
2281	/ Ok, it looks like it is advisable to defer.*
2282	* Three cases are tracked :
2283	* 1) We are cwnd-limited
2284	* 2) We are rwnd-limited
2285	* 3) We are application limited.
2286	*/
2287	if (cong_win < send_win) {
2288	if (cong_win <= skb->len) {
2289	*is_cwnd_limited = true;
2290	return true;
2291	}
2292	} else {
2293	if (send_win <= skb->len) {
2294	*is_rwnd_limited = true;
2295	return true;
2296	}
2297	}
2298
2299	/ If this packet won't get more data, do not wait. /
2300	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) \|\|
2301	TCP_SKB_CB(skb)->eor)
2302	goto send_now;
2303
2304	return true;
2305
2306	send_now:
2307	return false;
2308	}
2309
2310	static inline void tcp_mtu_check_reprobe(struct sock *sk)
2311	{
2312	struct inet_connection_sock *icsk = inet_csk(sk);
2313	struct tcp_sock *tp = tcp_sk(sk);
2314	struct net *net = sock_net(sk);
2315	u32 interval;
2316	s32 delta;
2317
2318	interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval);
2319	delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
2320	if (unlikely(delta >= interval * HZ)) {
2321	int mss = tcp_current_mss(sk);
2322
2323	/ Update current search range /
2324	icsk->icsk_mtup.probe_size = `0`;
2325	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
2326	sizeof(struct tcphdr) +
2327	icsk->icsk_af_ops->net_header_len;
2328	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
2329
2330	/ Update probe time stamp /
2331	icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
2332	}
2333	}
2334
2335	static bool tcp_can_coalesce_send_queue_head(struct sock sk, int* len)
2336	{
2337	struct sk_buff skb, next;
2338
2339	skb = tcp_send_head(sk);
2340	tcp_for_write_queue_from_safe(skb, next, sk) {
2341	if (len <= skb->len)
2342	break;
2343
2344	if (unlikely(TCP_SKB_CB(skb)->eor) \|\|
2345	tcp_has_tx_tstamp(skb) \|\|
2346	!skb_pure_zcopy_same(skb1: skb, skb2: next))
2347	return false;
2348
2349	len -= skb->len;
2350	}
2351
2352	return true;
2353	}
2354
2355	static int tcp_clone_payload(struct sock sk, struct* sk_buff *to,
2356	int probe_size)
2357	{
2358	skb_frag_t lastfrag = NULL, fragto = skb_shinfo(to)->frags;
2359	int i, todo, len = `0`, nr_frags = `0`;
2360	const struct sk_buff *skb;
2361
2362	if (!sk_wmem_schedule(sk, size: to->truesize + probe_size))
2363	return -ENOMEM;
2364
2365	skb_queue_walk(&sk->sk_write_queue, skb) {
2366	const skb_frag_t *fragfrom = skb_shinfo(skb)->frags;
2367
2368	if (skb_headlen(skb))
2369	return -EINVAL;
2370
2371	for (i = `0`; i < skb_shinfo(skb)->nr_frags; i++, fragfrom++) {
2372	if (len >= probe_size)
2373	goto commit;
2374	todo = min_t(int, skb_frag_size(fragfrom),
2375	probe_size - len);
2376	len += todo;
2377	if (lastfrag &&
2378	skb_frag_page(frag: fragfrom) == skb_frag_page(frag: lastfrag) &&
2379	skb_frag_off(frag: fragfrom) == skb_frag_off(frag: lastfrag) +
2380	skb_frag_size(frag: lastfrag)) {
2381	skb_frag_size_add(frag: lastfrag, delta: todo);
2382	continue;
2383	}
2384	if (unlikely(nr_frags == MAX_SKB_FRAGS))
2385	return -E2BIG;
2386	skb_frag_page_copy(fragto, fragfrom);
2387	skb_frag_off_copy(fragto, fragfrom);
2388	skb_frag_size_set(frag: fragto, size: todo);
2389	nr_frags++;
2390	lastfrag = fragto++;
2391	}
2392	}
2393	commit:
2394	WARN_ON_ONCE(len != probe_size);
2395	for (i = `0`; i < nr_frags; i++)
2396	skb_frag_ref(skb: to, f: i);
2397
2398	skb_shinfo(to)->nr_frags = nr_frags;
2399	to->truesize += probe_size;
2400	to->len += probe_size;
2401	to->data_len += probe_size;
2402	__skb_header_release(skb: to);
2403	return `0`;
2404	}
2405
2406	/ Create a new MTU probe if we are ready.*
2407	* MTU probe is regularly attempting to increase the path MTU by
2408	* deliberately sending larger packets. This discovers routing
2409	* changes resulting in larger path MTUs.
2410	*
2411	* Returns 0 if we should wait to probe (no cwnd available),
2412	* 1 if a probe was sent,
2413	* -1 otherwise
2414	*/
2415	static int tcp_mtu_probe(struct sock *sk)
2416	{
2417	struct inet_connection_sock *icsk = inet_csk(sk);
2418	struct tcp_sock *tp = tcp_sk(sk);
2419	struct sk_buff skb, nskb, *next;
2420	struct net *net = sock_net(sk);
2421	int probe_size;
2422	int size_needed;
2423	int copy, len;
2424	int mss_now;
2425	int interval;
2426
2427	/ Not currently probing/verifying,*
2428	* not in recovery,
2429	* have enough cwnd, and
2430	* not SACKing (the variable headers throw things off)
2431	*/
2432	if (likely(!icsk->icsk_mtup.enabled \|\|
2433	icsk->icsk_mtup.probe_size \|\|
2434	inet_csk(sk)->icsk_ca_state != TCP_CA_Open \|\|
2435	tcp_snd_cwnd(tp) < `11` \|\|
2436	tp->rx_opt.num_sacks \|\| tp->rx_opt.dsack))
2437	return -`1`;
2438
2439	/ Use binary search for probe_size between tcp_mss_base,*
2440	* and current mss_clamp. if (search_high - search_low)
2441	* smaller than a threshold, backoff from probing.
2442	*/
2443	mss_now = tcp_current_mss(sk);
2444	probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
2445	icsk->icsk_mtup.search_low) >> `1`);
2446	size_needed = probe_size + (tp->reordering + `1`) * tp->mss_cache;
2447	interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
2448	/ When misfortune happens, we are reprobing actively,*
2449	* and then reprobe timer has expired. We stick with current
2450	* probing process by not resetting search range to its orignal.
2451	*/
2452	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) \|\|
2453	interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) {
2454	/ Check whether enough time has elaplased for*
2455	* another round of probing.
2456	*/
2457	tcp_mtu_check_reprobe(sk);
2458	return -`1`;
2459	}
2460
2461	/ Have enough data in the send queue to probe? /
2462	if (tp->write_seq - tp->snd_nxt < size_needed)
2463	return -`1`;
2464
2465	if (tp->snd_wnd < size_needed)
2466	return -`1`;
2467	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
2468	return `0`;
2469
2470	/ Do we need to wait to drain cwnd? With none in flight, don't stall /
2471	if (tcp_packets_in_flight(tp) + `2` > tcp_snd_cwnd(tp)) {
2472	if (!tcp_packets_in_flight(tp))
2473	return -`1`;
2474	else
2475	return `0`;
2476	}
2477
2478	if (!tcp_can_coalesce_send_queue_head(sk, len: probe_size))
2479	return -`1`;
2480
2481	/ We're allowed to probe. Build it now. /
2482	nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, force_schedule: false);
2483	if (!nskb)
2484	return -`1`;
2485
2486	/ build the payload, and be prepared to abort if this fails. /
2487	if (tcp_clone_payload(sk, to: nskb, probe_size)) {
2488	tcp_skb_tsorted_anchor_cleanup(skb: nskb);
2489	consume_skb(skb: nskb);
2490	return -`1`;
2491	}
2492	sk_wmem_queued_add(sk, val: nskb->truesize);
2493	sk_mem_charge(sk, size: nskb->truesize);
2494
2495	skb = tcp_send_head(sk);
2496	skb_copy_decrypted(to: nskb, from: skb);
2497	mptcp_skb_ext_copy(to: nskb, from: skb);
2498
2499	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
2500	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
2501	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
2502
2503	tcp_insert_write_queue_before(new: nskb, skb, sk);
2504	tcp_highest_sack_replace(sk, old: skb, new: nskb);
2505
2506	len = `0`;
2507	tcp_for_write_queue_from_safe(skb, next, sk) {
2508	copy = min_t(int, skb->len, probe_size - len);
2509
2510	if (skb->len <= copy) {
2511	/ We've eaten all the data from this skb.*
2512	* Throw it away. */
2513	TCP_SKB_CB(nskb)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
2514	/ If this is the last SKB we copy and eor is set*
2515	* we need to propagate it to the new skb.
2516	*/
2517	TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
2518	tcp_skb_collapse_tstamp(skb: nskb, next_skb: skb);
2519	tcp_unlink_write_queue(skb, sk);
2520	tcp_wmem_free_skb(sk, skb);
2521	} else {
2522	TCP_SKB_CB(nskb)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags &
2523	~(TCPHDR_FIN\|TCPHDR_PSH);
2524	__pskb_trim_head(skb, len: copy);
2525	tcp_set_skb_tso_segs(skb, mss_now);
2526	TCP_SKB_CB(skb)->seq += copy;
2527	}
2528
2529	len += copy;
2530
2531	if (len >= probe_size)
2532	break;
2533	}
2534	tcp_init_tso_segs(skb: nskb, mss_now: nskb->len);
2535
2536	/ We're ready to send. If this fails, the probe will*
2537	* be resegmented into mss-sized pieces by tcp_write_xmit().
2538	*/
2539	if (!tcp_transmit_skb(sk, skb: nskb, clone_it: `1`, GFP_ATOMIC)) {
2540	/ Decrement cwnd here because we are sending*
2541	* effectively two packets. */
2542	tcp_snd_cwnd_set(tp, val: tcp_snd_cwnd(tp) - `1`);
2543	tcp_event_new_data_sent(sk, skb: nskb);
2544
2545	icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
2546	tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
2547	tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
2548
2549	return `1`;
2550	}
2551
2552	return -`1`;
2553	}
2554
2555	static bool tcp_pacing_check(struct sock *sk)
2556	{
2557	struct tcp_sock *tp = tcp_sk(sk);
2558
2559	if (!tcp_needs_internal_pacing(sk))
2560	return false;
2561
2562	if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
2563	return false;
2564
2565	if (!hrtimer_is_queued(timer: &tp->pacing_timer)) {
2566	hrtimer_start(timer: &tp->pacing_timer,
2567	tim: ns_to_ktime(ns: tp->tcp_wstamp_ns),
2568	mode: HRTIMER_MODE_ABS_PINNED_SOFT);
2569	sock_hold(sk);
2570	}
2571	return true;
2572	}
2573
2574	static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk)
2575	{
2576	const struct rb_node *node = sk->tcp_rtx_queue.rb_node;
2577
2578	/ No skb in the rtx queue. /
2579	if (!node)
2580	return true;
2581
2582	/ Only one skb in rtx queue. /
2583	return !node->rb_left && !node->rb_right;
2584	}
2585
2586	/ TCP Small Queues :*
2587	* Control number of packets in qdisc/devices to two packets / or ~1 ms.
2588	* (These limits are doubled for retransmits)
2589	* This allows for :
2590	* - better RTT estimation and ACK scheduling
2591	* - faster recovery
2592	* - high rates
2593	* Alas, some drivers / subsystems require a fair amount
2594	* of queued bytes to ensure line rate.
2595	* One example is wifi aggregation (802.11 AMPDU)
2596	*/
2597	static bool tcp_small_queue_check(struct sock sk, const* struct sk_buff *skb,
2598	unsigned int factor)
2599	{
2600	unsigned long limit;
2601
2602	limit = max_t(unsigned long,
2603	`2` * skb->truesize,
2604	READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
2605	if (sk->sk_pacing_status == SK_PACING_NONE)
2606	limit = min_t(unsigned long, limit,
2607	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
2608	limit <<= factor;
2609
2610	if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
2611	tcp_sk(sk)->tcp_tx_delay) {
2612	u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) *
2613	tcp_sk(sk)->tcp_tx_delay;
2614
2615	/ TSQ is based on skb truesize sum (sk_wmem_alloc), so we*
2616	* approximate our needs assuming an ~100% skb->truesize overhead.
2617	* USEC_PER_SEC is approximated by 2^20.
2618	* do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
2619	*/
2620	extra_bytes >>= (`20` - `1`);
2621	limit += extra_bytes;
2622	}
2623	if (refcount_read(r: &sk->sk_wmem_alloc) > limit) {
2624	/ Always send skb if rtx queue is empty or has one skb.*
2625	* No need to wait for TX completion to call us back,
2626	* after softirq/tasklet schedule.
2627	* This helps when TX completions are delayed too much.
2628	*/
2629	if (tcp_rtx_queue_empty_or_single_skb(sk))
2630	return false;
2631
2632	set_bit(nr: TSQ_THROTTLED, addr: &sk->sk_tsq_flags);
2633	/ It is possible TX completion already happened*
2634	* before we set TSQ_THROTTLED, so we must
2635	* test again the condition.
2636	*/
2637	smp_mb__after_atomic();
2638	if (refcount_read(r: &sk->sk_wmem_alloc) > limit)
2639	return true;
2640	}
2641	return false;
2642	}
2643
2644	static void tcp_chrono_set(struct tcp_sock tp, const* enum tcp_chrono new)
2645	{
2646	const u32 now = tcp_jiffies32;
2647	enum tcp_chrono old = tp->chrono_type;
2648
2649	if (old > TCP_CHRONO_UNSPEC)
2650	tp->chrono_stat[old - `1`] += now - tp->chrono_start;
2651	tp->chrono_start = now;
2652	tp->chrono_type = new;
2653	}
2654
2655	void tcp_chrono_start(struct sock sk, const* enum tcp_chrono type)
2656	{
2657	struct tcp_sock *tp = tcp_sk(sk);
2658
2659	/ If there are multiple conditions worthy of tracking in a*
2660	* chronograph then the highest priority enum takes precedence
2661	* over the other conditions. So that if something "more interesting"
2662	* starts happening, stop the previous chrono and start a new one.
2663	*/
2664	if (type > tp->chrono_type)
2665	tcp_chrono_set(tp, new: type);
2666	}
2667
2668	void tcp_chrono_stop(struct sock sk, const* enum tcp_chrono type)
2669	{
2670	struct tcp_sock *tp = tcp_sk(sk);
2671
2672
2673	/ There are multiple conditions worthy of tracking in a*
2674	* chronograph, so that the highest priority enum takes
2675	* precedence over the other conditions (see tcp_chrono_start).
2676	* If a condition stops, we only stop chrono tracking if
2677	* it's the "most interesting" or current chrono we are
2678	* tracking and starts busy chrono if we have pending data.
2679	*/
2680	if (tcp_rtx_and_write_queues_empty(sk))
2681	tcp_chrono_set(tp, new: TCP_CHRONO_UNSPEC);
2682	else if (type == tp->chrono_type)
2683	tcp_chrono_set(tp, new: TCP_CHRONO_BUSY);
2684	}
2685
2686	/ This routine writes packets to the network. It advances the*
2687	* send_head. This happens as incoming acks open up the remote
2688	* window for us.
2689	*
2690	* LARGESEND note: !tcp_urg_mode is overkill, only frames between
2691	* snd_up-64k-mss .. snd_up cannot be large. However, taking into
2692	* account rare use of URG, this is not a big flaw.
2693	*
2694	* Send at most one packet when push_one > 0. Temporarily ignore
2695	* cwnd limit to force at most one packet out when push_one == 2.
2696
2697	* Returns true, if no segments are in flight and we have queued segments,
2698	* but cannot send anything now because of SWS or another problem.
2699	*/
2700	static bool tcp_write_xmit(struct sock sk, unsigned* int mss_now, int nonagle,
2701	int push_one, gfp_t gfp)
2702	{
2703	struct tcp_sock *tp = tcp_sk(sk);
2704	struct sk_buff *skb;
2705	unsigned int tso_segs, sent_pkts;
2706	int cwnd_quota;
2707	int result;
2708	bool is_cwnd_limited = false, is_rwnd_limited = false;
2709	u32 max_segs;
2710
2711	sent_pkts = `0`;
2712
2713	tcp_mstamp_refresh(tp);
2714	if (!push_one) {
2715	/ Do MTU probing. /
2716	result = tcp_mtu_probe(sk);
2717	if (!result) {
2718	return false;
2719	} else if (result > `0`) {
2720	sent_pkts = `1`;
2721	}
2722	}
2723
2724	max_segs = tcp_tso_segs(sk, mss_now);
2725	while ((skb = tcp_send_head(sk))) {
2726	unsigned int limit;
2727
2728	if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
2729	/ "skb_mstamp_ns" is used as a start point for the retransmit timer /
2730	tp->tcp_wstamp_ns = tp->tcp_clock_cache;
2731	skb_set_delivery_time(skb, kt: tp->tcp_wstamp_ns, mono: true);
2732	list_move_tail(list: &skb->tcp_tsorted_anchor, head: &tp->tsorted_sent_queue);
2733	tcp_init_tso_segs(skb, mss_now);
2734	goto repair; / Skip network transmission /
2735	}
2736
2737	if (tcp_pacing_check(sk))
2738	break;
2739
2740	tso_segs = tcp_init_tso_segs(skb, mss_now);
2741	BUG_ON(!tso_segs);
2742
2743	cwnd_quota = tcp_cwnd_test(tp, skb);
2744	if (!cwnd_quota) {
2745	if (push_one == `2`)
2746	/ Force out a loss probe pkt. /
2747	cwnd_quota = `1`;
2748	else
2749	break;
2750	}
2751
2752	if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
2753	is_rwnd_limited = true;
2754	break;
2755	}
2756
2757	if (tso_segs == `1`) {
2758	if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
2759	(tcp_skb_is_last(sk, skb) ?
2760	nonagle : TCP_NAGLE_PUSH))))
2761	break;
2762	} else {
2763	if (!push_one &&
2764	tcp_tso_should_defer(sk, skb, is_cwnd_limited: &is_cwnd_limited,
2765	is_rwnd_limited: &is_rwnd_limited, max_segs))
2766	break;
2767	}
2768
2769	limit = mss_now;
2770	if (tso_segs > `1` && !tcp_urg_mode(tp))
2771	limit = tcp_mss_split_point(sk, skb, mss_now,
2772	min_t(unsigned int,
2773	cwnd_quota,
2774	max_segs),
2775	nonagle);
2776
2777	if (skb->len > limit &&
2778	unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2779	break;
2780
2781	if (tcp_small_queue_check(sk, skb, factor: `0`))
2782	break;
2783
2784	/ Argh, we hit an empty skb(), presumably a thread*
2785	* is sleeping in sendmsg()/sk_stream_wait_memory().
2786	* We do not want to send a pure-ack packet and have
2787	* a strange looking rtx queue with empty packet(s).
2788	*/
2789	if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
2790	break;
2791
2792	if (unlikely(tcp_transmit_skb(sk, skb, `1`, gfp)))
2793	break;
2794
2795	repair:
2796	/ Advance the send_head. This one is sent out.*
2797	* This call will increment packets_out.
2798	*/
2799	tcp_event_new_data_sent(sk, skb);
2800
2801	tcp_minshall_update(tp, mss_now, skb);
2802	sent_pkts += tcp_skb_pcount(skb);
2803
2804	if (push_one)
2805	break;
2806	}
2807
2808	if (is_rwnd_limited)
2809	tcp_chrono_start(sk, type: TCP_CHRONO_RWND_LIMITED);
2810	else
2811	tcp_chrono_stop(sk, type: TCP_CHRONO_RWND_LIMITED);
2812
2813	is_cwnd_limited \|= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp));
2814	if (likely(sent_pkts \|\| is_cwnd_limited))
2815	tcp_cwnd_validate(sk, is_cwnd_limited);
2816
2817	if (likely(sent_pkts)) {
2818	if (tcp_in_cwnd_reduction(sk))
2819	tp->prr_out += sent_pkts;
2820
2821	/ Send one loss probe per tail loss episode. /
2822	if (push_one != `2`)
2823	tcp_schedule_loss_probe(sk, advancing_rto: false);
2824	return false;
2825	}
2826	return !tp->packets_out && !tcp_write_queue_empty(sk);
2827	}
2828
2829	bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
2830	{
2831	struct inet_connection_sock *icsk = inet_csk(sk);
2832	struct tcp_sock *tp = tcp_sk(sk);
2833	u32 timeout, timeout_us, rto_delta_us;
2834	int early_retrans;
2835
2836	/ Don't do any loss probe on a Fast Open connection before 3WHS*
2837	* finishes.
2838	*/
2839	if (rcu_access_pointer(tp->fastopen_rsk))
2840	return false;
2841
2842	early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans);
2843	/ Schedule a loss probe in 2RTT for SACK capable connections
2844	* not in loss recovery, that are either limited by cwnd or application.
2845	*/
2846	if ((early_retrans != `3` && early_retrans != `4`) \|\|
2847	!tp->packets_out \|\| !tcp_is_sack(tp) \|\|
2848	(icsk->icsk_ca_state != TCP_CA_Open &&
2849	icsk->icsk_ca_state != TCP_CA_CWR))
2850	return false;
2851
2852	/ Probe timeout is 2rtt. Add minimum RTO to account
2853	* for delayed ack when there's one outstanding packet. If no RTT
2854	* sample is available then probe after TCP_TIMEOUT_INIT.
2855	*/
2856	if (tp->srtt_us) {
2857	timeout_us = tp->srtt_us >> `2`;
2858	if (tp->packets_out == `1`)
2859	timeout_us += tcp_rto_min_us(sk);
2860	else
2861	timeout_us += TCP_TIMEOUT_MIN_US;
2862	timeout = usecs_to_jiffies(u: timeout_us);
2863	} else {
2864	timeout = TCP_TIMEOUT_INIT;
2865	}
2866
2867	/ If the RTO formula yields an earlier time, then use that time. /
2868	rto_delta_us = advancing_rto ?
2869	jiffies_to_usecs(j: inet_csk(sk)->icsk_rto) :
2870	tcp_rto_delta_us(sk); / How far in future is RTO? /
2871	if (rto_delta_us > `0`)
2872	timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
2873
2874	tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, when: timeout, TCP_RTO_MAX);
2875	return true;
2876	}
2877
2878	/ Thanks to skb fast clones, we can detect if a prior transmit of*
2879	* a packet is still in a qdisc or driver queue.
2880	* In this case, there is very little point doing a retransmit !
2881	*/
2882	static bool skb_still_in_host_queue(struct sock *sk,
2883	const struct sk_buff *skb)
2884	{
2885	if (unlikely(skb_fclone_busy(sk, skb))) {
2886	set_bit(nr: TSQ_THROTTLED, addr: &sk->sk_tsq_flags);
2887	smp_mb__after_atomic();
2888	if (skb_fclone_busy(sk, skb)) {
2889	NET_INC_STATS(sock_net(sk),
2890	LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
2891	return true;
2892	}
2893	}
2894	return false;
2895	}
2896
2897	/ When probe timeout (PTO) fires, try send a new segment if possible, else*
2898	* retransmit the last segment.
2899	*/
2900	void tcp_send_loss_probe(struct sock *sk)
2901	{
2902	struct tcp_sock *tp = tcp_sk(sk);
2903	struct sk_buff *skb;
2904	int pcount;
2905	int mss = tcp_current_mss(sk);
2906
2907	/ At most one outstanding TLP /
2908	if (tp->tlp_high_seq)
2909	goto rearm_timer;
2910
2911	tp->tlp_retrans = `0`;
2912	skb = tcp_send_head(sk);
2913	if (skb && tcp_snd_wnd_test(tp, skb, cur_mss: mss)) {
2914	pcount = tp->packets_out;
2915	tcp_write_xmit(sk, mss_now: mss, TCP_NAGLE_OFF, push_one: `2`, GFP_ATOMIC);
2916	if (tp->packets_out > pcount)
2917	goto probe_sent;
2918	goto rearm_timer;
2919	}
2920	skb = skb_rb_last(&sk->tcp_rtx_queue);
2921	if (unlikely(!skb)) {
2922	WARN_ONCE(tp->packets_out,
2923	"invalid inflight: %u state %u cwnd %u mss %d\n",
2924	tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss);
2925	inet_csk(sk)->icsk_pending = `0`;
2926	return;
2927	}
2928
2929	if (skb_still_in_host_queue(sk, skb))
2930	goto rearm_timer;
2931
2932	pcount = tcp_skb_pcount(skb);
2933	if (WARN_ON(!pcount))
2934	goto rearm_timer;
2935
2936	if ((pcount > `1`) && (skb->len > (pcount - `1`) * mss)) {
2937	if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
2938	(pcount - `1`) * mss, mss,
2939	GFP_ATOMIC)))
2940	goto rearm_timer;
2941	skb = skb_rb_next(skb);
2942	}
2943
2944	if (WARN_ON(!skb \|\| !tcp_skb_pcount(skb)))
2945	goto rearm_timer;
2946
2947	if (__tcp_retransmit_skb(sk, skb, segs: `1`))
2948	goto rearm_timer;
2949
2950	tp->tlp_retrans = `1`;
2951
2952	probe_sent:
2953	/ Record snd_nxt for loss detection. /
2954	tp->tlp_high_seq = tp->snd_nxt;
2955
2956	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
2957	/ Reset s.t. tcp_rearm_rto will restart timer from now /
2958	inet_csk(sk)->icsk_pending = `0`;
2959	rearm_timer:
2960	tcp_rearm_rto(sk);
2961	}
2962
2963	/ Push out any pending frames which were held back due to*
2964	* TCP_CORK or attempt at coalescing tiny packets.
2965	* The socket must be locked by the caller.
2966	*/
2967	void __tcp_push_pending_frames(struct sock sk, unsigned* int cur_mss,
2968	int nonagle)
2969	{
2970	/ If we are closed, the bytes will have to remain here.*
2971	* In time closedown will finish, we empty the write queue and
2972	* all will be happy.
2973	*/
2974	if (unlikely(sk->sk_state == TCP_CLOSE))
2975	return;
2976
2977	if (tcp_write_xmit(sk, mss_now: cur_mss, nonagle, push_one: `0`,
2978	gfp: sk_gfp_mask(sk, GFP_ATOMIC)))
2979	tcp_check_probe_timer(sk);
2980	}
2981
2982	/ Send _single_ skb sitting at the send head. This function requires*
2983	* true push pending frames to setup probe timer etc.
2984	*/
2985	void tcp_push_one(struct sock sk, unsigned* int mss_now)
2986	{
2987	struct sk_buff *skb = tcp_send_head(sk);
2988
2989	BUG_ON(!skb \|\| skb->len < mss_now);
2990
2991	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, push_one: `1`, gfp: sk->sk_allocation);
2992	}
2993
2994	/ This function returns the amount that we can raise the*
2995	* usable window based on the following constraints
2996	*
2997	* 1. The window can never be shrunk once it is offered (RFC 793)
2998	* 2. We limit memory per socket
2999	*
3000	* RFC 1122:
3001	* "the suggested [SWS] avoidance algorithm for the receiver is to keep
3002	* RECV.NEXT + RCV.WIN fixed until:
3003	* RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
3004	*
3005	* i.e. don't raise the right edge of the window until you can raise
3006	* it at least MSS bytes.
3007	*
3008	* Unfortunately, the recommended algorithm breaks header prediction,
3009	* since header prediction assumes th->window stays fixed.
3010	*
3011	* Strictly speaking, keeping th->window fixed violates the receiver
3012	* side SWS prevention criteria. The problem is that under this rule
3013	* a stream of single byte packets will cause the right side of the
3014	* window to always advance by a single byte.
3015	*
3016	* Of course, if the sender implements sender side SWS prevention
3017	* then this will not be a problem.
3018	*
3019	* BSD seems to make the following compromise:
3020	*
3021	* If the free space is less than the 1/4 of the maximum
3022	* space available and the free space is less than 1/2 mss,
3023	* then set the window to 0.
3024	* [ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
3025	* Otherwise, just prevent the window from shrinking
3026	* and from being larger than the largest representable value.
3027	*
3028	* This prevents incremental opening of the window in the regime
3029	* where TCP is limited by the speed of the reader side taking
3030	* data out of the TCP receive queue. It does nothing about
3031	* those cases where the window is constrained on the sender side
3032	* because the pipeline is full.
3033	*
3034	* BSD also seems to "accidentally" limit itself to windows that are a
3035	* multiple of MSS, at least until the free space gets quite small.
3036	* This would appear to be a side effect of the mbuf implementation.
3037	* Combining these two algorithms results in the observed behavior
3038	* of having a fixed window size at almost all times.
3039	*
3040	* Below we obtain similar behavior by forcing the offered window to
3041	* a multiple of the mss when it is feasible to do so.
3042	*
3043	* Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
3044	* Regular options like TIMESTAMP are taken into account.
3045	*/
3046	u32 __tcp_select_window(struct sock *sk)
3047	{
3048	struct inet_connection_sock *icsk = inet_csk(sk);
3049	struct tcp_sock *tp = tcp_sk(sk);
3050	struct net *net = sock_net(sk);
3051	/ MSS for the peer's data. Previous versions used mss_clamp*
3052	* here. I don't know if the value based on our guesses
3053	* of peer's MSS is better for the performance. It's more correct
3054	* but may be worse for the performance because of rcv_mss
3055	* fluctuations. --SAW 1998/11/1
3056	*/
3057	int mss = icsk->icsk_ack.rcv_mss;
3058	int free_space = tcp_space(sk);
3059	int allowed_space = tcp_full_space(sk);
3060	int full_space, window;
3061
3062	if (sk_is_mptcp(sk))
3063	mptcp_space(ssk: sk, space: &free_space, full_space: &allowed_space);
3064
3065	full_space = min_t(int, tp->window_clamp, allowed_space);
3066
3067	if (unlikely(mss > full_space)) {
3068	mss = full_space;
3069	if (mss <= `0`)
3070	return `0`;
3071	}
3072
3073	/ Only allow window shrink if the sysctl is enabled and we have*
3074	* a non-zero scaling factor in effect.
3075	*/
3076	if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale)
3077	goto shrink_window_allowed;
3078
3079	/ do not allow window to shrink /
3080
3081	if (free_space < (full_space >> `1`)) {
3082	icsk->icsk_ack.quick = `0`;
3083
3084	if (tcp_under_memory_pressure(sk))
3085	tcp_adjust_rcv_ssthresh(sk);
3086
3087	/ free_space might become our new window, make sure we don't*
3088	* increase it due to wscale.
3089	*/
3090	free_space = round_down(free_space, `1` << tp->rx_opt.rcv_wscale);
3091
3092	/ if free space is less than mss estimate, or is below 1/16th*
3093	* of the maximum allowed, try to move to zero-window, else
3094	* tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
3095	* new incoming data is dropped due to memory limits.
3096	* With large window, mss test triggers way too late in order
3097	* to announce zero window in time before rmem limit kicks in.
3098	*/
3099	if (free_space < (allowed_space >> `4`) \|\| free_space < mss)
3100	return `0`;
3101	}
3102
3103	if (free_space > tp->rcv_ssthresh)
3104	free_space = tp->rcv_ssthresh;
3105
3106	/ Don't do rounding if we are using window scaling, since the*
3107	* scaled window will not line up with the MSS boundary anyway.
3108	*/
3109	if (tp->rx_opt.rcv_wscale) {
3110	window = free_space;
3111
3112	/ Advertise enough space so that it won't get scaled away.*
3113	* Import case: prevent zero window announcement if
3114	* 1<<rcv_wscale > mss.
3115	*/
3116	window = ALIGN(window, (`1` << tp->rx_opt.rcv_wscale));
3117	} else {
3118	window = tp->rcv_wnd;
3119	/ Get the largest window that is a nice multiple of mss.*
3120	* Window clamp already applied above.
3121	* If our current window offering is within 1 mss of the
3122	* free space we just keep it. This prevents the divide
3123	* and multiply from happening most of the time.
3124	* We also don't do any window rounding when the free space
3125	* is too small.
3126	*/
3127	if (window <= free_space - mss \|\| window > free_space)
3128	window = rounddown(free_space, mss);
3129	else if (mss == full_space &&
3130	free_space > window + (full_space >> `1`))
3131	window = free_space;
3132	}
3133
3134	return window;
3135
3136	shrink_window_allowed:
3137	/ new window should always be an exact multiple of scaling factor /
3138	free_space = round_down(free_space, `1` << tp->rx_opt.rcv_wscale);
3139
3140	if (free_space < (full_space >> `1`)) {
3141	icsk->icsk_ack.quick = `0`;
3142
3143	if (tcp_under_memory_pressure(sk))
3144	tcp_adjust_rcv_ssthresh(sk);
3145
3146	/ if free space is too low, return a zero window /
3147	if (free_space < (allowed_space >> `4`) \|\| free_space < mss \|\|
3148	free_space < (`1` << tp->rx_opt.rcv_wscale))
3149	return `0`;
3150	}
3151
3152	if (free_space > tp->rcv_ssthresh) {
3153	free_space = tp->rcv_ssthresh;
3154	/ new window should always be an exact multiple of scaling factor*
3155	*
3156	* For this case, we ALIGN "up" (increase free_space) because
3157	* we know free_space is not zero here, it has been reduced from
3158	* the memory-based limit, and rcv_ssthresh is not a hard limit
3159	* (unlike sk_rcvbuf).
3160	*/
3161	free_space = ALIGN(free_space, (`1` << tp->rx_opt.rcv_wscale));
3162	}
3163
3164	return free_space;
3165	}
3166
3167	void tcp_skb_collapse_tstamp(struct sk_buff *skb,
3168	const struct sk_buff *next_skb)
3169	{
3170	if (unlikely(tcp_has_tx_tstamp(next_skb))) {
3171	const struct skb_shared_info *next_shinfo =
3172	skb_shinfo(next_skb);
3173	struct skb_shared_info *shinfo = skb_shinfo(skb);
3174
3175	shinfo->tx_flags \|= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
3176	shinfo->tskey = next_shinfo->tskey;
3177	TCP_SKB_CB(skb)->txstamp_ack \|=
3178	TCP_SKB_CB(next_skb)->txstamp_ack;
3179	}
3180	}
3181
3182	/ Collapses two adjacent SKB's during retransmission. /
3183	static bool tcp_collapse_retrans(struct sock sk, struct* sk_buff *skb)
3184	{
3185	struct tcp_sock *tp = tcp_sk(sk);
3186	struct sk_buff *next_skb = skb_rb_next(skb);
3187	int next_skb_size;
3188
3189	next_skb_size = next_skb->len;
3190
3191	BUG_ON(tcp_skb_pcount(skb) != `1` \|\| tcp_skb_pcount(next_skb) != `1`);
3192
3193	if (next_skb_size && !tcp_skb_shift(to: skb, from: next_skb, pcount: `1`, shiftlen: next_skb_size))
3194	return false;
3195
3196	tcp_highest_sack_replace(sk, old: next_skb, new: skb);
3197
3198	/ Update sequence range on original skb. /
3199	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
3200
3201	/ Merge over control information. This moves PSH/FIN etc. over /
3202	TCP_SKB_CB(skb)->tcp_flags \|= TCP_SKB_CB(next_skb)->tcp_flags;
3203
3204	/ All done, get rid of second SKB and account for it so*
3205	* packet counting does not break.
3206	*/
3207	TCP_SKB_CB(skb)->sacked \|= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
3208	TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
3209
3210	/ changed transmit queue under us so clear hints /
3211	tcp_clear_retrans_hints_partial(tp);
3212	if (next_skb == tp->retransmit_skb_hint)
3213	tp->retransmit_skb_hint = skb;
3214
3215	tcp_adjust_pcount(sk, skb: next_skb, decr: tcp_skb_pcount(skb: next_skb));
3216
3217	tcp_skb_collapse_tstamp(skb, next_skb);
3218
3219	tcp_rtx_queue_unlink_and_free(skb: next_skb, sk);
3220	return true;
3221	}
3222
3223	/ Check if coalescing SKBs is legal. /
3224	static bool tcp_can_collapse(const struct sock sk, const* struct sk_buff *skb)
3225	{
3226	if (tcp_skb_pcount(skb) > `1`)
3227	return false;
3228	if (skb_cloned(skb))
3229	return false;
3230	/ Some heuristics for collapsing over SACK'd could be invented /
3231	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
3232	return false;
3233
3234	return true;
3235	}
3236
3237	/ Collapse packets in the retransmit queue to make to create*
3238	* less packets on the wire. This is only done on retransmission.
3239	*/
3240	static void tcp_retrans_try_collapse(struct sock sk, struct* sk_buff *to,
3241	int space)
3242	{
3243	struct tcp_sock *tp = tcp_sk(sk);
3244	struct sk_buff skb = to, tmp;
3245	bool first = true;
3246
3247	if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse))
3248	return;
3249	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
3250	return;
3251
3252	skb_rbtree_walk_from_safe(skb, tmp) {
3253	if (!tcp_can_collapse(sk, skb))
3254	break;
3255
3256	if (!tcp_skb_can_collapse(to, from: skb))
3257	break;
3258
3259	space -= skb->len;
3260
3261	if (first) {
3262	first = false;
3263	continue;
3264	}
3265
3266	if (space < `0`)
3267	break;
3268
3269	if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
3270	break;
3271
3272	if (!tcp_collapse_retrans(sk, skb: to))
3273	break;
3274	}
3275	}
3276
3277	/ This retransmits one SKB. Policy decisions and retransmit queue*
3278	* state updates are done by the caller. Returns non-zero if an
3279	* error occurred which prevented the send.
3280	*/
3281	int __tcp_retransmit_skb(struct sock sk, struct* sk_buff skb, int* segs)
3282	{
3283	struct inet_connection_sock *icsk = inet_csk(sk);
3284	struct tcp_sock *tp = tcp_sk(sk);
3285	unsigned int cur_mss;
3286	int diff, len, err;
3287	int avail_wnd;
3288
3289	/ Inconclusive MTU probe /
3290	if (icsk->icsk_mtup.probe_size)
3291	icsk->icsk_mtup.probe_size = `0`;
3292
3293	if (skb_still_in_host_queue(sk, skb))
3294	return -EBUSY;
3295
3296	start:
3297	if (before(TCP_SKB_CB(skb)->seq, seq2: tp->snd_una)) {
3298	if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
3299	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
3300	TCP_SKB_CB(skb)->seq++;
3301	goto start;
3302	}
3303	if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
3304	WARN_ON_ONCE(`1`);
3305	return -EINVAL;
3306	}
3307	if (tcp_trim_head(sk, skb, len: tp->snd_una - TCP_SKB_CB(skb)->seq))
3308	return -ENOMEM;
3309	}
3310
3311	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
3312	return -EHOSTUNREACH; / Routing failure or similar. /
3313
3314	cur_mss = tcp_current_mss(sk);
3315	avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
3316
3317	/ If receiver has shrunk his window, and skb is out of*
3318	* new window, do not retransmit it. The exception is the
3319	* case, when window is shrunk to zero. In this case
3320	* our retransmit of one segment serves as a zero window probe.
3321	*/
3322	if (avail_wnd <= `0`) {
3323	if (TCP_SKB_CB(skb)->seq != tp->snd_una)
3324	return -EAGAIN;
3325	avail_wnd = cur_mss;
3326	}
3327
3328	len = cur_mss * segs;
3329	if (len > avail_wnd) {
3330	len = rounddown(avail_wnd, cur_mss);
3331	if (!len)
3332	len = avail_wnd;
3333	}
3334	if (skb->len > len) {
3335	if (tcp_fragment(sk, tcp_queue: TCP_FRAG_IN_RTX_QUEUE, skb, len,
3336	mss_now: cur_mss, GFP_ATOMIC))
3337	return -ENOMEM; / We'll try again later. /
3338	} else {
3339	if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
3340	return -ENOMEM;
3341
3342	diff = tcp_skb_pcount(skb);
3343	tcp_set_skb_tso_segs(skb, mss_now: cur_mss);
3344	diff -= tcp_skb_pcount(skb);
3345	if (diff)
3346	tcp_adjust_pcount(sk, skb, decr: diff);
3347	avail_wnd = min_t(int, avail_wnd, cur_mss);
3348	if (skb->len < avail_wnd)
3349	tcp_retrans_try_collapse(sk, to: skb, space: avail_wnd);
3350	}
3351
3352	/ RFC3168, section 6.1.1.1. ECN fallback /
3353	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
3354	tcp_ecn_clear_syn(sk, skb);
3355
3356	/ Update global and local TCP statistics. /
3357	segs = tcp_skb_pcount(skb);
3358	TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
3359	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
3360	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
3361	tp->total_retrans += segs;
3362	tp->bytes_retrans += skb->len;
3363
3364	/ make sure skb->data is aligned on arches that require it*
3365	* and check if ack-trimming & collapsing extended the headroom
3366	* beyond what csum_start can cover.
3367	*/
3368	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & `3`)) \|\|
3369	skb_headroom(skb) >= `0xFFFF`)) {
3370	struct sk_buff *nskb;
3371
3372	tcp_skb_tsorted_save(skb) {
3373	nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
3374	if (nskb) {
3375	nskb->dev = NULL;
3376	err = tcp_transmit_skb(sk, skb: nskb, clone_it: `0`, GFP_ATOMIC);
3377	} else {
3378	err = -ENOBUFS;
3379	}
3380	} tcp_skb_tsorted_restore(skb);
3381
3382	if (!err) {
3383	tcp_update_skb_after_send(sk, skb, prior_wstamp: tp->tcp_wstamp_ns);
3384	tcp_rate_skb_sent(sk, skb);
3385	}
3386	} else {
3387	err = tcp_transmit_skb(sk, skb, clone_it: `1`, GFP_ATOMIC);
3388	}
3389
3390	/ To avoid taking spuriously low RTT samples based on a timestamp*
3391	* for a transmit that never happened, always mark EVER_RETRANS
3392	*/
3393	TCP_SKB_CB(skb)->sacked \|= TCPCB_EVER_RETRANS;
3394
3395	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
3396	tcp_call_bpf_3arg(sk, op: BPF_SOCK_OPS_RETRANS_CB,
3397	TCP_SKB_CB(skb)->seq, arg2: segs, arg3: err);
3398
3399	if (likely(!err)) {
3400	trace_tcp_retransmit_skb(sk, skb);
3401	} else if (err != -EBUSY) {
3402	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
3403	}
3404	return err;
3405	}
3406
3407	int tcp_retransmit_skb(struct sock sk, struct* sk_buff skb, int* segs)
3408	{
3409	struct tcp_sock *tp = tcp_sk(sk);
3410	int err = __tcp_retransmit_skb(sk, skb, segs);
3411
3412	if (err == `0`) {
3413	#if FASTRETRANS_DEBUG > 0
3414	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
3415	net_dbg_ratelimited("retrans_out leaked\n");
3416	}
3417	#endif
3418	TCP_SKB_CB(skb)->sacked \|= TCPCB_RETRANS;
3419	tp->retrans_out += tcp_skb_pcount(skb);
3420	}
3421
3422	/ Save stamp of the first (attempted) retransmit. /
3423	if (!tp->retrans_stamp)
3424	tp->retrans_stamp = tcp_skb_timestamp_ts(usec_ts: tp->tcp_usec_ts, skb);
3425
3426	if (tp->undo_retrans < `0`)
3427	tp->undo_retrans = `0`;
3428	tp->undo_retrans += tcp_skb_pcount(skb);
3429	return err;
3430	}
3431
3432	/ This gets called after a retransmit timeout, and the initially*
3433	* retransmitted data is acknowledged. It tries to continue
3434	* resending the rest of the retransmit queue, until either
3435	* we've sent it all or the congestion window limit is reached.
3436	*/
3437	void tcp_xmit_retransmit_queue(struct sock *sk)
3438	{
3439	const struct inet_connection_sock *icsk = inet_csk(sk);
3440	struct sk_buff skb, rtx_head, *hole = NULL;
3441	struct tcp_sock *tp = tcp_sk(sk);
3442	bool rearm_timer = false;
3443	u32 max_segs;
3444	int mib_idx;
3445
3446	if (!tp->packets_out)
3447	return;
3448
3449	rtx_head = tcp_rtx_queue_head(sk);
3450	skb = tp->retransmit_skb_hint ?: rtx_head;
3451	max_segs = tcp_tso_segs(sk, mss_now: tcp_current_mss(sk));
3452	skb_rbtree_walk_from(skb) {
3453	__u8 sacked;
3454	int segs;
3455
3456	if (tcp_pacing_check(sk))
3457	break;
3458
3459	/ we could do better than to assign each time /
3460	if (!hole)
3461	tp->retransmit_skb_hint = skb;
3462
3463	segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp);
3464	if (segs <= `0`)
3465	break;
3466	sacked = TCP_SKB_CB(skb)->sacked;
3467	/ In case tcp_shift_skb_data() have aggregated large skbs,*
3468	* we need to make sure not sending too bigs TSO packets
3469	*/
3470	segs = min_t(int, segs, max_segs);
3471
3472	if (tp->retrans_out >= tp->lost_out) {
3473	break;
3474	} else if (!(sacked & TCPCB_LOST)) {
3475	if (!hole && !(sacked & (TCPCB_SACKED_RETRANS\|TCPCB_SACKED_ACKED)))
3476	hole = skb;
3477	continue;
3478
3479	} else {
3480	if (icsk->icsk_ca_state != TCP_CA_Loss)
3481	mib_idx = LINUX_MIB_TCPFASTRETRANS;
3482	else
3483	mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
3484	}
3485
3486	if (sacked & (TCPCB_SACKED_ACKED\|TCPCB_SACKED_RETRANS))
3487	continue;
3488
3489	if (tcp_small_queue_check(sk, skb, factor: `1`))
3490	break;
3491
3492	if (tcp_retransmit_skb(sk, skb, segs))
3493	break;
3494
3495	NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
3496
3497	if (tcp_in_cwnd_reduction(sk))
3498	tp->prr_out += tcp_skb_pcount(skb);
3499
3500	if (skb == rtx_head &&
3501	icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
3502	rearm_timer = true;
3503
3504	}
3505	if (rearm_timer)
3506	tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
3507	when: inet_csk(sk)->icsk_rto,
3508	TCP_RTO_MAX);
3509	}
3510
3511	/ We allow to exceed memory limits for FIN packets to expedite*
3512	* connection tear down and (memory) recovery.
3513	* Otherwise tcp_send_fin() could be tempted to either delay FIN
3514	* or even be forced to close flow without any FIN.
3515	* In general, we want to allow one skb per socket to avoid hangs
3516	* with edge trigger epoll()
3517	*/
3518	void sk_forced_mem_schedule(struct sock sk, int* size)
3519	{
3520	int delta, amt;
3521
3522	delta = size - sk->sk_forward_alloc;
3523	if (delta <= `0`)
3524	return;
3525	amt = sk_mem_pages(amt: delta);
3526	sk_forward_alloc_add(sk, val: amt << PAGE_SHIFT);
3527	sk_memory_allocated_add(sk, amt);
3528
3529	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3530	mem_cgroup_charge_skmem(memcg: sk->sk_memcg, nr_pages: amt,
3531	gfp_mask: gfp_memcg_charge() \| __GFP_NOFAIL);
3532	}
3533
3534	/ Send a FIN. The caller locks the socket for us.*
3535	* We should try to send a FIN packet really hard, but eventually give up.
3536	*/
3537	void tcp_send_fin(struct sock *sk)
3538	{
3539	struct sk_buff skb, tskb, *tail = tcp_write_queue_tail(sk);
3540	struct tcp_sock *tp = tcp_sk(sk);
3541
3542	/ Optimization, tack on the FIN if we have one skb in write queue and*
3543	* this skb was not yet sent, or we are under memory pressure.
3544	* Note: in the latter case, FIN packet will be sent after a timeout,
3545	* as TCP stack thinks it has already been transmitted.
3546	*/
3547	tskb = tail;
3548	if (!tskb && tcp_under_memory_pressure(sk))
3549	tskb = skb_rb_last(&sk->tcp_rtx_queue);
3550
3551	if (tskb) {
3552	TCP_SKB_CB(tskb)->tcp_flags \|= TCPHDR_FIN;
3553	TCP_SKB_CB(tskb)->end_seq++;
3554	tp->write_seq++;
3555	if (!tail) {
3556	/ This means tskb was already sent.*
3557	* Pretend we included the FIN on previous transmit.
3558	* We need to set tp->snd_nxt to the value it would have
3559	* if FIN had been sent. This is because retransmit path
3560	* does not change tp->snd_nxt.
3561	*/
3562	WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + `1`);
3563	return;
3564	}
3565	} else {
3566	skb = alloc_skb_fclone(MAX_TCP_HEADER, priority: sk->sk_allocation);
3567	if (unlikely(!skb))
3568	return;
3569
3570	INIT_LIST_HEAD(list: &skb->tcp_tsorted_anchor);
3571	skb_reserve(skb, MAX_TCP_HEADER);
3572	sk_forced_mem_schedule(sk, size: skb->truesize);
3573	/ FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). /
3574	tcp_init_nondata_skb(skb, seq: tp->write_seq,
3575	TCPHDR_ACK \| TCPHDR_FIN);
3576	tcp_queue_skb(sk, skb);
3577	}
3578	__tcp_push_pending_frames(sk, cur_mss: tcp_current_mss(sk), TCP_NAGLE_OFF);
3579	}
3580
3581	/ We get here when a process closes a file descriptor (either due to*
3582	* an explicit close() or as a byproduct of exit()'ing) and there
3583	* was unread data in the receive queue. This behavior is recommended
3584	* by RFC 2525, section 2.17. -DaveM
3585	*/
3586	void tcp_send_active_reset(struct sock *sk, gfp_t priority)
3587	{
3588	struct sk_buff *skb;
3589
3590	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
3591
3592	/ NOTE: No TCP options attached and we never retransmit this. /
3593	skb = alloc_skb(MAX_TCP_HEADER, priority);
3594	if (!skb) {
3595	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3596	return;
3597	}
3598
3599	/ Reserve space for headers and prepare control bits. /
3600	skb_reserve(skb, MAX_TCP_HEADER);
3601	tcp_init_nondata_skb(skb, seq: tcp_acceptable_seq(sk),
3602	TCPHDR_ACK \| TCPHDR_RST);
3603	tcp_mstamp_refresh(tcp_sk(sk));
3604	/ Send it off. /
3605	if (tcp_transmit_skb(sk, skb, clone_it: `0`, gfp_mask: priority))
3606	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
3607
3608	/ skb of trace_tcp_send_reset() keeps the skb that caused RST,*
3609	* skb here is different to the troublesome skb, so use NULL
3610	*/
3611	trace_tcp_send_reset(sk, NULL);
3612	}
3613
3614	/ Send a crossed SYN-ACK during socket establishment.*
3615	* WARNING: This routine must only be called when we have already sent
3616	* a SYN packet that crossed the incoming SYN that caused this routine
3617	* to get called. If this assumption fails then the initial rcv_wnd
3618	* and rcv_wscale values will not be correct.
3619	*/
3620	int tcp_send_synack(struct sock *sk)
3621	{
3622	struct sk_buff *skb;
3623
3624	skb = tcp_rtx_queue_head(sk);
3625	if (!skb \|\| !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
3626	pr_err("%s: wrong queue state\n", __func__);
3627	return -EFAULT;
3628	}
3629	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
3630	if (skb_cloned(skb)) {
3631	struct sk_buff *nskb;
3632
3633	tcp_skb_tsorted_save(skb) {
3634	nskb = skb_copy(skb, GFP_ATOMIC);
3635	} tcp_skb_tsorted_restore(skb);
3636	if (!nskb)
3637	return -ENOMEM;
3638	INIT_LIST_HEAD(list: &nskb->tcp_tsorted_anchor);
3639	tcp_highest_sack_replace(sk, old: skb, new: nskb);
3640	tcp_rtx_queue_unlink_and_free(skb, sk);
3641	__skb_header_release(skb: nskb);
3642	tcp_rbtree_insert(root: &sk->tcp_rtx_queue, skb: nskb);
3643	sk_wmem_queued_add(sk, val: nskb->truesize);
3644	sk_mem_charge(sk, size: nskb->truesize);
3645	skb = nskb;
3646	}
3647
3648	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_ACK;
3649	tcp_ecn_send_synack(sk, skb);
3650	}
3651	return tcp_transmit_skb(sk, skb, clone_it: `1`, GFP_ATOMIC);
3652	}
3653
3654	/**
3655	* tcp_make_synack - Allocate one skb and build a SYNACK packet.
3656	* @sk: listener socket
3657	* @dst: dst entry attached to the SYNACK. It is consumed and caller
3658	* should not use it again.
3659	* @req: request_sock pointer
3660	* @foc: cookie for tcp fast open
3661	* @synack_type: Type of synack to prepare
3662	* @syn_skb: SYN packet just received. It could be NULL for rtx case.
3663	*/
3664	struct sk_buff tcp_make_synack(const* struct sock sk, struct* dst_entry *dst,
3665	struct request_sock *req,
3666	struct tcp_fastopen_cookie *foc,
3667	enum tcp_synack_type synack_type,
3668	struct sk_buff *syn_skb)
3669	{
3670	struct inet_request_sock *ireq = inet_rsk(sk: req);
3671	const struct tcp_sock *tp = tcp_sk(sk);
3672	struct tcp_out_options opts;
3673	struct tcp_key key = {};
3674	struct sk_buff *skb;
3675	int tcp_header_size;
3676	struct tcphdr *th;
3677	int mss;
3678	u64 now;
3679
3680	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
3681	if (unlikely(!skb)) {
3682	dst_release(dst);
3683	return NULL;
3684	}
3685	/ Reserve space for headers. /
3686	skb_reserve(skb, MAX_TCP_HEADER);
3687
3688	switch (synack_type) {
3689	case TCP_SYNACK_NORMAL:
3690	skb_set_owner_w(skb, sk: req_to_sk(req));
3691	break;
3692	case TCP_SYNACK_COOKIE:
3693	/ Under synflood, we do not attach skb to a socket,*
3694	* to avoid false sharing.
3695	*/
3696	break;
3697	case TCP_SYNACK_FASTOPEN:
3698	/ sk is a const pointer, because we want to express multiple*
3699	* cpu might call us concurrently.
3700	* sk->sk_wmem_alloc in an atomic, we can promote to rw.
3701	*/
3702	skb_set_owner_w(skb, sk: (struct sock *)sk);
3703	break;
3704	}
3705	skb_dst_set(skb, dst);
3706
3707	mss = tcp_mss_clamp(tp, mss: dst_metric_advmss(dst));
3708
3709	memset(&opts, `0`, sizeof(opts));
3710	now = tcp_clock_ns();
3711	#ifdef CONFIG_SYN_COOKIES
3712	if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
3713	skb_set_delivery_time(skb, kt: cookie_init_timestamp(req, now),
3714	mono: true);
3715	else
3716	#endif
3717	{
3718	skb_set_delivery_time(skb, kt: now, mono: true);
3719	if (!tcp_rsk(req)->snt_synack) / Timestamp first SYNACK /
3720	tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
3721	}
3722
3723	#if defined(CONFIG_TCP_MD5SIG) \|\| defined(CONFIG_TCP_AO)
3724	rcu_read_lock();
3725	#endif
3726	if (tcp_rsk_used_ao(req)) {
3727	#ifdef CONFIG_TCP_AO
3728	struct tcp_ao_key *ao_key = NULL;
3729	u8 keyid = tcp_rsk(req)->ao_keyid;
3730
3731	ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req),
3732	keyid, -`1`);
3733	/ If there is no matching key - avoid sending anything,*
3734	* especially usigned segments. It could try harder and lookup
3735	* for another peer-matching key, but the peer has requested
3736	* ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here.
3737	*/
3738	if (unlikely(!ao_key)) {
3739	rcu_read_unlock();
3740	kfree_skb(skb);
3741	net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n",
3742	keyid);
3743	return NULL;
3744	}
3745	key.ao_key = ao_key;
3746	key.type = TCP_KEY_AO;
3747	#endif
3748	} else {
3749	#ifdef CONFIG_TCP_MD5SIG
3750	key.md5_key = tcp_rsk(req)->af_specific->req_md5_lookup(sk,
3751	req_to_sk(req));
3752	if (key.md5_key)
3753	key.type = TCP_KEY_MD5;
3754	#endif
3755	}
3756	skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), type: PKT_HASH_TYPE_L4);
3757	/ bpf program will be interested in the tcp_flags /
3758	TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN \| TCPHDR_ACK;
3759	tcp_header_size = tcp_synack_options(sk, req, mss, skb, opts: &opts,
3760	key: &key, foc, synack_type, syn_skb)
3761	+ sizeof(*th);
3762
3763	skb_push(skb, len: tcp_header_size);
3764	skb_reset_transport_header(skb);
3765
3766	th = (struct tcphdr *)skb->data;
3767	memset(th, `0`, sizeof(struct tcphdr));
3768	th->syn = `1`;
3769	th->ack = `1`;
3770	tcp_ecn_make_synack(req, th);
3771	th->source = htons(ireq->ir_num);
3772	th->dest = ireq->ir_rmt_port;
3773	skb->mark = ireq->ir_mark;
3774	skb->ip_summed = CHECKSUM_PARTIAL;
3775	th->seq = htonl(tcp_rsk(req)->snt_isn);
3776	/ XXX data is queued and acked as is. No buffer/window check /
3777	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
3778
3779	/ RFC1323: The window in SYN & SYN/ACK segments is never scaled. /
3780	th->window = htons(min(req->rsk_rcv_wnd, `65535U`));
3781	tcp_options_write(th, NULL, tcprsk: tcp_rsk(req), opts: &opts, key: &key);
3782	th->doff = (tcp_header_size >> `2`);
3783	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
3784
3785	/ Okay, we have all we need - do the md5 hash if needed /
3786	if (tcp_key_is_md5(key: &key)) {
3787	#ifdef CONFIG_TCP_MD5SIG
3788	tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
3789	key.md5_key, req_to_sk(req), skb);
3790	#endif
3791	} else if (tcp_key_is_ao(key: &key)) {
3792	#ifdef CONFIG_TCP_AO
3793	tcp_rsk(req)->af_specific->ao_synack_hash(opts.hash_location,
3794	key.ao_key, req, skb,
3795	opts.hash_location - (u8 *)th, `0`);
3796	#endif
3797	}
3798	#if defined(CONFIG_TCP_MD5SIG) \|\| defined(CONFIG_TCP_AO)
3799	rcu_read_unlock();
3800	#endif
3801
3802	bpf_skops_write_hdr_opt(sk: (struct sock *)sk, skb, req, syn_skb,
3803	synack_type, opts: &opts);
3804
3805	skb_set_delivery_time(skb, kt: now, mono: true);
3806	tcp_add_tx_delay(skb, tp);
3807
3808	return skb;
3809	}
3810	EXPORT_SYMBOL(tcp_make_synack);
3811
3812	static void tcp_ca_dst_init(struct sock sk, const* struct dst_entry *dst)
3813	{
3814	struct inet_connection_sock *icsk = inet_csk(sk);
3815	const struct tcp_congestion_ops *ca;
3816	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
3817
3818	if (ca_key == TCP_CA_UNSPEC)
3819	return;
3820
3821	rcu_read_lock();
3822	ca = tcp_ca_find_key(key: ca_key);
3823	if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
3824	bpf_module_put(data: icsk->icsk_ca_ops, owner: icsk->icsk_ca_ops->owner);
3825	icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
3826	icsk->icsk_ca_ops = ca;
3827	}
3828	rcu_read_unlock();
3829	}
3830
3831	/ Do all connect socket setups that can be done AF independent. /
3832	static void tcp_connect_init(struct sock *sk)
3833	{
3834	const struct dst_entry *dst = __sk_dst_get(sk);
3835	struct tcp_sock *tp = tcp_sk(sk);
3836	__u8 rcv_wscale;
3837	u32 rcv_wnd;
3838
3839	/ We'll fix this up when we get a response from the other end.*
3840	* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
3841	*/
3842	tp->tcp_header_len = sizeof(struct tcphdr);
3843	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps))
3844	tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
3845
3846	tcp_ao_connect_init(sk);
3847
3848	/ If user gave his TCP_MAXSEG, record it to clamp /
3849	if (tp->rx_opt.user_mss)
3850	tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
3851	tp->max_window = `0`;
3852	tcp_mtup_init(sk);
3853	tcp_sync_mss(sk, dst_mtu(dst));
3854
3855	tcp_ca_dst_init(sk, dst);
3856
3857	if (!tp->window_clamp)
3858	tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
3859	tp->advmss = tcp_mss_clamp(tp, mss: dst_metric_advmss(dst));
3860
3861	tcp_initialize_rcv_mss(sk);
3862
3863	/ limit the window selection if the user enforce a smaller rx buffer /
3864	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
3865	(tp->window_clamp > tcp_full_space(sk) \|\| tp->window_clamp == `0`))
3866	tp->window_clamp = tcp_full_space(sk);
3867
3868	rcv_wnd = tcp_rwnd_init_bpf(sk);
3869	if (rcv_wnd == `0`)
3870	rcv_wnd = dst_metric(dst, RTAX_INITRWND);
3871
3872	tcp_select_initial_window(sk, tcp_full_space(sk),
3873	tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : `0`),
3874	&tp->rcv_wnd,
3875	&tp->window_clamp,
3876	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling),
3877	&rcv_wscale,
3878	rcv_wnd);
3879
3880	tp->rx_opt.rcv_wscale = rcv_wscale;
3881	tp->rcv_ssthresh = tp->rcv_wnd;
3882
3883	WRITE_ONCE(sk->sk_err, `0`);
3884	sock_reset_flag(sk, flag: SOCK_DONE);
3885	tp->snd_wnd = `0`;
3886	tcp_init_wl(tp, seq: `0`);
3887	tcp_write_queue_purge(sk);
3888	tp->snd_una = tp->write_seq;
3889	tp->snd_sml = tp->write_seq;
3890	tp->snd_up = tp->write_seq;
3891	WRITE_ONCE(tp->snd_nxt, tp->write_seq);
3892
3893	if (likely(!tp->repair))
3894	tp->rcv_nxt = `0`;
3895	else
3896	tp->rcv_tstamp = tcp_jiffies32;
3897	tp->rcv_wup = tp->rcv_nxt;
3898	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
3899
3900	inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
3901	inet_csk(sk)->icsk_retransmits = `0`;
3902	tcp_clear_retrans(tp);
3903	}
3904
3905	static void tcp_connect_queue_skb(struct sock sk, struct* sk_buff *skb)
3906	{
3907	struct tcp_sock *tp = tcp_sk(sk);
3908	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
3909
3910	tcb->end_seq += skb->len;
3911	__skb_header_release(skb);
3912	sk_wmem_queued_add(sk, val: skb->truesize);
3913	sk_mem_charge(sk, size: skb->truesize);
3914	WRITE_ONCE(tp->write_seq, tcb->end_seq);
3915	tp->packets_out += tcp_skb_pcount(skb);
3916	}
3917
3918	/ Build and send a SYN with data and (cached) Fast Open cookie. However,*
3919	* queue a data-only packet after the regular SYN, such that regular SYNs
3920	* are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
3921	* only the SYN sequence, the data are retransmitted in the first ACK.
3922	* If cookie is not cached or other error occurs, falls back to send a
3923	* regular SYN with Fast Open cookie request option.
3924	*/
3925	static int tcp_send_syn_data(struct sock sk, struct* sk_buff *syn)
3926	{
3927	struct inet_connection_sock *icsk = inet_csk(sk);
3928	struct tcp_sock *tp = tcp_sk(sk);
3929	struct tcp_fastopen_request *fo = tp->fastopen_req;
3930	struct page_frag *pfrag = sk_page_frag(sk);
3931	struct sk_buff *syn_data;
3932	int space, err = `0`;
3933
3934	tp->rx_opt.mss_clamp = tp->advmss; / If MSS is not cached /
3935	if (!tcp_fastopen_cookie_check(sk, mss: &tp->rx_opt.mss_clamp, cookie: &fo->cookie))
3936	goto fallback;
3937
3938	/ MSS for SYN-data is based on cached MSS and bounded by PMTU and*
3939	* user-MSS. Reserve maximum option space for middleboxes that add
3940	* private TCP options. The cost is reduced data space in SYN :(
3941	*/
3942	tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, mss: tp->rx_opt.mss_clamp);
3943	/ Sync mss_cache after updating the mss_clamp /
3944	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
3945
3946	space = __tcp_mtu_to_mss(sk, pmtu: icsk->icsk_pmtu_cookie) -
3947	MAX_TCP_OPTION_SPACE;
3948
3949	space = min_t(size_t, space, fo->size);
3950
3951	if (space &&
3952	!skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE),
3953	pfrag, prio: sk->sk_allocation))
3954	goto fallback;
3955	syn_data = tcp_stream_alloc_skb(sk, gfp: sk->sk_allocation, force_schedule: false);
3956	if (!syn_data)
3957	goto fallback;
3958	memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
3959	if (space) {
3960	space = min_t(size_t, space, pfrag->size - pfrag->offset);
3961	space = tcp_wmem_schedule(sk, copy: space);
3962	}
3963	if (space) {
3964	space = copy_page_from_iter(page: pfrag->page, offset: pfrag->offset,
3965	bytes: space, i: &fo->data->msg_iter);
3966	if (unlikely(!space)) {
3967	tcp_skb_tsorted_anchor_cleanup(skb: syn_data);
3968	kfree_skb(skb: syn_data);
3969	goto fallback;
3970	}
3971	skb_fill_page_desc(skb: syn_data, i: `0`, page: pfrag->page,
3972	off: pfrag->offset, size: space);
3973	page_ref_inc(page: pfrag->page);
3974	pfrag->offset += space;
3975	skb_len_add(skb: syn_data, delta: space);
3976	skb_zcopy_set(skb: syn_data, uarg: fo->uarg, NULL);
3977	}
3978	/ No more data pending in inet_wait_for_connect() /
3979	if (space == fo->size)
3980	fo->data = NULL;
3981	fo->copied = space;
3982
3983	tcp_connect_queue_skb(sk, skb: syn_data);
3984	if (syn_data->len)
3985	tcp_chrono_start(sk, type: TCP_CHRONO_BUSY);
3986
3987	err = tcp_transmit_skb(sk, skb: syn_data, clone_it: `1`, gfp_mask: sk->sk_allocation);
3988
3989	skb_set_delivery_time(skb: syn, kt: syn_data->skb_mstamp_ns, mono: true);
3990
3991	/ Now full SYN+DATA was cloned and sent (or not),*
3992	* remove the SYN from the original skb (syn_data)
3993	* we keep in write queue in case of a retransmit, as we
3994	* also have the SYN packet (with no data) in the same queue.
3995	*/
3996	TCP_SKB_CB(syn_data)->seq++;
3997	TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK \| TCPHDR_PSH;
3998	if (!err) {
3999	tp->syn_data = (fo->copied > `0`);
4000	tcp_rbtree_insert(root: &sk->tcp_rtx_queue, skb: syn_data);
4001	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
4002	goto done;
4003	}
4004
4005	/ data was not sent, put it in write_queue /
4006	__skb_queue_tail(list: &sk->sk_write_queue, newsk: syn_data);
4007	tp->packets_out -= tcp_skb_pcount(skb: syn_data);
4008
4009	fallback:
4010	/ Send a regular SYN with Fast Open cookie request option /
4011	if (fo->cookie.len > `0`)
4012	fo->cookie.len = `0`;
4013	err = tcp_transmit_skb(sk, skb: syn, clone_it: `1`, gfp_mask: sk->sk_allocation);
4014	if (err)
4015	tp->syn_fastopen = `0`;
4016	done:
4017	fo->cookie.len = -`1`; / Exclude Fast Open option for SYN retries /
4018	return err;
4019	}
4020
4021	/ Build a SYN and send it off. /
4022	int tcp_connect(struct sock *sk)
4023	{
4024	struct tcp_sock *tp = tcp_sk(sk);
4025	struct sk_buff *buff;
4026	int err;
4027
4028	tcp_call_bpf(sk, op: BPF_SOCK_OPS_TCP_CONNECT_CB, nargs: `0`, NULL);
4029
4030	#if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO)
4031	/ Has to be checked late, after setting daddr/saddr/ops.*
4032	* Return error if the peer has both a md5 and a tcp-ao key
4033	* configured as this is ambiguous.
4034	*/
4035	if (unlikely(rcu_dereference_protected(tp->md5sig_info,
4036	lockdep_sock_is_held(sk)))) {
4037	bool needs_ao = !!tp->af_specific->ao_lookup(sk, sk, -`1`, -`1`);
4038	bool needs_md5 = !!tp->af_specific->md5_lookup(sk, sk);
4039	struct tcp_ao_info *ao_info;
4040
4041	ao_info = rcu_dereference_check(tp->ao_info,
4042	lockdep_sock_is_held(sk));
4043	if (ao_info) {
4044	/ This is an extra check: tcp_ao_required() in*
4045	* tcp_v{4,6}_parse_md5_keys() should prevent adding
4046	* md5 keys on ao_required socket.
4047	*/
4048	needs_ao \|= ao_info->ao_required;
4049	WARN_ON_ONCE(ao_info->ao_required && needs_md5);
4050	}
4051	if (needs_md5 && needs_ao)
4052	return -EKEYREJECTED;
4053
4054	/ If we have a matching md5 key and no matching tcp-ao key*
4055	* then free up ao_info if allocated.
4056	*/
4057	if (needs_md5) {
4058	tcp_ao_destroy_sock(sk, twsk: false);
4059	} else if (needs_ao) {
4060	tcp_clear_md5_list(sk);
4061	kfree(rcu_replace_pointer(tp->md5sig_info, NULL,
4062	lockdep_sock_is_held(sk)));
4063	}
4064	}
4065	#endif
4066	#ifdef CONFIG_TCP_AO
4067	if (unlikely(rcu_dereference_protected(tp->ao_info,
4068	lockdep_sock_is_held(sk)))) {
4069	/ Don't allow connecting if ao is configured but no*
4070	* matching key is found.
4071	*/
4072	if (!tp->af_specific->ao_lookup(sk, sk, -`1`, -`1`))
4073	return -EKEYREJECTED;
4074	}
4075	#endif
4076
4077	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
4078	return -EHOSTUNREACH; / Routing failure or similar. /
4079
4080	tcp_connect_init(sk);
4081
4082	if (unlikely(tp->repair)) {
4083	tcp_finish_connect(sk, NULL);
4084	return `0`;
4085	}
4086
4087	buff = tcp_stream_alloc_skb(sk, gfp: sk->sk_allocation, force_schedule: true);
4088	if (unlikely(!buff))
4089	return -ENOBUFS;
4090
4091	tcp_init_nondata_skb(skb: buff, seq: tp->write_seq++, TCPHDR_SYN);
4092	tcp_mstamp_refresh(tp);
4093	tp->retrans_stamp = tcp_time_stamp_ts(tp);
4094	tcp_connect_queue_skb(sk, skb: buff);
4095	tcp_ecn_send_syn(sk, skb: buff);
4096	tcp_rbtree_insert(root: &sk->tcp_rtx_queue, skb: buff);
4097
4098	/ Send off SYN; include data in Fast Open. /
4099	err = tp->fastopen_req ? tcp_send_syn_data(sk, syn: buff) :
4100	tcp_transmit_skb(sk, skb: buff, clone_it: `1`, gfp_mask: sk->sk_allocation);
4101	if (err == -ECONNREFUSED)
4102	return err;
4103
4104	/ We change tp->snd_nxt after the tcp_transmit_skb() call*
4105	* in order to make this packet get counted in tcpOutSegs.
4106	*/
4107	WRITE_ONCE(tp->snd_nxt, tp->write_seq);
4108	tp->pushed_seq = tp->write_seq;
4109	buff = tcp_send_head(sk);
4110	if (unlikely(buff)) {
4111	WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
4112	tp->pushed_seq = TCP_SKB_CB(buff)->seq;
4113	}
4114	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
4115
4116	/ Timer for repeating the SYN until an answer. /
4117	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
4118	when: inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
4119	return `0`;
4120	}
4121	EXPORT_SYMBOL(tcp_connect);
4122
4123	u32 tcp_delack_max(const struct sock *sk)
4124	{
4125	const struct dst_entry *dst = __sk_dst_get(sk);
4126	u32 delack_max = inet_csk(sk)->icsk_delack_max;
4127
4128	if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) {
4129	u32 rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
4130	u32 delack_from_rto_min = max_t(int, `1`, rto_min - `1`);
4131
4132	delack_max = min_t(u32, delack_max, delack_from_rto_min);
4133	}
4134	return delack_max;
4135	}
4136
4137	/ Send out a delayed ack, the caller does the policy checking*
4138	* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
4139	* for details.
4140	*/
4141	void tcp_send_delayed_ack(struct sock *sk)
4142	{
4143	struct inet_connection_sock *icsk = inet_csk(sk);
4144	int ato = icsk->icsk_ack.ato;
4145	unsigned long timeout;
4146
4147	if (ato > TCP_DELACK_MIN) {
4148	const struct tcp_sock *tp = tcp_sk(sk);
4149	int max_ato = HZ / `2`;
4150
4151	if (inet_csk_in_pingpong_mode(sk) \|\|
4152	(icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
4153	max_ato = TCP_DELACK_MAX;
4154
4155	/ Slow path, intersegment interval is "high". /
4156
4157	/ If some rtt estimate is known, use it to bound delayed ack.*
4158	* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
4159	* directly.
4160	*/
4161	if (tp->srtt_us) {
4162	int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> `3`),
4163	TCP_DELACK_MIN);
4164
4165	if (rtt < max_ato)
4166	max_ato = rtt;
4167	}
4168
4169	ato = min(ato, max_ato);
4170	}
4171
4172	ato = min_t(u32, ato, tcp_delack_max(sk));
4173
4174	/ Stay within the limit we were given /
4175	timeout = jiffies + ato;
4176
4177	/ Use new timeout only if there wasn't a older one earlier. /
4178	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
4179	/ If delack timer is about to expire, send ACK now. /
4180	if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> `2`))) {
4181	tcp_send_ack(sk);
4182	return;
4183	}
4184
4185	if (!time_before(timeout, icsk->icsk_ack.timeout))
4186	timeout = icsk->icsk_ack.timeout;
4187	}
4188	icsk->icsk_ack.pending \|= ICSK_ACK_SCHED \| ICSK_ACK_TIMER;
4189	icsk->icsk_ack.timeout = timeout;
4190	sk_reset_timer(sk, timer: &icsk->icsk_delack_timer, expires: timeout);
4191	}
4192
4193	/ This routine sends an ack and also updates the window. /
4194	void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
4195	{
4196	struct sk_buff *buff;
4197
4198	/ If we have been reset, we may not send again. /
4199	if (sk->sk_state == TCP_CLOSE)
4200	return;
4201
4202	/ We are not putting this on the write queue, so*
4203	* tcp_transmit_skb() will set the ownership to this
4204	* sock.
4205	*/
4206	buff = alloc_skb(MAX_TCP_HEADER,
4207	priority: sk_gfp_mask(sk, GFP_ATOMIC \| __GFP_NOWARN));
4208	if (unlikely(!buff)) {
4209	struct inet_connection_sock *icsk = inet_csk(sk);
4210	unsigned long delay;
4211
4212	delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
4213	if (delay < TCP_RTO_MAX)
4214	icsk->icsk_ack.retry++;
4215	inet_csk_schedule_ack(sk);
4216	icsk->icsk_ack.ato = TCP_ATO_MIN;
4217	inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, when: delay, TCP_RTO_MAX);
4218	return;
4219	}
4220
4221	/ Reserve space for headers and prepare control bits. /
4222	skb_reserve(skb: buff, MAX_TCP_HEADER);
4223	tcp_init_nondata_skb(skb: buff, seq: tcp_acceptable_seq(sk), TCPHDR_ACK);
4224
4225	/ We do not want pure acks influencing TCP Small Queues or fq/pacing*
4226	* too much.
4227	* SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
4228	*/
4229	skb_set_tcp_pure_ack(skb: buff);
4230
4231	/ Send it off, this clears delayed acks for us. /
4232	__tcp_transmit_skb(sk, skb: buff, clone_it: `0`, gfp_mask: (__force gfp_t)`0`, rcv_nxt);
4233	}
4234	EXPORT_SYMBOL_GPL(__tcp_send_ack);
4235
4236	void tcp_send_ack(struct sock *sk)
4237	{
4238	__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
4239	}
4240
4241	/ This routine sends a packet with an out of date sequence*
4242	* number. It assumes the other end will try to ack it.
4243	*
4244	* Question: what should we make while urgent mode?
4245	* 4.4BSD forces sending single byte of data. We cannot send
4246	* out of window data, because we have SND.NXT==SND.MAX...
4247	*
4248	* Current solution: to send TWO zero-length segments in urgent mode:
4249	* one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
4250	* out-of-date with SND.UNA-1 to probe window.
4251	*/
4252	static int tcp_xmit_probe_skb(struct sock sk, int* urgent, int mib)
4253	{
4254	struct tcp_sock *tp = tcp_sk(sk);
4255	struct sk_buff *skb;
4256
4257	/ We don't queue it, tcp_transmit_skb() sets ownership. /
4258	skb = alloc_skb(MAX_TCP_HEADER,
4259	priority: sk_gfp_mask(sk, GFP_ATOMIC \| __GFP_NOWARN));
4260	if (!skb)
4261	return -`1`;
4262
4263	/ Reserve space for headers and set control bits. /
4264	skb_reserve(skb, MAX_TCP_HEADER);
4265	/ Use a previous sequence. This should cause the other*
4266	* end to send an ack. Don't queue or clone SKB, just
4267	* send it.
4268	*/
4269	tcp_init_nondata_skb(skb, seq: tp->snd_una - !urgent, TCPHDR_ACK);
4270	NET_INC_STATS(sock_net(sk), mib);
4271	return tcp_transmit_skb(sk, skb, clone_it: `0`, gfp_mask: (__force gfp_t)`0`);
4272	}
4273
4274	/ Called from setsockopt( ... TCP_REPAIR ) /
4275	void tcp_send_window_probe(struct sock *sk)
4276	{
4277	if (sk->sk_state == TCP_ESTABLISHED) {
4278	tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - `1`;
4279	tcp_mstamp_refresh(tcp_sk(sk));
4280	tcp_xmit_probe_skb(sk, urgent: `0`, mib: LINUX_MIB_TCPWINPROBE);
4281	}
4282	}
4283
4284	/ Initiate keepalive or window probe from timer. /
4285	int tcp_write_wakeup(struct sock sk, int* mib)
4286	{
4287	struct tcp_sock *tp = tcp_sk(sk);
4288	struct sk_buff *skb;
4289
4290	if (sk->sk_state == TCP_CLOSE)
4291	return -`1`;
4292
4293	skb = tcp_send_head(sk);
4294	if (skb && before(TCP_SKB_CB(skb)->seq, seq2: tcp_wnd_end(tp))) {
4295	int err;
4296	unsigned int mss = tcp_current_mss(sk);
4297	unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
4298
4299	if (before(seq1: tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
4300	tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
4301
4302	/ We are probing the opening of a window*
4303	* but the window size is != 0
4304	* must have been a result SWS avoidance ( sender )
4305	*/
4306	if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq \|\|
4307	skb->len > mss) {
4308	seg_size = min(seg_size, mss);
4309	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
4310	if (tcp_fragment(sk, tcp_queue: TCP_FRAG_IN_WRITE_QUEUE,
4311	skb, len: seg_size, mss_now: mss, GFP_ATOMIC))
4312	return -`1`;
4313	} else if (!tcp_skb_pcount(skb))
4314	tcp_set_skb_tso_segs(skb, mss_now: mss);
4315
4316	TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_PSH;
4317	err = tcp_transmit_skb(sk, skb, clone_it: `1`, GFP_ATOMIC);
4318	if (!err)
4319	tcp_event_new_data_sent(sk, skb);
4320	return err;
4321	} else {
4322	if (between(seq1: tp->snd_up, seq2: tp->snd_una + `1`, seq3: tp->snd_una + `0xFFFF`))
4323	tcp_xmit_probe_skb(sk, urgent: `1`, mib);
4324	return tcp_xmit_probe_skb(sk, urgent: `0`, mib);
4325	}
4326	}
4327
4328	/ A window probe timeout has occurred. If window is not closed send*
4329	* a partial packet else a zero probe.
4330	*/
4331	void tcp_send_probe0(struct sock *sk)
4332	{
4333	struct inet_connection_sock *icsk = inet_csk(sk);
4334	struct tcp_sock *tp = tcp_sk(sk);
4335	struct net *net = sock_net(sk);
4336	unsigned long timeout;
4337	int err;
4338
4339	err = tcp_write_wakeup(sk, mib: LINUX_MIB_TCPWINPROBE);
4340
4341	if (tp->packets_out \|\| tcp_write_queue_empty(sk)) {
4342	/ Cancel probe timer, if it is not required. /
4343	icsk->icsk_probes_out = `0`;
4344	icsk->icsk_backoff = `0`;
4345	icsk->icsk_probes_tstamp = `0`;
4346	return;
4347	}
4348
4349	icsk->icsk_probes_out++;
4350	if (err <= `0`) {
4351	if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
4352	icsk->icsk_backoff++;
4353	timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
4354	} else {
4355	/ If packet was not sent due to local congestion,*
4356	* Let senders fight for local resources conservatively.
4357	*/
4358	timeout = TCP_RESOURCE_PROBE_INTERVAL;
4359	}
4360
4361	timeout = tcp_clamp_probe0_to_user_timeout(sk, when: timeout);
4362	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when: timeout, TCP_RTO_MAX);
4363	}
4364
4365	int tcp_rtx_synack(const struct sock sk, struct* request_sock *req)
4366	{
4367	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
4368	struct flowi fl;
4369	int res;
4370
4371	/ Paired with WRITE_ONCE() in sock_setsockopt() /
4372	if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
4373	WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash());
4374	res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
4375	NULL);
4376	if (!res) {
4377	TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
4378	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
4379	if (unlikely(tcp_passive_fastopen(sk))) {
4380	/ sk has const attribute because listeners are lockless.*
4381	* However in this case, we are dealing with a passive fastopen
4382	* socket thus we can change total_retrans value.
4383	*/
4384	tcp_sk_rw(sk)->total_retrans++;
4385	}
4386	trace_tcp_retransmit_synack(sk, req);
4387	}
4388	return res;
4389	}
4390	EXPORT_SYMBOL(tcp_rtx_synack);
4391

source code of linux/net/ipv4/tcp_output.c