tcp_input.c source code [linux/net/ipv4/tcp_input.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* INET An implementation of the TCP/IP protocol suite for the LINUX
4	* operating system. INET is implemented using the BSD Socket
5	* interface as the means of communication with the user level.
6	*
7	* Implementation of the Transmission Control Protocol(TCP).
8	*
9	* Authors: Ross Biro
10	* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11	* Mark Evans, <evansmp@uhura.aston.ac.uk>
12	* Corey Minyard <wf-rch!minyard@relay.EU.net>
13	* Florian La Roche, <flla@stud.uni-sb.de>
14	* Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
15	* Linus Torvalds, <torvalds@cs.helsinki.fi>
16	* Alan Cox, <gw4pts@gw4pts.ampr.org>
17	* Matthew Dillon, <dillon@apollo.west.oic.com>
18	* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
19	* Jorge Cwik, <jorge@laser.satlink.net>
20	*/
21
22	/*
23	* Changes:
24	* Pedro Roque : Fast Retransmit/Recovery.
25	* Two receive queues.
26	* Retransmit queue handled by TCP.
27	* Better retransmit timer handling.
28	* New congestion avoidance.
29	* Header prediction.
30	* Variable renaming.
31	*
32	* Eric : Fast Retransmit.
33	* Randy Scott : MSS option defines.
34	* Eric Schenk : Fixes to slow start algorithm.
35	* Eric Schenk : Yet another double ACK bug.
36	* Eric Schenk : Delayed ACK bug fixes.
37	* Eric Schenk : Floyd style fast retrans war avoidance.
38	* David S. Miller : Don't allow zero congestion window.
39	* Eric Schenk : Fix retransmitter so that it sends
40	* next packet on ack of previous packet.
41	* Andi Kleen : Moved open_request checking here
42	* and process RSTs for open_requests.
43	* Andi Kleen : Better prune_queue, and other fixes.
44	* Andrey Savochkin: Fix RTT measurements in the presence of
45	* timestamps.
46	* Andrey Savochkin: Check sequence numbers correctly when
47	* removing SACKs due to in sequence incoming
48	* data segments.
49	* Andi Kleen: Make sure we never ack data there is not
50	* enough room for. Also make this condition
51	* a fatal error if it might still happen.
52	* Andi Kleen: Add tcp_measure_rcv_mss to make
53	* connections with MSS<min(MTU,ann. MSS)
54	* work without delayed acks.
55	* Andi Kleen: Process packets with PSH set in the
56	* fast path.
57	* J Hadi Salim: ECN support
58	* Andrei Gurtov,
59	* Pasi Sarolahti,
60	* Panu Kuhlberg: Experimental audit of TCP (re)transmission
61	* engine. Lots of bugs are found.
62	* Pasi Sarolahti: F-RTO for dealing with spurious RTOs
63	*/
64
65	#define pr_fmt(fmt) "TCP: " fmt
66
67	#include <linux/mm.h>
68	#include <linux/slab.h>
69	#include <linux/module.h>
70	#include <linux/sysctl.h>
71	#include <linux/kernel.h>
72	#include <linux/prefetch.h>
73	#include <net/dst.h>
74	#include <net/tcp.h>
75	#include <net/proto_memory.h>
76	#include <net/inet_common.h>
77	#include <linux/ipsec.h>
78	#include <linux/unaligned.h>
79	#include <linux/errqueue.h>
80	#include <trace/events/tcp.h>
81	#include <linux/jump_label_ratelimit.h>
82	#include <net/busy_poll.h>
83	#include <net/mptcp.h>
84
85	int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
86
87	#define FLAG_DATA 0x01 /* Incoming frame contained data. */
88	#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
89	#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
90	#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */
91	#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
92	#define FLAG_DATA_SACKED 0x20 /* New SACK. */
93	#define FLAG_ECE 0x40 /* ECE in this ACK */
94	#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */
95	#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
96	#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
97	#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
98	#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
99	#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */
100	#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
101	#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
102	#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
103	#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
104	#define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */
105	#define FLAG_TS_PROGRESS 0x40000 /* Positive timestamp delta */
106
107	#define FLAG_ACKED (FLAG_DATA_ACKED\|FLAG_SYN_ACKED)
108	#define FLAG_NOT_DUP (FLAG_DATA\|FLAG_WIN_UPDATE\|FLAG_ACKED)
109	#define FLAG_CA_ALERT (FLAG_DATA_SACKED\|FLAG_ECE\|FLAG_DSACKING_ACK)
110	#define FLAG_FORWARD_PROGRESS (FLAG_ACKED\|FLAG_DATA_SACKED)
111
112	#define TCP_REMNANT (TCP_FLAG_FIN\|TCP_FLAG_URG\|TCP_FLAG_SYN\|TCP_FLAG_PSH)
113	#define TCP_HP_BITS (~(TCP_RESERVED_BITS\|TCP_FLAG_PSH))
114
115	#define REXMIT_NONE 0 /* no loss recovery to do */
116	#define REXMIT_LOST 1 /* retransmit packets marked lost */
117	#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
118
119	#if IS_ENABLED(CONFIG_TLS_DEVICE)
120	static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
121
122	void clean_acked_data_enable(struct tcp_sock *tp,
123	void (cad)(struct* sock *sk, u32 ack_seq))
124	{
125	tp->tcp_clean_acked = cad;
126	static_branch_deferred_inc(&clean_acked_data_enabled);
127	}
128	EXPORT_SYMBOL_GPL(clean_acked_data_enable);
129
130	void clean_acked_data_disable(struct tcp_sock *tp)
131	{
132	static_branch_slow_dec_deferred(&clean_acked_data_enabled);
133	tp->tcp_clean_acked = NULL;
134	}
135	EXPORT_SYMBOL_GPL(clean_acked_data_disable);
136
137	void clean_acked_data_flush(void)
138	{
139	static_key_deferred_flush(&clean_acked_data_enabled);
140	}
141	EXPORT_SYMBOL_GPL(clean_acked_data_flush);
142	#endif
143
144	#ifdef CONFIG_CGROUP_BPF
145	static void bpf_skops_parse_hdr(struct sock sk, struct* sk_buff *skb)
146	{
147	bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
148	BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
149	BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
150	bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
151	BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
152	struct bpf_sock_ops_kern sock_ops;
153
154	if (likely(!unknown_opt && !parse_all_opt))
155	return;
156
157	/ The skb will be handled in the*
158	* bpf_skops_established() or
159	* bpf_skops_write_hdr_opt().
160	*/
161	switch (sk->sk_state) {
162	case TCP_SYN_RECV:
163	case TCP_SYN_SENT:
164	case TCP_LISTEN:
165	return;
166	}
167
168	sock_owned_by_me(sk);
169
170	memset(&sock_ops, `0`, offsetof(struct bpf_sock_ops_kern, temp));
171	sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
172	sock_ops.is_fullsock = `1`;
173	sock_ops.is_locked_tcp_sock = `1`;
174	sock_ops.sk = sk;
175	bpf_skops_init_skb(skops: &sock_ops, skb, end_offset: tcp_hdrlen(skb));
176
177	BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
178	}
179
180	static void bpf_skops_established(struct sock sk, int* bpf_op,
181	struct sk_buff *skb)
182	{
183	struct bpf_sock_ops_kern sock_ops;
184
185	sock_owned_by_me(sk);
186
187	memset(&sock_ops, `0`, offsetof(struct bpf_sock_ops_kern, temp));
188	sock_ops.op = bpf_op;
189	sock_ops.is_fullsock = `1`;
190	sock_ops.is_locked_tcp_sock = `1`;
191	sock_ops.sk = sk;
192	/ sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect /
193	if (skb)
194	bpf_skops_init_skb(skops: &sock_ops, skb, end_offset: tcp_hdrlen(skb));
195
196	BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
197	}
198	#else
199	static void bpf_skops_parse_hdr(struct sock sk, struct* sk_buff *skb)
200	{
201	}
202
203	static void bpf_skops_established(struct sock sk, int* bpf_op,
204	struct sk_buff *skb)
205	{
206	}
207	#endif
208
209	static __cold void tcp_gro_dev_warn(const struct sock sk, const* struct sk_buff *skb,
210	unsigned int len)
211	{
212	struct net_device *dev;
213
214	rcu_read_lock();
215	dev = dev_get_by_index_rcu(net: sock_net(sk), ifindex: skb->skb_iif);
216	if (!dev \|\| len >= READ_ONCE(dev->mtu))
217	pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
218	dev ? dev->name : "Unknown driver");
219	rcu_read_unlock();
220	}
221
222	/ Adapt the MSS value used to make delayed ack decision to the*
223	* real world.
224	*/
225	static void tcp_measure_rcv_mss(struct sock sk, const* struct sk_buff *skb)
226	{
227	struct inet_connection_sock *icsk = inet_csk(sk);
228	const unsigned int lss = icsk->icsk_ack.last_seg_size;
229	unsigned int len;
230
231	icsk->icsk_ack.last_seg_size = `0`;
232
233	/ skb->len may jitter because of SACKs, even if peer*
234	* sends good full-sized frames.
235	*/
236	len = skb_shinfo(skb)->gso_size ? : skb->len;
237	if (len >= icsk->icsk_ack.rcv_mss) {
238	/ Note: divides are still a bit expensive.*
239	* For the moment, only adjust scaling_ratio
240	* when we update icsk_ack.rcv_mss.
241	*/
242	if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
243	u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
244	u8 old_ratio = tcp_sk(sk)->scaling_ratio;
245
246	do_div(val, skb->truesize);
247	tcp_sk(sk)->scaling_ratio = val ? val : `1`;
248
249	if (old_ratio != tcp_sk(sk)->scaling_ratio) {
250	struct tcp_sock *tp = tcp_sk(sk);
251
252	val = tcp_win_from_space(sk, space: sk->sk_rcvbuf);
253	tcp_set_window_clamp(sk, val);
254
255	if (tp->window_clamp < tp->rcvq_space.space)
256	tp->rcvq_space.space = tp->window_clamp;
257	}
258	}
259	icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
260	tcp_sk(sk)->advmss);
261	/ Account for possibly-removed options /
262	DO_ONCE_LITE_IF(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE,
263	tcp_gro_dev_warn, sk, skb, len);
264	/ If the skb has a len of exactly 1MSS and has the PSH bit
265	* set then it is likely the end of an application write. So
266	* more data may not be arriving soon, and yet the data sender
267	* may be waiting for an ACK if cwnd-bound or using TX zero
268	* copy. So we set ICSK_ACK_PUSHED here so that
269	* tcp_cleanup_rbuf() will send an ACK immediately if the app
270	* reads all of the data and is not ping-pong. If len > MSS
271	* then this logic does not matter (and does not hurt) because
272	* tcp_cleanup_rbuf() will always ACK immediately if the app
273	* reads data and there is more than an MSS of unACKed data.
274	*/
275	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH)
276	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
277	} else {
278	/ Otherwise, we make more careful check taking into account,*
279	* that SACKs block is variable.
280	*
281	* "len" is invariant segment length, including TCP header.
282	*/
283	len += skb->data - skb_transport_header(skb);
284	if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) \|\|
285	/ If PSH is not set, packet should be*
286	* full sized, provided peer TCP is not badly broken.
287	* This observation (if it is correct 8)) allows
288	* to handle super-low mtu links fairly.
289	*/
290	(len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
291	!(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
292	/ Subtract also invariant (if peer is RFC compliant),*
293	* tcp header plus fixed timestamp option length.
294	* Resulting "len" is MSS free of SACK jitter.
295	*/
296	len -= tcp_sk(sk)->tcp_header_len;
297	icsk->icsk_ack.last_seg_size = len;
298	if (len == lss) {
299	icsk->icsk_ack.rcv_mss = len;
300	return;
301	}
302	}
303	if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
304	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED2;
305	icsk->icsk_ack.pending \|= ICSK_ACK_PUSHED;
306	}
307	}
308
309	static void tcp_incr_quickack(struct sock sk, unsigned* int max_quickacks)
310	{
311	struct inet_connection_sock *icsk = inet_csk(sk);
312	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (`2` * icsk->icsk_ack.rcv_mss);
313
314	if (quickacks == `0`)
315	quickacks = `2`;
316	quickacks = min(quickacks, max_quickacks);
317	if (quickacks > icsk->icsk_ack.quick)
318	icsk->icsk_ack.quick = quickacks;
319	}
320
321	static void tcp_enter_quickack_mode(struct sock sk, unsigned* int max_quickacks)
322	{
323	struct inet_connection_sock *icsk = inet_csk(sk);
324
325	tcp_incr_quickack(sk, max_quickacks);
326	inet_csk_exit_pingpong_mode(sk);
327	icsk->icsk_ack.ato = TCP_ATO_MIN;
328	}
329
330	/ Send ACKs quickly, if "quick" count is not exhausted*
331	* and the session is not interactive.
332	*/
333
334	static bool tcp_in_quickack_mode(struct sock *sk)
335	{
336	const struct inet_connection_sock *icsk = inet_csk(sk);
337
338	return icsk->icsk_ack.dst_quick_ack \|\|
339	(icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
340	}
341
342	static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
343	{
344	if (tcp_ecn_mode_rfc3168(tp))
345	tp->ecn_flags \|= TCP_ECN_QUEUE_CWR;
346	}
347
348	static void tcp_ecn_accept_cwr(struct sock sk, const* struct sk_buff *skb)
349	{
350	if (tcp_hdr(skb)->cwr) {
351	tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
352
353	/ If the sender is telling us it has entered CWR, then its*
354	* cwnd may be very low (even just 1 packet), so we should ACK
355	* immediately.
356	*/
357	if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
358	inet_csk(sk)->icsk_ack.pending \|= ICSK_ACK_NOW;
359	}
360	}
361
362	static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
363	{
364	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
365	}
366
367	static void tcp_data_ecn_check(struct sock sk, const* struct sk_buff *skb)
368	{
369	struct tcp_sock *tp = tcp_sk(sk);
370
371	if (tcp_ecn_disabled(tp))
372	return;
373
374	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
375	case INET_ECN_NOT_ECT:
376	/ Funny extension: if ECT is not set on a segment,*
377	* and we already seen ECT on a previous segment,
378	* it is probably a retransmit.
379	*/
380	if (tp->ecn_flags & TCP_ECN_SEEN)
381	tcp_enter_quickack_mode(sk, max_quickacks: `2`);
382	break;
383	case INET_ECN_CE:
384	if (tcp_ca_needs_ecn(sk))
385	tcp_ca_event(sk, event: CA_EVENT_ECN_IS_CE);
386
387	if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
388	/ Better not delay acks, sender can have a very low cwnd /
389	tcp_enter_quickack_mode(sk, max_quickacks: `2`);
390	tp->ecn_flags \|= TCP_ECN_DEMAND_CWR;
391	}
392	tp->ecn_flags \|= TCP_ECN_SEEN;
393	break;
394	default:
395	if (tcp_ca_needs_ecn(sk))
396	tcp_ca_event(sk, event: CA_EVENT_ECN_NO_CE);
397	tp->ecn_flags \|= TCP_ECN_SEEN;
398	break;
399	}
400	}
401
402	static void tcp_ecn_rcv_synack(struct tcp_sock tp, const* struct tcphdr *th)
403	{
404	if (tcp_ecn_mode_rfc3168(tp) && (!th->ece \|\| th->cwr))
405	tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
406	}
407
408	static void tcp_ecn_rcv_syn(struct tcp_sock tp, const* struct tcphdr *th)
409	{
410	if (tcp_ecn_mode_rfc3168(tp) && (!th->ece \|\| !th->cwr))
411	tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
412	}
413
414	static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock tp, const* struct tcphdr *th)
415	{
416	if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp))
417	return true;
418	return false;
419	}
420
421	static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count)
422	{
423	tp->delivered_ce += ecn_count;
424	}
425
426	/ Updates the delivered and delivered_ce counts /
427	static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
428	bool ece_ack)
429	{
430	tp->delivered += delivered;
431	if (ece_ack)
432	tcp_count_delivered_ce(tp, ecn_count: delivered);
433	}
434
435	/ Buffer size and advertised window tuning.*
436	*
437	* 1. Tuning sk->sk_sndbuf, when connection enters established state.
438	*/
439
440	static void tcp_sndbuf_expand(struct sock *sk)
441	{
442	const struct tcp_sock *tp = tcp_sk(sk);
443	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
444	int sndmem, per_mss;
445	u32 nr_segs;
446
447	/ Worst case is non GSO/TSO : each frame consumes one skb*
448	* and skb->head is kmalloced using power of two area of memory
449	*/
450	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
451	MAX_TCP_HEADER +
452	SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
453
454	per_mss = roundup_pow_of_two(per_mss) +
455	SKB_DATA_ALIGN(sizeof(struct sk_buff));
456
457	nr_segs = max_t(u32, TCP_INIT_CWND, tcp_snd_cwnd(tp));
458	nr_segs = max_t(u32, nr_segs, tp->reordering + `1`);
459
460	/ Fast Recovery (RFC 5681 3.2) :*
461	* Cubic needs 1.7 factor, rounded to 2 to include
462	* extra cushion (application might react slowly to EPOLLOUT)
463	*/
464	sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : `2`;
465	sndmem = nr_segs per_mss;
466
467	if (sk->sk_sndbuf < sndmem)
468	WRITE_ONCE(sk->sk_sndbuf,
469	min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[`2`])));
470	}
471
472	/ 2. Tuning advertised window (window_clamp, rcv_ssthresh)*
473	*
474	* All tcp_full_space() is split to two parts: "network" buffer, allocated
475	* forward and advertised in receiver window (tp->rcv_wnd) and
476	* "application buffer", required to isolate scheduling/application
477	* latencies from network.
478	* window_clamp is maximal advertised window. It can be less than
479	* tcp_full_space(), in this case tcp_full_space() - window_clamp
480	* is reserved for "application" buffer. The less window_clamp is
481	* the smoother our behaviour from viewpoint of network, but the lower
482	* throughput and the higher sensitivity of the connection to losses. 8)
483	*
484	* rcv_ssthresh is more strict window_clamp used at "slow start"
485	* phase to predict further behaviour of this connection.
486	* It is used for two goals:
487	* - to enforce header prediction at sender, even when application
488	* requires some significant "application buffer". It is check #1.
489	* - to prevent pruning of receive queue because of misprediction
490	* of receiver window. Check #2.
491	*
492	* The scheme does not work when sender sends good segments opening
493	* window and then starts to feed us spaghetti. But it should work
494	* in common situations. Otherwise, we have to rely on queue collapsing.
495	*/
496
497	/ Slow part of check#2. /
498	static int __tcp_grow_window(const struct sock sk, const* struct sk_buff *skb,
499	unsigned int skbtruesize)
500	{
501	const struct tcp_sock *tp = tcp_sk(sk);
502	/ Optimize this! /
503	int truesize = tcp_win_from_space(sk, space: skbtruesize) >> `1`;
504	int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[`2`])) >> `1`;
505
506	while (tp->rcv_ssthresh <= window) {
507	if (truesize <= skb->len)
508	return `2` * inet_csk(sk)->icsk_ack.rcv_mss;
509
510	truesize >>= `1`;
511	window >>= `1`;
512	}
513	return `0`;
514	}
515
516	/ Even if skb appears to have a bad len/truesize ratio, TCP coalescing*
517	* can play nice with us, as sk_buff and skb->head might be either
518	* freed or shared with up to MAX_SKB_FRAGS segments.
519	* Only give a boost to drivers using page frag(s) to hold the frame(s),
520	* and if no payload was pulled in skb->head before reaching us.
521	*/
522	static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
523	{
524	u32 truesize = skb->truesize;
525
526	if (adjust && !skb_headlen(skb)) {
527	truesize -= SKB_TRUESIZE(skb_end_offset(skb));
528	/ paranoid check, some drivers might be buggy /
529	if (unlikely((int)truesize < (int)skb->len))
530	truesize = skb->truesize;
531	}
532	return truesize;
533	}
534
535	static void tcp_grow_window(struct sock sk, const* struct sk_buff *skb,
536	bool adjust)
537	{
538	struct tcp_sock *tp = tcp_sk(sk);
539	int room;
540
541	room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
542
543	if (room <= `0`)
544	return;
545
546	/ Check #1 /
547	if (!tcp_under_memory_pressure(sk)) {
548	unsigned int truesize = truesize_adjust(adjust, skb);
549	int incr;
550
551	/ Check #2. Increase window, if skb with such overhead*
552	* will fit to rcvbuf in future.
553	*/
554	if (tcp_win_from_space(sk, space: truesize) <= skb->len)
555	incr = `2` * tp->advmss;
556	else
557	incr = __tcp_grow_window(sk, skb, skbtruesize: truesize);
558
559	if (incr) {
560	incr = max_t(int, incr, `2` * skb->len);
561	tp->rcv_ssthresh += min(room, incr);
562	inet_csk(sk)->icsk_ack.quick \|= `1`;
563	}
564	} else {
565	/ Under pressure:*
566	* Adjust rcv_ssthresh according to reserved mem
567	*/
568	tcp_adjust_rcv_ssthresh(sk);
569	}
570	}
571
572	/ 3. Try to fixup all. It is made immediately after connection enters*
573	* established state.
574	*/
575	static void tcp_init_buffer_space(struct sock *sk)
576	{
577	int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win);
578	struct tcp_sock *tp = tcp_sk(sk);
579	int maxwin;
580
581	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
582	tcp_sndbuf_expand(sk);
583
584	tcp_mstamp_refresh(tp);
585	tp->rcvq_space.time = tp->tcp_mstamp;
586	tp->rcvq_space.seq = tp->copied_seq;
587
588	maxwin = tcp_full_space(sk);
589
590	if (tp->window_clamp >= maxwin) {
591	WRITE_ONCE(tp->window_clamp, maxwin);
592
593	if (tcp_app_win && maxwin > `4` * tp->advmss)
594	WRITE_ONCE(tp->window_clamp,
595	max(maxwin - (maxwin >> tcp_app_win),
596	`4` * tp->advmss));
597	}
598
599	/ Force reservation of one segment. /
600	if (tcp_app_win &&
601	tp->window_clamp > `2` * tp->advmss &&
602	tp->window_clamp + tp->advmss > maxwin)
603	WRITE_ONCE(tp->window_clamp,
604	max(`2` * tp->advmss, maxwin - tp->advmss));
605
606	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
607	tp->snd_cwnd_stamp = tcp_jiffies32;
608	tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd,
609	(u32)TCP_INIT_CWND * tp->advmss);
610	}
611
612	/ 4. Recalculate window clamp after socket hit its memory bounds. /
613	static void tcp_clamp_window(struct sock *sk)
614	{
615	struct tcp_sock *tp = tcp_sk(sk);
616	struct inet_connection_sock *icsk = inet_csk(sk);
617	struct net *net = sock_net(sk);
618	int rmem2;
619
620	icsk->icsk_ack.quick = `0`;
621	rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[`2`]);
622
623	if (sk->sk_rcvbuf < rmem2 &&
624	!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
625	!tcp_under_memory_pressure(sk) &&
626	sk_memory_allocated(sk) < sk_prot_mem_limits(sk, index: `0`)) {
627	WRITE_ONCE(sk->sk_rcvbuf,
628	min(atomic_read(&sk->sk_rmem_alloc), rmem2));
629	}
630	if (atomic_read(v: &sk->sk_rmem_alloc) > sk->sk_rcvbuf)
631	tp->rcv_ssthresh = min(tp->window_clamp, `2U` * tp->advmss);
632	}
633
634	/ Initialize RCV_MSS value.*
635	* RCV_MSS is an our guess about MSS used by the peer.
636	* We haven't any direct information about the MSS.
637	* It's better to underestimate the RCV_MSS rather than overestimate.
638	* Overestimations make us ACKing less frequently than needed.
639	* Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
640	*/
641	void tcp_initialize_rcv_mss(struct sock *sk)
642	{
643	const struct tcp_sock *tp = tcp_sk(sk);
644	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
645
646	hint = min(hint, tp->rcv_wnd / `2`);
647	hint = min(hint, TCP_MSS_DEFAULT);
648	hint = max(hint, TCP_MIN_MSS);
649
650	inet_csk(sk)->icsk_ack.rcv_mss = hint;
651	}
652	EXPORT_IPV6_MOD(tcp_initialize_rcv_mss);
653
654	/ Receiver "autotuning" code.*
655	*
656	* The algorithm for RTT estimation w/o timestamps is based on
657	* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
658	* <https://public.lanl.gov/radiant/pubs.html#DRS>
659	*
660	* More detail on this code can be found at
661	* <http://staff.psc.edu/jheffner/>,
662	* though this reference is out of date. A new paper
663	* is pending.
664	*/
665	static void tcp_rcv_rtt_update(struct tcp_sock tp, u32 sample, int* win_dep)
666	{
667	u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us;
668	long m = sample << `3`;
669
670	if (old_sample == `0` \|\| m < old_sample) {
671	new_sample = m;
672	} else {
673	/ If we sample in larger samples in the non-timestamp*
674	* case, we could grossly overestimate the RTT especially
675	* with chatty applications or bulk transfer apps which
676	* are stalled on filesystem I/O.
677	*
678	* Also, since we are only going for a minimum in the
679	* non-timestamp case, we do not smooth things out
680	* else with timestamps disabled convergence takes too
681	* long.
682	*/
683	if (win_dep)
684	return;
685	/ Do not use this sample if receive queue is not empty. /
686	if (tp->rcv_nxt != tp->copied_seq)
687	return;
688	new_sample = old_sample - (old_sample >> `3`) + sample;
689	}
690
691	tp->rcv_rtt_est.rtt_us = new_sample;
692	}
693
694	static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
695	{
696	u32 delta_us;
697
698	if (tp->rcv_rtt_est.time == `0`)
699	goto new_measure;
700	if (before(seq1: tp->rcv_nxt, seq2: tp->rcv_rtt_est.seq))
701	return;
702	delta_us = tcp_stamp_us_delta(t1: tp->tcp_mstamp, t0: tp->rcv_rtt_est.time);
703	if (!delta_us)
704	delta_us = `1`;
705	tcp_rcv_rtt_update(tp, sample: delta_us, win_dep: `1`);
706
707	new_measure:
708	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
709	tp->rcv_rtt_est.time = tp->tcp_mstamp;
710	}
711
712	static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta)
713	{
714	u32 delta, delta_us;
715
716	delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr;
717	if (tp->tcp_usec_ts)
718	return delta;
719
720	if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
721	if (!delta)
722	delta = min_delta;
723	delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
724	return delta_us;
725	}
726	return -`1`;
727	}
728
729	static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
730	const struct sk_buff *skb)
731	{
732	struct tcp_sock *tp = tcp_sk(sk);
733
734	if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
735	return;
736	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
737
738	if (TCP_SKB_CB(skb)->end_seq -
739	TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
740	s32 delta = tcp_rtt_tsopt_us(tp, min_delta: `0`);
741
742	if (delta > `0`)
743	tcp_rcv_rtt_update(tp, sample: delta, win_dep: `0`);
744	}
745	}
746
747	static void tcp_rcvbuf_grow(struct sock *sk)
748	{
749	const struct net *net = sock_net(sk);
750	struct tcp_sock *tp = tcp_sk(sk);
751	int rcvwin, rcvbuf, cap;
752
753	if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) \|\|
754	(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
755	return;
756
757	/ slow start: allow the sender to double its rate. /
758	rcvwin = tp->rcvq_space.space << `1`;
759
760	if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
761	rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;
762
763	cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[`2`]);
764
765	rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap);
766	if (rcvbuf > sk->sk_rcvbuf) {
767	WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
768	/ Make the window clamp follow along. /
769	WRITE_ONCE(tp->window_clamp,
770	tcp_win_from_space(sk, rcvbuf));
771	}
772	}
773	/*
774	* This function should be called every time data is copied to user space.
775	* It calculates the appropriate TCP receive buffer space.
776	*/
777	void tcp_rcv_space_adjust(struct sock *sk)
778	{
779	struct tcp_sock *tp = tcp_sk(sk);
780	int time, inq, copied;
781
782	trace_tcp_rcv_space_adjust(sk);
783
784	tcp_mstamp_refresh(tp);
785	time = tcp_stamp_us_delta(t1: tp->tcp_mstamp, t0: tp->rcvq_space.time);
786	if (time < (tp->rcv_rtt_est.rtt_us >> `3`) \|\| tp->rcv_rtt_est.rtt_us == `0`)
787	return;
788
789	/ Number of bytes copied to user in last RTT /
790	copied = tp->copied_seq - tp->rcvq_space.seq;
791	/ Number of bytes in receive queue. /
792	inq = tp->rcv_nxt - tp->copied_seq;
793	copied -= inq;
794	if (copied <= tp->rcvq_space.space)
795	goto new_measure;
796
797	trace_tcp_rcvbuf_grow(sk, time);
798
799	tp->rcvq_space.space = copied;
800
801	tcp_rcvbuf_grow(sk);
802
803	new_measure:
804	tp->rcvq_space.seq = tp->copied_seq;
805	tp->rcvq_space.time = tp->tcp_mstamp;
806	}
807
808	static void tcp_save_lrcv_flowlabel(struct sock sk, const* struct sk_buff *skb)
809	{
810	#if IS_ENABLED(CONFIG_IPV6)
811	struct inet_connection_sock *icsk = inet_csk(sk);
812
813	if (skb->protocol == htons(ETH_P_IPV6))
814	icsk->icsk_ack.lrcv_flowlabel = ntohl(ip6_flowlabel(ipv6_hdr(skb)));
815	#endif
816	}
817
818	/ There is something which you must keep in mind when you analyze the*
819	* behavior of the tp->ato delayed ack timeout interval. When a
820	* connection starts up, we want to ack as quickly as possible. The
821	* problem is that "good" TCP's do slow start at the beginning of data
822	* transmission. The means that until we send the first few ACK's the
823	* sender will sit on his end and only queue most of his data, because
824	* he can only send snd_cwnd unacked packets at any given time. For
825	* each ACK we send, he increments snd_cwnd and transmits more of his
826	* queue. -DaveM
827	*/
828	static void tcp_event_data_recv(struct sock sk, struct* sk_buff *skb)
829	{
830	struct tcp_sock *tp = tcp_sk(sk);
831	struct inet_connection_sock *icsk = inet_csk(sk);
832	u32 now;
833
834	inet_csk_schedule_ack(sk);
835
836	tcp_measure_rcv_mss(sk, skb);
837
838	tcp_rcv_rtt_measure(tp);
839
840	now = tcp_jiffies32;
841
842	if (!icsk->icsk_ack.ato) {
843	/ The _first_ data packet received, initialize*
844	* delayed ACK engine.
845	*/
846	tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
847	icsk->icsk_ack.ato = TCP_ATO_MIN;
848	} else {
849	int m = now - icsk->icsk_ack.lrcvtime;
850
851	if (m <= TCP_ATO_MIN / `2`) {
852	/ The fastest case is the first. /
853	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> `1`) + TCP_ATO_MIN / `2`;
854	} else if (m < icsk->icsk_ack.ato) {
855	icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> `1`) + m;
856	if (icsk->icsk_ack.ato > icsk->icsk_rto)
857	icsk->icsk_ack.ato = icsk->icsk_rto;
858	} else if (m > icsk->icsk_rto) {
859	/ Too long gap. Apparently sender failed to*
860	* restart window, so that we send ACKs quickly.
861	*/
862	tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
863	}
864	}
865	icsk->icsk_ack.lrcvtime = now;
866	tcp_save_lrcv_flowlabel(sk, skb);
867
868	tcp_data_ecn_check(sk, skb);
869
870	if (skb->len >= `128`)
871	tcp_grow_window(sk, skb, adjust: true);
872	}
873
874	/ Called to compute a smoothed rtt estimate. The data fed to this*
875	* routine either comes from timestamps, or from segments that were
876	* known _not_ to have been retransmitted [see Karn/Partridge
877	* Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
878	* piece by Van Jacobson.
879	* NOTE: the next three routines used to be one big routine.
880	* To save cycles in the RFC 1323 implementation it was better to break
881	* it up into three procedures. -- erics
882	*/
883	static void tcp_rtt_estimator(struct sock sk, long* mrtt_us)
884	{
885	struct tcp_sock *tp = tcp_sk(sk);
886	long m = mrtt_us; / RTT /
887	u32 srtt = tp->srtt_us;
888
889	/ The following amusing code comes from Jacobson's*
890	* article in SIGCOMM '88. Note that rtt and mdev
891	* are scaled versions of rtt and mean deviation.
892	* This is designed to be as fast as possible
893	* m stands for "measurement".
894	*
895	* On a 1990 paper the rto value is changed to:
896	* RTO = rtt + 4 * mdev
897	*
898	* Funny. This algorithm seems to be very broken.
899	* These formulae increase RTO, when it should be decreased, increase
900	* too slowly, when it should be increased quickly, decrease too quickly
901	* etc. I guess in BSD RTO takes ONE value, so that it is absolutely
902	* does not matter how to _calculate_ it. Seems, it was trap
903	* that VJ failed to avoid. 8)
904	*/
905	if (srtt != `0`) {
906	m -= (srtt >> `3`); / m is now error in rtt est /
907	srtt += m; / rtt = 7/8 rtt + 1/8 new /
908	if (m < `0`) {
909	m = -m; / m is now abs(error) /
910	m -= (tp->mdev_us >> `2`); / similar update on mdev /
911	/ This is similar to one of Eifel findings.*
912	* Eifel blocks mdev updates when rtt decreases.
913	* This solution is a bit different: we use finer gain
914	* for mdev in this case (alpha*beta).
915	* Like Eifel it also prevents growth of rto,
916	* but also it limits too fast rto decreases,
917	* happening in pure Eifel.
918	*/
919	if (m > `0`)
920	m >>= `3`;
921	} else {
922	m -= (tp->mdev_us >> `2`); / similar update on mdev /
923	}
924	tp->mdev_us += m; / mdev = 3/4 mdev + 1/4 new /
925	if (tp->mdev_us > tp->mdev_max_us) {
926	tp->mdev_max_us = tp->mdev_us;
927	if (tp->mdev_max_us > tp->rttvar_us)
928	tp->rttvar_us = tp->mdev_max_us;
929	}
930	if (after(tp->snd_una, tp->rtt_seq)) {
931	if (tp->mdev_max_us < tp->rttvar_us)
932	tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> `2`;
933	tp->rtt_seq = tp->snd_nxt;
934	tp->mdev_max_us = tcp_rto_min_us(sk);
935
936	tcp_bpf_rtt(sk, mrtt: mrtt_us, srtt);
937	}
938	} else {
939	/ no previous measure. /
940	srtt = m << `3`; / take the measured time to be rtt /
941	tp->mdev_us = m << `1`; / make sure rto = 3rtt /*
942	tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
943	tp->mdev_max_us = tp->rttvar_us;
944	tp->rtt_seq = tp->snd_nxt;
945
946	tcp_bpf_rtt(sk, mrtt: mrtt_us, srtt);
947	}
948	tp->srtt_us = max(`1U`, srtt);
949	}
950
951	static void tcp_update_pacing_rate(struct sock *sk)
952	{
953	const struct tcp_sock *tp = tcp_sk(sk);
954	u64 rate;
955
956	/ set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) /
957	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / `100`) << `3`);
958
959	/ current rate is (cwnd * mss) / srtt*
960	* In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
961	* In Congestion Avoidance phase, set it to 120 % the current rate.
962	*
963	* [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
964	* If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
965	* end of slow start and should slow down.
966	*/
967	if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / `2`)
968	rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio);
969	else
970	rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio);
971
972	rate *= max(tcp_snd_cwnd(tp), tp->packets_out);
973
974	if (likely(tp->srtt_us))
975	do_div(rate, tp->srtt_us);
976
977	/ WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate*
978	* without any lock. We want to make sure compiler wont store
979	* intermediate values in this location.
980	*/
981	WRITE_ONCE(sk->sk_pacing_rate,
982	min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)));
983	}
984
985	/ Calculate rto without backoff. This is the second half of Van Jacobson's*
986	* routine referred to above.
987	*/
988	static void tcp_set_rto(struct sock *sk)
989	{
990	const struct tcp_sock *tp = tcp_sk(sk);
991	/ Old crap is replaced with new one. 8)*
992	*
993	* More seriously:
994	* 1. If rtt variance happened to be less 50msec, it is hallucination.
995	* It cannot be less due to utterly erratic ACK generation made
996	* at least by solaris and freebsd. "Erratic ACKs" has _nothing_
997	* to do with delayed acks, because at cwnd>2 true delack timeout
998	* is invisible. Actually, Linux-2.4 also generates erratic
999	* ACKs in some circumstances.
1000	*/
1001	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
1002
1003	/ 2. Fixups made earlier cannot be right.*
1004	* If we do not estimate RTO correctly without them,
1005	* all the algo is pure shit and should be replaced
1006	* with correct one. It is exactly, which we pretend to do.
1007	*/
1008
1009	/ NOTE: clamping at TCP_RTO_MIN is not required, current algo*
1010	* guarantees that rto is higher.
1011	*/
1012	tcp_bound_rto(sk);
1013	}
1014
1015	__u32 tcp_init_cwnd(const struct tcp_sock tp, const* struct dst_entry *dst)
1016	{
1017	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : `0`);
1018
1019	if (!cwnd)
1020	cwnd = TCP_INIT_CWND;
1021	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
1022	}
1023
1024	struct tcp_sacktag_state {
1025	/ Timestamps for earliest and latest never-retransmitted segment*
1026	* that was SACKed. RTO needs the earliest RTT to stay conservative,
1027	* but congestion control should still get an accurate delay signal.
1028	*/
1029	u64 first_sackt;
1030	u64 last_sackt;
1031	u32 reord;
1032	u32 sack_delivered;
1033	int flag;
1034	unsigned int mss_now;
1035	struct rate_sample *rate;
1036	};
1037
1038	/ Take a notice that peer is sending D-SACKs. Skip update of data delivery*
1039	* and spurious retransmission information if this DSACK is unlikely caused by
1040	* sender's action:
1041	* - DSACKed sequence range is larger than maximum receiver's window.
1042	* - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
1043	*/
1044	static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
1045	u32 end_seq, struct tcp_sacktag_state *state)
1046	{
1047	u32 seq_len, dup_segs = `1`;
1048
1049	if (!before(seq1: start_seq, seq2: end_seq))
1050	return `0`;
1051
1052	seq_len = end_seq - start_seq;
1053	/ Dubious DSACK: DSACKed range greater than maximum advertised rwnd /
1054	if (seq_len > tp->max_window)
1055	return `0`;
1056	if (seq_len > tp->mss_cache)
1057	dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
1058	else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq)
1059	state->flag \|= FLAG_DSACK_TLP;
1060
1061	tp->dsack_dups += dup_segs;
1062	/ Skip the DSACK if dup segs weren't retransmitted by sender /
1063	if (tp->dsack_dups > tp->total_retrans)
1064	return `0`;
1065
1066	tp->rx_opt.sack_ok \|= TCP_DSACK_SEEN;
1067	/ We increase the RACK ordering window in rounds where we receive*
1068	* DSACKs that may have been due to reordering causing RACK to trigger
1069	* a spurious fast recovery. Thus RACK ignores DSACKs that happen
1070	* without having seen reordering, or that match TLP probes (TLP
1071	* is timer-driven, not triggered by RACK).
1072	*/
1073	if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP))
1074	tp->rack.dsack_seen = `1`;
1075
1076	state->flag \|= FLAG_DSACKING_ACK;
1077	/ A spurious retransmission is delivered /
1078	state->sack_delivered += dup_segs;
1079
1080	return dup_segs;
1081	}
1082
1083	/ It's reordering when higher sequence was delivered (i.e. sacked) before*
1084	* some lower never-retransmitted sequence ("low_seq"). The maximum reordering
1085	* distance is approximated in full-mss packet distance ("reordering").
1086	*/
1087	static void tcp_check_sack_reordering(struct sock sk, const* u32 low_seq,
1088	const int ts)
1089	{
1090	struct tcp_sock *tp = tcp_sk(sk);
1091	const u32 mss = tp->mss_cache;
1092	u32 fack, metric;
1093
1094	fack = tcp_highest_sack_seq(tp);
1095	if (!before(seq1: low_seq, seq2: fack))
1096	return;
1097
1098	metric = fack - low_seq;
1099	if ((metric > tp->reordering * mss) && mss) {
1100	#if FASTRETRANS_DEBUG > 1
1101	pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
1102	tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
1103	tp->reordering,
1104	`0`,
1105	tp->sacked_out,
1106	tp->undo_marker ? tp->undo_retrans : `0`);
1107	#endif
1108	tp->reordering = min_t(u32, (metric + mss - `1`) / mss,
1109	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
1110	}
1111
1112	/ This exciting event is worth to be remembered. 8) /
1113	tp->reord_seen++;
1114	NET_INC_STATS(sock_net(sk),
1115	ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
1116	}
1117
1118	/ This must be called before lost_out or retrans_out are updated*
1119	* on a new loss, because we want to know if all skbs previously
1120	* known to be lost have already been retransmitted, indicating
1121	* that this newly lost skb is our next skb to retransmit.
1122	*/
1123	static void tcp_verify_retransmit_hint(struct tcp_sock tp, struct* sk_buff *skb)
1124	{
1125	if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) \|\|
1126	(tp->retransmit_skb_hint &&
1127	before(TCP_SKB_CB(skb)->seq,
1128	TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
1129	tp->retransmit_skb_hint = skb;
1130	}
1131
1132	/ Sum the number of packets on the wire we have marked as lost, and*
1133	* notify the congestion control module that the given skb was marked lost.
1134	*/
1135	static void tcp_notify_skb_loss_event(struct tcp_sock tp, const* struct sk_buff *skb)
1136	{
1137	tp->lost += tcp_skb_pcount(skb);
1138	}
1139
1140	void tcp_mark_skb_lost(struct sock sk, struct* sk_buff *skb)
1141	{
1142	__u8 sacked = TCP_SKB_CB(skb)->sacked;
1143	struct tcp_sock *tp = tcp_sk(sk);
1144
1145	if (sacked & TCPCB_SACKED_ACKED)
1146	return;
1147
1148	tcp_verify_retransmit_hint(tp, skb);
1149	if (sacked & TCPCB_LOST) {
1150	if (sacked & TCPCB_SACKED_RETRANS) {
1151	/ Account for retransmits that are lost again /
1152	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1153	tp->retrans_out -= tcp_skb_pcount(skb);
1154	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
1155	tcp_skb_pcount(skb));
1156	tcp_notify_skb_loss_event(tp, skb);
1157	}
1158	} else {
1159	tp->lost_out += tcp_skb_pcount(skb);
1160	TCP_SKB_CB(skb)->sacked \|= TCPCB_LOST;
1161	tcp_notify_skb_loss_event(tp, skb);
1162	}
1163	}
1164
1165	/ This procedure tags the retransmission queue when SACKs arrive.*
1166	*
1167	* We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
1168	* Packets in queue with these bits set are counted in variables
1169	* sacked_out, retrans_out and lost_out, correspondingly.
1170	*
1171	* Valid combinations are:
1172	* Tag InFlight Description
1173	* 0 1 - orig segment is in flight.
1174	* S 0 - nothing flies, orig reached receiver.
1175	* L 0 - nothing flies, orig lost by net.
1176	* R 2 - both orig and retransmit are in flight.
1177	* L\|R 1 - orig is lost, retransmit is in flight.
1178	* S\|R 1 - orig reached receiver, retrans is still in flight.
1179	* (L\|S\|R is logically valid, it could occur when L\|R is sacked,
1180	* but it is equivalent to plain S and code short-circuits it to S.
1181	* L\|S is logically invalid, it would mean -1 packet in flight 8))
1182	*
1183	* These 6 states form finite state machine, controlled by the following events:
1184	* 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
1185	* 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
1186	* 3. Loss detection event of two flavors:
1187	* A. Scoreboard estimator decided the packet is lost.
1188	* A'. Reno "three dupacks" marks head of queue lost.
1189	* B. SACK arrives sacking SND.NXT at the moment, when the
1190	* segment was retransmitted.
1191	* 4. D-SACK added new rule: D-SACK changes any tag to S.
1192	*
1193	* It is pleasant to note, that state diagram turns out to be commutative,
1194	* so that we are allowed not to be bothered by order of our actions,
1195	* when multiple events arrive simultaneously. (see the function below).
1196	*
1197	* Reordering detection.
1198	* --------------------
1199	* Reordering metric is maximal distance, which a packet can be displaced
1200	* in packet stream. With SACKs we can estimate it:
1201	*
1202	* 1. SACK fills old hole and the corresponding segment was not
1203	* ever retransmitted -> reordering. Alas, we cannot use it
1204	* when segment was retransmitted.
1205	* 2. The last flaw is solved with D-SACK. D-SACK arrives
1206	* for retransmitted and already SACKed segment -> reordering..
1207	* Both of these heuristics are not used in Loss state, when we cannot
1208	* account for retransmits accurately.
1209	*
1210	* SACK block validation.
1211	* ----------------------
1212	*
1213	* SACK block range validation checks that the received SACK block fits to
1214	* the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
1215	* Note that SND.UNA is not included to the range though being valid because
1216	* it means that the receiver is rather inconsistent with itself reporting
1217	* SACK reneging when it should advance SND.UNA. Such SACK block this is
1218	* perfectly valid, however, in light of RFC2018 which explicitly states
1219	* that "SACK block MUST reflect the newest segment. Even if the newest
1220	* segment is going to be discarded ...", not that it looks very clever
1221	* in case of head skb. Due to potentional receiver driven attacks, we
1222	* choose to avoid immediate execution of a walk in write queue due to
1223	* reneging and defer head skb's loss recovery to standard loss recovery
1224	* procedure that will eventually trigger (nothing forbids us doing this).
1225	*
1226	* Implements also blockage to start_seq wrap-around. Problem lies in the
1227	* fact that though start_seq (s) is before end_seq (i.e., not reversed),
1228	* there's no guarantee that it will be before snd_nxt (n). The problem
1229	* happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
1230	* wrap (s_w):
1231	*
1232	* <- outs wnd -> <- wrapzone ->
1233	* u e n u_w e_w s n_w
1234	* \| \| \| \| \| \| \|
1235	* \|<------------+------+----- TCP seqno space --------------+---------->\|
1236	* ...-- <2^31 ->\| \|<--------...
1237	* ...---- >2^31 ------>\| \|<--------...
1238	*
1239	* Current code wouldn't be vulnerable but it's better still to discard such
1240	* crazy SACK blocks. Doing this check for start_seq alone closes somewhat
1241	* similar case (end_seq after snd_nxt wrap) as earlier reversed check in
1242	* snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
1243	* equal to the ideal case (infinite seqno space without wrap caused issues).
1244	*
1245	* With D-SACK the lower bound is extended to cover sequence space below
1246	* SND.UNA down to undo_marker, which is the last point of interest. Yet
1247	* again, D-SACK block must not to go across snd_una (for the same reason as
1248	* for the normal SACK blocks, explained above). But there all simplicity
1249	* ends, TCP might receive valid D-SACKs below that. As long as they reside
1250	* fully below undo_marker they do not affect behavior in anyway and can
1251	* therefore be safely ignored. In rare cases (which are more or less
1252	* theoretical ones), the D-SACK will nicely cross that boundary due to skb
1253	* fragmentation and packet reordering past skb's retransmission. To consider
1254	* them correctly, the acceptable range must be extended even more though
1255	* the exact amount is rather hard to quantify. However, tp->max_window can
1256	* be used as an exaggerated estimate.
1257	*/
1258	static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
1259	u32 start_seq, u32 end_seq)
1260	{
1261	/ Too far in future, or reversed (interpretation is ambiguous) /
1262	if (after(end_seq, tp->snd_nxt) \|\| !before(seq1: start_seq, seq2: end_seq))
1263	return false;
1264
1265	/ Nasty start_seq wrap-around check (see comments above) /
1266	if (!before(seq1: start_seq, seq2: tp->snd_nxt))
1267	return false;
1268
1269	/ In outstanding window? ...This is valid exit for D-SACKs too.*
1270	* start_seq == snd_una is non-sensical (see comments above)
1271	*/
1272	if (after(start_seq, tp->snd_una))
1273	return true;
1274
1275	if (!is_dsack \|\| !tp->undo_marker)
1276	return false;
1277
1278	/ ...Then it's D-SACK, and must reside below snd_una completely /
1279	if (after(end_seq, tp->snd_una))
1280	return false;
1281
1282	if (!before(seq1: start_seq, seq2: tp->undo_marker))
1283	return true;
1284
1285	/ Too old /
1286	if (!after(end_seq, tp->undo_marker))
1287	return false;
1288
1289	/ Undo_marker boundary crossing (overestimates a lot). Known already:*
1290	* start_seq < undo_marker and end_seq >= undo_marker.
1291	*/
1292	return !before(seq1: start_seq, seq2: end_seq - tp->max_window);
1293	}
1294
1295	static bool tcp_check_dsack(struct sock sk, const* struct sk_buff *ack_skb,
1296	struct tcp_sack_block_wire sp, int* num_sacks,
1297	u32 prior_snd_una, struct tcp_sacktag_state *state)
1298	{
1299	struct tcp_sock *tp = tcp_sk(sk);
1300	u32 start_seq_0 = get_unaligned_be32(p: &sp[`0`].start_seq);
1301	u32 end_seq_0 = get_unaligned_be32(p: &sp[`0`].end_seq);
1302	u32 dup_segs;
1303
1304	if (before(seq1: start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
1305	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
1306	} else if (num_sacks > `1`) {
1307	u32 end_seq_1 = get_unaligned_be32(p: &sp[`1`].end_seq);
1308	u32 start_seq_1 = get_unaligned_be32(p: &sp[`1`].start_seq);
1309
1310	if (after(end_seq_0, end_seq_1) \|\| before(seq1: start_seq_0, seq2: start_seq_1))
1311	return false;
1312	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
1313	} else {
1314	return false;
1315	}
1316
1317	dup_segs = tcp_dsack_seen(tp, start_seq: start_seq_0, end_seq: end_seq_0, state);
1318	if (!dup_segs) { / Skip dubious DSACK /
1319	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
1320	return false;
1321	}
1322
1323	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
1324
1325	/ D-SACK for already forgotten data... Do dumb counting. /
1326	if (tp->undo_marker && tp->undo_retrans > `0` &&
1327	!after(end_seq_0, prior_snd_una) &&
1328	after(end_seq_0, tp->undo_marker))
1329	tp->undo_retrans = max_t(int, `0`, tp->undo_retrans - dup_segs);
1330
1331	return true;
1332	}
1333
1334	/ Check if skb is fully within the SACK block. In presence of GSO skbs,*
1335	* the incoming SACK may not exactly match but we can find smaller MSS
1336	* aligned portion of it that matches. Therefore we might need to fragment
1337	* which may fail and creates some hassle (caller must handle error case
1338	* returns).
1339	*
1340	* FIXME: this could be merged to shift decision code
1341	*/
1342	static int tcp_match_skb_to_sack(struct sock sk, struct* sk_buff *skb,
1343	u32 start_seq, u32 end_seq)
1344	{
1345	int err;
1346	bool in_sack;
1347	unsigned int pkt_len;
1348	unsigned int mss;
1349
1350	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1351	!before(seq1: end_seq, TCP_SKB_CB(skb)->end_seq);
1352
1353	if (tcp_skb_pcount(skb) > `1` && !in_sack &&
1354	after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
1355	mss = tcp_skb_mss(skb);
1356	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1357
1358	if (!in_sack) {
1359	pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
1360	if (pkt_len < mss)
1361	pkt_len = mss;
1362	} else {
1363	pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
1364	if (pkt_len < mss)
1365	return -EINVAL;
1366	}
1367
1368	/ Round if necessary so that SACKs cover only full MSSes*
1369	* and/or the remaining small portion (if present)
1370	*/
1371	if (pkt_len > mss) {
1372	unsigned int new_len = (pkt_len / mss) * mss;
1373	if (!in_sack && new_len < pkt_len)
1374	new_len += mss;
1375	pkt_len = new_len;
1376	}
1377
1378	if (pkt_len >= skb->len && !in_sack)
1379	return `0`;
1380
1381	err = tcp_fragment(sk, tcp_queue: TCP_FRAG_IN_RTX_QUEUE, skb,
1382	len: pkt_len, mss_now: mss, GFP_ATOMIC);
1383	if (err < `0`)
1384	return err;
1385	}
1386
1387	return in_sack;
1388	}
1389
1390	/ Mark the given newly-SACKed range as such, adjusting counters and hints. /
1391	static u8 tcp_sacktag_one(struct sock *sk,
1392	struct tcp_sacktag_state *state, u8 sacked,
1393	u32 start_seq, u32 end_seq,
1394	int dup_sack, int pcount,
1395	u64 xmit_time)
1396	{
1397	struct tcp_sock *tp = tcp_sk(sk);
1398
1399	/ Account D-SACK for retransmitted packet. /
1400	if (dup_sack && (sacked & TCPCB_RETRANS)) {
1401	if (tp->undo_marker && tp->undo_retrans > `0` &&
1402	after(end_seq, tp->undo_marker))
1403	tp->undo_retrans = max_t(int, `0`, tp->undo_retrans - pcount);
1404	if ((sacked & TCPCB_SACKED_ACKED) &&
1405	before(seq1: start_seq, seq2: state->reord))
1406	state->reord = start_seq;
1407	}
1408
1409	/ Nothing to do; acked frame is about to be dropped (was ACKed). /
1410	if (!after(end_seq, tp->snd_una))
1411	return sacked;
1412
1413	if (!(sacked & TCPCB_SACKED_ACKED)) {
1414	tcp_rack_advance(tp, sacked, end_seq, xmit_time);
1415
1416	if (sacked & TCPCB_SACKED_RETRANS) {
1417	/ If the segment is not tagged as lost,*
1418	* we do not clear RETRANS, believing
1419	* that retransmission is still in flight.
1420	*/
1421	if (sacked & TCPCB_LOST) {
1422	sacked &= ~(TCPCB_LOST\|TCPCB_SACKED_RETRANS);
1423	tp->lost_out -= pcount;
1424	tp->retrans_out -= pcount;
1425	}
1426	} else {
1427	if (!(sacked & TCPCB_RETRANS)) {
1428	/ New sack for not retransmitted frame,*
1429	* which was in hole. It is reordering.
1430	*/
1431	if (before(seq1: start_seq,
1432	seq2: tcp_highest_sack_seq(tp)) &&
1433	before(seq1: start_seq, seq2: state->reord))
1434	state->reord = start_seq;
1435
1436	if (!after(end_seq, tp->high_seq))
1437	state->flag \|= FLAG_ORIG_SACK_ACKED;
1438	if (state->first_sackt == `0`)
1439	state->first_sackt = xmit_time;
1440	state->last_sackt = xmit_time;
1441	}
1442
1443	if (sacked & TCPCB_LOST) {
1444	sacked &= ~TCPCB_LOST;
1445	tp->lost_out -= pcount;
1446	}
1447	}
1448
1449	sacked \|= TCPCB_SACKED_ACKED;
1450	state->flag \|= FLAG_DATA_SACKED;
1451	tp->sacked_out += pcount;
1452	/ Out-of-order packets delivered /
1453	state->sack_delivered += pcount;
1454
1455	/ Lost marker hint past SACKed? Tweak RFC3517 cnt /
1456	if (tp->lost_skb_hint &&
1457	before(seq1: start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
1458	tp->lost_cnt_hint += pcount;
1459	}
1460
1461	/ D-SACK. We can detect redundant retransmission in S\|R and plain R*
1462	* frames and clear it. undo_retrans is decreased above, L\|R frames
1463	* are accounted above as well.
1464	*/
1465	if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
1466	sacked &= ~TCPCB_SACKED_RETRANS;
1467	tp->retrans_out -= pcount;
1468	}
1469
1470	return sacked;
1471	}
1472
1473	/ Shift newly-SACKed bytes from this skb to the immediately previous*
1474	* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
1475	*/
1476	static bool tcp_shifted_skb(struct sock sk, struct* sk_buff *prev,
1477	struct sk_buff *skb,
1478	struct tcp_sacktag_state *state,
1479	unsigned int pcount, int shifted, int mss,
1480	bool dup_sack)
1481	{
1482	struct tcp_sock *tp = tcp_sk(sk);
1483	u32 start_seq = TCP_SKB_CB(skb)->seq; / start of newly-SACKed /
1484	u32 end_seq = start_seq + shifted; / end of newly-SACKed /
1485
1486	BUG_ON(!pcount);
1487
1488	/ Adjust counters and hints for the newly sacked sequence*
1489	* range but discard the return value since prev is already
1490	* marked. We must tag the range first because the seq
1491	* advancement below implicitly advances
1492	* tcp_highest_sack_seq() when skb is highest_sack.
1493	*/
1494	tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
1495	start_seq, end_seq, dup_sack, pcount,
1496	xmit_time: tcp_skb_timestamp_us(skb));
1497	tcp_rate_skb_delivered(sk, skb, rs: state->rate);
1498
1499	if (skb == tp->lost_skb_hint)
1500	tp->lost_cnt_hint += pcount;
1501
1502	TCP_SKB_CB(prev)->end_seq += shifted;
1503	TCP_SKB_CB(skb)->seq += shifted;
1504
1505	tcp_skb_pcount_add(skb: prev, segs: pcount);
1506	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
1507	tcp_skb_pcount_add(skb, segs: -pcount);
1508
1509	/ When we're adding to gso_segs == 1, gso_size will be zero,*
1510	* in theory this shouldn't be necessary but as long as DSACK
1511	* code can come after this skb later on it's better to keep
1512	* setting gso_size to something.
1513	*/
1514	if (!TCP_SKB_CB(prev)->tcp_gso_size)
1515	TCP_SKB_CB(prev)->tcp_gso_size = mss;
1516
1517	/ CHECKME: To clear or not to clear? Mimics normal skb currently /
1518	if (tcp_skb_pcount(skb) <= `1`)
1519	TCP_SKB_CB(skb)->tcp_gso_size = `0`;
1520
1521	/ Difference in this won't matter, both ACKed by the same cumul. ACK /
1522	TCP_SKB_CB(prev)->sacked \|= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
1523
1524	if (skb->len > `0`) {
1525	BUG_ON(!tcp_skb_pcount(skb));
1526	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
1527	return false;
1528	}
1529
1530	/ Whole SKB was eaten :-) /
1531
1532	if (skb == tp->retransmit_skb_hint)
1533	tp->retransmit_skb_hint = prev;
1534	if (skb == tp->lost_skb_hint) {
1535	tp->lost_skb_hint = prev;
1536	tp->lost_cnt_hint -= tcp_skb_pcount(skb: prev);
1537	}
1538
1539	TCP_SKB_CB(prev)->tcp_flags \|= TCP_SKB_CB(skb)->tcp_flags;
1540	TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
1541	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
1542	TCP_SKB_CB(prev)->end_seq++;
1543
1544	if (skb == tcp_highest_sack(sk))
1545	tcp_advance_highest_sack(sk, skb);
1546
1547	tcp_skb_collapse_tstamp(skb: prev, next_skb: skb);
1548	if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp))
1549	TCP_SKB_CB(prev)->tx.delivered_mstamp = `0`;
1550
1551	tcp_rtx_queue_unlink_and_free(skb, sk);
1552
1553	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
1554
1555	return true;
1556	}
1557
1558	/ I wish gso_size would have a bit more sane initialization than*
1559	* something-or-zero which complicates things
1560	*/
1561	static int tcp_skb_seglen(const struct sk_buff *skb)
1562	{
1563	return tcp_skb_pcount(skb) == `1` ? skb->len : tcp_skb_mss(skb);
1564	}
1565
1566	/ Shifting pages past head area doesn't work /
1567	static int skb_can_shift(const struct sk_buff *skb)
1568	{
1569	return !skb_headlen(skb) && skb_is_nonlinear(skb);
1570	}
1571
1572	int tcp_skb_shift(struct sk_buff to, struct* sk_buff *from,
1573	int pcount, int shiftlen)
1574	{
1575	/ TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)*
1576	* Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
1577	* to make sure not storing more than 65535 * 8 bytes per skb,
1578	* even if current MSS is bigger.
1579	*/
1580	if (unlikely(to->len + shiftlen >= `65535` * TCP_MIN_GSO_SIZE))
1581	return `0`;
1582	if (unlikely(tcp_skb_pcount(to) + pcount > `65535`))
1583	return `0`;
1584	return skb_shift(tgt: to, skb: from, shiftlen);
1585	}
1586
1587	/ Try collapsing SACK blocks spanning across multiple skbs to a single*
1588	* skb.
1589	*/
1590	static struct sk_buff tcp_shift_skb_data(struct* sock sk, struct* sk_buff *skb,
1591	struct tcp_sacktag_state *state,
1592	u32 start_seq, u32 end_seq,
1593	bool dup_sack)
1594	{
1595	struct tcp_sock *tp = tcp_sk(sk);
1596	struct sk_buff *prev;
1597	int mss;
1598	int pcount = `0`;
1599	int len;
1600	int in_sack;
1601
1602	/ Normally R but no L won't result in plain S /
1603	if (!dup_sack &&
1604	(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST\|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
1605	goto fallback;
1606	if (!skb_can_shift(skb))
1607	goto fallback;
1608	/ This frame is about to be dropped (was ACKed). /
1609	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
1610	goto fallback;
1611
1612	/ Can only happen with delayed DSACK + discard craziness /
1613	prev = skb_rb_prev(skb);
1614	if (!prev)
1615	goto fallback;
1616
1617	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
1618	goto fallback;
1619
1620	if (!tcp_skb_can_collapse(to: prev, from: skb))
1621	goto fallback;
1622
1623	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
1624	!before(seq1: end_seq, TCP_SKB_CB(skb)->end_seq);
1625
1626	if (in_sack) {
1627	len = skb->len;
1628	pcount = tcp_skb_pcount(skb);
1629	mss = tcp_skb_seglen(skb);
1630
1631	/ TODO: Fix DSACKs to not fragment already SACKed and we can*
1632	* drop this restriction as unnecessary
1633	*/
1634	if (mss != tcp_skb_seglen(skb: prev))
1635	goto fallback;
1636	} else {
1637	if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
1638	goto noop;
1639	/ CHECKME: This is non-MSS split case only?, this will*
1640	* cause skipped skbs due to advancing loop btw, original
1641	* has that feature too
1642	*/
1643	if (tcp_skb_pcount(skb) <= `1`)
1644	goto noop;
1645
1646	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
1647	if (!in_sack) {
1648	/ TODO: head merge to next could be attempted here*
1649	* if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
1650	* though it might not be worth of the additional hassle
1651	*
1652	* ...we can probably just fallback to what was done
1653	* previously. We could try merging non-SACKed ones
1654	* as well but it probably isn't going to buy off
1655	* because later SACKs might again split them, and
1656	* it would make skb timestamp tracking considerably
1657	* harder problem.
1658	*/
1659	goto fallback;
1660	}
1661
1662	len = end_seq - TCP_SKB_CB(skb)->seq;
1663	BUG_ON(len < `0`);
1664	BUG_ON(len > skb->len);
1665
1666	/ MSS boundaries should be honoured or else pcount will*
1667	* severely break even though it makes things bit trickier.
1668	* Optimize common case to avoid most of the divides
1669	*/
1670	mss = tcp_skb_mss(skb);
1671
1672	/ TODO: Fix DSACKs to not fragment already SACKed and we can*
1673	* drop this restriction as unnecessary
1674	*/
1675	if (mss != tcp_skb_seglen(skb: prev))
1676	goto fallback;
1677
1678	if (len == mss) {
1679	pcount = `1`;
1680	} else if (len < mss) {
1681	goto noop;
1682	} else {
1683	pcount = len / mss;
1684	len = pcount * mss;
1685	}
1686	}
1687
1688	/ tcp_sacktag_one() won't SACK-tag ranges below snd_una /
1689	if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
1690	goto fallback;
1691
1692	if (!tcp_skb_shift(to: prev, from: skb, pcount, shiftlen: len))
1693	goto fallback;
1694	if (!tcp_shifted_skb(sk, prev, skb, state, pcount, shifted: len, mss, dup_sack))
1695	goto out;
1696
1697	/ Hole filled allows collapsing with the next as well, this is very*
1698	* useful when hole on every nth skb pattern happens
1699	*/
1700	skb = skb_rb_next(prev);
1701	if (!skb)
1702	goto out;
1703
1704	if (!skb_can_shift(skb) \|\|
1705	((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) \|\|
1706	(mss != tcp_skb_seglen(skb)))
1707	goto out;
1708
1709	if (!tcp_skb_can_collapse(to: prev, from: skb))
1710	goto out;
1711	len = skb->len;
1712	pcount = tcp_skb_pcount(skb);
1713	if (tcp_skb_shift(to: prev, from: skb, pcount, shiftlen: len))
1714	tcp_shifted_skb(sk, prev, skb, state, pcount,
1715	shifted: len, mss, dup_sack: `0`);
1716
1717	out:
1718	return prev;
1719
1720	noop:
1721	return skb;
1722
1723	fallback:
1724	NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
1725	return NULL;
1726	}
1727
1728	static struct sk_buff tcp_sacktag_walk(struct* sk_buff skb, struct* sock *sk,
1729	struct tcp_sack_block *next_dup,
1730	struct tcp_sacktag_state *state,
1731	u32 start_seq, u32 end_seq,
1732	bool dup_sack_in)
1733	{
1734	struct tcp_sock *tp = tcp_sk(sk);
1735	struct sk_buff *tmp;
1736
1737	skb_rbtree_walk_from(skb) {
1738	int in_sack = `0`;
1739	bool dup_sack = dup_sack_in;
1740
1741	/ queue is in-order => we can short-circuit the walk early /
1742	if (!before(TCP_SKB_CB(skb)->seq, seq2: end_seq))
1743	break;
1744
1745	if (next_dup &&
1746	before(TCP_SKB_CB(skb)->seq, seq2: next_dup->end_seq)) {
1747	in_sack = tcp_match_skb_to_sack(sk, skb,
1748	start_seq: next_dup->start_seq,
1749	end_seq: next_dup->end_seq);
1750	if (in_sack > `0`)
1751	dup_sack = true;
1752	}
1753
1754	/ skb reference here is a bit tricky to get right, since*
1755	* shifting can eat and free both this skb and the next,
1756	* so not even _safe variant of the loop is enough.
1757	*/
1758	if (in_sack <= `0`) {
1759	tmp = tcp_shift_skb_data(sk, skb, state,
1760	start_seq, end_seq, dup_sack);
1761	if (tmp) {
1762	if (tmp != skb) {
1763	skb = tmp;
1764	continue;
1765	}
1766
1767	in_sack = `0`;
1768	} else {
1769	in_sack = tcp_match_skb_to_sack(sk, skb,
1770	start_seq,
1771	end_seq);
1772	}
1773	}
1774
1775	if (unlikely(in_sack < `0`))
1776	break;
1777
1778	if (in_sack) {
1779	TCP_SKB_CB(skb)->sacked =
1780	tcp_sacktag_one(sk,
1781	state,
1782	TCP_SKB_CB(skb)->sacked,
1783	TCP_SKB_CB(skb)->seq,
1784	TCP_SKB_CB(skb)->end_seq,
1785	dup_sack,
1786	pcount: tcp_skb_pcount(skb),
1787	xmit_time: tcp_skb_timestamp_us(skb));
1788	tcp_rate_skb_delivered(sk, skb, rs: state->rate);
1789	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
1790	list_del_init(entry: &skb->tcp_tsorted_anchor);
1791
1792	if (!before(TCP_SKB_CB(skb)->seq,
1793	seq2: tcp_highest_sack_seq(tp)))
1794	tcp_advance_highest_sack(sk, skb);
1795	}
1796	}
1797	return skb;
1798	}
1799
1800	static struct sk_buff tcp_sacktag_bsearch(struct* sock *sk, u32 seq)
1801	{
1802	struct rb_node parent, *p = &sk->tcp_rtx_queue.rb_node;
1803	struct sk_buff *skb;
1804
1805	while (*p) {
1806	parent = *p;
1807	skb = rb_to_skb(parent);
1808	if (before(seq1: seq, TCP_SKB_CB(skb)->seq)) {
1809	p = &parent->rb_left;
1810	continue;
1811	}
1812	if (!before(seq1: seq, TCP_SKB_CB(skb)->end_seq)) {
1813	p = &parent->rb_right;
1814	continue;
1815	}
1816	return skb;
1817	}
1818	return NULL;
1819	}
1820
1821	static struct sk_buff tcp_sacktag_skip(struct* sk_buff skb, struct* sock *sk,
1822	u32 skip_to_seq)
1823	{
1824	if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
1825	return skb;
1826
1827	return tcp_sacktag_bsearch(sk, seq: skip_to_seq);
1828	}
1829
1830	static struct sk_buff tcp_maybe_skipping_dsack(struct* sk_buff *skb,
1831	struct sock *sk,
1832	struct tcp_sack_block *next_dup,
1833	struct tcp_sacktag_state *state,
1834	u32 skip_to_seq)
1835	{
1836	if (!next_dup)
1837	return skb;
1838
1839	if (before(seq1: next_dup->start_seq, seq2: skip_to_seq)) {
1840	skb = tcp_sacktag_skip(skb, sk, skip_to_seq: next_dup->start_seq);
1841	skb = tcp_sacktag_walk(skb, sk, NULL, state,
1842	start_seq: next_dup->start_seq, end_seq: next_dup->end_seq,
1843	dup_sack_in: `1`);
1844	}
1845
1846	return skb;
1847	}
1848
1849	static int tcp_sack_cache_ok(const struct tcp_sock tp, const* struct tcp_sack_block *cache)
1850	{
1851	return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1852	}
1853
1854	static int
1855	tcp_sacktag_write_queue(struct sock sk, const* struct sk_buff *ack_skb,
1856	u32 prior_snd_una, struct tcp_sacktag_state *state)
1857	{
1858	struct tcp_sock *tp = tcp_sk(sk);
1859	const unsigned char *ptr = (skb_transport_header(skb: ack_skb) +
1860	TCP_SKB_CB(ack_skb)->sacked);
1861	struct tcp_sack_block_wire sp_wire = (struct* tcp_sack_block_wire *)(ptr+`2`);
1862	struct tcp_sack_block sp[TCP_NUM_SACKS];
1863	struct tcp_sack_block *cache;
1864	struct sk_buff *skb;
1865	int num_sacks = min(TCP_NUM_SACKS, (ptr[`1`] - TCPOLEN_SACK_BASE) >> `3`);
1866	int used_sacks;
1867	bool found_dup_sack = false;
1868	int i, j;
1869	int first_sack_index;
1870
1871	state->flag = `0`;
1872	state->reord = tp->snd_nxt;
1873
1874	if (!tp->sacked_out)
1875	tcp_highest_sack_reset(sk);
1876
1877	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp: sp_wire,
1878	num_sacks, prior_snd_una, state);
1879
1880	/ Eliminate too old ACKs, but take into*
1881	* account more or less fresh ones, they can
1882	* contain valid SACK info.
1883	*/
1884	if (before(TCP_SKB_CB(ack_skb)->ack_seq, seq2: prior_snd_una - tp->max_window))
1885	return `0`;
1886
1887	if (!tp->packets_out)
1888	goto out;
1889
1890	used_sacks = `0`;
1891	first_sack_index = `0`;
1892	for (i = `0`; i < num_sacks; i++) {
1893	bool dup_sack = !i && found_dup_sack;
1894
1895	sp[used_sacks].start_seq = get_unaligned_be32(p: &sp_wire[i].start_seq);
1896	sp[used_sacks].end_seq = get_unaligned_be32(p: &sp_wire[i].end_seq);
1897
1898	if (!tcp_is_sackblock_valid(tp, is_dsack: dup_sack,
1899	start_seq: sp[used_sacks].start_seq,
1900	end_seq: sp[used_sacks].end_seq)) {
1901	int mib_idx;
1902
1903	if (dup_sack) {
1904	if (!tp->undo_marker)
1905	mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
1906	else
1907	mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
1908	} else {
1909	/ Don't count olds caused by ACK reordering /
1910	if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
1911	!after(sp[used_sacks].end_seq, tp->snd_una))
1912	continue;
1913	mib_idx = LINUX_MIB_TCPSACKDISCARD;
1914	}
1915
1916	NET_INC_STATS(sock_net(sk), mib_idx);
1917	if (i == `0`)
1918	first_sack_index = -`1`;
1919	continue;
1920	}
1921
1922	/ Ignore very old stuff early /
1923	if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
1924	if (i == `0`)
1925	first_sack_index = -`1`;
1926	continue;
1927	}
1928
1929	used_sacks++;
1930	}
1931
1932	/ order SACK blocks to allow in order walk of the retrans queue /
1933	for (i = used_sacks - `1`; i > `0`; i--) {
1934	for (j = `0`; j < i; j++) {
1935	if (after(sp[j].start_seq, sp[j + `1`].start_seq)) {
1936	swap(sp[j], sp[j + `1`]);
1937
1938	/ Track where the first SACK block goes to /
1939	if (j == first_sack_index)
1940	first_sack_index = j + `1`;
1941	}
1942	}
1943	}
1944
1945	state->mss_now = tcp_current_mss(sk);
1946	skb = NULL;
1947	i = `0`;
1948
1949	if (!tp->sacked_out) {
1950	/ It's already past, so skip checking against it /
1951	cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
1952	} else {
1953	cache = tp->recv_sack_cache;
1954	/ Skip empty blocks in at head of the cache /
1955	while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
1956	!cache->end_seq)
1957	cache++;
1958	}
1959
1960	while (i < used_sacks) {
1961	u32 start_seq = sp[i].start_seq;
1962	u32 end_seq = sp[i].end_seq;
1963	bool dup_sack = (found_dup_sack && (i == first_sack_index));
1964	struct tcp_sack_block *next_dup = NULL;
1965
1966	if (found_dup_sack && ((i + `1`) == first_sack_index))
1967	next_dup = &sp[i + `1`];
1968
1969	/ Skip too early cached blocks /
1970	while (tcp_sack_cache_ok(tp, cache) &&
1971	!before(seq1: start_seq, seq2: cache->end_seq))
1972	cache++;
1973
1974	/ Can skip some work by looking recv_sack_cache? /
1975	if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
1976	after(end_seq, cache->start_seq)) {
1977
1978	/ Head todo? /
1979	if (before(seq1: start_seq, seq2: cache->start_seq)) {
1980	skb = tcp_sacktag_skip(skb, sk, skip_to_seq: start_seq);
1981	skb = tcp_sacktag_walk(skb, sk, next_dup,
1982	state,
1983	start_seq,
1984	end_seq: cache->start_seq,
1985	dup_sack_in: dup_sack);
1986	}
1987
1988	/ Rest of the block already fully processed? /
1989	if (!after(end_seq, cache->end_seq))
1990	goto advance_sp;
1991
1992	skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
1993	state,
1994	skip_to_seq: cache->end_seq);
1995
1996	/ ...tail remains todo... /
1997	if (tcp_highest_sack_seq(tp) == cache->end_seq) {
1998	/ ...but better entrypoint exists! /
1999	skb = tcp_highest_sack(sk);
2000	if (!skb)
2001	break;
2002	cache++;
2003	goto walk;
2004	}
2005
2006	skb = tcp_sacktag_skip(skb, sk, skip_to_seq: cache->end_seq);
2007	/ Check overlap against next cached too (past this one already) /
2008	cache++;
2009	continue;
2010	}
2011
2012	if (!before(seq1: start_seq, seq2: tcp_highest_sack_seq(tp))) {
2013	skb = tcp_highest_sack(sk);
2014	if (!skb)
2015	break;
2016	}
2017	skb = tcp_sacktag_skip(skb, sk, skip_to_seq: start_seq);
2018
2019	walk:
2020	skb = tcp_sacktag_walk(skb, sk, next_dup, state,
2021	start_seq, end_seq, dup_sack_in: dup_sack);
2022
2023	advance_sp:
2024	i++;
2025	}
2026
2027	/ Clear the head of the cache sack blocks so we can skip it next time /
2028	for (i = `0`; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
2029	tp->recv_sack_cache[i].start_seq = `0`;
2030	tp->recv_sack_cache[i].end_seq = `0`;
2031	}
2032	for (j = `0`; j < used_sacks; j++)
2033	tp->recv_sack_cache[i++] = sp[j];
2034
2035	if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss \|\| tp->undo_marker)
2036	tcp_check_sack_reordering(sk, low_seq: state->reord, ts: `0`);
2037
2038	tcp_verify_left_out(tp);
2039	out:
2040
2041	#if FASTRETRANS_DEBUG > 0
2042	WARN_ON((int)tp->sacked_out < `0`);
2043	WARN_ON((int)tp->lost_out < `0`);
2044	WARN_ON((int)tp->retrans_out < `0`);
2045	WARN_ON((int)tcp_packets_in_flight(tp) < `0`);
2046	#endif
2047	return state->flag;
2048	}
2049
2050	/ Limits sacked_out so that sum with lost_out isn't ever larger than*
2051	* packets_out. Returns false if sacked_out adjustement wasn't necessary.
2052	*/
2053	static bool tcp_limit_reno_sacked(struct tcp_sock *tp)
2054	{
2055	u32 holes;
2056
2057	holes = max(tp->lost_out, `1U`);
2058	holes = min(holes, tp->packets_out);
2059
2060	if ((tp->sacked_out + holes) > tp->packets_out) {
2061	tp->sacked_out = tp->packets_out - holes;
2062	return true;
2063	}
2064	return false;
2065	}
2066
2067	/ If we receive more dupacks than we expected counting segments*
2068	* in assumption of absent reordering, interpret this as reordering.
2069	* The only another reason could be bug in receiver TCP.
2070	*/
2071	static void tcp_check_reno_reordering(struct sock sk, const* int addend)
2072	{
2073	struct tcp_sock *tp = tcp_sk(sk);
2074
2075	if (!tcp_limit_reno_sacked(tp))
2076	return;
2077
2078	tp->reordering = min_t(u32, tp->packets_out + addend,
2079	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
2080	tp->reord_seen++;
2081	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
2082	}
2083
2084	/ Emulate SACKs for SACKless connection: account for a new dupack. /
2085
2086	static void tcp_add_reno_sack(struct sock sk, int* num_dupack, bool ece_ack)
2087	{
2088	if (num_dupack) {
2089	struct tcp_sock *tp = tcp_sk(sk);
2090	u32 prior_sacked = tp->sacked_out;
2091	s32 delivered;
2092
2093	tp->sacked_out += num_dupack;
2094	tcp_check_reno_reordering(sk, addend: `0`);
2095	delivered = tp->sacked_out - prior_sacked;
2096	if (delivered > `0`)
2097	tcp_count_delivered(tp, delivered, ece_ack);
2098	tcp_verify_left_out(tp);
2099	}
2100	}
2101
2102	/ Account for ACK, ACKing some data in Reno Recovery phase. /
2103
2104	static void tcp_remove_reno_sacks(struct sock sk, int* acked, bool ece_ack)
2105	{
2106	struct tcp_sock *tp = tcp_sk(sk);
2107
2108	if (acked > `0`) {
2109	/ One ACK acked hole. The rest eat duplicate ACKs. /
2110	tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, `1`),
2111	ece_ack);
2112	if (acked - `1` >= tp->sacked_out)
2113	tp->sacked_out = `0`;
2114	else
2115	tp->sacked_out -= acked - `1`;
2116	}
2117	tcp_check_reno_reordering(sk, addend: acked);
2118	tcp_verify_left_out(tp);
2119	}
2120
2121	static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
2122	{
2123	tp->sacked_out = `0`;
2124	}
2125
2126	void tcp_clear_retrans(struct tcp_sock *tp)
2127	{
2128	tp->retrans_out = `0`;
2129	tp->lost_out = `0`;
2130	tp->undo_marker = `0`;
2131	tp->undo_retrans = -`1`;
2132	tp->sacked_out = `0`;
2133	tp->rto_stamp = `0`;
2134	tp->total_rto = `0`;
2135	tp->total_rto_recoveries = `0`;
2136	tp->total_rto_time = `0`;
2137	}
2138
2139	static inline void tcp_init_undo(struct tcp_sock *tp)
2140	{
2141	tp->undo_marker = tp->snd_una;
2142
2143	/ Retransmission still in flight may cause DSACKs later. /
2144	/ First, account for regular retransmits in flight: /
2145	tp->undo_retrans = tp->retrans_out;
2146	/ Next, account for TLP retransmits in flight: /
2147	if (tp->tlp_high_seq && tp->tlp_retrans)
2148	tp->undo_retrans++;
2149	/ Finally, avoid 0, because undo_retrans==0 means "can undo now": /
2150	if (!tp->undo_retrans)
2151	tp->undo_retrans = -`1`;
2152	}
2153
2154	static bool tcp_is_rack(const struct sock *sk)
2155	{
2156	return READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
2157	TCP_RACK_LOSS_DETECTION;
2158	}
2159
2160	/ If we detect SACK reneging, forget all SACK information*
2161	* and reset tags completely, otherwise preserve SACKs. If receiver
2162	* dropped its ofo queue, we will know this due to reneging detection.
2163	*/
2164	static void tcp_timeout_mark_lost(struct sock *sk)
2165	{
2166	struct tcp_sock *tp = tcp_sk(sk);
2167	struct sk_buff skb, head;
2168	bool is_reneg; / is receiver reneging on SACKs? /
2169
2170	head = tcp_rtx_queue_head(sk);
2171	is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
2172	if (is_reneg) {
2173	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
2174	tp->sacked_out = `0`;
2175	/ Mark SACK reneging until we recover from this loss event. /
2176	tp->is_sack_reneg = `1`;
2177	} else if (tcp_is_reno(tp)) {
2178	tcp_reset_reno_sack(tp);
2179	}
2180
2181	skb = head;
2182	skb_rbtree_walk_from(skb) {
2183	if (is_reneg)
2184	TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
2185	else if (tcp_is_rack(sk) && skb != head &&
2186	tcp_rack_skb_timeout(tp, skb, reo_wnd: `0`) > `0`)
2187	continue; / Don't mark recently sent ones lost yet /
2188	tcp_mark_skb_lost(sk, skb);
2189	}
2190	tcp_verify_left_out(tp);
2191	tcp_clear_all_retrans_hints(tp);
2192	}
2193
2194	/ Enter Loss state. /
2195	void tcp_enter_loss(struct sock *sk)
2196	{
2197	const struct inet_connection_sock *icsk = inet_csk(sk);
2198	struct tcp_sock *tp = tcp_sk(sk);
2199	struct net *net = sock_net(sk);
2200	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
2201	u8 reordering;
2202
2203	tcp_timeout_mark_lost(sk);
2204
2205	/ Reduce ssthresh if it has not yet been made inside this window. /
2206	if (icsk->icsk_ca_state <= TCP_CA_Disorder \|\|
2207	!after(tp->high_seq, tp->snd_una) \|\|
2208	(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
2209	tp->prior_ssthresh = tcp_current_ssthresh(sk);
2210	tp->prior_cwnd = tcp_snd_cwnd(tp);
2211	tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
2212	tcp_ca_event(sk, event: CA_EVENT_LOSS);
2213	tcp_init_undo(tp);
2214	}
2215	tcp_snd_cwnd_set(tp, val: tcp_packets_in_flight(tp) + `1`);
2216	tp->snd_cwnd_cnt = `0`;
2217	tp->snd_cwnd_stamp = tcp_jiffies32;
2218
2219	/ Timeout in disordered state after receiving substantial DUPACKs*
2220	* suggests that the degree of reordering is over-estimated.
2221	*/
2222	reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering);
2223	if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
2224	tp->sacked_out >= reordering)
2225	tp->reordering = min_t(unsigned int, tp->reordering,
2226	reordering);
2227
2228	tcp_set_ca_state(sk, ca_state: TCP_CA_Loss);
2229	tp->high_seq = tp->snd_nxt;
2230	tp->tlp_high_seq = `0`;
2231	tcp_ecn_queue_cwr(tp);
2232
2233	/ F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous*
2234	* loss recovery is underway except recurring timeout(s) on
2235	* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
2236	*/
2237	tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) &&
2238	(new_recovery \|\| icsk->icsk_retransmits) &&
2239	!inet_csk(sk)->icsk_mtup.probe_size;
2240	}
2241
2242	/ If ACK arrived pointing to a remembered SACK, it means that our*
2243	* remembered SACKs do not reflect real state of receiver i.e.
2244	* receiver _host_ is heavily congested (or buggy).
2245	*
2246	* To avoid big spurious retransmission bursts due to transient SACK
2247	* scoreboard oddities that look like reneging, we give the receiver a
2248	* little time (max(RTT/2, 10ms)) to send us some more ACKs that will
2249	* restore sanity to the SACK scoreboard. If the apparent reneging
2250	* persists until this RTO then we'll clear the SACK scoreboard.
2251	*/
2252	static bool tcp_check_sack_reneging(struct sock sk, int* *ack_flag)
2253	{
2254	if (*ack_flag & FLAG_SACK_RENEGING &&
2255	*ack_flag & FLAG_SND_UNA_ADVANCED) {
2256	struct tcp_sock *tp = tcp_sk(sk);
2257	unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> `4`),
2258	msecs_to_jiffies(`10`));
2259
2260	tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, when: delay, pace_delay: false);
2261	*ack_flag &= ~FLAG_SET_XMIT_TIMER;
2262	return true;
2263	}
2264	return false;
2265	}
2266
2267	/ Heurestics to calculate number of duplicate ACKs. There's no dupACKs*
2268	* counter when SACK is enabled (without SACK, sacked_out is used for
2269	* that purpose).
2270	*
2271	* With reordering, holes may still be in flight, so RFC3517 recovery
2272	* uses pure sacked_out (total number of SACKed segments) even though
2273	* it violates the RFC that uses duplicate ACKs, often these are equal
2274	* but when e.g. out-of-window ACKs or packet duplication occurs,
2275	* they differ. Since neither occurs due to loss, TCP should really
2276	* ignore them.
2277	*/
2278	static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
2279	{
2280	return tp->sacked_out + `1`;
2281	}
2282
2283	/ Linux NewReno/SACK/ECN state machine.*
2284	* --------------------------------------
2285	*
2286	* "Open" Normal state, no dubious events, fast path.
2287	* "Disorder" In all the respects it is "Open",
2288	* but requires a bit more attention. It is entered when
2289	* we see some SACKs or dupacks. It is split of "Open"
2290	* mainly to move some processing from fast path to slow one.
2291	* "CWR" CWND was reduced due to some Congestion Notification event.
2292	* It can be ECN, ICMP source quench, local device congestion.
2293	* "Recovery" CWND was reduced, we are fast-retransmitting.
2294	* "Loss" CWND was reduced due to RTO timeout or SACK reneging.
2295	*
2296	* tcp_fastretrans_alert() is entered:
2297	* - each incoming ACK, if state is not "Open"
2298	* - when arrived ACK is unusual, namely:
2299	* * SACK
2300	* * Duplicate ACK.
2301	* * ECN ECE.
2302	*
2303	* Counting packets in flight is pretty simple.
2304	*
2305	* in_flight = packets_out - left_out + retrans_out
2306	*
2307	* packets_out is SND.NXT-SND.UNA counted in packets.
2308	*
2309	* retrans_out is number of retransmitted segments.
2310	*
2311	* left_out is number of segments left network, but not ACKed yet.
2312	*
2313	* left_out = sacked_out + lost_out
2314	*
2315	* sacked_out: Packets, which arrived to receiver out of order
2316	* and hence not ACKed. With SACKs this number is simply
2317	* amount of SACKed data. Even without SACKs
2318	* it is easy to give pretty reliable estimate of this number,
2319	* counting duplicate ACKs.
2320	*
2321	* lost_out: Packets lost by network. TCP has no explicit
2322	* "loss notification" feedback from network (for now).
2323	* It means that this number can be only _guessed_.
2324	* Actually, it is the heuristics to predict lossage that
2325	* distinguishes different algorithms.
2326	*
2327	* F.e. after RTO, when all the queue is considered as lost,
2328	* lost_out = packets_out and in_flight = retrans_out.
2329	*
2330	* Essentially, we have now a few algorithms detecting
2331	* lost packets.
2332	*
2333	* If the receiver supports SACK:
2334	*
2335	* RFC6675/3517: It is the conventional algorithm. A packet is
2336	* considered lost if the number of higher sequence packets
2337	* SACKed is greater than or equal the DUPACK thoreshold
2338	* (reordering). This is implemented in tcp_mark_head_lost and
2339	* tcp_update_scoreboard.
2340	*
2341	* RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
2342	* (2017-) that checks timing instead of counting DUPACKs.
2343	* Essentially a packet is considered lost if it's not S/ACKed
2344	* after RTT + reordering_window, where both metrics are
2345	* dynamically measured and adjusted. This is implemented in
2346	* tcp_rack_mark_lost.
2347	*
2348	* If the receiver does not support SACK:
2349	*
2350	* NewReno (RFC6582): in Recovery we assume that one segment
2351	* is lost (classic Reno). While we are in Recovery and
2352	* a partial ACK arrives, we assume that one more packet
2353	* is lost (NewReno). This heuristics are the same in NewReno
2354	* and SACK.
2355	*
2356	* Really tricky (and requiring careful tuning) part of algorithm
2357	* is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
2358	* The first determines the moment _when_ we should reduce CWND and,
2359	* hence, slow down forward transmission. In fact, it determines the moment
2360	* when we decide that hole is caused by loss, rather than by a reorder.
2361	*
2362	* tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
2363	* holes, caused by lost packets.
2364	*
2365	* And the most logically complicated part of algorithm is undo
2366	* heuristics. We detect false retransmits due to both too early
2367	* fast retransmit (reordering) and underestimated RTO, analyzing
2368	* timestamps and D-SACKs. When we detect that some segments were
2369	* retransmitted by mistake and CWND reduction was wrong, we undo
2370	* window reduction and abort recovery phase. This logic is hidden
2371	* inside several functions named tcp_try_undo_<something>.
2372	*/
2373
2374	/ This function decides, when we should leave Disordered state*
2375	* and enter Recovery phase, reducing congestion window.
2376	*
2377	* Main question: may we further continue forward transmission
2378	* with the same cwnd?
2379	*/
2380	static bool tcp_time_to_recover(struct sock sk, int* flag)
2381	{
2382	struct tcp_sock *tp = tcp_sk(sk);
2383
2384	/ Trick#1: The loss is proven. /
2385	if (tp->lost_out)
2386	return true;
2387
2388	/ Not-A-Trick#2 : Classic rule... /
2389	if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
2390	return true;
2391
2392	return false;
2393	}
2394
2395	/ Detect loss in event "A" above by marking head of queue up as lost.*
2396	* For RFC3517 SACK, a segment is considered lost if it
2397	* has at least tp->reordering SACKed seqments above it; "packets" refers to
2398	* the maximum SACKed segments to pass before reaching this limit.
2399	*/
2400	static void tcp_mark_head_lost(struct sock sk, int* packets, int mark_head)
2401	{
2402	struct tcp_sock *tp = tcp_sk(sk);
2403	struct sk_buff *skb;
2404	int cnt;
2405	/ Use SACK to deduce losses of new sequences sent during recovery /
2406	const u32 loss_high = tp->snd_nxt;
2407
2408	WARN_ON(packets > tp->packets_out);
2409	skb = tp->lost_skb_hint;
2410	if (skb) {
2411	/ Head already handled? /
2412	if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
2413	return;
2414	cnt = tp->lost_cnt_hint;
2415	} else {
2416	skb = tcp_rtx_queue_head(sk);
2417	cnt = `0`;
2418	}
2419
2420	skb_rbtree_walk_from(skb) {
2421	/ TODO: do this better /
2422	/ this is not the most efficient way to do this... /
2423	tp->lost_skb_hint = skb;
2424	tp->lost_cnt_hint = cnt;
2425
2426	if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
2427	break;
2428
2429	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
2430	cnt += tcp_skb_pcount(skb);
2431
2432	if (cnt > packets)
2433	break;
2434
2435	if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
2436	tcp_mark_skb_lost(sk, skb);
2437
2438	if (mark_head)
2439	break;
2440	}
2441	tcp_verify_left_out(tp);
2442	}
2443
2444	/ Account newly detected lost packet(s) /
2445
2446	static void tcp_update_scoreboard(struct sock sk, int* fast_rexmit)
2447	{
2448	struct tcp_sock *tp = tcp_sk(sk);
2449
2450	if (tcp_is_sack(tp)) {
2451	int sacked_upto = tp->sacked_out - tp->reordering;
2452	if (sacked_upto >= `0`)
2453	tcp_mark_head_lost(sk, packets: sacked_upto, mark_head: `0`);
2454	else if (fast_rexmit)
2455	tcp_mark_head_lost(sk, packets: `1`, mark_head: `1`);
2456	}
2457	}
2458
2459	static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
2460	{
2461	return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
2462	before(seq1: tp->rx_opt.rcv_tsecr, seq2: when);
2463	}
2464
2465	/ skb is spurious retransmitted if the returned timestamp echo*
2466	* reply is prior to the skb transmission time
2467	*/
2468	static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
2469	const struct sk_buff *skb)
2470	{
2471	return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
2472	tcp_tsopt_ecr_before(tp, when: tcp_skb_timestamp_ts(usec_ts: tp->tcp_usec_ts, skb));
2473	}
2474
2475	/ Nothing was retransmitted or returned timestamp is less*
2476	* than timestamp of the first retransmission.
2477	*/
2478	static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
2479	{
2480	const struct sock sk = (const* struct sock *)tp;
2481
2482	if (tp->retrans_stamp &&
2483	tcp_tsopt_ecr_before(tp, when: tp->retrans_stamp))
2484	return true; / got echoed TS before first retransmission /
2485
2486	/ Check if nothing was retransmitted (retrans_stamp==0), which may*
2487	* happen in fast recovery due to TSQ. But we ignore zero retrans_stamp
2488	* in TCP_SYN_SENT, since when we set FLAG_SYN_ACKED we also clear
2489	* retrans_stamp even if we had retransmitted the SYN.
2490	*/
2491	if (!tp->retrans_stamp && / no record of a retransmit/SYN? /
2492	sk->sk_state != TCP_SYN_SENT) / not the FLAG_SYN_ACKED case? /
2493	return true; / nothing was retransmitted /
2494
2495	return false;
2496	}
2497
2498	/ Undo procedures. /
2499
2500	/ We can clear retrans_stamp when there are no retransmissions in the*
2501	* window. It would seem that it is trivially available for us in
2502	* tp->retrans_out, however, that kind of assumptions doesn't consider
2503	* what will happen if errors occur when sending retransmission for the
2504	* second time. ...It could the that such segment has only
2505	* TCPCB_EVER_RETRANS set at the present time. It seems that checking
2506	* the head skb is enough except for some reneging corner cases that
2507	* are not worth the effort.
2508	*
2509	* Main reason for all this complexity is the fact that connection dying
2510	* time now depends on the validity of the retrans_stamp, in particular,
2511	* that successive retransmissions of a segment must not advance
2512	* retrans_stamp under any conditions.
2513	*/
2514	static bool tcp_any_retrans_done(const struct sock *sk)
2515	{
2516	const struct tcp_sock *tp = tcp_sk(sk);
2517	struct sk_buff *skb;
2518
2519	if (tp->retrans_out)
2520	return true;
2521
2522	skb = tcp_rtx_queue_head(sk);
2523	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
2524	return true;
2525
2526	return false;
2527	}
2528
2529	/ If loss recovery is finished and there are no retransmits out in the*
2530	* network, then we clear retrans_stamp so that upon the next loss recovery
2531	* retransmits_timed_out() and timestamp-undo are using the correct value.
2532	*/
2533	static void tcp_retrans_stamp_cleanup(struct sock *sk)
2534	{
2535	if (!tcp_any_retrans_done(sk))
2536	tcp_sk(sk)->retrans_stamp = `0`;
2537	}
2538
2539	static void DBGUNDO(struct sock sk, const* char *msg)
2540	{
2541	#if FASTRETRANS_DEBUG > 1
2542	struct tcp_sock *tp = tcp_sk(sk);
2543	struct inet_sock *inet = inet_sk(sk);
2544
2545	if (sk->sk_family == AF_INET) {
2546	pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
2547	msg,
2548	&inet->inet_daddr, ntohs(inet->inet_dport),
2549	tcp_snd_cwnd(tp), tcp_left_out(tp),
2550	tp->snd_ssthresh, tp->prior_ssthresh,
2551	tp->packets_out);
2552	}
2553	#if IS_ENABLED(CONFIG_IPV6)
2554	else if (sk->sk_family == AF_INET6) {
2555	pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
2556	msg,
2557	&sk->sk_v6_daddr, ntohs(inet->inet_dport),
2558	tcp_snd_cwnd(tp), tcp_left_out(tp),
2559	tp->snd_ssthresh, tp->prior_ssthresh,
2560	tp->packets_out);
2561	}
2562	#endif
2563	#endif
2564	}
2565
2566	static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
2567	{
2568	struct tcp_sock *tp = tcp_sk(sk);
2569
2570	if (unmark_loss) {
2571	struct sk_buff *skb;
2572
2573	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2574	TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
2575	}
2576	tp->lost_out = `0`;
2577	tcp_clear_all_retrans_hints(tp);
2578	}
2579
2580	if (tp->prior_ssthresh) {
2581	const struct inet_connection_sock *icsk = inet_csk(sk);
2582
2583	tcp_snd_cwnd_set(tp, val: icsk->icsk_ca_ops->undo_cwnd(sk));
2584
2585	if (tp->prior_ssthresh > tp->snd_ssthresh) {
2586	tp->snd_ssthresh = tp->prior_ssthresh;
2587	tcp_ecn_withdraw_cwr(tp);
2588	}
2589	}
2590	tp->snd_cwnd_stamp = tcp_jiffies32;
2591	tp->undo_marker = `0`;
2592	tp->rack.advanced = `1`; / Force RACK to re-exam losses /
2593	}
2594
2595	static inline bool tcp_may_undo(const struct tcp_sock *tp)
2596	{
2597	return tp->undo_marker && (!tp->undo_retrans \|\| tcp_packet_delayed(tp));
2598	}
2599
2600	static bool tcp_is_non_sack_preventing_reopen(struct sock *sk)
2601	{
2602	struct tcp_sock *tp = tcp_sk(sk);
2603
2604	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
2605	/ Hold old state until something above high_seq*
2606	* is ACKed. For Reno it is MUST to prevent false
2607	* fast retransmits (RFC2582). SACK TCP is safe. */
2608	if (!tcp_any_retrans_done(sk))
2609	tp->retrans_stamp = `0`;
2610	return true;
2611	}
2612	return false;
2613	}
2614
2615	/ People celebrate: "We love our President!" /
2616	static bool tcp_try_undo_recovery(struct sock *sk)
2617	{
2618	struct tcp_sock *tp = tcp_sk(sk);
2619
2620	if (tcp_may_undo(tp)) {
2621	int mib_idx;
2622
2623	/ Happy end! We did not retransmit anything*
2624	* or our original transmission succeeded.
2625	*/
2626	DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
2627	tcp_undo_cwnd_reduction(sk, unmark_loss: false);
2628	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
2629	mib_idx = LINUX_MIB_TCPLOSSUNDO;
2630	else
2631	mib_idx = LINUX_MIB_TCPFULLUNDO;
2632
2633	NET_INC_STATS(sock_net(sk), mib_idx);
2634	} else if (tp->rack.reo_wnd_persist) {
2635	tp->rack.reo_wnd_persist--;
2636	}
2637	if (tcp_is_non_sack_preventing_reopen(sk))
2638	return true;
2639	tcp_set_ca_state(sk, ca_state: TCP_CA_Open);
2640	tp->is_sack_reneg = `0`;
2641	return false;
2642	}
2643
2644	/ Try to undo cwnd reduction, because D-SACKs acked all retransmitted data /
2645	static bool tcp_try_undo_dsack(struct sock *sk)
2646	{
2647	struct tcp_sock *tp = tcp_sk(sk);
2648
2649	if (tp->undo_marker && !tp->undo_retrans) {
2650	tp->rack.reo_wnd_persist = min(TCP_RACK_RECOVERY_THRESH,
2651	tp->rack.reo_wnd_persist + `1`);
2652	DBGUNDO(sk, msg: "D-SACK");
2653	tcp_undo_cwnd_reduction(sk, unmark_loss: false);
2654	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
2655	return true;
2656	}
2657	return false;
2658	}
2659
2660	/ Undo during loss recovery after partial ACK or using F-RTO. /
2661	static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
2662	{
2663	struct tcp_sock *tp = tcp_sk(sk);
2664
2665	if (frto_undo \|\| tcp_may_undo(tp)) {
2666	tcp_undo_cwnd_reduction(sk, unmark_loss: true);
2667
2668	DBGUNDO(sk, msg: "partial loss");
2669	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
2670	if (frto_undo)
2671	NET_INC_STATS(sock_net(sk),
2672	LINUX_MIB_TCPSPURIOUSRTOS);
2673	inet_csk(sk)->icsk_retransmits = `0`;
2674	if (tcp_is_non_sack_preventing_reopen(sk))
2675	return true;
2676	if (frto_undo \|\| tcp_is_sack(tp)) {
2677	tcp_set_ca_state(sk, ca_state: TCP_CA_Open);
2678	tp->is_sack_reneg = `0`;
2679	}
2680	return true;
2681	}
2682	return false;
2683	}
2684
2685	/ The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937.*
2686	* It computes the number of packets to send (sndcnt) based on packets newly
2687	* delivered:
2688	* 1) If the packets in flight is larger than ssthresh, PRR spreads the
2689	* cwnd reductions across a full RTT.
2690	* 2) Otherwise PRR uses packet conservation to send as much as delivered.
2691	* But when SND_UNA is acked without further losses,
2692	* slow starts cwnd up to ssthresh to speed up the recovery.
2693	*/
2694	static void tcp_init_cwnd_reduction(struct sock *sk)
2695	{
2696	struct tcp_sock *tp = tcp_sk(sk);
2697
2698	tp->high_seq = tp->snd_nxt;
2699	tp->tlp_high_seq = `0`;
2700	tp->snd_cwnd_cnt = `0`;
2701	tp->prior_cwnd = tcp_snd_cwnd(tp);
2702	tp->prr_delivered = `0`;
2703	tp->prr_out = `0`;
2704	tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
2705	tcp_ecn_queue_cwr(tp);
2706	}
2707
2708	void tcp_cwnd_reduction(struct sock sk, int* newly_acked_sacked, int newly_lost, int flag)
2709	{
2710	struct tcp_sock *tp = tcp_sk(sk);
2711	int sndcnt = `0`;
2712	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
2713
2714	if (newly_acked_sacked <= `0` \|\| WARN_ON_ONCE(!tp->prior_cwnd))
2715	return;
2716
2717	trace_tcp_cwnd_reduction_tp(sk, newly_acked_sacked, newly_lost, flag);
2718
2719	tp->prr_delivered += newly_acked_sacked;
2720	if (delta < `0`) {
2721	u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
2722	tp->prior_cwnd - `1`;
2723	sndcnt = div_u64(dividend, divisor: tp->prior_cwnd) - tp->prr_out;
2724	} else {
2725	sndcnt = max_t(int, tp->prr_delivered - tp->prr_out,
2726	newly_acked_sacked);
2727	if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost)
2728	sndcnt++;
2729	sndcnt = min(delta, sndcnt);
2730	}
2731	/ Force a fast retransmit upon entering fast recovery /
2732	sndcnt = max(sndcnt, (tp->prr_out ? `0` : `1`));
2733	tcp_snd_cwnd_set(tp, val: tcp_packets_in_flight(tp) + sndcnt);
2734	}
2735
2736	static inline void tcp_end_cwnd_reduction(struct sock *sk)
2737	{
2738	struct tcp_sock *tp = tcp_sk(sk);
2739
2740	if (inet_csk(sk)->icsk_ca_ops->cong_control)
2741	return;
2742
2743	/ Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) /
2744	if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
2745	(inet_csk(sk)->icsk_ca_state == TCP_CA_CWR \|\| tp->undo_marker)) {
2746	tcp_snd_cwnd_set(tp, val: tp->snd_ssthresh);
2747	tp->snd_cwnd_stamp = tcp_jiffies32;
2748	}
2749	tcp_ca_event(sk, event: CA_EVENT_COMPLETE_CWR);
2750	}
2751
2752	/ Enter CWR state. Disable cwnd undo since congestion is proven with ECN /
2753	void tcp_enter_cwr(struct sock *sk)
2754	{
2755	struct tcp_sock *tp = tcp_sk(sk);
2756
2757	tp->prior_ssthresh = `0`;
2758	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
2759	tp->undo_marker = `0`;
2760	tcp_init_cwnd_reduction(sk);
2761	tcp_set_ca_state(sk, ca_state: TCP_CA_CWR);
2762	}
2763	}
2764	EXPORT_SYMBOL(tcp_enter_cwr);
2765
2766	static void tcp_try_keep_open(struct sock *sk)
2767	{
2768	struct tcp_sock *tp = tcp_sk(sk);
2769	int state = TCP_CA_Open;
2770
2771	if (tcp_left_out(tp) \|\| tcp_any_retrans_done(sk))
2772	state = TCP_CA_Disorder;
2773
2774	if (inet_csk(sk)->icsk_ca_state != state) {
2775	tcp_set_ca_state(sk, ca_state: state);
2776	tp->high_seq = tp->snd_nxt;
2777	}
2778	}
2779
2780	static void tcp_try_to_open(struct sock sk, int* flag)
2781	{
2782	struct tcp_sock *tp = tcp_sk(sk);
2783
2784	tcp_verify_left_out(tp);
2785
2786	if (!tcp_any_retrans_done(sk))
2787	tp->retrans_stamp = `0`;
2788
2789	if (flag & FLAG_ECE)
2790	tcp_enter_cwr(sk);
2791
2792	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
2793	tcp_try_keep_open(sk);
2794	}
2795	}
2796
2797	static void tcp_mtup_probe_failed(struct sock *sk)
2798	{
2799	struct inet_connection_sock *icsk = inet_csk(sk);
2800
2801	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - `1`;
2802	icsk->icsk_mtup.probe_size = `0`;
2803	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
2804	}
2805
2806	static void tcp_mtup_probe_success(struct sock *sk)
2807	{
2808	struct tcp_sock *tp = tcp_sk(sk);
2809	struct inet_connection_sock *icsk = inet_csk(sk);
2810	u64 val;
2811
2812	tp->prior_ssthresh = tcp_current_ssthresh(sk);
2813
2814	val = (u64)tcp_snd_cwnd(tp) * tcp_mss_to_mtu(sk, mss: tp->mss_cache);
2815	do_div(val, icsk->icsk_mtup.probe_size);
2816	DEBUG_NET_WARN_ON_ONCE((u32)val != val);
2817	tcp_snd_cwnd_set(tp, max_t(u32, `1U`, val));
2818
2819	tp->snd_cwnd_cnt = `0`;
2820	tp->snd_cwnd_stamp = tcp_jiffies32;
2821	tp->snd_ssthresh = tcp_current_ssthresh(sk);
2822
2823	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
2824	icsk->icsk_mtup.probe_size = `0`;
2825	tcp_sync_mss(sk, pmtu: icsk->icsk_pmtu_cookie);
2826	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
2827	}
2828
2829	/ Sometimes we deduce that packets have been dropped due to reasons other than*
2830	* congestion, like path MTU reductions or failed client TFO attempts. In these
2831	* cases we call this function to retransmit as many packets as cwnd allows,
2832	* without reducing cwnd. Given that retransmits will set retrans_stamp to a
2833	* non-zero value (and may do so in a later calling context due to TSQ), we
2834	* also enter CA_Loss so that we track when all retransmitted packets are ACKed
2835	* and clear retrans_stamp when that happens (to ensure later recurring RTOs
2836	* are using the correct retrans_stamp and don't declare ETIMEDOUT
2837	* prematurely).
2838	*/
2839	static void tcp_non_congestion_loss_retransmit(struct sock *sk)
2840	{
2841	const struct inet_connection_sock *icsk = inet_csk(sk);
2842	struct tcp_sock *tp = tcp_sk(sk);
2843
2844	if (icsk->icsk_ca_state != TCP_CA_Loss) {
2845	tp->high_seq = tp->snd_nxt;
2846	tp->snd_ssthresh = tcp_current_ssthresh(sk);
2847	tp->prior_ssthresh = `0`;
2848	tp->undo_marker = `0`;
2849	tcp_set_ca_state(sk, ca_state: TCP_CA_Loss);
2850	}
2851	tcp_xmit_retransmit_queue(sk);
2852	}
2853
2854	/ Do a simple retransmit without using the backoff mechanisms in*
2855	* tcp_timer. This is used for path mtu discovery.
2856	* The socket is already locked here.
2857	*/
2858	void tcp_simple_retransmit(struct sock *sk)
2859	{
2860	struct tcp_sock *tp = tcp_sk(sk);
2861	struct sk_buff *skb;
2862	int mss;
2863
2864	/ A fastopen SYN request is stored as two separate packets within*
2865	* the retransmit queue, this is done by tcp_send_syn_data().
2866	* As a result simply checking the MSS of the frames in the queue
2867	* will not work for the SYN packet.
2868	*
2869	* Us being here is an indication of a path MTU issue so we can
2870	* assume that the fastopen SYN was lost and just mark all the
2871	* frames in the retransmit queue as lost. We will use an MSS of
2872	* -1 to mark all frames as lost, otherwise compute the current MSS.
2873	*/
2874	if (tp->syn_data && sk->sk_state == TCP_SYN_SENT)
2875	mss = -`1`;
2876	else
2877	mss = tcp_current_mss(sk);
2878
2879	skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
2880	if (tcp_skb_seglen(skb) > mss)
2881	tcp_mark_skb_lost(sk, skb);
2882	}
2883
2884	tcp_clear_retrans_hints_partial(tp);
2885
2886	if (!tp->lost_out)
2887	return;
2888
2889	if (tcp_is_reno(tp))
2890	tcp_limit_reno_sacked(tp);
2891
2892	tcp_verify_left_out(tp);
2893
2894	/ Don't muck with the congestion window here.*
2895	* Reason is that we do not increase amount of _data_
2896	* in network, but units changed and effective
2897	* cwnd/ssthresh really reduced now.
2898	*/
2899	tcp_non_congestion_loss_retransmit(sk);
2900	}
2901	EXPORT_IPV6_MOD(tcp_simple_retransmit);
2902
2903	void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2904	{
2905	struct tcp_sock *tp = tcp_sk(sk);
2906	int mib_idx;
2907
2908	/ Start the clock with our fast retransmit, for undo and ETIMEDOUT. /
2909	tcp_retrans_stamp_cleanup(sk);
2910
2911	if (tcp_is_reno(tp))
2912	mib_idx = LINUX_MIB_TCPRENORECOVERY;
2913	else
2914	mib_idx = LINUX_MIB_TCPSACKRECOVERY;
2915
2916	NET_INC_STATS(sock_net(sk), mib_idx);
2917
2918	tp->prior_ssthresh = `0`;
2919	tcp_init_undo(tp);
2920
2921	if (!tcp_in_cwnd_reduction(sk)) {
2922	if (!ece_ack)
2923	tp->prior_ssthresh = tcp_current_ssthresh(sk);
2924	tcp_init_cwnd_reduction(sk);
2925	}
2926	tcp_set_ca_state(sk, ca_state: TCP_CA_Recovery);
2927	}
2928
2929	static void tcp_update_rto_time(struct tcp_sock *tp)
2930	{
2931	if (tp->rto_stamp) {
2932	tp->total_rto_time += tcp_time_stamp_ms(tp) - tp->rto_stamp;
2933	tp->rto_stamp = `0`;
2934	}
2935	}
2936
2937	/ Process an ACK in CA_Loss state. Move to CA_Open if lost data are*
2938	* recovered or spurious. Otherwise retransmits more on partial ACKs.
2939	*/
2940	static void tcp_process_loss(struct sock sk, int* flag, int num_dupack,
2941	int *rexmit)
2942	{
2943	struct tcp_sock *tp = tcp_sk(sk);
2944	bool recovered = !before(seq1: tp->snd_una, seq2: tp->high_seq);
2945
2946	if ((flag & FLAG_SND_UNA_ADVANCED \|\| rcu_access_pointer(tp->fastopen_rsk)) &&
2947	tcp_try_undo_loss(sk, frto_undo: false))
2948	return;
2949
2950	if (tp->frto) { / F-RTO RFC5682 sec 3.1 (sack enhanced version). /
2951	/ Step 3.b. A timeout is spurious if not all data are*
2952	* lost, i.e., never-retransmitted data are (s)acked.
2953	*/
2954	if ((flag & FLAG_ORIG_SACK_ACKED) &&
2955	tcp_try_undo_loss(sk, frto_undo: true))
2956	return;
2957
2958	if (after(tp->snd_nxt, tp->high_seq)) {
2959	if (flag & FLAG_DATA_SACKED \|\| num_dupack)
2960	tp->frto = `0`; / Step 3.a. loss was real /
2961	} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
2962	tp->high_seq = tp->snd_nxt;
2963	/ Step 2.b. Try send new data (but deferred until cwnd*
2964	* is updated in tcp_ack()). Otherwise fall back to
2965	* the conventional recovery.
2966	*/
2967	if (!tcp_write_queue_empty(sk) &&
2968	after(tcp_wnd_end(tp), tp->snd_nxt)) {
2969	*rexmit = REXMIT_NEW;
2970	return;
2971	}
2972	tp->frto = `0`;
2973	}
2974	}
2975
2976	if (recovered) {
2977	/ F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a /
2978	tcp_try_undo_recovery(sk);
2979	return;
2980	}
2981	if (tcp_is_reno(tp)) {
2982	/ A Reno DUPACK means new data in F-RTO step 2.b above are*
2983	* delivered. Lower inflight to clock out (re)transmissions.
2984	*/
2985	if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
2986	tcp_add_reno_sack(sk, num_dupack, ece_ack: flag & FLAG_ECE);
2987	else if (flag & FLAG_SND_UNA_ADVANCED)
2988	tcp_reset_reno_sack(tp);
2989	}
2990	*rexmit = REXMIT_LOST;
2991	}
2992
2993	static bool tcp_force_fast_retransmit(struct sock *sk)
2994	{
2995	struct tcp_sock *tp = tcp_sk(sk);
2996
2997	return after(tcp_highest_sack_seq(tp),
2998	tp->snd_una + tp->reordering * tp->mss_cache);
2999	}
3000
3001	/ Undo during fast recovery after partial ACK. /
3002	static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
3003	bool *do_lost)
3004	{
3005	struct tcp_sock *tp = tcp_sk(sk);
3006
3007	if (tp->undo_marker && tcp_packet_delayed(tp)) {
3008	/ Plain luck! Hole if filled with delayed*
3009	* packet, rather than with a retransmit. Check reordering.
3010	*/
3011	tcp_check_sack_reordering(sk, low_seq: prior_snd_una, ts: `1`);
3012
3013	/ We are getting evidence that the reordering degree is higher*
3014	* than we realized. If there are no retransmits out then we
3015	* can undo. Otherwise we clock out new packets but do not
3016	* mark more packets lost or retransmit more.
3017	*/
3018	if (tp->retrans_out)
3019	return true;
3020
3021	if (!tcp_any_retrans_done(sk))
3022	tp->retrans_stamp = `0`;
3023
3024	DBGUNDO(sk, msg: "partial recovery");
3025	tcp_undo_cwnd_reduction(sk, unmark_loss: true);
3026	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
3027	tcp_try_keep_open(sk);
3028	} else {
3029	/ Partial ACK arrived. Force fast retransmit. /
3030	*do_lost = tcp_force_fast_retransmit(sk);
3031	}
3032	return false;
3033	}
3034
3035	static void tcp_identify_packet_loss(struct sock sk, int* *ack_flag)
3036	{
3037	struct tcp_sock *tp = tcp_sk(sk);
3038
3039	if (tcp_rtx_queue_empty(sk))
3040	return;
3041
3042	if (unlikely(tcp_is_reno(tp))) {
3043	tcp_newreno_mark_lost(sk, snd_una_advanced: *ack_flag & FLAG_SND_UNA_ADVANCED);
3044	} else if (tcp_is_rack(sk)) {
3045	u32 prior_retrans = tp->retrans_out;
3046
3047	if (tcp_rack_mark_lost(sk))
3048	*ack_flag &= ~FLAG_SET_XMIT_TIMER;
3049	if (prior_retrans > tp->retrans_out)
3050	*ack_flag \|= FLAG_LOST_RETRANS;
3051	}
3052	}
3053
3054	/ Process an event, which can update packets-in-flight not trivially.*
3055	* Main goal of this function is to calculate new estimate for left_out,
3056	* taking into account both packets sitting in receiver's buffer and
3057	* packets lost by network.
3058	*
3059	* Besides that it updates the congestion state when packet loss or ECN
3060	* is detected. But it does not reduce the cwnd, it is done by the
3061	* congestion control later.
3062	*
3063	* It does _not_ decide what to send, it is made in function
3064	* tcp_xmit_retransmit_queue().
3065	*/
3066	static void tcp_fastretrans_alert(struct sock sk, const* u32 prior_snd_una,
3067	int num_dupack, int ack_flag, int* *rexmit)
3068	{
3069	struct inet_connection_sock *icsk = inet_csk(sk);
3070	struct tcp_sock *tp = tcp_sk(sk);
3071	int fast_rexmit = `0`, flag = *ack_flag;
3072	bool ece_ack = flag & FLAG_ECE;
3073	bool do_lost = num_dupack \|\| ((flag & FLAG_DATA_SACKED) &&
3074	tcp_force_fast_retransmit(sk));
3075
3076	if (!tp->packets_out && tp->sacked_out)
3077	tp->sacked_out = `0`;
3078
3079	/ Now state machine starts.*
3080	* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
3081	if (ece_ack)
3082	tp->prior_ssthresh = `0`;
3083
3084	/ B. In all the states check for reneging SACKs. /
3085	if (tcp_check_sack_reneging(sk, ack_flag))
3086	return;
3087
3088	/ C. Check consistency of the current state. /
3089	tcp_verify_left_out(tp);
3090
3091	/ D. Check state exit conditions. State can be terminated*
3092	* when high_seq is ACKed. */
3093	if (icsk->icsk_ca_state == TCP_CA_Open) {
3094	WARN_ON(tp->retrans_out != `0` && !tp->syn_data);
3095	tp->retrans_stamp = `0`;
3096	} else if (!before(seq1: tp->snd_una, seq2: tp->high_seq)) {
3097	switch (icsk->icsk_ca_state) {
3098	case TCP_CA_CWR:
3099	/ CWR is to be held something above high_seq*
3100	* is ACKed for CWR bit to reach receiver. */
3101	if (tp->snd_una != tp->high_seq) {
3102	tcp_end_cwnd_reduction(sk);
3103	tcp_set_ca_state(sk, ca_state: TCP_CA_Open);
3104	}
3105	break;
3106
3107	case TCP_CA_Recovery:
3108	if (tcp_is_reno(tp))
3109	tcp_reset_reno_sack(tp);
3110	if (tcp_try_undo_recovery(sk))
3111	return;
3112	tcp_end_cwnd_reduction(sk);
3113	break;
3114	}
3115	}
3116
3117	/ E. Process state. /
3118	switch (icsk->icsk_ca_state) {
3119	case TCP_CA_Recovery:
3120	if (!(flag & FLAG_SND_UNA_ADVANCED)) {
3121	if (tcp_is_reno(tp))
3122	tcp_add_reno_sack(sk, num_dupack, ece_ack);
3123	} else if (tcp_try_undo_partial(sk, prior_snd_una, do_lost: &do_lost))
3124	return;
3125
3126	if (tcp_try_undo_dsack(sk))
3127	tcp_try_to_open(sk, flag);
3128
3129	tcp_identify_packet_loss(sk, ack_flag);
3130	if (icsk->icsk_ca_state != TCP_CA_Recovery) {
3131	if (!tcp_time_to_recover(sk, flag))
3132	return;
3133	/ Undo reverts the recovery state. If loss is evident,*
3134	* starts a new recovery (e.g. reordering then loss);
3135	*/
3136	tcp_enter_recovery(sk, ece_ack);
3137	}
3138	break;
3139	case TCP_CA_Loss:
3140	tcp_process_loss(sk, flag, num_dupack, rexmit);
3141	if (icsk->icsk_ca_state != TCP_CA_Loss)
3142	tcp_update_rto_time(tp);
3143	tcp_identify_packet_loss(sk, ack_flag);
3144	if (!(icsk->icsk_ca_state == TCP_CA_Open \|\|
3145	(*ack_flag & FLAG_LOST_RETRANS)))
3146	return;
3147	/ Change state if cwnd is undone or retransmits are lost /
3148	fallthrough;
3149	default:
3150	if (tcp_is_reno(tp)) {
3151	if (flag & FLAG_SND_UNA_ADVANCED)
3152	tcp_reset_reno_sack(tp);
3153	tcp_add_reno_sack(sk, num_dupack, ece_ack);
3154	}
3155
3156	if (icsk->icsk_ca_state <= TCP_CA_Disorder)
3157	tcp_try_undo_dsack(sk);
3158
3159	tcp_identify_packet_loss(sk, ack_flag);
3160	if (!tcp_time_to_recover(sk, flag)) {
3161	tcp_try_to_open(sk, flag);
3162	return;
3163	}
3164
3165	/ MTU probe failure: don't reduce cwnd /
3166	if (icsk->icsk_ca_state < TCP_CA_CWR &&
3167	icsk->icsk_mtup.probe_size &&
3168	tp->snd_una == tp->mtu_probe.probe_seq_start) {
3169	tcp_mtup_probe_failed(sk);
3170	/ Restores the reduction we did in tcp_mtup_probe() /
3171	tcp_snd_cwnd_set(tp, val: tcp_snd_cwnd(tp) + `1`);
3172	tcp_simple_retransmit(sk);
3173	return;
3174	}
3175
3176	/ Otherwise enter Recovery state /
3177	tcp_enter_recovery(sk, ece_ack);
3178	fast_rexmit = `1`;
3179	}
3180
3181	if (!tcp_is_rack(sk) && do_lost)
3182	tcp_update_scoreboard(sk, fast_rexmit);
3183	*rexmit = REXMIT_LOST;
3184	}
3185
3186	static void tcp_update_rtt_min(struct sock sk, u32 rtt_us, const* int flag)
3187	{
3188	u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ;
3189	struct tcp_sock *tp = tcp_sk(sk);
3190
3191	if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
3192	/ If the remote keeps returning delayed ACKs, eventually*
3193	* the min filter would pick it up and overestimate the
3194	* prop. delay when it expires. Skip suspected delayed ACKs.
3195	*/
3196	return;
3197	}
3198	minmax_running_min(m: &tp->rtt_min, win: wlen, tcp_jiffies32,
3199	meas: rtt_us ? : jiffies_to_usecs(j: `1`));
3200	}
3201
3202	static bool tcp_ack_update_rtt(struct sock sk, const* int flag,
3203	long seq_rtt_us, long sack_rtt_us,
3204	long ca_rtt_us, struct rate_sample *rs)
3205	{
3206	const struct tcp_sock *tp = tcp_sk(sk);
3207
3208	/ Prefer RTT measured from ACK's timing to TS-ECR. This is because*
3209	* broken middle-boxes or peers may corrupt TS-ECR fields. But
3210	* Karn's algorithm forbids taking RTT if some retransmitted data
3211	* is acked (RFC6298).
3212	*/
3213	if (seq_rtt_us < `0`)
3214	seq_rtt_us = sack_rtt_us;
3215
3216	/ RTTM Rule: A TSecr value received in a segment is used to*
3217	* update the averaged RTT measurement only if the segment
3218	* acknowledges some new data, i.e., only if it advances the
3219	* left edge of the send window.
3220	* See draft-ietf-tcplw-high-performance-00, section 3.3.
3221	*/
3222	if (seq_rtt_us < `0` && tp->rx_opt.saw_tstamp &&
3223	tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED)
3224	seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, min_delta: `1`);
3225
3226	rs->rtt_us = ca_rtt_us; / RTT of last (S)ACKed packet (or -1) /
3227	if (seq_rtt_us < `0`)
3228	return false;
3229
3230	/ ca_rtt_us >= 0 is counting on the invariant that ca_rtt_us is*
3231	* always taken together with ACK, SACK, or TS-opts. Any negative
3232	* values will be skipped with the seq_rtt_us < 0 check above.
3233	*/
3234	tcp_update_rtt_min(sk, rtt_us: ca_rtt_us, flag);
3235	tcp_rtt_estimator(sk, mrtt_us: seq_rtt_us);
3236	tcp_set_rto(sk);
3237
3238	/ RFC6298: only reset backoff on valid RTT measurement. /
3239	inet_csk(sk)->icsk_backoff = `0`;
3240	return true;
3241	}
3242
3243	/ Compute time elapsed between (last) SYNACK and the ACK completing 3WHS. /
3244	void tcp_synack_rtt_meas(struct sock sk, struct* request_sock *req)
3245	{
3246	struct rate_sample rs;
3247	long rtt_us = -`1L`;
3248
3249	if (req && !req->num_retrans && tcp_rsk(req)->snt_synack)
3250	rtt_us = tcp_stamp_us_delta(t1: tcp_clock_us(), t0: tcp_rsk(req)->snt_synack);
3251
3252	tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us: rtt_us, sack_rtt_us: -`1L`, ca_rtt_us: rtt_us, rs: &rs);
3253	}
3254
3255
3256	static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
3257	{
3258	const struct inet_connection_sock *icsk = inet_csk(sk);
3259
3260	icsk->icsk_ca_ops->cong_avoid(sk, ack, acked);
3261	tcp_sk(sk)->snd_cwnd_stamp = tcp_jiffies32;
3262	}
3263
3264	/ Restart timer after forward progress on connection.*
3265	* RFC2988 recommends to restart timer to now+rto.
3266	*/
3267	void tcp_rearm_rto(struct sock *sk)
3268	{
3269	const struct inet_connection_sock *icsk = inet_csk(sk);
3270	struct tcp_sock *tp = tcp_sk(sk);
3271
3272	/ If the retrans timer is currently being used by Fast Open*
3273	* for SYN-ACK retrans purpose, stay put.
3274	*/
3275	if (rcu_access_pointer(tp->fastopen_rsk))
3276	return;
3277
3278	if (!tp->packets_out) {
3279	inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
3280	} else {
3281	u32 rto = inet_csk(sk)->icsk_rto;
3282	/ Offset the time elapsed after installing regular RTO /
3283	if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT \|\|
3284	icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
3285	s64 delta_us = tcp_rto_delta_us(sk);
3286	/ delta_us may not be positive if the socket is locked*
3287	* when the retrans timer fires and is rescheduled.
3288	*/
3289	rto = usecs_to_jiffies(max_t(int, delta_us, `1`));
3290	}
3291	tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, when: rto, pace_delay: true);
3292	}
3293	}
3294
3295	/ Try to schedule a loss probe; if that doesn't work, then schedule an RTO. /
3296	static void tcp_set_xmit_timer(struct sock *sk)
3297	{
3298	if (!tcp_schedule_loss_probe(sk, advancing_rto: true))
3299	tcp_rearm_rto(sk);
3300	}
3301
3302	/ If we get here, the whole TSO packet has not been acked. /
3303	static u32 tcp_tso_acked(struct sock sk, struct* sk_buff *skb)
3304	{
3305	struct tcp_sock *tp = tcp_sk(sk);
3306	u32 packets_acked;
3307
3308	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
3309
3310	packets_acked = tcp_skb_pcount(skb);
3311	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
3312	return `0`;
3313	packets_acked -= tcp_skb_pcount(skb);
3314
3315	if (packets_acked) {
3316	BUG_ON(tcp_skb_pcount(skb) == `0`);
3317	BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
3318	}
3319
3320	return packets_acked;
3321	}
3322
3323	static void tcp_ack_tstamp(struct sock sk, struct* sk_buff *skb,
3324	const struct sk_buff *ack_skb, u32 prior_snd_una)
3325	{
3326	const struct skb_shared_info *shinfo;
3327
3328	/ Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags /
3329	if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
3330	return;
3331
3332	shinfo = skb_shinfo(skb);
3333	if (!before(seq1: shinfo->tskey, seq2: prior_snd_una) &&
3334	before(seq1: shinfo->tskey, tcp_sk(sk)->snd_una)) {
3335	tcp_skb_tsorted_save(skb) {
3336	__skb_tstamp_tx(orig_skb: skb, ack_skb, NULL, sk, tstype: SCM_TSTAMP_ACK);
3337	} tcp_skb_tsorted_restore(skb);
3338	}
3339	}
3340
3341	/ Remove acknowledged frames from the retransmission queue. If our packet*
3342	* is before the ack sequence we can discard it as it's confirmed to have
3343	* arrived at the other end.
3344	*/
3345	static int tcp_clean_rtx_queue(struct sock sk, const* struct sk_buff *ack_skb,
3346	u32 prior_fack, u32 prior_snd_una,
3347	struct tcp_sacktag_state *sack, bool ece_ack)
3348	{
3349	const struct inet_connection_sock *icsk = inet_csk(sk);
3350	u64 first_ackt, last_ackt;
3351	struct tcp_sock *tp = tcp_sk(sk);
3352	u32 prior_sacked = tp->sacked_out;
3353	u32 reord = tp->snd_nxt; / lowest acked un-retx un-sacked seq /
3354	struct sk_buff skb, next;
3355	bool fully_acked = true;
3356	long sack_rtt_us = -`1L`;
3357	long seq_rtt_us = -`1L`;
3358	long ca_rtt_us = -`1L`;
3359	u32 pkts_acked = `0`;
3360	bool rtt_update;
3361	int flag = `0`;
3362
3363	first_ackt = `0`;
3364
3365	for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) {
3366	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3367	const u32 start_seq = scb->seq;
3368	u8 sacked = scb->sacked;
3369	u32 acked_pcount;
3370
3371	/ Determine how many packets and what bytes were acked, tso and else /
3372	if (after(scb->end_seq, tp->snd_una)) {
3373	if (tcp_skb_pcount(skb) == `1` \|\|
3374	!after(tp->snd_una, scb->seq))
3375	break;
3376
3377	acked_pcount = tcp_tso_acked(sk, skb);
3378	if (!acked_pcount)
3379	break;
3380	fully_acked = false;
3381	} else {
3382	acked_pcount = tcp_skb_pcount(skb);
3383	}
3384
3385	if (unlikely(sacked & TCPCB_RETRANS)) {
3386	if (sacked & TCPCB_SACKED_RETRANS)
3387	tp->retrans_out -= acked_pcount;
3388	flag \|= FLAG_RETRANS_DATA_ACKED;
3389	} else if (!(sacked & TCPCB_SACKED_ACKED)) {
3390	last_ackt = tcp_skb_timestamp_us(skb);
3391	WARN_ON_ONCE(last_ackt == `0`);
3392	if (!first_ackt)
3393	first_ackt = last_ackt;
3394
3395	if (before(seq1: start_seq, seq2: reord))
3396	reord = start_seq;
3397	if (!after(scb->end_seq, tp->high_seq))
3398	flag \|= FLAG_ORIG_SACK_ACKED;
3399	}
3400
3401	if (sacked & TCPCB_SACKED_ACKED) {
3402	tp->sacked_out -= acked_pcount;
3403	} else if (tcp_is_sack(tp)) {
3404	tcp_count_delivered(tp, delivered: acked_pcount, ece_ack);
3405	if (!tcp_skb_spurious_retrans(tp, skb))
3406	tcp_rack_advance(tp, sacked, end_seq: scb->end_seq,
3407	xmit_time: tcp_skb_timestamp_us(skb));
3408	}
3409	if (sacked & TCPCB_LOST)
3410	tp->lost_out -= acked_pcount;
3411
3412	tp->packets_out -= acked_pcount;
3413	pkts_acked += acked_pcount;
3414	tcp_rate_skb_delivered(sk, skb, rs: sack->rate);
3415
3416	/ Initial outgoing SYN's get put onto the write_queue*
3417	* just like anything else we transmit. It is not
3418	* true data, and if we misinform our callers that
3419	* this ACK acks real data, we will erroneously exit
3420	* connection startup slow start one packet too
3421	* quickly. This is severely frowned upon behavior.
3422	*/
3423	if (likely(!(scb->tcp_flags & TCPHDR_SYN))) {
3424	flag \|= FLAG_DATA_ACKED;
3425	} else {
3426	flag \|= FLAG_SYN_ACKED;
3427	tp->retrans_stamp = `0`;
3428	}
3429
3430	if (!fully_acked)
3431	break;
3432
3433	tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
3434
3435	next = skb_rb_next(skb);
3436	if (unlikely(skb == tp->retransmit_skb_hint))
3437	tp->retransmit_skb_hint = NULL;
3438	if (unlikely(skb == tp->lost_skb_hint))
3439	tp->lost_skb_hint = NULL;
3440	tcp_highest_sack_replace(sk, old: skb, new: next);
3441	tcp_rtx_queue_unlink_and_free(skb, sk);
3442	}
3443
3444	if (!skb)
3445	tcp_chrono_stop(sk, type: TCP_CHRONO_BUSY);
3446
3447	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
3448	tp->snd_up = tp->snd_una;
3449
3450	if (skb) {
3451	tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
3452	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
3453	flag \|= FLAG_SACK_RENEGING;
3454	}
3455
3456	if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
3457	seq_rtt_us = tcp_stamp_us_delta(t1: tp->tcp_mstamp, t0: first_ackt);
3458	ca_rtt_us = tcp_stamp_us_delta(t1: tp->tcp_mstamp, t0: last_ackt);
3459
3460	if (pkts_acked == `1` && fully_acked && !prior_sacked &&
3461	(tp->snd_una - prior_snd_una) < tp->mss_cache &&
3462	sack->rate->prior_delivered + `1` == tp->delivered &&
3463	!(flag & (FLAG_CA_ALERT \| FLAG_SYN_ACKED))) {
3464	/ Conservatively mark a delayed ACK. It's typically*
3465	* from a lone runt packet over the round trip to
3466	* a receiver w/o out-of-order or CE events.
3467	*/
3468	flag \|= FLAG_ACK_MAYBE_DELAYED;
3469	}
3470	}
3471	if (sack->first_sackt) {
3472	sack_rtt_us = tcp_stamp_us_delta(t1: tp->tcp_mstamp, t0: sack->first_sackt);
3473	ca_rtt_us = tcp_stamp_us_delta(t1: tp->tcp_mstamp, t0: sack->last_sackt);
3474	}
3475	rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us,
3476	ca_rtt_us, rs: sack->rate);
3477
3478	if (flag & FLAG_ACKED) {
3479	flag \|= FLAG_SET_XMIT_TIMER; / set TLP or RTO timer /
3480	if (unlikely(icsk->icsk_mtup.probe_size &&
3481	!after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
3482	tcp_mtup_probe_success(sk);
3483	}
3484
3485	if (tcp_is_reno(tp)) {
3486	tcp_remove_reno_sacks(sk, acked: pkts_acked, ece_ack);
3487
3488	/ If any of the cumulatively ACKed segments was*
3489	* retransmitted, non-SACK case cannot confirm that
3490	* progress was due to original transmission due to
3491	* lack of TCPCB_SACKED_ACKED bits even if some of
3492	* the packets may have been never retransmitted.
3493	*/
3494	if (flag & FLAG_RETRANS_DATA_ACKED)
3495	flag &= ~FLAG_ORIG_SACK_ACKED;
3496	} else {
3497	int delta;
3498
3499	/ Non-retransmitted hole got filled? That's reordering /
3500	if (before(seq1: reord, seq2: prior_fack))
3501	tcp_check_sack_reordering(sk, low_seq: reord, ts: `0`);
3502
3503	delta = prior_sacked - tp->sacked_out;
3504	tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3505	}
3506	} else if (skb && rtt_update && sack_rtt_us >= `0` &&
3507	sack_rtt_us > tcp_stamp_us_delta(t1: tp->tcp_mstamp,
3508	t0: tcp_skb_timestamp_us(skb))) {
3509	/ Do not re-arm RTO if the sack RTT is measured from data sent*
3510	* after when the head was last (re)transmitted. Otherwise the
3511	* timeout may continue to extend in loss recovery.
3512	*/
3513	flag \|= FLAG_SET_XMIT_TIMER; / set TLP or RTO timer /
3514	}
3515
3516	if (icsk->icsk_ca_ops->pkts_acked) {
3517	struct ack_sample sample = { .pkts_acked = pkts_acked,
3518	.rtt_us = sack->rate->rtt_us };
3519
3520	sample.in_flight = tp->mss_cache *
3521	(tp->delivered - sack->rate->prior_delivered);
3522	icsk->icsk_ca_ops->pkts_acked(sk, &sample);
3523	}
3524
3525	#if FASTRETRANS_DEBUG > 0
3526	WARN_ON((int)tp->sacked_out < `0`);
3527	WARN_ON((int)tp->lost_out < `0`);
3528	WARN_ON((int)tp->retrans_out < `0`);
3529	if (!tp->packets_out && tcp_is_sack(tp)) {
3530	icsk = inet_csk(sk);
3531	if (tp->lost_out) {
3532	pr_debug("Leak l=%u %d\n",
3533	tp->lost_out, icsk->icsk_ca_state);
3534	tp->lost_out = `0`;
3535	}
3536	if (tp->sacked_out) {
3537	pr_debug("Leak s=%u %d\n",
3538	tp->sacked_out, icsk->icsk_ca_state);
3539	tp->sacked_out = `0`;
3540	}
3541	if (tp->retrans_out) {
3542	pr_debug("Leak r=%u %d\n",
3543	tp->retrans_out, icsk->icsk_ca_state);
3544	tp->retrans_out = `0`;
3545	}
3546	}
3547	#endif
3548	return flag;
3549	}
3550
3551	static void tcp_ack_probe(struct sock *sk)
3552	{
3553	struct inet_connection_sock *icsk = inet_csk(sk);
3554	struct sk_buff *head = tcp_send_head(sk);
3555	const struct tcp_sock *tp = tcp_sk(sk);
3556
3557	/ Was it a usable window open? /
3558	if (!head)
3559	return;
3560	if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
3561	icsk->icsk_backoff = `0`;
3562	icsk->icsk_probes_tstamp = `0`;
3563	inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
3564	/ Socket must be waked up by subsequent tcp_data_snd_check().*
3565	* This function is not for random using!
3566	*/
3567	} else {
3568	unsigned long when = tcp_probe0_when(sk, max_when: tcp_rto_max(sk));
3569
3570	when = tcp_clamp_probe0_to_user_timeout(sk, when);
3571	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, pace_delay: true);
3572	}
3573	}
3574
3575	static inline bool tcp_ack_is_dubious(const struct sock sk, const* int flag)
3576	{
3577	return !(flag & FLAG_NOT_DUP) \|\| (flag & FLAG_CA_ALERT) \|\|
3578	inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
3579	}
3580
3581	/ Decide wheather to run the increase function of congestion control. /
3582	static inline bool tcp_may_raise_cwnd(const struct sock sk, const* int flag)
3583	{
3584	/ If reordering is high then always grow cwnd whenever data is*
3585	* delivered regardless of its ordering. Otherwise stay conservative
3586	* and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
3587	* new SACK or ECE mark may first advance cwnd here and later reduce
3588	* cwnd in tcp_fastretrans_alert() based on more states.
3589	*/
3590	if (tcp_sk(sk)->reordering >
3591	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering))
3592	return flag & FLAG_FORWARD_PROGRESS;
3593
3594	return flag & FLAG_DATA_ACKED;
3595	}
3596
3597	/ The "ultimate" congestion control function that aims to replace the rigid*
3598	* cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
3599	* It's called toward the end of processing an ACK with precise rate
3600	* information. All transmission or retransmission are delayed afterwards.
3601	*/
3602	static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
3603	int flag, const struct rate_sample *rs)
3604	{
3605	const struct inet_connection_sock *icsk = inet_csk(sk);
3606
3607	if (icsk->icsk_ca_ops->cong_control) {
3608	icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs);
3609	return;
3610	}
3611
3612	if (tcp_in_cwnd_reduction(sk)) {
3613	/ Reduce cwnd if state mandates /
3614	tcp_cwnd_reduction(sk, newly_acked_sacked: acked_sacked, newly_lost: rs->losses, flag);
3615	} else if (tcp_may_raise_cwnd(sk, flag)) {
3616	/ Advance cwnd if state allows /
3617	tcp_cong_avoid(sk, ack, acked: acked_sacked);
3618	}
3619	tcp_update_pacing_rate(sk);
3620	}
3621
3622	/ Check that window update is acceptable.*
3623	* The function assumes that snd_una<=ack<=snd_next.
3624	*/
3625	static inline bool tcp_may_update_window(const struct tcp_sock *tp,
3626	const u32 ack, const u32 ack_seq,
3627	const u32 nwin)
3628	{
3629	return after(ack, tp->snd_una) \|\|
3630	after(ack_seq, tp->snd_wl1) \|\|
3631	(ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd \|\| !nwin));
3632	}
3633
3634	static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack)
3635	{
3636	#ifdef CONFIG_TCP_AO
3637	struct tcp_ao_info *ao;
3638
3639	if (!static_branch_unlikely(&tcp_ao_needed.key))
3640	return;
3641
3642	ao = rcu_dereference_protected(tp->ao_info,
3643	lockdep_sock_is_held((struct sock *)tp));
3644	if (ao && ack < tp->snd_una) {
3645	ao->snd_sne++;
3646	trace_tcp_ao_snd_sne_update(sk: (struct sock *)tp, new_sne: ao->snd_sne);
3647	}
3648	#endif
3649	}
3650
3651	/ If we update tp->snd_una, also update tp->bytes_acked /
3652	static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
3653	{
3654	u32 delta = ack - tp->snd_una;
3655
3656	sock_owned_by_me(sk: (struct sock *)tp);
3657	tp->bytes_acked += delta;
3658	tcp_snd_sne_update(tp, ack);
3659	tp->snd_una = ack;
3660	}
3661
3662	static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq)
3663	{
3664	#ifdef CONFIG_TCP_AO
3665	struct tcp_ao_info *ao;
3666
3667	if (!static_branch_unlikely(&tcp_ao_needed.key))
3668	return;
3669
3670	ao = rcu_dereference_protected(tp->ao_info,
3671	lockdep_sock_is_held((struct sock *)tp));
3672	if (ao && seq < tp->rcv_nxt) {
3673	ao->rcv_sne++;
3674	trace_tcp_ao_rcv_sne_update(sk: (struct sock *)tp, new_sne: ao->rcv_sne);
3675	}
3676	#endif
3677	}
3678
3679	/ If we update tp->rcv_nxt, also update tp->bytes_received /
3680	static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
3681	{
3682	u32 delta = seq - tp->rcv_nxt;
3683
3684	sock_owned_by_me(sk: (struct sock *)tp);
3685	tp->bytes_received += delta;
3686	tcp_rcv_sne_update(tp, seq);
3687	WRITE_ONCE(tp->rcv_nxt, seq);
3688	}
3689
3690	/ Update our send window.*
3691	*
3692	* Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
3693	* and in FreeBSD. NetBSD's one is even worse.) is wrong.
3694	*/
3695	static int tcp_ack_update_window(struct sock sk, const* struct sk_buff *skb, u32 ack,
3696	u32 ack_seq)
3697	{
3698	struct tcp_sock *tp = tcp_sk(sk);
3699	int flag = `0`;
3700	u32 nwin = ntohs(tcp_hdr(skb)->window);
3701
3702	if (likely(!tcp_hdr(skb)->syn))
3703	nwin <<= tp->rx_opt.snd_wscale;
3704
3705	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
3706	flag \|= FLAG_WIN_UPDATE;
3707	tcp_update_wl(tp, seq: ack_seq);
3708
3709	if (tp->snd_wnd != nwin) {
3710	tp->snd_wnd = nwin;
3711
3712	/ Note, it is the only place, where*
3713	* fast path is recovered for sending TCP.
3714	*/
3715	tp->pred_flags = `0`;
3716	tcp_fast_path_check(sk);
3717
3718	if (!tcp_write_queue_empty(sk))
3719	tcp_slow_start_after_idle_check(sk);
3720
3721	if (nwin > tp->max_window) {
3722	tp->max_window = nwin;
3723	tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
3724	}
3725	}
3726	}
3727
3728	tcp_snd_una_update(tp, ack);
3729
3730	return flag;
3731	}
3732
3733	static bool __tcp_oow_rate_limited(struct net net, int* mib_idx,
3734	u32 *last_oow_ack_time)
3735	{
3736	/ Paired with the WRITE_ONCE() in this function. /
3737	u32 val = READ_ONCE(*last_oow_ack_time);
3738
3739	if (val) {
3740	s32 elapsed = (s32)(tcp_jiffies32 - val);
3741
3742	if (`0` <= elapsed &&
3743	elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) {
3744	NET_INC_STATS(net, mib_idx);
3745	return true; / rate-limited: don't send yet! /
3746	}
3747	}
3748
3749	/ Paired with the prior READ_ONCE() and with itself,*
3750	* as we might be lockless.
3751	*/
3752	WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32);
3753
3754	return false; / not rate-limited: go ahead, send dupack now! /
3755	}
3756
3757	/ Return true if we're currently rate-limiting out-of-window ACKs and*
3758	* thus shouldn't send a dupack right now. We rate-limit dupacks in
3759	* response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
3760	* attacks that send repeated SYNs or ACKs for the same connection. To
3761	* do this, we do not send a duplicate SYNACK or ACK if the remote
3762	* endpoint is sending out-of-window SYNs or pure ACKs at a high rate.
3763	*/
3764	bool tcp_oow_rate_limited(struct net net, const* struct sk_buff *skb,
3765	int mib_idx, u32 *last_oow_ack_time)
3766	{
3767	/ Data packets without SYNs are not likely part of an ACK loop. /
3768	if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
3769	!tcp_hdr(skb)->syn)
3770	return false;
3771
3772	return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
3773	}
3774
3775	/ RFC 5961 7 [ACK Throttling] /
3776	static void tcp_send_challenge_ack(struct sock *sk)
3777	{
3778	struct tcp_sock *tp = tcp_sk(sk);
3779	struct net *net = sock_net(sk);
3780	u32 count, now, ack_limit;
3781
3782	/ First check our per-socket dupack rate limit. /
3783	if (__tcp_oow_rate_limited(net,
3784	mib_idx: LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
3785	last_oow_ack_time: &tp->last_oow_ack_time))
3786	return;
3787
3788	ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
3789	if (ack_limit == INT_MAX)
3790	goto send_ack;
3791
3792	/ Then check host-wide RFC 5961 rate limit. /
3793	now = jiffies / HZ;
3794	if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) {
3795	u32 half = (ack_limit + `1`) >> `1`;
3796
3797	WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now);
3798	WRITE_ONCE(net->ipv4.tcp_challenge_count,
3799	get_random_u32_inclusive(half, ack_limit + half - `1`));
3800	}
3801	count = READ_ONCE(net->ipv4.tcp_challenge_count);
3802	if (count > `0`) {
3803	WRITE_ONCE(net->ipv4.tcp_challenge_count, count - `1`);
3804	send_ack:
3805	NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
3806	tcp_send_ack(sk);
3807	}
3808	}
3809
3810	static void tcp_store_ts_recent(struct tcp_sock *tp)
3811	{
3812	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
3813	tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
3814	}
3815
3816	static int __tcp_replace_ts_recent(struct tcp_sock *tp, s32 tstamp_delta)
3817	{
3818	tcp_store_ts_recent(tp);
3819	return tstamp_delta > `0` ? FLAG_TS_PROGRESS : `0`;
3820	}
3821
3822	static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3823	{
3824	s32 delta;
3825
3826	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
3827	/ PAWS bug workaround wrt. ACK frames, the PAWS discard*
3828	* extra check below makes sure this can only happen
3829	* for pure ACK frames. -DaveM
3830	*
3831	* Not only, also it occurs for expired timestamps.
3832	*/
3833
3834	if (tcp_paws_check(rx_opt: &tp->rx_opt, paws_win: `0`)) {
3835	delta = tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent;
3836	return __tcp_replace_ts_recent(tp, tstamp_delta: delta);
3837	}
3838	}
3839
3840	return `0`;
3841	}
3842
3843	/ This routine deals with acks during a TLP episode and ends an episode by*
3844	* resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
3845	*/
3846	static void tcp_process_tlp_ack(struct sock sk, u32 ack, int* flag)
3847	{
3848	struct tcp_sock *tp = tcp_sk(sk);
3849
3850	if (before(seq1: ack, seq2: tp->tlp_high_seq))
3851	return;
3852
3853	if (!tp->tlp_retrans) {
3854	/ TLP of new data has been acknowledged /
3855	tp->tlp_high_seq = `0`;
3856	} else if (flag & FLAG_DSACK_TLP) {
3857	/ This DSACK means original and TLP probe arrived; no loss /
3858	tp->tlp_high_seq = `0`;
3859	} else if (after(ack, tp->tlp_high_seq)) {
3860	/ ACK advances: there was a loss, so reduce cwnd. Reset*
3861	* tlp_high_seq in tcp_init_cwnd_reduction()
3862	*/
3863	tcp_init_cwnd_reduction(sk);
3864	tcp_set_ca_state(sk, ca_state: TCP_CA_CWR);
3865	tcp_end_cwnd_reduction(sk);
3866	tcp_try_keep_open(sk);
3867	NET_INC_STATS(sock_net(sk),
3868	LINUX_MIB_TCPLOSSPROBERECOVERY);
3869	} else if (!(flag & (FLAG_SND_UNA_ADVANCED \|
3870	FLAG_NOT_DUP \| FLAG_DATA_SACKED))) {
3871	/ Pure dupack: original and TLP probe arrived; no loss /
3872	tp->tlp_high_seq = `0`;
3873	}
3874	}
3875
3876	static void tcp_in_ack_event(struct sock sk, int* flag)
3877	{
3878	const struct inet_connection_sock *icsk = inet_csk(sk);
3879
3880	if (icsk->icsk_ca_ops->in_ack_event) {
3881	u32 ack_ev_flags = `0`;
3882
3883	if (flag & FLAG_WIN_UPDATE)
3884	ack_ev_flags \|= CA_ACK_WIN_UPDATE;
3885	if (flag & FLAG_SLOWPATH) {
3886	ack_ev_flags \|= CA_ACK_SLOWPATH;
3887	if (flag & FLAG_ECE)
3888	ack_ev_flags \|= CA_ACK_ECE;
3889	}
3890
3891	icsk->icsk_ca_ops->in_ack_event(sk, ack_ev_flags);
3892	}
3893	}
3894
3895	/ Congestion control has updated the cwnd already. So if we're in*
3896	* loss recovery then now we do any new sends (for FRTO) or
3897	* retransmits (for CA_Loss or CA_recovery) that make sense.
3898	*/
3899	static void tcp_xmit_recovery(struct sock sk, int* rexmit)
3900	{
3901	struct tcp_sock *tp = tcp_sk(sk);
3902
3903	if (rexmit == REXMIT_NONE \|\| sk->sk_state == TCP_SYN_SENT)
3904	return;
3905
3906	if (unlikely(rexmit == REXMIT_NEW)) {
3907	__tcp_push_pending_frames(sk, cur_mss: tcp_current_mss(sk),
3908	TCP_NAGLE_OFF);
3909	if (after(tp->snd_nxt, tp->high_seq))
3910	return;
3911	tp->frto = `0`;
3912	}
3913	tcp_xmit_retransmit_queue(sk);
3914	}
3915
3916	/ Returns the number of packets newly acked or sacked by the current ACK /
3917	static u32 tcp_newly_delivered(struct sock sk, u32 prior_delivered, int* flag)
3918	{
3919	const struct net *net = sock_net(sk);
3920	struct tcp_sock *tp = tcp_sk(sk);
3921	u32 delivered;
3922
3923	delivered = tp->delivered - prior_delivered;
3924	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
3925	if (flag & FLAG_ECE)
3926	NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
3927
3928	return delivered;
3929	}
3930
3931	/ This routine deals with incoming acks, but not outgoing ones. /
3932	static int tcp_ack(struct sock sk, const* struct sk_buff skb, int* flag)
3933	{
3934	struct inet_connection_sock *icsk = inet_csk(sk);
3935	struct tcp_sock *tp = tcp_sk(sk);
3936	struct tcp_sacktag_state sack_state;
3937	struct rate_sample rs = { .prior_delivered = `0` };
3938	u32 prior_snd_una = tp->snd_una;
3939	bool is_sack_reneg = tp->is_sack_reneg;
3940	u32 ack_seq = TCP_SKB_CB(skb)->seq;
3941	u32 ack = TCP_SKB_CB(skb)->ack_seq;
3942	int num_dupack = `0`;
3943	int prior_packets = tp->packets_out;
3944	u32 delivered = tp->delivered;
3945	u32 lost = tp->lost;
3946	int rexmit = REXMIT_NONE; / Flag to (re)transmit to recover losses /
3947	u32 prior_fack;
3948
3949	sack_state.first_sackt = `0`;
3950	sack_state.rate = &rs;
3951	sack_state.sack_delivered = `0`;
3952
3953	/ We very likely will need to access rtx queue. /
3954	prefetch(sk->tcp_rtx_queue.rb_node);
3955
3956	/ If the ack is older than previous acks*
3957	* then we can probably ignore it.
3958	*/
3959	if (before(seq1: ack, seq2: prior_snd_una)) {
3960	u32 max_window;
3961
3962	/ do not accept ACK for bytes we never sent. /
3963	max_window = min_t(u64, tp->max_window, tp->bytes_acked);
3964	/ RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] /
3965	if (before(seq1: ack, seq2: prior_snd_una - max_window)) {
3966	if (!(flag & FLAG_NO_CHALLENGE_ACK))
3967	tcp_send_challenge_ack(sk);
3968	return -SKB_DROP_REASON_TCP_TOO_OLD_ACK;
3969	}
3970	goto old_ack;
3971	}
3972
3973	/ If the ack includes data we haven't sent yet, discard*
3974	* this segment (RFC793 Section 3.9).
3975	*/
3976	if (after(ack, tp->snd_nxt))
3977	return -SKB_DROP_REASON_TCP_ACK_UNSENT_DATA;
3978
3979	if (after(ack, prior_snd_una)) {
3980	flag \|= FLAG_SND_UNA_ADVANCED;
3981	icsk->icsk_retransmits = `0`;
3982
3983	#if IS_ENABLED(CONFIG_TLS_DEVICE)
3984	if (static_branch_unlikely(&clean_acked_data_enabled.key))
3985	if (tp->tcp_clean_acked)
3986	tp->tcp_clean_acked(sk, ack);
3987	#endif
3988	}
3989
3990	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
3991	rs.prior_in_flight = tcp_packets_in_flight(tp);
3992
3993	/ ts_recent update must be made after we are sure that the packet*
3994	* is in window.
3995	*/
3996	if (flag & FLAG_UPDATE_TS_RECENT)
3997	flag \|= tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
3998
3999	if ((flag & (FLAG_SLOWPATH \| FLAG_SND_UNA_ADVANCED)) ==
4000	FLAG_SND_UNA_ADVANCED) {
4001	/ Window is constant, pure forward advance.*
4002	* No more checks are required.
4003	* Note, we use the fact that SND.UNA>=SND.WL2.
4004	*/
4005	tcp_update_wl(tp, seq: ack_seq);
4006	tcp_snd_una_update(tp, ack);
4007	flag \|= FLAG_WIN_UPDATE;
4008
4009	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
4010	} else {
4011	if (ack_seq != TCP_SKB_CB(skb)->end_seq)
4012	flag \|= FLAG_DATA;
4013	else
4014	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
4015
4016	flag \|= tcp_ack_update_window(sk, skb, ack, ack_seq);
4017
4018	if (TCP_SKB_CB(skb)->sacked)
4019	flag \|= tcp_sacktag_write_queue(sk, ack_skb: skb, prior_snd_una,
4020	state: &sack_state);
4021
4022	if (tcp_ecn_rcv_ecn_echo(tp, th: tcp_hdr(skb)))
4023	flag \|= FLAG_ECE;
4024
4025	if (sack_state.sack_delivered)
4026	tcp_count_delivered(tp, delivered: sack_state.sack_delivered,
4027	ece_ack: flag & FLAG_ECE);
4028	}
4029
4030	/ This is a deviation from RFC3168 since it states that:*
4031	* "When the TCP data sender is ready to set the CWR bit after reducing
4032	* the congestion window, it SHOULD set the CWR bit only on the first
4033	* new data packet that it transmits."
4034	* We accept CWR on pure ACKs to be more robust
4035	* with widely-deployed TCP implementations that do this.
4036	*/
4037	tcp_ecn_accept_cwr(sk, skb);
4038
4039	/ We passed data and got it acked, remove any soft error*
4040	* log. Something worked...
4041	*/
4042	WRITE_ONCE(sk->sk_err_soft, `0`);
4043	icsk->icsk_probes_out = `0`;
4044	tp->rcv_tstamp = tcp_jiffies32;
4045	if (!prior_packets)
4046	goto no_queue;
4047
4048	/ See if we can take anything off of the retransmit queue. /
4049	flag \|= tcp_clean_rtx_queue(sk, ack_skb: skb, prior_fack, prior_snd_una,
4050	sack: &sack_state, ece_ack: flag & FLAG_ECE);
4051
4052	tcp_rack_update_reo_wnd(sk, rs: &rs);
4053
4054	tcp_in_ack_event(sk, flag);
4055
4056	if (tp->tlp_high_seq)
4057	tcp_process_tlp_ack(sk, ack, flag);
4058
4059	if (tcp_ack_is_dubious(sk, flag)) {
4060	if (!(flag & (FLAG_SND_UNA_ADVANCED \|
4061	FLAG_NOT_DUP \| FLAG_DSACKING_ACK))) {
4062	num_dupack = `1`;
4063	/ Consider if pure acks were aggregated in tcp_add_backlog() /
4064	if (!(flag & FLAG_DATA))
4065	num_dupack = max_t(u16, `1`, skb_shinfo(skb)->gso_segs);
4066	}
4067	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, ack_flag: &flag,
4068	rexmit: &rexmit);
4069	}
4070
4071	/ If needed, reset TLP/RTO timer when RACK doesn't set. /
4072	if (flag & FLAG_SET_XMIT_TIMER)
4073	tcp_set_xmit_timer(sk);
4074
4075	if ((flag & FLAG_FORWARD_PROGRESS) \|\| !(flag & FLAG_NOT_DUP))
4076	sk_dst_confirm(sk);
4077
4078	delivered = tcp_newly_delivered(sk, prior_delivered: delivered, flag);
4079	lost = tp->lost - lost; / freshly marked lost /
4080	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
4081	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, rs: sack_state.rate);
4082	tcp_cong_control(sk, ack, acked_sacked: delivered, flag, rs: sack_state.rate);
4083	tcp_xmit_recovery(sk, rexmit);
4084	return `1`;
4085
4086	no_queue:
4087	tcp_in_ack_event(sk, flag);
4088	/ If data was DSACKed, see if we can undo a cwnd reduction. /
4089	if (flag & FLAG_DSACKING_ACK) {
4090	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, ack_flag: &flag,
4091	rexmit: &rexmit);
4092	tcp_newly_delivered(sk, prior_delivered: delivered, flag);
4093	}
4094	/ If this ack opens up a zero window, clear backoff. It was*
4095	* being used to time the probes, and is probably far higher than
4096	* it needs to be for normal retransmission.
4097	*/
4098	tcp_ack_probe(sk);
4099
4100	if (tp->tlp_high_seq)
4101	tcp_process_tlp_ack(sk, ack, flag);
4102	return `1`;
4103
4104	old_ack:
4105	/ If data was SACKed, tag it and see if we should send more data.*
4106	* If data was DSACKed, see if we can undo a cwnd reduction.
4107	*/
4108	if (TCP_SKB_CB(skb)->sacked) {
4109	flag \|= tcp_sacktag_write_queue(sk, ack_skb: skb, prior_snd_una,
4110	state: &sack_state);
4111	tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, ack_flag: &flag,
4112	rexmit: &rexmit);
4113	tcp_newly_delivered(sk, prior_delivered: delivered, flag);
4114	tcp_xmit_recovery(sk, rexmit);
4115	}
4116
4117	return `0`;
4118	}
4119
4120	static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
4121	bool syn, struct tcp_fastopen_cookie *foc,
4122	bool exp_opt)
4123	{
4124	/ Valid only in SYN or SYN-ACK with an even length. /
4125	if (!foc \|\| !syn \|\| len < `0` \|\| (len & `1`))
4126	return;
4127
4128	if (len >= TCP_FASTOPEN_COOKIE_MIN &&
4129	len <= TCP_FASTOPEN_COOKIE_MAX)
4130	memcpy(foc->val, cookie, len);
4131	else if (len != `0`)
4132	len = -`1`;
4133	foc->len = len;
4134	foc->exp = exp_opt;
4135	}
4136
4137	static bool smc_parse_options(const struct tcphdr *th,
4138	struct tcp_options_received *opt_rx,
4139	const unsigned char *ptr,
4140	int opsize)
4141	{
4142	#if IS_ENABLED(CONFIG_SMC)
4143	if (static_branch_unlikely(&tcp_have_smc)) {
4144	if (th->syn && !(opsize & `1`) &&
4145	opsize >= TCPOLEN_EXP_SMC_BASE &&
4146	get_unaligned_be32(p: ptr) == TCPOPT_SMC_MAGIC) {
4147	opt_rx->smc_ok = `1`;
4148	return true;
4149	}
4150	}
4151	#endif
4152	return false;
4153	}
4154
4155	/ Try to parse the MSS option from the TCP header. Return 0 on failure, clamped*
4156	* value on success.
4157	*/
4158	u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
4159	{
4160	const unsigned char ptr = (const* unsigned char *)(th + `1`);
4161	int length = (th->doff * `4`) - sizeof(struct tcphdr);
4162	u16 mss = `0`;
4163
4164	while (length > `0`) {
4165	int opcode = *ptr++;
4166	int opsize;
4167
4168	switch (opcode) {
4169	case TCPOPT_EOL:
4170	return mss;
4171	case TCPOPT_NOP: / Ref: RFC 793 section 3.1 /
4172	length--;
4173	continue;
4174	default:
4175	if (length < `2`)
4176	return mss;
4177	opsize = *ptr++;
4178	if (opsize < `2`) / "silly options" /
4179	return mss;
4180	if (opsize > length)
4181	return mss; / fail on partial options /
4182	if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
4183	u16 in_mss = get_unaligned_be16(p: ptr);
4184
4185	if (in_mss) {
4186	if (user_mss && user_mss < in_mss)
4187	in_mss = user_mss;
4188	mss = in_mss;
4189	}
4190	}
4191	ptr += opsize - `2`;
4192	length -= opsize;
4193	}
4194	}
4195	return mss;
4196	}
4197
4198	/ Look for tcp options. Normally only called on SYN and SYNACK packets.*
4199	* But, this can also be called on packets in the established flow when
4200	* the fast version below fails.
4201	*/
4202	void tcp_parse_options(const struct net *net,
4203	const struct sk_buff *skb,
4204	struct tcp_options_received opt_rx, int* estab,
4205	struct tcp_fastopen_cookie *foc)
4206	{
4207	const unsigned char *ptr;
4208	const struct tcphdr *th = tcp_hdr(skb);
4209	int length = (th->doff * `4`) - sizeof(struct tcphdr);
4210
4211	ptr = (const unsigned char *)(th + `1`);
4212	opt_rx->saw_tstamp = `0`;
4213	opt_rx->saw_unknown = `0`;
4214
4215	while (length > `0`) {
4216	int opcode = *ptr++;
4217	int opsize;
4218
4219	switch (opcode) {
4220	case TCPOPT_EOL:
4221	return;
4222	case TCPOPT_NOP: / Ref: RFC 793 section 3.1 /
4223	length--;
4224	continue;
4225	default:
4226	if (length < `2`)
4227	return;
4228	opsize = *ptr++;
4229	if (opsize < `2`) / "silly options" /
4230	return;
4231	if (opsize > length)
4232	return; / don't parse partial options /
4233	switch (opcode) {
4234	case TCPOPT_MSS:
4235	if (opsize == TCPOLEN_MSS && th->syn && !estab) {
4236	u16 in_mss = get_unaligned_be16(p: ptr);
4237	if (in_mss) {
4238	if (opt_rx->user_mss &&
4239	opt_rx->user_mss < in_mss)
4240	in_mss = opt_rx->user_mss;
4241	opt_rx->mss_clamp = in_mss;
4242	}
4243	}
4244	break;
4245	case TCPOPT_WINDOW:
4246	if (opsize == TCPOLEN_WINDOW && th->syn &&
4247	!estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) {
4248	__u8 snd_wscale = (__u8 )ptr;
4249	opt_rx->wscale_ok = `1`;
4250	if (snd_wscale > TCP_MAX_WSCALE) {
4251	net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n",
4252	__func__,
4253	snd_wscale,
4254	TCP_MAX_WSCALE);
4255	snd_wscale = TCP_MAX_WSCALE;
4256	}
4257	opt_rx->snd_wscale = snd_wscale;
4258	}
4259	break;
4260	case TCPOPT_TIMESTAMP:
4261	if ((opsize == TCPOLEN_TIMESTAMP) &&
4262	((estab && opt_rx->tstamp_ok) \|\|
4263	(!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) {
4264	opt_rx->saw_tstamp = `1`;
4265	opt_rx->rcv_tsval = get_unaligned_be32(p: ptr);
4266	opt_rx->rcv_tsecr = get_unaligned_be32(p: ptr + `4`);
4267	}
4268	break;
4269	case TCPOPT_SACK_PERM:
4270	if (opsize == TCPOLEN_SACK_PERM && th->syn &&
4271	!estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) {
4272	opt_rx->sack_ok = TCP_SACK_SEEN;
4273	tcp_sack_reset(rx_opt: opt_rx);
4274	}
4275	break;
4276
4277	case TCPOPT_SACK:
4278	if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
4279	!((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
4280	opt_rx->sack_ok) {
4281	TCP_SKB_CB(skb)->sacked = (ptr - `2`) - (unsigned char *)th;
4282	}
4283	break;
4284	#ifdef CONFIG_TCP_MD5SIG
4285	case TCPOPT_MD5SIG:
4286	/ The MD5 Hash has already been*
4287	* checked (see tcp_v{4,6}_rcv()).
4288	*/
4289	break;
4290	#endif
4291	#ifdef CONFIG_TCP_AO
4292	case TCPOPT_AO:
4293	/ TCP AO has already been checked*
4294	* (see tcp_inbound_ao_hash()).
4295	*/
4296	break;
4297	#endif
4298	case TCPOPT_FASTOPEN:
4299	tcp_parse_fastopen_option(
4300	len: opsize - TCPOLEN_FASTOPEN_BASE,
4301	cookie: ptr, syn: th->syn, foc, exp_opt: false);
4302	break;
4303
4304	case TCPOPT_EXP:
4305	/ Fast Open option shares code 254 using a*
4306	* 16 bits magic number.
4307	*/
4308	if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
4309	get_unaligned_be16(p: ptr) ==
4310	TCPOPT_FASTOPEN_MAGIC) {
4311	tcp_parse_fastopen_option(len: opsize -
4312	TCPOLEN_EXP_FASTOPEN_BASE,
4313	cookie: ptr + `2`, syn: th->syn, foc, exp_opt: true);
4314	break;
4315	}
4316
4317	if (smc_parse_options(th, opt_rx, ptr, opsize))
4318	break;
4319
4320	opt_rx->saw_unknown = `1`;
4321	break;
4322
4323	default:
4324	opt_rx->saw_unknown = `1`;
4325	}
4326	ptr += opsize-`2`;
4327	length -= opsize;
4328	}
4329	}
4330	}
4331	EXPORT_SYMBOL(tcp_parse_options);
4332
4333	static bool tcp_parse_aligned_timestamp(struct tcp_sock tp, const* struct tcphdr *th)
4334	{
4335	const __be32 ptr = (const* __be32 *)(th + `1`);
4336
4337	if (*ptr == htonl((TCPOPT_NOP << `24`) \| (TCPOPT_NOP << `16`)
4338	\| (TCPOPT_TIMESTAMP << `8`) \| TCPOLEN_TIMESTAMP)) {
4339	tp->rx_opt.saw_tstamp = `1`;
4340	++ptr;
4341	tp->rx_opt.rcv_tsval = ntohl(*ptr);
4342	++ptr;
4343	if (*ptr)
4344	tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset;
4345	else
4346	tp->rx_opt.rcv_tsecr = `0`;
4347	return true;
4348	}
4349	return false;
4350	}
4351
4352	/ Fast parse options. This hopes to only see timestamps.*
4353	* If it is wrong it falls back on tcp_parse_options().
4354	*/
4355	static bool tcp_fast_parse_options(const struct net *net,
4356	const struct sk_buff *skb,
4357	const struct tcphdr th, struct* tcp_sock *tp)
4358	{
4359	/ In the spirit of fast parsing, compare doff directly to constant*
4360	* values. Because equality is used, short doff can be ignored here.
4361	*/
4362	if (th->doff == (sizeof(*th) / `4`)) {
4363	tp->rx_opt.saw_tstamp = `0`;
4364	return false;
4365	} else if (tp->rx_opt.tstamp_ok &&
4366	th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / `4`)) {
4367	if (tcp_parse_aligned_timestamp(tp, th))
4368	return true;
4369	}
4370
4371	tcp_parse_options(net, skb, &tp->rx_opt, `1`, NULL);
4372	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
4373	tp->rx_opt.rcv_tsecr -= tp->tsoffset;
4374
4375	return true;
4376	}
4377
4378	#if defined(CONFIG_TCP_MD5SIG) \|\| defined(CONFIG_TCP_AO)
4379	/*
4380	* Parse Signature options
4381	*/
4382	int tcp_do_parse_auth_options(const struct tcphdr *th,
4383	const u8 *md5_hash, const* u8 **ao_hash)
4384	{
4385	int length = (th->doff << `2`) - sizeof(*th);
4386	const u8 ptr = (const* u8 *)(th + `1`);
4387	unsigned int minlen = TCPOLEN_MD5SIG;
4388
4389	if (IS_ENABLED(CONFIG_TCP_AO))
4390	minlen = sizeof(struct tcp_ao_hdr) + `1`;
4391
4392	*md5_hash = NULL;
4393	*ao_hash = NULL;
4394
4395	/ If not enough data remaining, we can short cut /
4396	while (length >= minlen) {
4397	int opcode = *ptr++;
4398	int opsize;
4399
4400	switch (opcode) {
4401	case TCPOPT_EOL:
4402	return `0`;
4403	case TCPOPT_NOP:
4404	length--;
4405	continue;
4406	default:
4407	opsize = *ptr++;
4408	if (opsize < `2` \|\| opsize > length)
4409	return -EINVAL;
4410	if (opcode == TCPOPT_MD5SIG) {
4411	if (opsize != TCPOLEN_MD5SIG)
4412	return -EINVAL;
4413	if (unlikely(md5_hash \|\| ao_hash))
4414	return -EEXIST;
4415	*md5_hash = ptr;
4416	} else if (opcode == TCPOPT_AO) {
4417	if (opsize <= sizeof(struct tcp_ao_hdr))
4418	return -EINVAL;
4419	if (unlikely(md5_hash \|\| ao_hash))
4420	return -EEXIST;
4421	*ao_hash = ptr;
4422	}
4423	}
4424	ptr += opsize - `2`;
4425	length -= opsize;
4426	}
4427	return `0`;
4428	}
4429	EXPORT_SYMBOL(tcp_do_parse_auth_options);
4430	#endif
4431
4432	/ Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM*
4433	*
4434	* It is not fatal. If this ACK does _not_ change critical state (seqs, window)
4435	* it can pass through stack. So, the following predicate verifies that
4436	* this segment is not used for anything but congestion avoidance or
4437	* fast retransmit. Moreover, we even are able to eliminate most of such
4438	* second order effects, if we apply some small "replay" window (~RTO)
4439	* to timestamp space.
4440	*
4441	* All these measures still do not guarantee that we reject wrapped ACKs
4442	* on networks with high bandwidth, when sequence space is recycled fastly,
4443	* but it guarantees that such events will be very rare and do not affect
4444	* connection seriously. This doesn't look nice, but alas, PAWS is really
4445	* buggy extension.
4446	*
4447	* [ Later note. Even worse! It is buggy for segments _with_ data. RFC
4448	* states that events when retransmit arrives after original data are rare.
4449	* It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
4450	* the biggest problem on large power networks even with minor reordering.
4451	* OK, let's give it small replay window. If peer clock is even 1hz, it is safe
4452	* up to bandwidth of 18Gigabit/sec. 8) ]
4453	*/
4454
4455	/ Estimates max number of increments of remote peer TSval in*
4456	* a replay window (based on our current RTO estimation).
4457	*/
4458	static u32 tcp_tsval_replay(const struct sock *sk)
4459	{
4460	/ If we use usec TS resolution,*
4461	* then expect the remote peer to use the same resolution.
4462	*/
4463	if (tcp_sk(sk)->tcp_usec_ts)
4464	return inet_csk(sk)->icsk_rto * (USEC_PER_SEC / HZ);
4465
4466	/ RFC 7323 recommends a TSval clock between 1ms and 1sec.*
4467	* We know that some OS (including old linux) can use 1200 Hz.
4468	*/
4469	return inet_csk(sk)->icsk_rto * `1200` / HZ;
4470	}
4471
4472	static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk,
4473	const struct sk_buff *skb)
4474	{
4475	const struct tcp_sock *tp = tcp_sk(sk);
4476	const struct tcphdr *th = tcp_hdr(skb);
4477	SKB_DR_INIT(reason, TCP_RFC7323_PAWS);
4478	u32 ack = TCP_SKB_CB(skb)->ack_seq;
4479	u32 seq = TCP_SKB_CB(skb)->seq;
4480
4481	/ 1. Is this not a pure ACK ? /
4482	if (!th->ack \|\| seq != TCP_SKB_CB(skb)->end_seq)
4483	return reason;
4484
4485	/ 2. Is its sequence not the expected one ? /
4486	if (seq != tp->rcv_nxt)
4487	return before(seq1: seq, seq2: tp->rcv_nxt) ?
4488	SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK :
4489	reason;
4490
4491	/ 3. Is this not a duplicate ACK ? /
4492	if (ack != tp->snd_una)
4493	return reason;
4494
4495	/ 4. Is this updating the window ? /
4496	if (tcp_may_update_window(tp, ack, ack_seq: seq, ntohs(th->window) <<
4497	tp->rx_opt.snd_wscale))
4498	return reason;
4499
4500	/ 5. Is this not in the replay window ? /
4501	if ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) >
4502	tcp_tsval_replay(sk))
4503	return reason;
4504
4505	return `0`;
4506	}
4507
4508	/ Check segment sequence number for validity.*
4509	*
4510	* Segment controls are considered valid, if the segment
4511	* fits to the window after truncation to the window. Acceptability
4512	* of data (and SYN, FIN, of course) is checked separately.
4513	* See tcp_data_queue(), for example.
4514	*
4515	* Also, controls (RST is main one) are accepted using RCV.WUP instead
4516	* of RCV.NXT. Peer still did not advance his SND.UNA when we
4517	* delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
4518	* (borrowed from freebsd)
4519	*/
4520
4521	static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
4522	u32 seq, u32 end_seq)
4523	{
4524	if (before(seq1: end_seq, seq2: tp->rcv_wup))
4525	return SKB_DROP_REASON_TCP_OLD_SEQUENCE;
4526
4527	if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
4528	return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
4529
4530	return SKB_NOT_DROPPED_YET;
4531	}
4532
4533
4534	void tcp_done_with_error(struct sock sk, int* err)
4535	{
4536	/ This barrier is coupled with smp_rmb() in tcp_poll() /
4537	WRITE_ONCE(sk->sk_err, err);
4538	smp_wmb();
4539
4540	tcp_write_queue_purge(sk);
4541	tcp_done(sk);
4542
4543	if (!sock_flag(sk, flag: SOCK_DEAD))
4544	sk_error_report(sk);
4545	}
4546	EXPORT_IPV6_MOD(tcp_done_with_error);
4547
4548	/ When we get a reset we do this. /
4549	void tcp_reset(struct sock sk, struct* sk_buff *skb)
4550	{
4551	int err;
4552
4553	trace_tcp_receive_reset(sk);
4554
4555	/ mptcp can't tell us to ignore reset pkts,*
4556	* so just ignore the return value of mptcp_incoming_options().
4557	*/
4558	if (sk_is_mptcp(sk))
4559	mptcp_incoming_options(sk, skb);
4560
4561	/ We want the right error as BSD sees it (and indeed as we do). /
4562	switch (sk->sk_state) {
4563	case TCP_SYN_SENT:
4564	err = ECONNREFUSED;
4565	break;
4566	case TCP_CLOSE_WAIT:
4567	err = EPIPE;
4568	break;
4569	case TCP_CLOSE:
4570	return;
4571	default:
4572	err = ECONNRESET;
4573	}
4574	tcp_done_with_error(sk, err);
4575	}
4576
4577	/*
4578	* Process the FIN bit. This now behaves as it is supposed to work
4579	* and the FIN takes effect when it is validly part of sequence
4580	* space. Not before when we get holes.
4581	*
4582	* If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
4583	* (and thence onto LAST-ACK and finally, CLOSE, we never enter
4584	* TIME-WAIT)
4585	*
4586	* If we are in FINWAIT-1, a received FIN indicates simultaneous
4587	* close and we go into CLOSING (and later onto TIME-WAIT)
4588	*
4589	* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
4590	*/
4591	void tcp_fin(struct sock *sk)
4592	{
4593	struct tcp_sock *tp = tcp_sk(sk);
4594
4595	inet_csk_schedule_ack(sk);
4596
4597	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown \| RCV_SHUTDOWN);
4598	sock_set_flag(sk, flag: SOCK_DONE);
4599
4600	switch (sk->sk_state) {
4601	case TCP_SYN_RECV:
4602	case TCP_ESTABLISHED:
4603	/ Move to CLOSE_WAIT /
4604	tcp_set_state(sk, state: TCP_CLOSE_WAIT);
4605	inet_csk_enter_pingpong_mode(sk);
4606	break;
4607
4608	case TCP_CLOSE_WAIT:
4609	case TCP_CLOSING:
4610	/ Received a retransmission of the FIN, do*
4611	* nothing.
4612	*/
4613	break;
4614	case TCP_LAST_ACK:
4615	/ RFC793: Remain in the LAST-ACK state. /
4616	break;
4617
4618	case TCP_FIN_WAIT1:
4619	/ This case occurs when a simultaneous close*
4620	* happens, we must ack the received FIN and
4621	* enter the CLOSING state.
4622	*/
4623	tcp_send_ack(sk);
4624	tcp_set_state(sk, state: TCP_CLOSING);
4625	break;
4626	case TCP_FIN_WAIT2:
4627	/ Received a FIN -- send ACK and enter TIME_WAIT. /
4628	tcp_send_ack(sk);
4629	tcp_time_wait(sk, state: TCP_TIME_WAIT, timeo: `0`);
4630	break;
4631	default:
4632	/ Only TCP_LISTEN and TCP_CLOSE are left, in these*
4633	* cases we should never reach this piece of code.
4634	*/
4635	pr_err("%s: Impossible, sk->sk_state=%d\n",
4636	__func__, sk->sk_state);
4637	break;
4638	}
4639
4640	/ It _is_ possible, that we have something out-of-order _after_ FIN.*
4641	* Probably, we should reset in this case. For now drop them.
4642	*/
4643	skb_rbtree_purge(root: &tp->out_of_order_queue);
4644	if (tcp_is_sack(tp))
4645	tcp_sack_reset(rx_opt: &tp->rx_opt);
4646
4647	if (!sock_flag(sk, flag: SOCK_DEAD)) {
4648	sk->sk_state_change(sk);
4649
4650	/ Do not send POLL_HUP for half duplex close. /
4651	if (sk->sk_shutdown == SHUTDOWN_MASK \|\|
4652	sk->sk_state == TCP_CLOSE)
4653	sk_wake_async(sk, how: SOCK_WAKE_WAITD, POLL_HUP);
4654	else
4655	sk_wake_async(sk, how: SOCK_WAKE_WAITD, POLL_IN);
4656	}
4657	}
4658
4659	static inline bool tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
4660	u32 end_seq)
4661	{
4662	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
4663	if (before(seq1: seq, seq2: sp->start_seq))
4664	sp->start_seq = seq;
4665	if (after(end_seq, sp->end_seq))
4666	sp->end_seq = end_seq;
4667	return true;
4668	}
4669	return false;
4670	}
4671
4672	static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
4673	{
4674	struct tcp_sock *tp = tcp_sk(sk);
4675
4676	if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
4677	int mib_idx;
4678
4679	if (before(seq1: seq, seq2: tp->rcv_nxt))
4680	mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
4681	else
4682	mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
4683
4684	NET_INC_STATS(sock_net(sk), mib_idx);
4685
4686	tp->rx_opt.dsack = `1`;
4687	tp->duplicate_sack[`0`].start_seq = seq;
4688	tp->duplicate_sack[`0`].end_seq = end_seq;
4689	}
4690	}
4691
4692	static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
4693	{
4694	struct tcp_sock *tp = tcp_sk(sk);
4695
4696	if (!tp->rx_opt.dsack)
4697	tcp_dsack_set(sk, seq, end_seq);
4698	else
4699	tcp_sack_extend(sp: tp->duplicate_sack, seq, end_seq);
4700	}
4701
4702	static void tcp_rcv_spurious_retrans(struct sock sk, const* struct sk_buff *skb)
4703	{
4704	/ When the ACK path fails or drops most ACKs, the sender would*
4705	* timeout and spuriously retransmit the same segment repeatedly.
4706	* If it seems our ACKs are not reaching the other side,
4707	* based on receiving a duplicate data segment with new flowlabel
4708	* (suggesting the sender suffered an RTO), and we are not already
4709	* repathing due to our own RTO, then rehash the socket to repath our
4710	* packets.
4711	*/
4712	#if IS_ENABLED(CONFIG_IPV6)
4713	if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss &&
4714	skb->protocol == htons(ETH_P_IPV6) &&
4715	(tcp_sk(sk)->inet_conn.icsk_ack.lrcv_flowlabel !=
4716	ntohl(ip6_flowlabel(ipv6_hdr(skb)))) &&
4717	sk_rethink_txhash(sk))
4718	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
4719
4720	/ Save last flowlabel after a spurious retrans. /
4721	tcp_save_lrcv_flowlabel(sk, skb);
4722	#endif
4723	}
4724
4725	static void tcp_send_dupack(struct sock sk, const* struct sk_buff *skb)
4726	{
4727	struct tcp_sock *tp = tcp_sk(sk);
4728
4729	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
4730	before(TCP_SKB_CB(skb)->seq, seq2: tp->rcv_nxt)) {
4731	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
4732	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
4733
4734	if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
4735	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
4736
4737	tcp_rcv_spurious_retrans(sk, skb);
4738	if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
4739	end_seq = tp->rcv_nxt;
4740	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
4741	}
4742	}
4743
4744	tcp_send_ack(sk);
4745	}
4746
4747	/ These routines update the SACK block as out-of-order packets arrive or*
4748	* in-order packets close up the sequence space.
4749	*/
4750	static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
4751	{
4752	int this_sack;
4753	struct tcp_sack_block *sp = &tp->selective_acks[`0`];
4754	struct tcp_sack_block *swalk = sp + `1`;
4755
4756	/ See if the recent change to the first SACK eats into*
4757	* or hits the sequence space of other SACK blocks, if so coalesce.
4758	*/
4759	for (this_sack = `1`; this_sack < tp->rx_opt.num_sacks;) {
4760	if (tcp_sack_extend(sp, seq: swalk->start_seq, end_seq: swalk->end_seq)) {
4761	int i;
4762
4763	/ Zap SWALK, by moving every further SACK up by one slot.*
4764	* Decrease num_sacks.
4765	*/
4766	tp->rx_opt.num_sacks--;
4767	for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
4768	sp[i] = sp[i + `1`];
4769	continue;
4770	}
4771	this_sack++;
4772	swalk++;
4773	}
4774	}
4775
4776	void tcp_sack_compress_send_ack(struct sock *sk)
4777	{
4778	struct tcp_sock *tp = tcp_sk(sk);
4779
4780	if (!tp->compressed_ack)
4781	return;
4782
4783	if (hrtimer_try_to_cancel(timer: &tp->compressed_ack_timer) == `1`)
4784	__sock_put(sk);
4785
4786	/ Since we have to send one ack finally,*
4787	* substract one from tp->compressed_ack to keep
4788	* LINUX_MIB_TCPACKCOMPRESSED accurate.
4789	*/
4790	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
4791	tp->compressed_ack - `1`);
4792
4793	tp->compressed_ack = `0`;
4794	tcp_send_ack(sk);
4795	}
4796
4797	/ Reasonable amount of sack blocks included in TCP SACK option*
4798	* The max is 4, but this becomes 3 if TCP timestamps are there.
4799	* Given that SACK packets might be lost, be conservative and use 2.
4800	*/
4801	#define TCP_SACK_BLOCKS_EXPECTED 2
4802
4803	static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
4804	{
4805	struct tcp_sock *tp = tcp_sk(sk);
4806	struct tcp_sack_block *sp = &tp->selective_acks[`0`];
4807	int cur_sacks = tp->rx_opt.num_sacks;
4808	int this_sack;
4809
4810	if (!cur_sacks)
4811	goto new_sack;
4812
4813	for (this_sack = `0`; this_sack < cur_sacks; this_sack++, sp++) {
4814	if (tcp_sack_extend(sp, seq, end_seq)) {
4815	if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
4816	tcp_sack_compress_send_ack(sk);
4817	/ Rotate this_sack to the first one. /
4818	for (; this_sack > `0`; this_sack--, sp--)
4819	swap(sp, (sp - `1`));
4820	if (cur_sacks > `1`)
4821	tcp_sack_maybe_coalesce(tp);
4822	return;
4823	}
4824	}
4825
4826	if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
4827	tcp_sack_compress_send_ack(sk);
4828
4829	/ Could not find an adjacent existing SACK, build a new one,*
4830	* put it at the front, and shift everyone else down. We
4831	* always know there is at least one SACK present already here.
4832	*
4833	* If the sack array is full, forget about the last one.
4834	*/
4835	if (this_sack >= TCP_NUM_SACKS) {
4836	this_sack--;
4837	tp->rx_opt.num_sacks--;
4838	sp--;
4839	}
4840	for (; this_sack > `0`; this_sack--, sp--)
4841	sp = (sp - `1`);
4842
4843	new_sack:
4844	/ Build the new head SACK, and we're done. /
4845	sp->start_seq = seq;
4846	sp->end_seq = end_seq;
4847	tp->rx_opt.num_sacks++;
4848	}
4849
4850	/ RCV.NXT advances, some SACKs should be eaten. /
4851
4852	static void tcp_sack_remove(struct tcp_sock *tp)
4853	{
4854	struct tcp_sack_block *sp = &tp->selective_acks[`0`];
4855	int num_sacks = tp->rx_opt.num_sacks;
4856	int this_sack;
4857
4858	/ Empty ofo queue, hence, all the SACKs are eaten. Clear. /
4859	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
4860	tp->rx_opt.num_sacks = `0`;
4861	return;
4862	}
4863
4864	for (this_sack = `0`; this_sack < num_sacks;) {
4865	/ Check if the start of the sack is covered by RCV.NXT. /
4866	if (!before(seq1: tp->rcv_nxt, seq2: sp->start_seq)) {
4867	int i;
4868
4869	/ RCV.NXT must cover all the block! /
4870	WARN_ON(before(tp->rcv_nxt, sp->end_seq));
4871
4872	/ Zap this SACK, by moving forward any other SACKS. /
4873	for (i = this_sack+`1`; i < num_sacks; i++)
4874	tp->selective_acks[i-`1`] = tp->selective_acks[i];
4875	num_sacks--;
4876	continue;
4877	}
4878	this_sack++;
4879	sp++;
4880	}
4881	tp->rx_opt.num_sacks = num_sacks;
4882	}
4883
4884	/**
4885	* tcp_try_coalesce - try to merge skb to prior one
4886	* @sk: socket
4887	* @to: prior buffer
4888	* @from: buffer to add in queue
4889	* @fragstolen: pointer to boolean
4890	*
4891	* Before queueing skb @from after @to, try to merge them
4892	* to reduce overall memory use and queue lengths, if cost is small.
4893	* Packets in ofo or receive queues can stay a long time.
4894	* Better try to coalesce them right now to avoid future collapses.
4895	* Returns true if caller should free @from instead of queueing it
4896	*/
4897	static bool tcp_try_coalesce(struct sock *sk,
4898	struct sk_buff *to,
4899	struct sk_buff *from,
4900	bool *fragstolen)
4901	{
4902	int delta;
4903
4904	*fragstolen = false;
4905
4906	/ Its possible this segment overlaps with prior segment in queue /
4907	if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
4908	return false;
4909
4910	if (!tcp_skb_can_collapse_rx(to, from))
4911	return false;
4912
4913	if (!skb_try_coalesce(to, from, fragstolen, delta_truesize: &delta))
4914	return false;
4915
4916	atomic_add(i: delta, v: &sk->sk_rmem_alloc);
4917	sk_mem_charge(sk, size: delta);
4918	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
4919	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
4920	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
4921	TCP_SKB_CB(to)->tcp_flags \|= TCP_SKB_CB(from)->tcp_flags;
4922
4923	if (TCP_SKB_CB(from)->has_rxtstamp) {
4924	TCP_SKB_CB(to)->has_rxtstamp = true;
4925	to->tstamp = from->tstamp;
4926	skb_hwtstamps(skb: to)->hwtstamp = skb_hwtstamps(skb: from)->hwtstamp;
4927	}
4928
4929	return true;
4930	}
4931
4932	static bool tcp_ooo_try_coalesce(struct sock *sk,
4933	struct sk_buff *to,
4934	struct sk_buff *from,
4935	bool *fragstolen)
4936	{
4937	bool res = tcp_try_coalesce(sk, to, from, fragstolen);
4938
4939	/ In case tcp_drop_reason() is called later, update to->gso_segs /
4940	if (res) {
4941	u32 gso_segs = max_t(u16, `1`, skb_shinfo(to)->gso_segs) +
4942	max_t(u16, `1`, skb_shinfo(from)->gso_segs);
4943
4944	skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, `0xFFFF`);
4945	}
4946	return res;
4947	}
4948
4949	noinline_for_tracing static void
4950	tcp_drop_reason(struct sock sk, struct* sk_buff skb, enum* skb_drop_reason reason)
4951	{
4952	sk_drops_add(sk, skb);
4953	sk_skb_reason_drop(sk, skb, reason);
4954	}
4955
4956	/ This one checks to see if we can put data from the*
4957	* out_of_order queue into the receive_queue.
4958	*/
4959	static void tcp_ofo_queue(struct sock *sk)
4960	{
4961	struct tcp_sock *tp = tcp_sk(sk);
4962	__u32 dsack_high = tp->rcv_nxt;
4963	bool fin, fragstolen, eaten;
4964	struct sk_buff skb, tail;
4965	struct rb_node *p;
4966
4967	p = rb_first(&tp->out_of_order_queue);
4968	while (p) {
4969	skb = rb_to_skb(p);
4970	if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
4971	break;
4972
4973	if (before(TCP_SKB_CB(skb)->seq, seq2: dsack_high)) {
4974	__u32 dsack = dsack_high;
4975	if (before(TCP_SKB_CB(skb)->end_seq, seq2: dsack_high))
4976	dsack_high = TCP_SKB_CB(skb)->end_seq;
4977	tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, end_seq: dsack);
4978	}
4979	p = rb_next(p);
4980	rb_erase(&skb->rbnode, &tp->out_of_order_queue);
4981
4982	if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
4983	tcp_drop_reason(sk, skb, reason: SKB_DROP_REASON_TCP_OFO_DROP);
4984	continue;
4985	}
4986
4987	tail = skb_peek_tail(list_: &sk->sk_receive_queue);
4988	eaten = tail && tcp_try_coalesce(sk, to: tail, from: skb, fragstolen: &fragstolen);
4989	tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
4990	fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
4991	if (!eaten)
4992	tcp_add_receive_queue(sk, skb);
4993	else
4994	kfree_skb_partial(skb, head_stolen: fragstolen);
4995
4996	if (unlikely(fin)) {
4997	tcp_fin(sk);
4998	/ tcp_fin() purges tp->out_of_order_queue,*
4999	* so we must end this loop right now.
5000	*/
5001	break;
5002	}
5003	}
5004	}
5005
5006	static bool tcp_prune_ofo_queue(struct sock sk, const* struct sk_buff *in_skb);
5007	static int tcp_prune_queue(struct sock sk, const* struct sk_buff *in_skb);
5008
5009	static int tcp_try_rmem_schedule(struct sock sk, struct* sk_buff *skb,
5010	unsigned int size)
5011	{
5012	if (atomic_read(v: &sk->sk_rmem_alloc) > sk->sk_rcvbuf \|\|
5013	!sk_rmem_schedule(sk, skb, size)) {
5014
5015	if (tcp_prune_queue(sk, in_skb: skb) < `0`)
5016	return -`1`;
5017
5018	while (!sk_rmem_schedule(sk, skb, size)) {
5019	if (!tcp_prune_ofo_queue(sk, in_skb: skb))
5020	return -`1`;
5021	}
5022	}
5023	return `0`;
5024	}
5025
5026	static void tcp_data_queue_ofo(struct sock sk, struct* sk_buff *skb)
5027	{
5028	struct tcp_sock *tp = tcp_sk(sk);
5029	struct rb_node *p, parent;
5030	struct sk_buff *skb1;
5031	u32 seq, end_seq;
5032	bool fragstolen;
5033
5034	tcp_save_lrcv_flowlabel(sk, skb);
5035	tcp_data_ecn_check(sk, skb);
5036
5037	if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
5038	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
5039	sk->sk_data_ready(sk);
5040	tcp_drop_reason(sk, skb, reason: SKB_DROP_REASON_PROTO_MEM);
5041	return;
5042	}
5043
5044	/ Disable header prediction. /
5045	tp->pred_flags = `0`;
5046	inet_csk_schedule_ack(sk);
5047
5048	tp->rcv_ooopack += max_t(u16, `1`, skb_shinfo(skb)->gso_segs);
5049	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
5050	seq = TCP_SKB_CB(skb)->seq;
5051	end_seq = TCP_SKB_CB(skb)->end_seq;
5052
5053	p = &tp->out_of_order_queue.rb_node;
5054	if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
5055	/ Initial out of order segment, build 1 SACK. /
5056	if (tcp_is_sack(tp)) {
5057	tp->rx_opt.num_sacks = `1`;
5058	tp->selective_acks[`0`].start_seq = seq;
5059	tp->selective_acks[`0`].end_seq = end_seq;
5060	}
5061	rb_link_node(node: &skb->rbnode, NULL, rb_link: p);
5062	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
5063	tp->ooo_last_skb = skb;
5064	goto end;
5065	}
5066
5067	/ In the typical case, we are adding an skb to the end of the list.*
5068	* Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
5069	*/
5070	if (tcp_ooo_try_coalesce(sk, to: tp->ooo_last_skb,
5071	from: skb, fragstolen: &fragstolen)) {
5072	coalesce_done:
5073	/ For non sack flows, do not grow window to force DUPACK*
5074	* and trigger fast retransmit.
5075	*/
5076	if (tcp_is_sack(tp))
5077	tcp_grow_window(sk, skb, adjust: true);
5078	kfree_skb_partial(skb, head_stolen: fragstolen);
5079	skb = NULL;
5080	goto add_sack;
5081	}
5082	/ Can avoid an rbtree lookup if we are adding skb after ooo_last_skb /
5083	if (!before(seq1: seq, TCP_SKB_CB(tp->ooo_last_skb)->end_seq)) {
5084	parent = &tp->ooo_last_skb->rbnode;
5085	p = &parent->rb_right;
5086	goto insert;
5087	}
5088
5089	/ Find place to insert this segment. Handle overlaps on the way. /
5090	parent = NULL;
5091	while (*p) {
5092	parent = *p;
5093	skb1 = rb_to_skb(parent);
5094	if (before(seq1: seq, TCP_SKB_CB(skb1)->seq)) {
5095	p = &parent->rb_left;
5096	continue;
5097	}
5098	if (before(seq1: seq, TCP_SKB_CB(skb1)->end_seq)) {
5099	if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
5100	/ All the bits are present. Drop. /
5101	NET_INC_STATS(sock_net(sk),
5102	LINUX_MIB_TCPOFOMERGE);
5103	tcp_drop_reason(sk, skb,
5104	reason: SKB_DROP_REASON_TCP_OFOMERGE);
5105	skb = NULL;
5106	tcp_dsack_set(sk, seq, end_seq);
5107	goto add_sack;
5108	}
5109	if (after(seq, TCP_SKB_CB(skb1)->seq)) {
5110	/ Partial overlap. /
5111	tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq);
5112	} else {
5113	/ skb's seq == skb1's seq and skb covers skb1.*
5114	* Replace skb1 with skb.
5115	*/
5116	rb_replace_node(victim: &skb1->rbnode, new: &skb->rbnode,
5117	root: &tp->out_of_order_queue);
5118	tcp_dsack_extend(sk,
5119	TCP_SKB_CB(skb1)->seq,
5120	TCP_SKB_CB(skb1)->end_seq);
5121	NET_INC_STATS(sock_net(sk),
5122	LINUX_MIB_TCPOFOMERGE);
5123	tcp_drop_reason(sk, skb: skb1,
5124	reason: SKB_DROP_REASON_TCP_OFOMERGE);
5125	goto merge_right;
5126	}
5127	} else if (tcp_ooo_try_coalesce(sk, to: skb1,
5128	from: skb, fragstolen: &fragstolen)) {
5129	goto coalesce_done;
5130	}
5131	p = &parent->rb_right;
5132	}
5133	insert:
5134	/ Insert segment into RB tree. /
5135	rb_link_node(node: &skb->rbnode, parent, rb_link: p);
5136	rb_insert_color(&skb->rbnode, &tp->out_of_order_queue);
5137
5138	merge_right:
5139	/ Remove other segments covered by skb. /
5140	while ((skb1 = skb_rb_next(skb)) != NULL) {
5141	if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
5142	break;
5143	if (before(seq1: end_seq, TCP_SKB_CB(skb1)->end_seq)) {
5144	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
5145	end_seq);
5146	break;
5147	}
5148	rb_erase(&skb1->rbnode, &tp->out_of_order_queue);
5149	tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
5150	TCP_SKB_CB(skb1)->end_seq);
5151	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
5152	tcp_drop_reason(sk, skb: skb1, reason: SKB_DROP_REASON_TCP_OFOMERGE);
5153	}
5154	/ If there is no skb after us, we are the last_skb ! /
5155	if (!skb1)
5156	tp->ooo_last_skb = skb;
5157
5158	add_sack:
5159	if (tcp_is_sack(tp))
5160	tcp_sack_new_ofo_skb(sk, seq, end_seq);
5161	end:
5162	if (skb) {
5163	/ For non sack flows, do not grow window to force DUPACK*
5164	* and trigger fast retransmit.
5165	*/
5166	if (tcp_is_sack(tp))
5167	tcp_grow_window(sk, skb, adjust: false);
5168	skb_condense(skb);
5169	skb_set_owner_r(skb, sk);
5170	}
5171	tcp_rcvbuf_grow(sk);
5172	}
5173
5174	static int __must_check tcp_queue_rcv(struct sock sk, struct* sk_buff *skb,
5175	bool *fragstolen)
5176	{
5177	int eaten;
5178	struct sk_buff *tail = skb_peek_tail(list_: &sk->sk_receive_queue);
5179
5180	eaten = (tail &&
5181	tcp_try_coalesce(sk, to: tail,
5182	from: skb, fragstolen)) ? `1` : `0`;
5183	tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
5184	if (!eaten) {
5185	tcp_add_receive_queue(sk, skb);
5186	skb_set_owner_r(skb, sk);
5187	}
5188	return eaten;
5189	}
5190
5191	int tcp_send_rcvq(struct sock sk, struct* msghdr *msg, size_t size)
5192	{
5193	struct sk_buff *skb;
5194	int err = -ENOMEM;
5195	int data_len = `0`;
5196	bool fragstolen;
5197
5198	if (size == `0`)
5199	return `0`;
5200
5201	if (size > PAGE_SIZE) {
5202	int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
5203
5204	data_len = npages << PAGE_SHIFT;
5205	size = data_len + (size & ~PAGE_MASK);
5206	}
5207	skb = alloc_skb_with_frags(header_len: size - data_len, data_len,
5208	PAGE_ALLOC_COSTLY_ORDER,
5209	errcode: &err, gfp_mask: sk->sk_allocation);
5210	if (!skb)
5211	goto err;
5212
5213	skb_put(skb, len: size - data_len);
5214	skb->data_len = data_len;
5215	skb->len = size;
5216
5217	if (tcp_try_rmem_schedule(sk, skb, size: skb->truesize)) {
5218	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
5219	goto err_free;
5220	}
5221
5222	err = skb_copy_datagram_from_iter(skb, offset: `0`, from: &msg->msg_iter, len: size);
5223	if (err)
5224	goto err_free;
5225
5226	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
5227	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
5228	TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - `1`;
5229
5230	if (tcp_queue_rcv(sk, skb, fragstolen: &fragstolen)) {
5231	WARN_ON_ONCE(fragstolen); / should not happen /
5232	__kfree_skb(skb);
5233	}
5234	return size;
5235
5236	err_free:
5237	kfree_skb(skb);
5238	err:
5239	return err;
5240
5241	}
5242
5243	void tcp_data_ready(struct sock *sk)
5244	{
5245	if (tcp_epollin_ready(sk, target: sk->sk_rcvlowat) \|\| sock_flag(sk, flag: SOCK_DONE))
5246	sk->sk_data_ready(sk);
5247	}
5248
5249	static void tcp_data_queue(struct sock sk, struct* sk_buff *skb)
5250	{
5251	struct tcp_sock *tp = tcp_sk(sk);
5252	enum skb_drop_reason reason;
5253	bool fragstolen;
5254	int eaten;
5255
5256	/ If a subflow has been reset, the packet should not continue*
5257	* to be processed, drop the packet.
5258	*/
5259	if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
5260	__kfree_skb(skb);
5261	return;
5262	}
5263
5264	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
5265	__kfree_skb(skb);
5266	return;
5267	}
5268	tcp_cleanup_skb(skb);
5269	__skb_pull(skb, len: tcp_hdr(skb)->doff * `4`);
5270
5271	reason = SKB_DROP_REASON_NOT_SPECIFIED;
5272	tp->rx_opt.dsack = `0`;
5273
5274	/ Queue data for delivery to the user.*
5275	* Packets in sequence go to the receive queue.
5276	* Out of sequence packets to the out_of_order_queue.
5277	*/
5278	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
5279	if (tcp_receive_window(tp) == `0`) {
5280	/ Some stacks are known to send bare FIN packets*
5281	* in a loop even if we send RWIN 0 in our ACK.
5282	* Accepting this FIN does not hurt memory pressure
5283	* because the FIN flag will simply be merged to the
5284	* receive queue tail skb in most cases.
5285	*/
5286	if (!skb->len &&
5287	(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
5288	goto queue_and_out;
5289
5290	reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
5291	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
5292	goto out_of_window;
5293	}
5294
5295	/ Ok. In sequence. In window. /
5296	queue_and_out:
5297	if (tcp_try_rmem_schedule(sk, skb, size: skb->truesize)) {
5298	/ TODO: maybe ratelimit these WIN 0 ACK ? /
5299	inet_csk(sk)->icsk_ack.pending \|=
5300	(ICSK_ACK_NOMEM \| ICSK_ACK_NOW);
5301	inet_csk_schedule_ack(sk);
5302	sk->sk_data_ready(sk);
5303
5304	if (skb_queue_len(list_: &sk->sk_receive_queue) && skb->len) {
5305	reason = SKB_DROP_REASON_PROTO_MEM;
5306	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
5307	goto drop;
5308	}
5309	sk_forced_mem_schedule(sk, size: skb->truesize);
5310	}
5311
5312	eaten = tcp_queue_rcv(sk, skb, fragstolen: &fragstolen);
5313	if (skb->len)
5314	tcp_event_data_recv(sk, skb);
5315	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
5316	tcp_fin(sk);
5317
5318	if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
5319	tcp_ofo_queue(sk);
5320
5321	/ RFC5681. 4.2. SHOULD send immediate ACK, when*
5322	* gap in queue is filled.
5323	*/
5324	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
5325	inet_csk(sk)->icsk_ack.pending \|= ICSK_ACK_NOW;
5326	}
5327
5328	if (tp->rx_opt.num_sacks)
5329	tcp_sack_remove(tp);
5330
5331	tcp_fast_path_check(sk);
5332
5333	if (eaten > `0`)
5334	kfree_skb_partial(skb, head_stolen: fragstolen);
5335	if (!sock_flag(sk, flag: SOCK_DEAD))
5336	tcp_data_ready(sk);
5337	return;
5338	}
5339
5340	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
5341	tcp_rcv_spurious_retrans(sk, skb);
5342	/ A retransmit, 2nd most common case. Force an immediate ack. /
5343	reason = SKB_DROP_REASON_TCP_OLD_DATA;
5344	NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
5345	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
5346
5347	out_of_window:
5348	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
5349	inet_csk_schedule_ack(sk);
5350	drop:
5351	tcp_drop_reason(sk, skb, reason);
5352	return;
5353	}
5354
5355	/ Out of window. F.e. zero window probe. /
5356	if (!before(TCP_SKB_CB(skb)->seq,
5357	seq2: tp->rcv_nxt + tcp_receive_window(tp))) {
5358	reason = SKB_DROP_REASON_TCP_OVERWINDOW;
5359	goto out_of_window;
5360	}
5361
5362	if (before(TCP_SKB_CB(skb)->seq, seq2: tp->rcv_nxt)) {
5363	/ Partial packet, seq < rcv_next < end_seq /
5364	tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq: tp->rcv_nxt);
5365
5366	/ If window is closed, drop tail of packet. But after*
5367	* remembering D-SACK for its head made in previous line.
5368	*/
5369	if (!tcp_receive_window(tp)) {
5370	reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
5371	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
5372	goto out_of_window;
5373	}
5374	goto queue_and_out;
5375	}
5376
5377	tcp_data_queue_ofo(sk, skb);
5378	}
5379
5380	static struct sk_buff tcp_skb_next(struct* sk_buff skb, struct* sk_buff_head *list)
5381	{
5382	if (list)
5383	return !skb_queue_is_last(list, skb) ? skb->next : NULL;
5384
5385	return skb_rb_next(skb);
5386	}
5387
5388	static struct sk_buff tcp_collapse_one(struct* sock sk, struct* sk_buff *skb,
5389	struct sk_buff_head *list,
5390	struct rb_root *root)
5391	{
5392	struct sk_buff *next = tcp_skb_next(skb, list);
5393
5394	if (list)
5395	__skb_unlink(skb, list);
5396	else
5397	rb_erase(&skb->rbnode, root);
5398
5399	__kfree_skb(skb);
5400	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
5401
5402	return next;
5403	}
5404
5405	/ Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq /
5406	void tcp_rbtree_insert(struct rb_root root, struct* sk_buff *skb)
5407	{
5408	struct rb_node **p = &root->rb_node;
5409	struct rb_node *parent = NULL;
5410	struct sk_buff *skb1;
5411
5412	while (*p) {
5413	parent = *p;
5414	skb1 = rb_to_skb(parent);
5415	if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
5416	p = &parent->rb_left;
5417	else
5418	p = &parent->rb_right;
5419	}
5420	rb_link_node(node: &skb->rbnode, parent, rb_link: p);
5421	rb_insert_color(&skb->rbnode, root);
5422	}
5423
5424	/ Collapse contiguous sequence of skbs head..tail with*
5425	* sequence numbers start..end.
5426	*
5427	* If tail is NULL, this means until the end of the queue.
5428	*
5429	* Segments with FIN/SYN are not collapsed (only because this
5430	* simplifies code)
5431	*/
5432	static void
5433	tcp_collapse(struct sock sk, struct* sk_buff_head list, struct* rb_root *root,
5434	struct sk_buff head, struct* sk_buff *tail, u32 start, u32 end)
5435	{
5436	struct sk_buff skb = head, n;
5437	struct sk_buff_head tmp;
5438	bool end_of_skbs;
5439
5440	/ First, check that queue is collapsible and find*
5441	* the point where collapsing can be useful.
5442	*/
5443	restart:
5444	for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
5445	n = tcp_skb_next(skb, list);
5446
5447	if (!skb_frags_readable(skb))
5448	goto skip_this;
5449
5450	/ No new bits? It is possible on ofo queue. /
5451	if (!before(seq1: start, TCP_SKB_CB(skb)->end_seq)) {
5452	skb = tcp_collapse_one(sk, skb, list, root);
5453	if (!skb)
5454	break;
5455	goto restart;
5456	}
5457
5458	/ The first skb to collapse is:*
5459	* - not SYN/FIN and
5460	* - bloated or contains data before "start" or
5461	* overlaps to the next one and mptcp allow collapsing.
5462	*/
5463	if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)) &&
5464	(tcp_win_from_space(sk, space: skb->truesize) > skb->len \|\|
5465	before(TCP_SKB_CB(skb)->seq, seq2: start))) {
5466	end_of_skbs = false;
5467	break;
5468	}
5469
5470	if (n && n != tail && skb_frags_readable(skb: n) &&
5471	tcp_skb_can_collapse_rx(to: skb, from: n) &&
5472	TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
5473	end_of_skbs = false;
5474	break;
5475	}
5476
5477	skip_this:
5478	/ Decided to skip this, advance start seq. /
5479	start = TCP_SKB_CB(skb)->end_seq;
5480	}
5481	if (end_of_skbs \|\|
5482	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)) \|\|
5483	!skb_frags_readable(skb))
5484	return;
5485
5486	__skb_queue_head_init(list: &tmp);
5487
5488	while (before(seq1: start, seq2: end)) {
5489	int copy = min_t(int, SKB_MAX_ORDER(`0`, `0`), end - start);
5490	struct sk_buff *nskb;
5491
5492	nskb = alloc_skb(size: copy, GFP_ATOMIC);
5493	if (!nskb)
5494	break;
5495
5496	memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
5497	skb_copy_decrypted(to: nskb, from: skb);
5498	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
5499	if (list)
5500	__skb_queue_before(list, next: skb, newsk: nskb);
5501	else
5502	__skb_queue_tail(list: &tmp, newsk: nskb); / defer rbtree insertion /
5503	skb_set_owner_r(skb: nskb, sk);
5504	mptcp_skb_ext_move(to: nskb, from: skb);
5505
5506	/ Copy data, releasing collapsed skbs. /
5507	while (copy > `0`) {
5508	int offset = start - TCP_SKB_CB(skb)->seq;
5509	int size = TCP_SKB_CB(skb)->end_seq - start;
5510
5511	BUG_ON(offset < `0`);
5512	if (size > `0`) {
5513	size = min(copy, size);
5514	if (skb_copy_bits(skb, offset, to: skb_put(skb: nskb, len: size), len: size))
5515	BUG();
5516	TCP_SKB_CB(nskb)->end_seq += size;
5517	copy -= size;
5518	start += size;
5519	}
5520	if (!before(seq1: start, TCP_SKB_CB(skb)->end_seq)) {
5521	skb = tcp_collapse_one(sk, skb, list, root);
5522	if (!skb \|\|
5523	skb == tail \|\|
5524	!tcp_skb_can_collapse_rx(to: nskb, from: skb) \|\|
5525	(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN \| TCPHDR_FIN)) \|\|
5526	!skb_frags_readable(skb))
5527	goto end;
5528	}
5529	}
5530	}
5531	end:
5532	skb_queue_walk_safe(&tmp, skb, n)
5533	tcp_rbtree_insert(root, skb);
5534	}
5535
5536	/ Collapse ofo queue. Algorithm: select contiguous sequence of skbs*
5537	* and tcp_collapse() them until all the queue is collapsed.
5538	*/
5539	static void tcp_collapse_ofo_queue(struct sock *sk)
5540	{
5541	struct tcp_sock *tp = tcp_sk(sk);
5542	u32 range_truesize, sum_tiny = `0`;
5543	struct sk_buff skb, head;
5544	u32 start, end;
5545
5546	skb = skb_rb_first(&tp->out_of_order_queue);
5547	new_range:
5548	if (!skb) {
5549	tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue);
5550	return;
5551	}
5552	start = TCP_SKB_CB(skb)->seq;
5553	end = TCP_SKB_CB(skb)->end_seq;
5554	range_truesize = skb->truesize;
5555
5556	for (head = skb;;) {
5557	skb = skb_rb_next(skb);
5558
5559	/ Range is terminated when we see a gap or when*
5560	* we are at the queue end.
5561	*/
5562	if (!skb \|\|
5563	after(TCP_SKB_CB(skb)->seq, end) \|\|
5564	before(TCP_SKB_CB(skb)->end_seq, seq2: start)) {
5565	/ Do not attempt collapsing tiny skbs /
5566	if (range_truesize != head->truesize \|\|
5567	end - start >= SKB_WITH_OVERHEAD(PAGE_SIZE)) {
5568	tcp_collapse(sk, NULL, root: &tp->out_of_order_queue,
5569	head, tail: skb, start, end);
5570	} else {
5571	sum_tiny += range_truesize;
5572	if (sum_tiny > sk->sk_rcvbuf >> `3`)
5573	return;
5574	}
5575	goto new_range;
5576	}
5577
5578	range_truesize += skb->truesize;
5579	if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
5580	start = TCP_SKB_CB(skb)->seq;
5581	if (after(TCP_SKB_CB(skb)->end_seq, end))
5582	end = TCP_SKB_CB(skb)->end_seq;
5583	}
5584	}
5585
5586	/*
5587	* Clean the out-of-order queue to make room.
5588	* We drop high sequences packets to :
5589	* 1) Let a chance for holes to be filled.
5590	* This means we do not drop packets from ooo queue if their sequence
5591	* is before incoming packet sequence.
5592	* 2) not add too big latencies if thousands of packets sit there.
5593	* (But if application shrinks SO_RCVBUF, we could still end up
5594	* freeing whole queue here)
5595	* 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
5596	*
5597	* Return true if queue has shrunk.
5598	*/
5599	static bool tcp_prune_ofo_queue(struct sock sk, const* struct sk_buff *in_skb)
5600	{
5601	struct tcp_sock *tp = tcp_sk(sk);
5602	struct rb_node node, prev;
5603	bool pruned = false;
5604	int goal;
5605
5606	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
5607	return false;
5608
5609	goal = sk->sk_rcvbuf >> `3`;
5610	node = &tp->ooo_last_skb->rbnode;
5611
5612	do {
5613	struct sk_buff *skb = rb_to_skb(node);
5614
5615	/ If incoming skb would land last in ofo queue, stop pruning. /
5616	if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq))
5617	break;
5618	pruned = true;
5619	prev = rb_prev(node);
5620	rb_erase(node, &tp->out_of_order_queue);
5621	goal -= skb->truesize;
5622	tcp_drop_reason(sk, skb, reason: SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
5623	tp->ooo_last_skb = rb_to_skb(prev);
5624	if (!prev \|\| goal <= `0`) {
5625	if (atomic_read(v: &sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
5626	!tcp_under_memory_pressure(sk))
5627	break;
5628	goal = sk->sk_rcvbuf >> `3`;
5629	}
5630	node = prev;
5631	} while (node);
5632
5633	if (pruned) {
5634	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
5635	/ Reset SACK state. A conforming SACK implementation will*
5636	* do the same at a timeout based retransmit. When a connection
5637	* is in a sad state like this, we care only about integrity
5638	* of the connection not performance.
5639	*/
5640	if (tp->rx_opt.sack_ok)
5641	tcp_sack_reset(rx_opt: &tp->rx_opt);
5642	}
5643	return pruned;
5644	}
5645
5646	/ Reduce allocated memory if we can, trying to get*
5647	* the socket within its memory limits again.
5648	*
5649	* Return less than zero if we should start dropping frames
5650	* until the socket owning process reads some of the data
5651	* to stabilize the situation.
5652	*/
5653	static int tcp_prune_queue(struct sock sk, const* struct sk_buff *in_skb)
5654	{
5655	struct tcp_sock *tp = tcp_sk(sk);
5656
5657	NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
5658
5659	if (atomic_read(v: &sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
5660	tcp_clamp_window(sk);
5661	else if (tcp_under_memory_pressure(sk))
5662	tcp_adjust_rcv_ssthresh(sk);
5663
5664	if (atomic_read(v: &sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
5665	return `0`;
5666
5667	tcp_collapse_ofo_queue(sk);
5668	if (!skb_queue_empty(list: &sk->sk_receive_queue))
5669	tcp_collapse(sk, list: &sk->sk_receive_queue, NULL,
5670	head: skb_peek(list_: &sk->sk_receive_queue),
5671	NULL,
5672	start: tp->copied_seq, end: tp->rcv_nxt);
5673
5674	if (atomic_read(v: &sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
5675	return `0`;
5676
5677	/ Collapsing did not help, destructive actions follow.*
5678	* This must not ever occur. */
5679
5680	tcp_prune_ofo_queue(sk, in_skb);
5681
5682	if (atomic_read(v: &sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
5683	return `0`;
5684
5685	/ If we are really being abused, tell the caller to silently*
5686	* drop receive data on the floor. It will get retransmitted
5687	* and hopefully then we'll have sufficient space.
5688	*/
5689	NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
5690
5691	/ Massive buffer overcommit. /
5692	tp->pred_flags = `0`;
5693	return -`1`;
5694	}
5695
5696	static bool tcp_should_expand_sndbuf(struct sock *sk)
5697	{
5698	const struct tcp_sock *tp = tcp_sk(sk);
5699
5700	/ If the user specified a specific send buffer setting, do*
5701	* not modify it.
5702	*/
5703	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
5704	return false;
5705
5706	/ If we are under global TCP memory pressure, do not expand. /
5707	if (tcp_under_memory_pressure(sk)) {
5708	int unused_mem = sk_unused_reserved_mem(sk);
5709
5710	/ Adjust sndbuf according to reserved mem. But make sure*
5711	* it never goes below SOCK_MIN_SNDBUF.
5712	* See sk_stream_moderate_sndbuf() for more details.
5713	*/
5714	if (unused_mem > SOCK_MIN_SNDBUF)
5715	WRITE_ONCE(sk->sk_sndbuf, unused_mem);
5716
5717	return false;
5718	}
5719
5720	/ If we are under soft global TCP memory pressure, do not expand. /
5721	if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, index: `0`))
5722	return false;
5723
5724	/ If we filled the congestion window, do not expand. /
5725	if (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp))
5726	return false;
5727
5728	return true;
5729	}
5730
5731	static void tcp_new_space(struct sock *sk)
5732	{
5733	struct tcp_sock *tp = tcp_sk(sk);
5734
5735	if (tcp_should_expand_sndbuf(sk)) {
5736	tcp_sndbuf_expand(sk);
5737	tp->snd_cwnd_stamp = tcp_jiffies32;
5738	}
5739
5740	INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);
5741	}
5742
5743	/ Caller made space either from:*
5744	* 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
5745	* 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
5746	*
5747	* We might be able to generate EPOLLOUT to the application if:
5748	* 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
5749	* 2) notsent amount (tp->write_seq - tp->snd_nxt) became
5750	* small enough that tcp_stream_memory_free() decides it
5751	* is time to generate EPOLLOUT.
5752	*/
5753	void tcp_check_space(struct sock *sk)
5754	{
5755	/ pairs with tcp_poll() /
5756	smp_mb();
5757	if (sk->sk_socket &&
5758	test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
5759	tcp_new_space(sk);
5760	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
5761	tcp_chrono_stop(sk, type: TCP_CHRONO_SNDBUF_LIMITED);
5762	}
5763	}
5764
5765	static inline void tcp_data_snd_check(struct sock *sk)
5766	{
5767	tcp_push_pending_frames(sk);
5768	tcp_check_space(sk);
5769	}
5770
5771	/*
5772	* Check if sending an ack is needed.
5773	*/
5774	static void __tcp_ack_snd_check(struct sock sk, int* ofo_possible)
5775	{
5776	struct tcp_sock *tp = tcp_sk(sk);
5777	unsigned long rtt, delay;
5778
5779	/ More than one full frame received... /
5780	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
5781	/ ... and right edge of window advances far enough.*
5782	* (tcp_recvmsg() will send ACK otherwise).
5783	* If application uses SO_RCVLOWAT, we want send ack now if
5784	* we have not received enough bytes to satisfy the condition.
5785	*/
5786	(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat \|\|
5787	__tcp_select_window(sk) >= tp->rcv_wnd)) \|\|
5788	/ We ACK each frame or... /
5789	tcp_in_quickack_mode(sk) \|\|
5790	/ Protocol state mandates a one-time immediate ACK /
5791	inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
5792	/ If we are running from __release_sock() in user context,*
5793	* Defer the ack until tcp_release_cb().
5794	*/
5795	if (sock_owned_by_user_nocheck(sk) &&
5796	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
5797	set_bit(nr: TCP_ACK_DEFERRED, addr: &sk->sk_tsq_flags);
5798	return;
5799	}
5800	send_now:
5801	tcp_send_ack(sk);
5802	return;
5803	}
5804
5805	if (!ofo_possible \|\| RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
5806	tcp_send_delayed_ack(sk);
5807	return;
5808	}
5809
5810	if (!tcp_is_sack(tp) \|\|
5811	tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
5812	goto send_now;
5813
5814	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
5815	tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
5816	tp->dup_ack_counter = `0`;
5817	}
5818	if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
5819	tp->dup_ack_counter++;
5820	goto send_now;
5821	}
5822	tp->compressed_ack++;
5823	if (hrtimer_is_queued(timer: &tp->compressed_ack_timer))
5824	return;
5825
5826	/ compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns /
5827
5828	rtt = tp->rcv_rtt_est.rtt_us;
5829	if (tp->srtt_us && tp->srtt_us < rtt)
5830	rtt = tp->srtt_us;
5831
5832	delay = min_t(unsigned long,
5833	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
5834	rtt * (NSEC_PER_USEC >> `3`)/`20`);
5835	sock_hold(sk);
5836	hrtimer_start_range_ns(timer: &tp->compressed_ack_timer, tim: ns_to_ktime(ns: delay),
5837	READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
5838	mode: HRTIMER_MODE_REL_PINNED_SOFT);
5839	}
5840
5841	static inline void tcp_ack_snd_check(struct sock *sk)
5842	{
5843	if (!inet_csk_ack_scheduled(sk)) {
5844	/ We sent a data segment already. /
5845	return;
5846	}
5847	__tcp_ack_snd_check(sk, ofo_possible: `1`);
5848	}
5849
5850	/*
5851	* This routine is only called when we have urgent data
5852	* signaled. Its the 'slow' part of tcp_urg. It could be
5853	* moved inline now as tcp_urg is only called from one
5854	* place. We handle URGent data wrong. We have to - as
5855	* BSD still doesn't use the correction from RFC961.
5856	* For 1003.1g we should support a new option TCP_STDURG to permit
5857	* either form (or just set the sysctl tcp_stdurg).
5858	*/
5859
5860	static void tcp_check_urg(struct sock sk, const* struct tcphdr *th)
5861	{
5862	struct tcp_sock *tp = tcp_sk(sk);
5863	u32 ptr = ntohs(th->urg_ptr);
5864
5865	if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg))
5866	ptr--;
5867	ptr += ntohl(th->seq);
5868
5869	/ Ignore urgent data that we've already seen and read. /
5870	if (after(tp->copied_seq, ptr))
5871	return;
5872
5873	/ Do not replay urg ptr.*
5874	*
5875	* NOTE: interesting situation not covered by specs.
5876	* Misbehaving sender may send urg ptr, pointing to segment,
5877	* which we already have in ofo queue. We are not able to fetch
5878	* such data and will stay in TCP_URG_NOTYET until will be eaten
5879	* by recvmsg(). Seems, we are not obliged to handle such wicked
5880	* situations. But it is worth to think about possibility of some
5881	* DoSes using some hypothetical application level deadlock.
5882	*/
5883	if (before(seq1: ptr, seq2: tp->rcv_nxt))
5884	return;
5885
5886	/ Do we already have a newer (or duplicate) urgent pointer? /
5887	if (tp->urg_data && !after(ptr, tp->urg_seq))
5888	return;
5889
5890	/ Tell the world about our new urgent pointer. /
5891	sk_send_sigurg(sk);
5892
5893	/ We may be adding urgent data when the last byte read was*
5894	* urgent. To do this requires some care. We cannot just ignore
5895	* tp->copied_seq since we would read the last urgent byte again
5896	* as data, nor can we alter copied_seq until this data arrives
5897	* or we break the semantics of SIOCATMARK (and thus sockatmark())
5898	*
5899	* NOTE. Double Dutch. Rendering to plain English: author of comment
5900	* above did something sort of send("A", MSG_OOB); send("B", MSG_OOB);
5901	* and expect that both A and B disappear from stream. This is _wrong_.
5902	* Though this happens in BSD with high probability, this is occasional.
5903	* Any application relying on this is buggy. Note also, that fix "works"
5904	* only in this artificial test. Insert some normal data between A and B and we will
5905	* decline of BSD again. Verdict: it is better to remove to trap
5906	* buggy users.
5907	*/
5908	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
5909	!sock_flag(sk, flag: SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
5910	struct sk_buff *skb = skb_peek(list_: &sk->sk_receive_queue);
5911	tp->copied_seq++;
5912	if (skb && !before(seq1: tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
5913	__skb_unlink(skb, list: &sk->sk_receive_queue);
5914	__kfree_skb(skb);
5915	}
5916	}
5917
5918	WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET);
5919	WRITE_ONCE(tp->urg_seq, ptr);
5920
5921	/ Disable header prediction. /
5922	tp->pred_flags = `0`;
5923	}
5924
5925	/ This is the 'fast' part of urgent handling. /
5926	static void tcp_urg(struct sock sk, struct* sk_buff skb, const* struct tcphdr *th)
5927	{
5928	struct tcp_sock *tp = tcp_sk(sk);
5929
5930	/ Check if we get a new urgent pointer - normally not. /
5931	if (unlikely(th->urg))
5932	tcp_check_urg(sk, th);
5933
5934	/ Do we wait for any urgent data? - normally not... /
5935	if (unlikely(tp->urg_data == TCP_URG_NOTYET)) {
5936	u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * `4`) -
5937	th->syn;
5938
5939	/ Is the urgent pointer pointing into this packet? /
5940	if (ptr < skb->len) {
5941	u8 tmp;
5942	if (skb_copy_bits(skb, offset: ptr, to: &tmp, len: `1`))
5943	BUG();
5944	WRITE_ONCE(tp->urg_data, TCP_URG_VALID \| tmp);
5945	if (!sock_flag(sk, flag: SOCK_DEAD))
5946	sk->sk_data_ready(sk);
5947	}
5948	}
5949	}
5950
5951	/ Accept RST for rcv_nxt - 1 after a FIN.*
5952	* When tcp connections are abruptly terminated from Mac OSX (via ^C), a
5953	* FIN is sent followed by a RST packet. The RST is sent with the same
5954	* sequence number as the FIN, and thus according to RFC 5961 a challenge
5955	* ACK should be sent. However, Mac OSX rate limits replies to challenge
5956	* ACKs on the closed socket. In addition middleboxes can drop either the
5957	* challenge ACK or a subsequent RST.
5958	*/
5959	static bool tcp_reset_check(const struct sock sk, const* struct sk_buff *skb)
5960	{
5961	const struct tcp_sock *tp = tcp_sk(sk);
5962
5963	return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - `1`) &&
5964	(`1` << sk->sk_state) & (TCPF_CLOSE_WAIT \| TCPF_LAST_ACK \|
5965	TCPF_CLOSING));
5966	}
5967
5968	/ Does PAWS and seqno based validation of an incoming segment, flags will*
5969	* play significant role here.
5970	*/
5971	static bool tcp_validate_incoming(struct sock sk, struct* sk_buff *skb,
5972	const struct tcphdr th, int* syn_inerr)
5973	{
5974	struct tcp_sock *tp = tcp_sk(sk);
5975	SKB_DR(reason);
5976
5977	/ RFC1323: H1. Apply PAWS check first. /
5978	if (!tcp_fast_parse_options(net: sock_net(sk), skb, th, tp) \|\|
5979	!tp->rx_opt.saw_tstamp \|\|
5980	tcp_paws_check(rx_opt: &tp->rx_opt, TCP_PAWS_WINDOW))
5981	goto step1;
5982
5983	reason = tcp_disordered_ack_check(sk, skb);
5984	if (!reason)
5985	goto step1;
5986	/ Reset is accepted even if it did not pass PAWS. /
5987	if (th->rst)
5988	goto step1;
5989	if (unlikely(th->syn))
5990	goto syn_challenge;
5991
5992	/ Old ACK are common, increment PAWS_OLD_ACK*
5993	* and do not send a dupack.
5994	*/
5995	if (reason == SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK) {
5996	NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWS_OLD_ACK);
5997	goto discard;
5998	}
5999	NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
6000	if (!tcp_oow_rate_limited(net: sock_net(sk), skb,
6001	mib_idx: LINUX_MIB_TCPACKSKIPPEDPAWS,
6002	last_oow_ack_time: &tp->last_oow_ack_time))
6003	tcp_send_dupack(sk, skb);
6004	goto discard;
6005
6006	step1:
6007	/ Step 1: check sequence number /
6008	reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
6009	if (reason) {
6010	/ RFC793, page 37: "In all states except SYN-SENT, all reset*
6011	* (RST) segments are validated by checking their SEQ-fields."
6012	* And page 69: "If an incoming segment is not acceptable,
6013	* an acknowledgment should be sent in reply (unless the RST
6014	* bit is set, if so drop the segment and return)".
6015	*/
6016	if (!th->rst) {
6017	if (th->syn)
6018	goto syn_challenge;
6019	if (!tcp_oow_rate_limited(net: sock_net(sk), skb,
6020	mib_idx: LINUX_MIB_TCPACKSKIPPEDSEQ,
6021	last_oow_ack_time: &tp->last_oow_ack_time))
6022	tcp_send_dupack(sk, skb);
6023	} else if (tcp_reset_check(sk, skb)) {
6024	goto reset;
6025	}
6026	goto discard;
6027	}
6028
6029	/ Step 2: check RST bit /
6030	if (th->rst) {
6031	/ RFC 5961 3.2 (extend to match against (RCV.NXT - 1) after a*
6032	* FIN and SACK too if available):
6033	* If seq num matches RCV.NXT or (RCV.NXT - 1) after a FIN, or
6034	* the right-most SACK block,
6035	* then
6036	* RESET the connection
6037	* else
6038	* Send a challenge ACK
6039	*/
6040	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt \|\|
6041	tcp_reset_check(sk, skb))
6042	goto reset;
6043
6044	if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > `0`) {
6045	struct tcp_sack_block *sp = &tp->selective_acks[`0`];
6046	int max_sack = sp[`0`].end_seq;
6047	int this_sack;
6048
6049	for (this_sack = `1`; this_sack < tp->rx_opt.num_sacks;
6050	++this_sack) {
6051	max_sack = after(sp[this_sack].end_seq,
6052	max_sack) ?
6053	sp[this_sack].end_seq : max_sack;
6054	}
6055
6056	if (TCP_SKB_CB(skb)->seq == max_sack)
6057	goto reset;
6058	}
6059
6060	/ Disable TFO if RST is out-of-order*
6061	* and no data has been received
6062	* for current active TFO socket
6063	*/
6064	if (tp->syn_fastopen && !tp->data_segs_in &&
6065	sk->sk_state == TCP_ESTABLISHED)
6066	tcp_fastopen_active_disable(sk);
6067	tcp_send_challenge_ack(sk);
6068	SKB_DR_SET(reason, TCP_RESET);
6069	goto discard;
6070	}
6071
6072	/ step 3: check security and precedence [ignored] /
6073
6074	/ step 4: Check for a SYN*
6075	* RFC 5961 4.2 : Send a challenge ack
6076	*/
6077	if (th->syn) {
6078	if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
6079	TCP_SKB_CB(skb)->seq + `1` == TCP_SKB_CB(skb)->end_seq &&
6080	TCP_SKB_CB(skb)->seq + `1` == tp->rcv_nxt &&
6081	TCP_SKB_CB(skb)->ack_seq == tp->snd_nxt)
6082	goto pass;
6083	syn_challenge:
6084	if (syn_inerr)
6085	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
6086	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
6087	tcp_send_challenge_ack(sk);
6088	SKB_DR_SET(reason, TCP_INVALID_SYN);
6089	goto discard;
6090	}
6091
6092	pass:
6093	bpf_skops_parse_hdr(sk, skb);
6094
6095	return true;
6096
6097	discard:
6098	tcp_drop_reason(sk, skb, reason);
6099	return false;
6100
6101	reset:
6102	tcp_reset(sk, skb);
6103	__kfree_skb(skb);
6104	return false;
6105	}
6106
6107	/*
6108	* TCP receive function for the ESTABLISHED state.
6109	*
6110	* It is split into a fast path and a slow path. The fast path is
6111	* disabled when:
6112	* - A zero window was announced from us - zero window probing
6113	* is only handled properly in the slow path.
6114	* - Out of order segments arrived.
6115	* - Urgent data is expected.
6116	* - There is no buffer space left
6117	* - Unexpected TCP flags/window values/header lengths are received
6118	* (detected by checking the TCP header against pred_flags)
6119	* - Data is sent in both directions. Fast path only supports pure senders
6120	* or pure receivers (this means either the sequence number or the ack
6121	* value must stay constant)
6122	* - Unexpected TCP option.
6123	*
6124	* When these conditions are not satisfied it drops into a standard
6125	* receive procedure patterned after RFC793 to handle all cases.
6126	* The first three cases are guaranteed by proper pred_flags setting,
6127	* the rest is checked inline. Fast processing is turned on in
6128	* tcp_data_queue when everything is OK.
6129	*/
6130	void tcp_rcv_established(struct sock sk, struct* sk_buff *skb)
6131	{
6132	enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
6133	const struct tcphdr th = (const* struct tcphdr *)skb->data;
6134	struct tcp_sock *tp = tcp_sk(sk);
6135	unsigned int len = skb->len;
6136
6137	/ TCP congestion window tracking /
6138	trace_tcp_probe(sk, skb);
6139
6140	tcp_mstamp_refresh(tp);
6141	if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
6142	inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
6143	/*
6144	* Header prediction.
6145	* The code loosely follows the one in the famous
6146	* "30 instruction TCP receive" Van Jacobson mail.
6147	*
6148	* Van's trick is to deposit buffers into socket queue
6149	* on a device interrupt, to call tcp_recv function
6150	* on the receive process context and checksum and copy
6151	* the buffer to user space. smart...
6152	*
6153	* Our current scheme is not silly either but we take the
6154	* extra cost of the net_bh soft interrupt processing...
6155	* We do checksum and copy also but from device to kernel.
6156	*/
6157
6158	tp->rx_opt.saw_tstamp = `0`;
6159
6160	/ pred_flags is 0xS?10 << 16 + snd_wnd*
6161	* if header_prediction is to be made
6162	* 'S' will always be tp->tcp_header_len >> 2
6163	* '?' will be 0 for the fast path, otherwise pred_flags is 0 to
6164	* turn it off (when there are holes in the receive
6165	* space for instance)
6166	* PSH flag is ignored.
6167	*/
6168
6169	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
6170	TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
6171	!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
6172	int tcp_header_len = tp->tcp_header_len;
6173	s32 delta = `0`;
6174	int flag = `0`;
6175
6176	/ Timestamp header prediction: tcp_header_len*
6177	* is automatically equal to th->doff*4 due to pred_flags
6178	* match.
6179	*/
6180
6181	/ Check timestamp /
6182	if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
6183	/ No? Slow path! /
6184	if (!tcp_parse_aligned_timestamp(tp, th))
6185	goto slow_path;
6186
6187	delta = tp->rx_opt.rcv_tsval -
6188	tp->rx_opt.ts_recent;
6189	/ If PAWS failed, check it more carefully in slow path /
6190	if (delta < `0`)
6191	goto slow_path;
6192
6193	/ DO NOT update ts_recent here, if checksum fails*
6194	* and timestamp was corrupted part, it will result
6195	* in a hung connection since we will drop all
6196	* future packets due to the PAWS test.
6197	*/
6198	}
6199
6200	if (len <= tcp_header_len) {
6201	/ Bulk data transfer: sender /
6202	if (len == tcp_header_len) {
6203	/ Predicted packet is in window by definition.*
6204	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
6205	* Hence, check seq<=rcv_wup reduces to:
6206	*/
6207	if (tcp_header_len ==
6208	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
6209	tp->rcv_nxt == tp->rcv_wup)
6210	flag \|= __tcp_replace_ts_recent(tp,
6211	tstamp_delta: delta);
6212
6213	/ We know that such packets are checksummed*
6214	* on entry.
6215	*/
6216	tcp_ack(sk, skb, flag);
6217	__kfree_skb(skb);
6218	tcp_data_snd_check(sk);
6219	/ When receiving pure ack in fast path, update*
6220	* last ts ecr directly instead of calling
6221	* tcp_rcv_rtt_measure_ts()
6222	*/
6223	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
6224	return;
6225	} else { / Header too small /
6226	reason = SKB_DROP_REASON_PKT_TOO_SMALL;
6227	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
6228	goto discard;
6229	}
6230	} else {
6231	int eaten = `0`;
6232	bool fragstolen = false;
6233
6234	if (tcp_checksum_complete(skb))
6235	goto csum_error;
6236
6237	if ((int)skb->truesize > sk->sk_forward_alloc)
6238	goto step5;
6239
6240	/ Predicted packet is in window by definition.*
6241	* seq == rcv_nxt and rcv_wup <= rcv_nxt.
6242	* Hence, check seq<=rcv_wup reduces to:
6243	*/
6244	if (tcp_header_len ==
6245	(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
6246	tp->rcv_nxt == tp->rcv_wup)
6247	flag \|= __tcp_replace_ts_recent(tp,
6248	tstamp_delta: delta);
6249
6250	tcp_rcv_rtt_measure_ts(sk, skb);
6251
6252	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
6253
6254	/ Bulk data transfer: receiver /
6255	tcp_cleanup_skb(skb);
6256	__skb_pull(skb, len: tcp_header_len);
6257	eaten = tcp_queue_rcv(sk, skb, fragstolen: &fragstolen);
6258
6259	tcp_event_data_recv(sk, skb);
6260
6261	if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
6262	/ Well, only one small jumplet in fast path... /
6263	tcp_ack(sk, skb, flag: flag \| FLAG_DATA);
6264	tcp_data_snd_check(sk);
6265	if (!inet_csk_ack_scheduled(sk))
6266	goto no_ack;
6267	} else {
6268	tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
6269	}
6270
6271	__tcp_ack_snd_check(sk, ofo_possible: `0`);
6272	no_ack:
6273	if (eaten)
6274	kfree_skb_partial(skb, head_stolen: fragstolen);
6275	tcp_data_ready(sk);
6276	return;
6277	}
6278	}
6279
6280	slow_path:
6281	if (len < (th->doff << `2`) \|\| tcp_checksum_complete(skb))
6282	goto csum_error;
6283
6284	if (!th->ack && !th->rst && !th->syn) {
6285	reason = SKB_DROP_REASON_TCP_FLAGS;
6286	goto discard;
6287	}
6288
6289	/*
6290	* Standard slow path.
6291	*/
6292
6293	if (!tcp_validate_incoming(sk, skb, th, syn_inerr: `1`))
6294	return;
6295
6296	step5:
6297	reason = tcp_ack(sk, skb, FLAG_SLOWPATH \| FLAG_UPDATE_TS_RECENT);
6298	if ((int)reason < `0`) {
6299	reason = -reason;
6300	goto discard;
6301	}
6302	tcp_rcv_rtt_measure_ts(sk, skb);
6303
6304	/ Process urgent data. /
6305	tcp_urg(sk, skb, th);
6306
6307	/ step 7: process the segment text /
6308	tcp_data_queue(sk, skb);
6309
6310	tcp_data_snd_check(sk);
6311	tcp_ack_snd_check(sk);
6312	return;
6313
6314	csum_error:
6315	reason = SKB_DROP_REASON_TCP_CSUM;
6316	trace_tcp_bad_csum(skb);
6317	TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
6318	TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
6319
6320	discard:
6321	tcp_drop_reason(sk, skb, reason);
6322	}
6323	EXPORT_IPV6_MOD(tcp_rcv_established);
6324
6325	void tcp_init_transfer(struct sock sk, int* bpf_op, struct sk_buff *skb)
6326	{
6327	struct inet_connection_sock *icsk = inet_csk(sk);
6328	struct tcp_sock *tp = tcp_sk(sk);
6329
6330	tcp_mtup_init(sk);
6331	icsk->icsk_af_ops->rebuild_header(sk);
6332	tcp_init_metrics(sk);
6333
6334	/ Initialize the congestion window to start the transfer.*
6335	* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
6336	* retransmitted. In light of RFC6298 more aggressive 1sec
6337	* initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
6338	* retransmission has occurred.
6339	*/
6340	if (tp->total_retrans > `1` && tp->undo_marker)
6341	tcp_snd_cwnd_set(tp, val: `1`);
6342	else
6343	tcp_snd_cwnd_set(tp, val: tcp_init_cwnd(tp, dst: __sk_dst_get(sk)));
6344	tp->snd_cwnd_stamp = tcp_jiffies32;
6345
6346	bpf_skops_established(sk, bpf_op, skb);
6347	/ Initialize congestion control unless BPF initialized it already: /
6348	if (!icsk->icsk_ca_initialized)
6349	tcp_init_congestion_control(sk);
6350	tcp_init_buffer_space(sk);
6351	}
6352
6353	void tcp_finish_connect(struct sock sk, struct* sk_buff *skb)
6354	{
6355	struct tcp_sock *tp = tcp_sk(sk);
6356	struct inet_connection_sock *icsk = inet_csk(sk);
6357
6358	tcp_ao_finish_connect(sk, skb);
6359	tcp_set_state(sk, state: TCP_ESTABLISHED);
6360	icsk->icsk_ack.lrcvtime = tcp_jiffies32;
6361
6362	if (skb) {
6363	icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
6364	security_inet_conn_established(sk, skb);
6365	sk_mark_napi_id(sk, skb);
6366	}
6367
6368	tcp_init_transfer(sk, bpf_op: BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
6369
6370	/ Prevent spurious tcp_cwnd_restart() on first data*
6371	* packet.
6372	*/
6373	tp->lsndtime = tcp_jiffies32;
6374
6375	if (sock_flag(sk, flag: SOCK_KEEPOPEN))
6376	tcp_reset_keepalive_timer(sk, timeout: keepalive_time_when(tp));
6377
6378	if (!tp->rx_opt.snd_wscale)
6379	__tcp_fast_path_on(tp, snd_wnd: tp->snd_wnd);
6380	else
6381	tp->pred_flags = `0`;
6382	}
6383
6384	static bool tcp_rcv_fastopen_synack(struct sock sk, struct* sk_buff *synack,
6385	struct tcp_fastopen_cookie *cookie)
6386	{
6387	struct tcp_sock *tp = tcp_sk(sk);
6388	struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL;
6389	u16 mss = tp->rx_opt.mss_clamp, try_exp = `0`;
6390	bool syn_drop = false;
6391
6392	if (mss == tp->rx_opt.user_mss) {
6393	struct tcp_options_received opt;
6394
6395	/ Get original SYNACK MSS value if user MSS sets mss_clamp /
6396	tcp_clear_options(rx_opt: &opt);
6397	opt.user_mss = opt.mss_clamp = `0`;
6398	tcp_parse_options(sock_net(sk), synack, &opt, `0`, NULL);
6399	mss = opt.mss_clamp;
6400	}
6401
6402	if (!tp->syn_fastopen) {
6403	/ Ignore an unsolicited cookie /
6404	cookie->len = -`1`;
6405	} else if (tp->total_retrans) {
6406	/ SYN timed out and the SYN-ACK neither has a cookie nor*
6407	* acknowledges data. Presumably the remote received only
6408	* the retransmitted (regular) SYNs: either the original
6409	* SYN-data or the corresponding SYN-ACK was dropped.
6410	*/
6411	syn_drop = (cookie->len < `0` && data);
6412	} else if (cookie->len < `0` && !tp->syn_data) {
6413	/ We requested a cookie but didn't get it. If we did not use*
6414	* the (old) exp opt format then try so next time (try_exp=1).
6415	* Otherwise we go back to use the RFC7413 opt (try_exp=2).
6416	*/
6417	try_exp = tp->syn_fastopen_exp ? `2` : `1`;
6418	}
6419
6420	tcp_fastopen_cache_set(sk, mss, cookie, syn_lost: syn_drop, try_exp);
6421
6422	if (data) { / Retransmit unacked data in SYN /
6423	if (tp->total_retrans)
6424	tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
6425	else
6426	tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
6427	skb_rbtree_walk_from(data)
6428	tcp_mark_skb_lost(sk, skb: data);
6429	tcp_non_congestion_loss_retransmit(sk);
6430	NET_INC_STATS(sock_net(sk),
6431	LINUX_MIB_TCPFASTOPENACTIVEFAIL);
6432	return true;
6433	}
6434	tp->syn_data_acked = tp->syn_data;
6435	if (tp->syn_data_acked) {
6436	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
6437	/ SYN-data is counted as two separate packets in tcp_ack() /
6438	if (tp->delivered > `1`)
6439	--tp->delivered;
6440	}
6441
6442	tcp_fastopen_add_skb(sk, skb: synack);
6443
6444	return false;
6445	}
6446
6447	static void smc_check_reset_syn(struct tcp_sock *tp)
6448	{
6449	#if IS_ENABLED(CONFIG_SMC)
6450	if (static_branch_unlikely(&tcp_have_smc)) {
6451	if (tp->syn_smc && !tp->rx_opt.smc_ok)
6452	tp->syn_smc = `0`;
6453	}
6454	#endif
6455	}
6456
6457	static void tcp_try_undo_spurious_syn(struct sock *sk)
6458	{
6459	struct tcp_sock *tp = tcp_sk(sk);
6460	u32 syn_stamp;
6461
6462	/ undo_marker is set when SYN or SYNACK times out. The timeout is*
6463	* spurious if the ACK's timestamp option echo value matches the
6464	* original SYN timestamp.
6465	*/
6466	syn_stamp = tp->retrans_stamp;
6467	if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
6468	syn_stamp == tp->rx_opt.rcv_tsecr)
6469	tp->undo_marker = `0`;
6470	}
6471
6472	static int tcp_rcv_synsent_state_process(struct sock sk, struct* sk_buff *skb,
6473	const struct tcphdr *th)
6474	{
6475	struct inet_connection_sock *icsk = inet_csk(sk);
6476	struct tcp_sock *tp = tcp_sk(sk);
6477	struct tcp_fastopen_cookie foc = { .len = -`1` };
6478	int saved_clamp = tp->rx_opt.mss_clamp;
6479	bool fastopen_fail;
6480	SKB_DR(reason);
6481
6482	tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, `0`, &foc);
6483	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
6484	tp->rx_opt.rcv_tsecr -= tp->tsoffset;
6485
6486	if (th->ack) {
6487	/ rfc793:*
6488	* "If the state is SYN-SENT then
6489	* first check the ACK bit
6490	* If the ACK bit is set
6491	* If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
6492	* a reset (unless the RST bit is set, if so drop
6493	* the segment and return)"
6494	*/
6495	if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) \|\|
6496	after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
6497	/ Previous FIN/ACK or RST/ACK might be ignored. /
6498	if (icsk->icsk_retransmits == `0`)
6499	tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
6500	TCP_TIMEOUT_MIN, pace_delay: false);
6501	SKB_DR_SET(reason, TCP_INVALID_ACK_SEQUENCE);
6502	goto reset_and_undo;
6503	}
6504
6505	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
6506	!between(seq1: tp->rx_opt.rcv_tsecr, seq2: tp->retrans_stamp,
6507	seq3: tcp_time_stamp_ts(tp))) {
6508	NET_INC_STATS(sock_net(sk),
6509	LINUX_MIB_PAWSACTIVEREJECTED);
6510	SKB_DR_SET(reason, TCP_RFC7323_PAWS);
6511	goto reset_and_undo;
6512	}
6513
6514	/ Now ACK is acceptable.*
6515	*
6516	* "If the RST bit is set
6517	* If the ACK was acceptable then signal the user "error:
6518	* connection reset", drop the segment, enter CLOSED state,
6519	* delete TCB, and return."
6520	*/
6521
6522	if (th->rst) {
6523	tcp_reset(sk, skb);
6524	consume:
6525	__kfree_skb(skb);
6526	return `0`;
6527	}
6528
6529	/ rfc793:*
6530	* "fifth, if neither of the SYN or RST bits is set then
6531	* drop the segment and return."
6532	*
6533	* See note below!
6534	* --ANK(990513)
6535	*/
6536	if (!th->syn) {
6537	SKB_DR_SET(reason, TCP_FLAGS);
6538	goto discard_and_undo;
6539	}
6540	/ rfc793:*
6541	* "If the SYN bit is on ...
6542	* are acceptable then ...
6543	* (our SYN has been ACKed), change the connection
6544	* state to ESTABLISHED..."
6545	*/
6546
6547	tcp_ecn_rcv_synack(tp, th);
6548
6549	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
6550	tcp_try_undo_spurious_syn(sk);
6551	tcp_ack(sk, skb, FLAG_SLOWPATH);
6552
6553	/ Ok.. it's good. Set up sequence numbers and*
6554	* move to established.
6555	*/
6556	WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + `1`);
6557	tp->rcv_wup = TCP_SKB_CB(skb)->seq + `1`;
6558
6559	/ RFC1323: The window in SYN & SYN/ACK segments is*
6560	* never scaled.
6561	*/
6562	tp->snd_wnd = ntohs(th->window);
6563
6564	if (!tp->rx_opt.wscale_ok) {
6565	tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = `0`;
6566	WRITE_ONCE(tp->window_clamp,
6567	min(tp->window_clamp, `65535U`));
6568	}
6569
6570	if (tp->rx_opt.saw_tstamp) {
6571	tp->rx_opt.tstamp_ok = `1`;
6572	tp->tcp_header_len =
6573	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
6574	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
6575	tcp_store_ts_recent(tp);
6576	} else {
6577	tp->tcp_header_len = sizeof(struct tcphdr);
6578	}
6579
6580	tcp_sync_mss(sk, pmtu: icsk->icsk_pmtu_cookie);
6581	tcp_initialize_rcv_mss(sk);
6582
6583	/ Remember, tcp_poll() does not lock socket!*
6584	* Change state from SYN-SENT only after copied_seq
6585	* is initialized. */
6586	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
6587
6588	smc_check_reset_syn(tp);
6589
6590	smp_mb();
6591
6592	tcp_finish_connect(sk, skb);
6593
6594	fastopen_fail = (tp->syn_fastopen \|\| tp->syn_data) &&
6595	tcp_rcv_fastopen_synack(sk, synack: skb, cookie: &foc);
6596
6597	if (!sock_flag(sk, flag: SOCK_DEAD)) {
6598	sk->sk_state_change(sk);
6599	sk_wake_async(sk, how: SOCK_WAKE_IO, POLL_OUT);
6600	}
6601	if (fastopen_fail)
6602	return -`1`;
6603	if (sk->sk_write_pending \|\|
6604	READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) \|\|
6605	inet_csk_in_pingpong_mode(sk)) {
6606	/ Save one ACK. Data will be ready after*
6607	* several ticks, if write_pending is set.
6608	*
6609	* It may be deleted, but with this feature tcpdumps
6610	* look so _wonderfully_ clever, that I was not able
6611	* to stand against the temptation 8) --ANK
6612	*/
6613	inet_csk_schedule_ack(sk);
6614	tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
6615	tcp_reset_xmit_timer(sk, ICSK_TIME_DACK,
6616	TCP_DELACK_MAX, pace_delay: false);
6617	goto consume;
6618	}
6619	tcp_send_ack(sk);
6620	return -`1`;
6621	}
6622
6623	/ No ACK in the segment /
6624
6625	if (th->rst) {
6626	/ rfc793:*
6627	* "If the RST bit is set
6628	*
6629	* Otherwise (no ACK) drop the segment and return."
6630	*/
6631	SKB_DR_SET(reason, TCP_RESET);
6632	goto discard_and_undo;
6633	}
6634
6635	/ PAWS check. /
6636	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
6637	tcp_paws_reject(rx_opt: &tp->rx_opt, rst: `0`)) {
6638	SKB_DR_SET(reason, TCP_RFC7323_PAWS);
6639	goto discard_and_undo;
6640	}
6641	if (th->syn) {
6642	/ We see SYN without ACK. It is attempt of*
6643	* simultaneous connect with crossed SYNs.
6644	* Particularly, it can be connect to self.
6645	*/
6646	#ifdef CONFIG_TCP_AO
6647	struct tcp_ao_info *ao;
6648
6649	ao = rcu_dereference_protected(tp->ao_info,
6650	lockdep_sock_is_held(sk));
6651	if (ao) {
6652	WRITE_ONCE(ao->risn, th->seq);
6653	ao->rcv_sne = `0`;
6654	}
6655	#endif
6656	tcp_set_state(sk, state: TCP_SYN_RECV);
6657
6658	if (tp->rx_opt.saw_tstamp) {
6659	tp->rx_opt.tstamp_ok = `1`;
6660	tcp_store_ts_recent(tp);
6661	tp->tcp_header_len =
6662	sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
6663	} else {
6664	tp->tcp_header_len = sizeof(struct tcphdr);
6665	}
6666
6667	WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + `1`);
6668	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
6669	tp->rcv_wup = TCP_SKB_CB(skb)->seq + `1`;
6670
6671	/ RFC1323: The window in SYN & SYN/ACK segments is*
6672	* never scaled.
6673	*/
6674	tp->snd_wnd = ntohs(th->window);
6675	tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
6676	tp->max_window = tp->snd_wnd;
6677
6678	tcp_ecn_rcv_syn(tp, th);
6679
6680	tcp_mtup_init(sk);
6681	tcp_sync_mss(sk, pmtu: icsk->icsk_pmtu_cookie);
6682	tcp_initialize_rcv_mss(sk);
6683
6684	tcp_send_synack(sk);
6685	#if 0
6686	/ Note, we could accept data and URG from this segment.*
6687	* There are no obstacles to make this (except that we must
6688	* either change tcp_recvmsg() to prevent it from returning data
6689	* before 3WHS completes per RFC793, or employ TCP Fast Open).
6690	*
6691	* However, if we ignore data in ACKless segments sometimes,
6692	* we have no reasons to accept it sometimes.
6693	* Also, seems the code doing it in step6 of tcp_rcv_state_process
6694	* is not flawless. So, discard packet for sanity.
6695	* Uncomment this return to process the data.
6696	*/
6697	return -`1`;
6698	#else
6699	goto consume;
6700	#endif
6701	}
6702	/ "fifth, if neither of the SYN or RST bits is set then*
6703	* drop the segment and return."
6704	*/
6705
6706	discard_and_undo:
6707	tcp_clear_options(rx_opt: &tp->rx_opt);
6708	tp->rx_opt.mss_clamp = saved_clamp;
6709	tcp_drop_reason(sk, skb, reason);
6710	return `0`;
6711
6712	reset_and_undo:
6713	tcp_clear_options(rx_opt: &tp->rx_opt);
6714	tp->rx_opt.mss_clamp = saved_clamp;
6715	/ we can reuse/return @reason to its caller to handle the exception /
6716	return reason;
6717	}
6718
6719	static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
6720	{
6721	struct tcp_sock *tp = tcp_sk(sk);
6722	struct request_sock *req;
6723
6724	/ If we are still handling the SYNACK RTO, see if timestamp ECR allows*
6725	* undo. If peer SACKs triggered fast recovery, we can't undo here.
6726	*/
6727	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out)
6728	tcp_try_undo_recovery(sk);
6729
6730	tcp_update_rto_time(tp);
6731	inet_csk(sk)->icsk_retransmits = `0`;
6732	/ In tcp_fastopen_synack_timer() on the first SYNACK RTO we set*
6733	* retrans_stamp but don't enter CA_Loss, so in case that happened we
6734	* need to zero retrans_stamp here to prevent spurious
6735	* retransmits_timed_out(). However, if the ACK of our SYNACK caused us
6736	* to enter CA_Recovery then we need to leave retrans_stamp as it was
6737	* set entering CA_Recovery, for correct retransmits_timed_out() and
6738	* undo behavior.
6739	*/
6740	tcp_retrans_stamp_cleanup(sk);
6741
6742	/ Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,*
6743	* we no longer need req so release it.
6744	*/
6745	req = rcu_dereference_protected(tp->fastopen_rsk,
6746	lockdep_sock_is_held(sk));
6747	reqsk_fastopen_remove(sk, req, reset: false);
6748
6749	/ Re-arm the timer because data may have been sent out.*
6750	* This is similar to the regular data transmission case
6751	* when new data has just been ack'ed.
6752	*
6753	* (TFO) - we could try to be more aggressive and
6754	* retransmitting any data sooner based on when they
6755	* are sent out.
6756	*/
6757	tcp_rearm_rto(sk);
6758	}
6759
6760	/*
6761	* This function implements the receiving procedure of RFC 793 for
6762	* all states except ESTABLISHED and TIME_WAIT.
6763	* It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
6764	* address independent.
6765	*/
6766
6767	enum skb_drop_reason
6768	tcp_rcv_state_process(struct sock sk, struct* sk_buff *skb)
6769	{
6770	struct tcp_sock *tp = tcp_sk(sk);
6771	struct inet_connection_sock *icsk = inet_csk(sk);
6772	const struct tcphdr *th = tcp_hdr(skb);
6773	struct request_sock *req;
6774	int queued = `0`;
6775	SKB_DR(reason);
6776
6777	switch (sk->sk_state) {
6778	case TCP_CLOSE:
6779	SKB_DR_SET(reason, TCP_CLOSE);
6780	goto discard;
6781
6782	case TCP_LISTEN:
6783	if (th->ack)
6784	return SKB_DROP_REASON_TCP_FLAGS;
6785
6786	if (th->rst) {
6787	SKB_DR_SET(reason, TCP_RESET);
6788	goto discard;
6789	}
6790	if (th->syn) {
6791	if (th->fin) {
6792	SKB_DR_SET(reason, TCP_FLAGS);
6793	goto discard;
6794	}
6795	/ It is possible that we process SYN packets from backlog,*
6796	* so we need to make sure to disable BH and RCU right there.
6797	*/
6798	rcu_read_lock();
6799	local_bh_disable();
6800	icsk->icsk_af_ops->conn_request(sk, skb);
6801	local_bh_enable();
6802	rcu_read_unlock();
6803
6804	consume_skb(skb);
6805	return `0`;
6806	}
6807	SKB_DR_SET(reason, TCP_FLAGS);
6808	goto discard;
6809
6810	case TCP_SYN_SENT:
6811	tp->rx_opt.saw_tstamp = `0`;
6812	tcp_mstamp_refresh(tp);
6813	queued = tcp_rcv_synsent_state_process(sk, skb, th);
6814	if (queued >= `0`)
6815	return queued;
6816
6817	/ Do step6 onward by hand. /
6818	tcp_urg(sk, skb, th);
6819	__kfree_skb(skb);
6820	tcp_data_snd_check(sk);
6821	return `0`;
6822	}
6823
6824	tcp_mstamp_refresh(tp);
6825	tp->rx_opt.saw_tstamp = `0`;
6826	req = rcu_dereference_protected(tp->fastopen_rsk,
6827	lockdep_sock_is_held(sk));
6828	if (req) {
6829	bool req_stolen;
6830
6831	WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
6832	sk->sk_state != TCP_FIN_WAIT1);
6833
6834	SKB_DR_SET(reason, TCP_FASTOPEN);
6835	if (!tcp_check_req(sk, skb, req, fastopen: true, lost_race: &req_stolen, drop_reason: &reason))
6836	goto discard;
6837	}
6838
6839	if (!th->ack && !th->rst && !th->syn) {
6840	SKB_DR_SET(reason, TCP_FLAGS);
6841	goto discard;
6842	}
6843	if (!tcp_validate_incoming(sk, skb, th, syn_inerr: `0`))
6844	return `0`;
6845
6846	/ step 5: check the ACK field /
6847	reason = tcp_ack(sk, skb, FLAG_SLOWPATH \|
6848	FLAG_UPDATE_TS_RECENT \|
6849	FLAG_NO_CHALLENGE_ACK);
6850
6851	if ((int)reason <= `0`) {
6852	if (sk->sk_state == TCP_SYN_RECV) {
6853	/ send one RST /
6854	if (!reason)
6855	return SKB_DROP_REASON_TCP_OLD_ACK;
6856	return -reason;
6857	}
6858	/ accept old ack during closing /
6859	if ((int)reason < `0`) {
6860	tcp_send_challenge_ack(sk);
6861	reason = -reason;
6862	goto discard;
6863	}
6864	}
6865	SKB_DR_SET(reason, NOT_SPECIFIED);
6866	switch (sk->sk_state) {
6867	case TCP_SYN_RECV:
6868	tp->delivered++; / SYN-ACK delivery isn't tracked in tcp_ack /
6869	if (!tp->srtt_us)
6870	tcp_synack_rtt_meas(sk, req);
6871
6872	if (tp->rx_opt.tstamp_ok)
6873	tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
6874
6875	if (req) {
6876	tcp_rcv_synrecv_state_fastopen(sk);
6877	} else {
6878	tcp_try_undo_spurious_syn(sk);
6879	tp->retrans_stamp = `0`;
6880	tcp_init_transfer(sk, bpf_op: BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
6881	skb);
6882	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
6883	}
6884	tcp_ao_established(sk);
6885	smp_mb();
6886	tcp_set_state(sk, state: TCP_ESTABLISHED);
6887	sk->sk_state_change(sk);
6888
6889	/ Note, that this wakeup is only for marginal crossed SYN case.*
6890	* Passively open sockets are not waked up, because
6891	* sk->sk_sleep == NULL and sk->sk_socket == NULL.
6892	*/
6893	if (sk->sk_socket)
6894	sk_wake_async(sk, how: SOCK_WAKE_IO, POLL_OUT);
6895
6896	tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
6897	tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
6898	tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
6899
6900	if (!inet_csk(sk)->icsk_ca_ops->cong_control)
6901	tcp_update_pacing_rate(sk);
6902
6903	/ Prevent spurious tcp_cwnd_restart() on first data packet /
6904	tp->lsndtime = tcp_jiffies32;
6905
6906	tcp_initialize_rcv_mss(sk);
6907	tcp_fast_path_on(tp);
6908	if (sk->sk_shutdown & SEND_SHUTDOWN)
6909	tcp_shutdown(sk, SEND_SHUTDOWN);
6910	break;
6911
6912	case TCP_FIN_WAIT1: {
6913	int tmo;
6914
6915	if (req)
6916	tcp_rcv_synrecv_state_fastopen(sk);
6917
6918	if (tp->snd_una != tp->write_seq)
6919	break;
6920
6921	tcp_set_state(sk, state: TCP_FIN_WAIT2);
6922	WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown \| SEND_SHUTDOWN);
6923
6924	sk_dst_confirm(sk);
6925
6926	if (!sock_flag(sk, flag: SOCK_DEAD)) {
6927	/ Wake up lingering close() /
6928	sk->sk_state_change(sk);
6929	break;
6930	}
6931
6932	if (READ_ONCE(tp->linger2) < `0`) {
6933	tcp_done(sk);
6934	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6935	return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
6936	}
6937	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
6938	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
6939	/ Receive out of order FIN after close() /
6940	if (tp->syn_fastopen && th->fin)
6941	tcp_fastopen_active_disable(sk);
6942	tcp_done(sk);
6943	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
6944	return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
6945	}
6946
6947	tmo = tcp_fin_time(sk);
6948	if (tmo > TCP_TIMEWAIT_LEN) {
6949	tcp_reset_keepalive_timer(sk, timeout: tmo - TCP_TIMEWAIT_LEN);
6950	} else if (th->fin \|\| sock_owned_by_user(sk)) {
6951	/ Bad case. We could lose such FIN otherwise.*
6952	* It is not a big problem, but it looks confusing
6953	* and not so rare event. We still can lose it now,
6954	* if it spins in bh_lock_sock(), but it is really
6955	* marginal case.
6956	*/
6957	tcp_reset_keepalive_timer(sk, timeout: tmo);
6958	} else {
6959	tcp_time_wait(sk, state: TCP_FIN_WAIT2, timeo: tmo);
6960	goto consume;
6961	}
6962	break;
6963	}
6964
6965	case TCP_CLOSING:
6966	if (tp->snd_una == tp->write_seq) {
6967	tcp_time_wait(sk, state: TCP_TIME_WAIT, timeo: `0`);
6968	goto consume;
6969	}
6970	break;
6971
6972	case TCP_LAST_ACK:
6973	if (tp->snd_una == tp->write_seq) {
6974	tcp_update_metrics(sk);
6975	tcp_done(sk);
6976	goto consume;
6977	}
6978	break;
6979	}
6980
6981	/ step 6: check the URG bit /
6982	tcp_urg(sk, skb, th);
6983
6984	/ step 7: process the segment text /
6985	switch (sk->sk_state) {
6986	case TCP_CLOSE_WAIT:
6987	case TCP_CLOSING:
6988	case TCP_LAST_ACK:
6989	if (!before(TCP_SKB_CB(skb)->seq, seq2: tp->rcv_nxt)) {
6990	/ If a subflow has been reset, the packet should not*
6991	* continue to be processed, drop the packet.
6992	*/
6993	if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
6994	goto discard;
6995	break;
6996	}
6997	fallthrough;
6998	case TCP_FIN_WAIT1:
6999	case TCP_FIN_WAIT2:
7000	/ RFC 793 says to queue data in these states,*
7001	* RFC 1122 says we MUST send a reset.
7002	* BSD 4.4 also does reset.
7003	*/
7004	if (sk->sk_shutdown & RCV_SHUTDOWN) {
7005	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
7006	after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
7007	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
7008	tcp_reset(sk, skb);
7009	return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
7010	}
7011	}
7012	fallthrough;
7013	case TCP_ESTABLISHED:
7014	tcp_data_queue(sk, skb);
7015	queued = `1`;
7016	break;
7017	}
7018
7019	/ tcp_data could move socket to TIME-WAIT /
7020	if (sk->sk_state != TCP_CLOSE) {
7021	tcp_data_snd_check(sk);
7022	tcp_ack_snd_check(sk);
7023	}
7024
7025	if (!queued) {
7026	discard:
7027	tcp_drop_reason(sk, skb, reason);
7028	}
7029	return `0`;
7030
7031	consume:
7032	__kfree_skb(skb);
7033	return `0`;
7034	}
7035	EXPORT_IPV6_MOD(tcp_rcv_state_process);
7036
7037	static inline void pr_drop_req(struct request_sock req, __u16 port, int* family)
7038	{
7039	struct inet_request_sock *ireq = inet_rsk(sk: req);
7040
7041	if (family == AF_INET)
7042	net_dbg_ratelimited("drop open request from %pI4/%u\n",
7043	&ireq->ir_rmt_addr, port);
7044	#if IS_ENABLED(CONFIG_IPV6)
7045	else if (family == AF_INET6)
7046	net_dbg_ratelimited("drop open request from %pI6/%u\n",
7047	&ireq->ir_v6_rmt_addr, port);
7048	#endif
7049	}
7050
7051	/ RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set*
7052	*
7053	* If we receive a SYN packet with these bits set, it means a
7054	* network is playing bad games with TOS bits. In order to
7055	* avoid possible false congestion notifications, we disable
7056	* TCP ECN negotiation.
7057	*
7058	* Exception: tcp_ca wants ECN. This is required for DCTCP
7059	* congestion control: Linux DCTCP asserts ECT on all packets,
7060	* including SYN, which is most optimal solution; however,
7061	* others, such as FreeBSD do not.
7062	*
7063	* Exception: At least one of the reserved bits of the TCP header (th->res1) is
7064	* set, indicating the use of a future TCP extension (such as AccECN). See
7065	* RFC8311 §4.3 which updates RFC3168 to allow the development of such
7066	* extensions.
7067	*/
7068	static void tcp_ecn_create_request(struct request_sock *req,
7069	const struct sk_buff *skb,
7070	const struct sock *listen_sk,
7071	const struct dst_entry *dst)
7072	{
7073	const struct tcphdr *th = tcp_hdr(skb);
7074	const struct net *net = sock_net(sk: listen_sk);
7075	bool th_ecn = th->ece && th->cwr;
7076	bool ect, ecn_ok;
7077	u32 ecn_ok_dst;
7078
7079	if (!th_ecn)
7080	return;
7081
7082	ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
7083	ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
7084	ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) \|\| ecn_ok_dst;
7085
7086	if (((!ect \|\| th->res1) && ecn_ok) \|\| tcp_ca_needs_ecn(sk: listen_sk) \|\|
7087	(ecn_ok_dst & DST_FEATURE_ECN_CA) \|\|
7088	tcp_bpf_ca_needs_ecn(sk: (struct sock *)req))
7089	inet_rsk(sk: req)->ecn_ok = `1`;
7090	}
7091
7092	static void tcp_openreq_init(struct request_sock *req,
7093	const struct tcp_options_received *rx_opt,
7094	struct sk_buff skb, const* struct sock *sk)
7095	{
7096	struct inet_request_sock *ireq = inet_rsk(sk: req);
7097
7098	req->rsk_rcv_wnd = `0`; / So that tcp_send_synack() knows! /
7099	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
7100	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + `1`;
7101	tcp_rsk(req)->snt_synack = `0`;
7102	tcp_rsk(req)->snt_tsval_first = `0`;
7103	tcp_rsk(req)->last_oow_ack_time = `0`;
7104	req->mss = rx_opt->mss_clamp;
7105	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : `0`;
7106	ireq->tstamp_ok = rx_opt->tstamp_ok;
7107	ireq->sack_ok = rx_opt->sack_ok;
7108	ireq->snd_wscale = rx_opt->snd_wscale;
7109	ireq->wscale_ok = rx_opt->wscale_ok;
7110	ireq->acked = `0`;
7111	ireq->ecn_ok = `0`;
7112	ireq->ir_rmt_port = tcp_hdr(skb)->source;
7113	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
7114	ireq->ir_mark = inet_request_mark(sk, skb);
7115	#if IS_ENABLED(CONFIG_SMC)
7116	ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
7117	tcp_sk(sk)->smc_hs_congested(sk));
7118	#endif
7119	}
7120
7121	/*
7122	* Return true if a syncookie should be sent
7123	*/
7124	static bool tcp_syn_flood_action(struct sock sk, const* char *proto)
7125	{
7126	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
7127	const char *msg = "Dropping request";
7128	struct net *net = sock_net(sk);
7129	bool want_cookie = false;
7130	u8 syncookies;
7131
7132	syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
7133
7134	#ifdef CONFIG_SYN_COOKIES
7135	if (syncookies) {
7136	msg = "Sending cookies";
7137	want_cookie = true;
7138	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
7139	} else
7140	#endif
7141	__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
7142
7143	if (!READ_ONCE(queue->synflood_warned) && syncookies != `2` &&
7144	xchg(&queue->synflood_warned, `1`) == `0`) {
7145	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) {
7146	net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n",
7147	proto, inet6_rcv_saddr(sk),
7148	sk->sk_num, msg);
7149	} else {
7150	net_info_ratelimited("%s: Possible SYN flooding on port %pI4:%u. %s.\n",
7151	proto, &sk->sk_rcv_saddr,
7152	sk->sk_num, msg);
7153	}
7154	}
7155
7156	return want_cookie;
7157	}
7158
7159	static void tcp_reqsk_record_syn(const struct sock *sk,
7160	struct request_sock *req,
7161	const struct sk_buff *skb)
7162	{
7163	if (tcp_sk(sk)->save_syn) {
7164	u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
7165	struct saved_syn *saved_syn;
7166	u32 mac_hdrlen;
7167	void *base;
7168
7169	if (tcp_sk(sk)->save_syn == `2`) { / Save full header. /
7170	base = skb_mac_header(skb);
7171	mac_hdrlen = skb_mac_header_len(skb);
7172	len += mac_hdrlen;
7173	} else {
7174	base = skb_network_header(skb);
7175	mac_hdrlen = `0`;
7176	}
7177
7178	saved_syn = kmalloc(struct_size(saved_syn, data, len),
7179	GFP_ATOMIC);
7180	if (saved_syn) {
7181	saved_syn->mac_hdrlen = mac_hdrlen;
7182	saved_syn->network_hdrlen = skb_network_header_len(skb);
7183	saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
7184	memcpy(saved_syn->data, base, len);
7185	req->saved_syn = saved_syn;
7186	}
7187	}
7188	}
7189
7190	/ If a SYN cookie is required and supported, returns a clamped MSS value to be*
7191	* used for SYN cookie generation.
7192	*/
7193	u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
7194	const struct tcp_request_sock_ops *af_ops,
7195	struct sock sk, struct* tcphdr *th)
7196	{
7197	struct tcp_sock *tp = tcp_sk(sk);
7198	u16 mss;
7199
7200	if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != `2` &&
7201	!inet_csk_reqsk_queue_is_full(sk))
7202	return `0`;
7203
7204	if (!tcp_syn_flood_action(sk, proto: rsk_ops->slab_name))
7205	return `0`;
7206
7207	if (sk_acceptq_is_full(sk)) {
7208	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
7209	return `0`;
7210	}
7211
7212	mss = tcp_parse_mss_option(th, user_mss: tp->rx_opt.user_mss);
7213	if (!mss)
7214	mss = af_ops->mss_clamp;
7215
7216	return mss;
7217	}
7218	EXPORT_IPV6_MOD_GPL(tcp_get_syncookie_mss);
7219
7220	int tcp_conn_request(struct request_sock_ops *rsk_ops,
7221	const struct tcp_request_sock_ops *af_ops,
7222	struct sock sk, struct* sk_buff *skb)
7223	{
7224	struct tcp_fastopen_cookie foc = { .len = -`1` };
7225	struct tcp_options_received tmp_opt;
7226	struct tcp_sock *tp = tcp_sk(sk);
7227	struct net *net = sock_net(sk);
7228	struct sock *fastopen_sk = NULL;
7229	struct request_sock *req;
7230	bool want_cookie = false;
7231	struct dst_entry *dst;
7232	struct flowi fl;
7233	u8 syncookies;
7234	u32 isn;
7235
7236	#ifdef CONFIG_TCP_AO
7237	const struct tcp_ao_hdr *aoh;
7238	#endif
7239
7240	isn = __this_cpu_read(tcp_tw_isn);
7241	if (isn) {
7242	/ TW buckets are converted to open requests without*
7243	* limitations, they conserve resources and peer is
7244	* evidently real one.
7245	*/
7246	__this_cpu_write(tcp_tw_isn, `0`);
7247	} else {
7248	syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
7249
7250	if (syncookies == `2` \|\| inet_csk_reqsk_queue_is_full(sk)) {
7251	want_cookie = tcp_syn_flood_action(sk,
7252	proto: rsk_ops->slab_name);
7253	if (!want_cookie)
7254	goto drop;
7255	}
7256	}
7257
7258	if (sk_acceptq_is_full(sk)) {
7259	NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
7260	goto drop;
7261	}
7262
7263	req = inet_reqsk_alloc(ops: rsk_ops, sk_listener: sk, attach_listener: !want_cookie);
7264	if (!req)
7265	goto drop;
7266
7267	req->syncookie = want_cookie;
7268	tcp_rsk(req)->af_specific = af_ops;
7269	tcp_rsk(req)->ts_off = `0`;
7270	tcp_rsk(req)->req_usec_ts = false;
7271	#if IS_ENABLED(CONFIG_MPTCP)
7272	tcp_rsk(req)->is_mptcp = `0`;
7273	#endif
7274
7275	tcp_clear_options(rx_opt: &tmp_opt);
7276	tmp_opt.mss_clamp = af_ops->mss_clamp;
7277	tmp_opt.user_mss = tp->rx_opt.user_mss;
7278	tcp_parse_options(sock_net(sk), skb, &tmp_opt, `0`,
7279	want_cookie ? NULL : &foc);
7280
7281	if (want_cookie && !tmp_opt.saw_tstamp)
7282	tcp_clear_options(rx_opt: &tmp_opt);
7283
7284	if (IS_ENABLED(CONFIG_SMC) && want_cookie)
7285	tmp_opt.smc_ok = `0`;
7286
7287	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
7288	tcp_openreq_init(req, rx_opt: &tmp_opt, skb, sk);
7289	inet_rsk(sk: req)->no_srccheck = inet_test_bit(TRANSPARENT, sk);
7290
7291	/ Note: tcp_v6_init_req() might override ir_iif for link locals /
7292	inet_rsk(sk: req)->ir_iif = inet_request_bound_dev_if(sk, skb);
7293
7294	dst = af_ops->route_req(sk, skb, &fl, req, isn);
7295	if (!dst)
7296	goto drop_and_free;
7297
7298	if (tmp_opt.tstamp_ok) {
7299	tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
7300	tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
7301	}
7302	if (!want_cookie && !isn) {
7303	int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);
7304
7305	/ Kill the following clause, if you dislike this way. /
7306	if (!syncookies &&
7307	(max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
7308	(max_syn_backlog >> `2`)) &&
7309	!tcp_peer_is_proven(req, dst)) {
7310	/ Without syncookies last quarter of*
7311	* backlog is filled with destinations,
7312	* proven to be alive.
7313	* It means that we continue to communicate
7314	* to destinations, already remembered
7315	* to the moment of synflood.
7316	*/
7317	pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
7318	family: rsk_ops->family);
7319	goto drop_and_release;
7320	}
7321
7322	isn = af_ops->init_seq(skb);
7323	}
7324
7325	tcp_ecn_create_request(req, skb, listen_sk: sk, dst);
7326
7327	if (want_cookie) {
7328	isn = cookie_init_sequence(ops: af_ops, sk, skb, mss: &req->mss);
7329	if (!tmp_opt.tstamp_ok)
7330	inet_rsk(sk: req)->ecn_ok = `0`;
7331	}
7332
7333	#ifdef CONFIG_TCP_AO
7334	if (tcp_parse_auth_options(th: tcp_hdr(skb), NULL, aoh: &aoh))
7335	goto drop_and_release; / Invalid TCP options /
7336	if (aoh) {
7337	tcp_rsk(req)->used_tcp_ao = true;
7338	tcp_rsk(req)->ao_rcv_next = aoh->keyid;
7339	tcp_rsk(req)->ao_keyid = aoh->rnext_keyid;
7340
7341	} else {
7342	tcp_rsk(req)->used_tcp_ao = false;
7343	}
7344	#endif
7345	tcp_rsk(req)->snt_isn = isn;
7346	tcp_rsk(req)->txhash = net_tx_rndhash();
7347	tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
7348	tcp_openreq_init_rwin(req, sk_listener: sk, dst);
7349	sk_rx_queue_set(sk: req_to_sk(req), skb);
7350	if (!want_cookie) {
7351	tcp_reqsk_record_syn(sk, req, skb);
7352	fastopen_sk = tcp_try_fastopen(sk, skb, req, foc: &foc, dst);
7353	}
7354	if (fastopen_sk) {
7355	af_ops->send_synack(fastopen_sk, dst, &fl, req,
7356	&foc, TCP_SYNACK_FASTOPEN, skb);
7357	/ Add the child socket directly into the accept queue /
7358	if (!inet_csk_reqsk_queue_add(sk, req, child: fastopen_sk)) {
7359	reqsk_fastopen_remove(sk: fastopen_sk, req, reset: false);
7360	bh_unlock_sock(fastopen_sk);
7361	sock_put(sk: fastopen_sk);
7362	goto drop_and_free;
7363	}
7364	sk->sk_data_ready(sk);
7365	bh_unlock_sock(fastopen_sk);
7366	sock_put(sk: fastopen_sk);
7367	} else {
7368	tcp_rsk(req)->tfo_listener = false;
7369	if (!want_cookie) {
7370	req->timeout = tcp_timeout_init(sk: (struct sock *)req);
7371	if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req,
7372	req->timeout))) {
7373	reqsk_free(req);
7374	dst_release(dst);
7375	return `0`;
7376	}
7377
7378	}
7379	af_ops->send_synack(sk, dst, &fl, req, &foc,
7380	!want_cookie ? TCP_SYNACK_NORMAL :
7381	TCP_SYNACK_COOKIE,
7382	skb);
7383	if (want_cookie) {
7384	reqsk_free(req);
7385	return `0`;
7386	}
7387	}
7388	reqsk_put(req);
7389	return `0`;
7390
7391	drop_and_release:
7392	dst_release(dst);
7393	drop_and_free:
7394	__reqsk_free(req);
7395	drop:
7396	tcp_listendrop(sk);
7397	return `0`;
7398	}
7399	EXPORT_IPV6_MOD(tcp_conn_request);
7400

Provided by KDAB

Definitions

sysctl_tcp_max_orphans
clean_acked_data_enabled
clean_acked_data_enable
clean_acked_data_disable
clean_acked_data_flush
bpf_skops_parse_hdr
bpf_skops_established
tcp_gro_dev_warn
tcp_measure_rcv_mss
tcp_incr_quickack
tcp_enter_quickack_mode
tcp_in_quickack_mode
tcp_ecn_queue_cwr
tcp_ecn_accept_cwr
tcp_ecn_withdraw_cwr
tcp_data_ecn_check
tcp_ecn_rcv_synack
tcp_ecn_rcv_syn
tcp_ecn_rcv_ecn_echo
tcp_count_delivered_ce
tcp_count_delivered
tcp_sndbuf_expand
__tcp_grow_window
truesize_adjust
tcp_grow_window
tcp_init_buffer_space
tcp_clamp_window
tcp_initialize_rcv_mss
tcp_rcv_rtt_update
tcp_rcv_rtt_measure
tcp_rtt_tsopt_us
tcp_rcv_rtt_measure_ts
tcp_rcvbuf_grow
tcp_rcv_space_adjust
tcp_save_lrcv_flowlabel
tcp_event_data_recv
tcp_rtt_estimator
tcp_update_pacing_rate
tcp_set_rto
tcp_init_cwnd
tcp_sacktag_state
tcp_dsack_seen
tcp_check_sack_reordering
tcp_verify_retransmit_hint
tcp_notify_skb_loss_event
tcp_mark_skb_lost
tcp_is_sackblock_valid
tcp_check_dsack
tcp_match_skb_to_sack
tcp_sacktag_one
tcp_shifted_skb
tcp_skb_seglen
skb_can_shift
tcp_skb_shift
tcp_shift_skb_data
tcp_sacktag_walk
tcp_sacktag_bsearch
tcp_sacktag_skip
tcp_maybe_skipping_dsack
tcp_sack_cache_ok
tcp_sacktag_write_queue
tcp_limit_reno_sacked
tcp_check_reno_reordering
tcp_add_reno_sack
tcp_remove_reno_sacks
tcp_reset_reno_sack
tcp_clear_retrans
tcp_init_undo
tcp_is_rack
tcp_timeout_mark_lost
tcp_enter_loss
tcp_check_sack_reneging
tcp_dupack_heuristics
tcp_time_to_recover
tcp_mark_head_lost
tcp_update_scoreboard
tcp_tsopt_ecr_before
tcp_skb_spurious_retrans
tcp_packet_delayed
tcp_any_retrans_done
tcp_retrans_stamp_cleanup
DBGUNDO
tcp_undo_cwnd_reduction
tcp_may_undo
tcp_is_non_sack_preventing_reopen
tcp_try_undo_recovery
tcp_try_undo_dsack
tcp_try_undo_loss
tcp_init_cwnd_reduction
tcp_cwnd_reduction
tcp_end_cwnd_reduction
tcp_enter_cwr
tcp_try_keep_open
tcp_try_to_open
tcp_mtup_probe_failed
tcp_mtup_probe_success
tcp_non_congestion_loss_retransmit
tcp_simple_retransmit
tcp_enter_recovery
tcp_update_rto_time
tcp_process_loss
tcp_force_fast_retransmit
tcp_try_undo_partial
tcp_identify_packet_loss
tcp_fastretrans_alert
tcp_update_rtt_min
tcp_ack_update_rtt
tcp_synack_rtt_meas
tcp_cong_avoid
tcp_rearm_rto
tcp_set_xmit_timer
tcp_tso_acked
tcp_ack_tstamp
tcp_clean_rtx_queue
tcp_ack_probe
tcp_ack_is_dubious
tcp_may_raise_cwnd
tcp_cong_control
tcp_may_update_window
tcp_snd_sne_update
tcp_snd_una_update
tcp_rcv_sne_update
tcp_rcv_nxt_update
tcp_ack_update_window
__tcp_oow_rate_limited
tcp_oow_rate_limited
tcp_send_challenge_ack
tcp_store_ts_recent
__tcp_replace_ts_recent
tcp_replace_ts_recent
tcp_process_tlp_ack
tcp_in_ack_event
tcp_xmit_recovery
tcp_newly_delivered
tcp_ack
tcp_parse_fastopen_option
smc_parse_options
tcp_parse_mss_option
tcp_parse_options
tcp_parse_aligned_timestamp
tcp_fast_parse_options
tcp_do_parse_auth_options
tcp_tsval_replay
tcp_disordered_ack_check
tcp_sequence
tcp_done_with_error
tcp_reset
tcp_fin
tcp_sack_extend
tcp_dsack_set
tcp_dsack_extend
tcp_rcv_spurious_retrans
tcp_send_dupack
tcp_sack_maybe_coalesce
tcp_sack_compress_send_ack
tcp_sack_new_ofo_skb
tcp_sack_remove
tcp_try_coalesce
tcp_ooo_try_coalesce
tcp_drop_reason
tcp_ofo_queue
tcp_try_rmem_schedule
tcp_data_queue_ofo
tcp_queue_rcv
tcp_send_rcvq
tcp_data_ready
tcp_data_queue
tcp_skb_next
tcp_collapse_one
tcp_rbtree_insert
tcp_collapse
tcp_collapse_ofo_queue
tcp_prune_ofo_queue
tcp_prune_queue
tcp_should_expand_sndbuf
tcp_new_space
tcp_check_space
tcp_data_snd_check
__tcp_ack_snd_check
tcp_ack_snd_check
tcp_check_urg
tcp_urg
tcp_reset_check
tcp_validate_incoming
tcp_rcv_established
tcp_init_transfer
tcp_finish_connect
tcp_rcv_fastopen_synack
smc_check_reset_syn
tcp_try_undo_spurious_syn
tcp_rcv_synsent_state_process
tcp_rcv_synrecv_state_fastopen
tcp_rcv_state_process
pr_drop_req
tcp_ecn_create_request
tcp_openreq_init
tcp_syn_flood_action
tcp_reqsk_record_syn
tcp_get_syncookie_mss

Improve your Profiling and Debugging skills

Find out more

Definitions

source code of linux/net/ipv4/tcp_input.c