1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19/*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48#define pr_fmt(fmt) "TCP: " fmt
49
50#include <linux/bottom_half.h>
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
59#include <linux/slab.h>
60#include <linux/sched.h>
61
62#include <net/net_namespace.h>
63#include <net/icmp.h>
64#include <net/inet_hashtables.h>
65#include <net/tcp.h>
66#include <net/transp_v6.h>
67#include <net/ipv6.h>
68#include <net/inet_common.h>
69#include <net/inet_ecn.h>
70#include <net/timewait_sock.h>
71#include <net/xfrm.h>
72#include <net/secure_seq.h>
73#include <net/busy_poll.h>
74#include <net/rstreason.h>
75
76#include <linux/inet.h>
77#include <linux/ipv6.h>
78#include <linux/stddef.h>
79#include <linux/proc_fs.h>
80#include <linux/seq_file.h>
81#include <linux/inetdevice.h>
82#include <linux/btf_ids.h>
83#include <linux/skbuff_ref.h>
84
85#include <crypto/hash.h>
86#include <linux/scatterlist.h>
87
88#include <trace/events/tcp.h>
89
90#ifdef CONFIG_TCP_MD5SIG
91static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
92 __be32 daddr, __be32 saddr, const struct tcphdr *th);
93#endif
94
95struct inet_hashinfo tcp_hashinfo;
96
97static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
98 .bh_lock = INIT_LOCAL_LOCK(bh_lock),
99};
100
101static DEFINE_MUTEX(tcp_exit_batch_mutex);
102
103static u32 tcp_v4_init_seq(const struct sk_buff *skb)
104{
105 return secure_tcp_seq(saddr: ip_hdr(skb)->daddr,
106 daddr: ip_hdr(skb)->saddr,
107 sport: tcp_hdr(skb)->dest,
108 dport: tcp_hdr(skb)->source);
109}
110
111static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
112{
113 return secure_tcp_ts_off(net, saddr: ip_hdr(skb)->daddr, daddr: ip_hdr(skb)->saddr);
114}
115
116int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
117{
118 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
119 const struct inet_timewait_sock *tw = inet_twsk(sk: sktw);
120 const struct tcp_timewait_sock *tcptw = tcp_twsk(sk: sktw);
121 struct tcp_sock *tp = tcp_sk(sk);
122 int ts_recent_stamp;
123 u32 reuse_thresh;
124
125 if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
126 reuse = 0;
127
128 if (reuse == 2) {
129 /* Still does not detect *everything* that goes through
130 * lo, since we require a loopback src or dst address
131 * or direct binding to 'lo' interface.
132 */
133 bool loopback = false;
134 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
135 loopback = true;
136#if IS_ENABLED(CONFIG_IPV6)
137 if (tw->tw_family == AF_INET6) {
138 if (ipv6_addr_loopback(a: &tw->tw_v6_daddr) ||
139 ipv6_addr_v4mapped_loopback(a: &tw->tw_v6_daddr) ||
140 ipv6_addr_loopback(a: &tw->tw_v6_rcv_saddr) ||
141 ipv6_addr_v4mapped_loopback(a: &tw->tw_v6_rcv_saddr))
142 loopback = true;
143 } else
144#endif
145 {
146 if (ipv4_is_loopback(addr: tw->tw_daddr) ||
147 ipv4_is_loopback(addr: tw->tw_rcv_saddr))
148 loopback = true;
149 }
150 if (!loopback)
151 reuse = 0;
152 }
153
154 /* With PAWS, it is safe from the viewpoint
155 of data integrity. Even without PAWS it is safe provided sequence
156 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
157
158 Actually, the idea is close to VJ's one, only timestamp cache is
159 held not per host, but per port pair and TW bucket is used as state
160 holder.
161
162 If TW bucket has been already destroyed we fall back to VJ's scheme
163 and use initial timestamp retrieved from peer table.
164 */
165 ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
166 reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
167 READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
168 if (ts_recent_stamp &&
169 (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
170 /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
171 * and releasing the bucket lock.
172 */
173 if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
174 return 0;
175
176 /* In case of repair and re-using TIME-WAIT sockets we still
177 * want to be sure that it is safe as above but honor the
178 * sequence numbers and time stamps set as part of the repair
179 * process.
180 *
181 * Without this check re-using a TIME-WAIT socket with TCP
182 * repair would accumulate a -1 on the repair assigned
183 * sequence number. The first time it is reused the sequence
184 * is -1, the second time -2, etc. This fixes that issue
185 * without appearing to create any others.
186 */
187 if (likely(!tp->repair)) {
188 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
189
190 if (!seq)
191 seq = 1;
192 WRITE_ONCE(tp->write_seq, seq);
193 tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
194 tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
195 }
196
197 return 1;
198 }
199
200 return 0;
201}
202EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
203
204static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
205 int addr_len)
206{
207 /* This check is replicated from tcp_v4_connect() and intended to
208 * prevent BPF program called below from accessing bytes that are out
209 * of the bound specified by user in addr_len.
210 */
211 if (addr_len < sizeof(struct sockaddr_in))
212 return -EINVAL;
213
214 sock_owned_by_me(sk);
215
216 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
217}
218
219/* This will initiate an outgoing connection. */
220int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
221{
222 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
223 struct inet_timewait_death_row *tcp_death_row;
224 struct inet_sock *inet = inet_sk(sk);
225 struct tcp_sock *tp = tcp_sk(sk);
226 struct ip_options_rcu *inet_opt;
227 struct net *net = sock_net(sk);
228 __be16 orig_sport, orig_dport;
229 __be32 daddr, nexthop;
230 struct flowi4 *fl4;
231 struct rtable *rt;
232 int err;
233
234 if (addr_len < sizeof(struct sockaddr_in))
235 return -EINVAL;
236
237 if (usin->sin_family != AF_INET)
238 return -EAFNOSUPPORT;
239
240 nexthop = daddr = usin->sin_addr.s_addr;
241 inet_opt = rcu_dereference_protected(inet->inet_opt,
242 lockdep_sock_is_held(sk));
243 if (inet_opt && inet_opt->opt.srr) {
244 if (!daddr)
245 return -EINVAL;
246 nexthop = inet_opt->opt.faddr;
247 }
248
249 orig_sport = inet->inet_sport;
250 orig_dport = usin->sin_port;
251 fl4 = &inet->cork.fl.u.ip4;
252 rt = ip_route_connect(fl4, dst: nexthop, src: inet->inet_saddr,
253 oif: sk->sk_bound_dev_if, IPPROTO_TCP, sport: orig_sport,
254 dport: orig_dport, sk);
255 if (IS_ERR(ptr: rt)) {
256 err = PTR_ERR(ptr: rt);
257 if (err == -ENETUNREACH)
258 IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
259 return err;
260 }
261
262 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
263 ip_rt_put(rt);
264 return -ENETUNREACH;
265 }
266
267 if (!inet_opt || !inet_opt->opt.srr)
268 daddr = fl4->daddr;
269
270 tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
271
272 if (!inet->inet_saddr) {
273 err = inet_bhash2_update_saddr(sk, saddr: &fl4->saddr, AF_INET);
274 if (err) {
275 ip_rt_put(rt);
276 return err;
277 }
278 } else {
279 sk_rcv_saddr_set(sk, addr: inet->inet_saddr);
280 }
281
282 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
283 /* Reset inherited state */
284 tp->rx_opt.ts_recent = 0;
285 tp->rx_opt.ts_recent_stamp = 0;
286 if (likely(!tp->repair))
287 WRITE_ONCE(tp->write_seq, 0);
288 }
289
290 inet->inet_dport = usin->sin_port;
291 sk_daddr_set(sk, addr: daddr);
292
293 inet_csk(sk)->icsk_ext_hdr_len = 0;
294 if (inet_opt)
295 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
296
297 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
298
299 /* Socket identity is still unknown (sport may be zero).
300 * However we set state to SYN-SENT and not releasing socket
301 * lock select source port, enter ourselves into the hash tables and
302 * complete initialization after this.
303 */
304 tcp_set_state(sk, state: TCP_SYN_SENT);
305 err = inet_hash_connect(death_row: tcp_death_row, sk);
306 if (err)
307 goto failure;
308
309 sk_set_txhash(sk);
310
311 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
312 sport: inet->inet_sport, dport: inet->inet_dport, sk);
313 if (IS_ERR(ptr: rt)) {
314 err = PTR_ERR(ptr: rt);
315 rt = NULL;
316 goto failure;
317 }
318 tp->tcp_usec_ts = dst_tcp_usec_ts(dst: &rt->dst);
319 /* OK, now commit destination to socket. */
320 sk->sk_gso_type = SKB_GSO_TCPV4;
321 sk_setup_caps(sk, dst: &rt->dst);
322 rt = NULL;
323
324 if (likely(!tp->repair)) {
325 if (!tp->write_seq)
326 WRITE_ONCE(tp->write_seq,
327 secure_tcp_seq(inet->inet_saddr,
328 inet->inet_daddr,
329 inet->inet_sport,
330 usin->sin_port));
331 WRITE_ONCE(tp->tsoffset,
332 secure_tcp_ts_off(net, inet->inet_saddr,
333 inet->inet_daddr));
334 }
335
336 atomic_set(v: &inet->inet_id, i: get_random_u16());
337
338 if (tcp_fastopen_defer_connect(sk, err: &err))
339 return err;
340 if (err)
341 goto failure;
342
343 err = tcp_connect(sk);
344
345 if (err)
346 goto failure;
347
348 return 0;
349
350failure:
351 /*
352 * This unhashes the socket and releases the local port,
353 * if necessary.
354 */
355 tcp_set_state(sk, state: TCP_CLOSE);
356 inet_bhash2_reset_saddr(sk);
357 ip_rt_put(rt);
358 sk->sk_route_caps = 0;
359 inet->inet_dport = 0;
360 return err;
361}
362EXPORT_IPV6_MOD(tcp_v4_connect);
363
364/*
365 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
366 * It can be called through tcp_release_cb() if socket was owned by user
367 * at the time tcp_v4_err() was called to handle ICMP message.
368 */
369void tcp_v4_mtu_reduced(struct sock *sk)
370{
371 struct inet_sock *inet = inet_sk(sk);
372 struct dst_entry *dst;
373 u32 mtu;
374
375 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
376 return;
377 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
378 dst = inet_csk_update_pmtu(sk, mtu);
379 if (!dst)
380 return;
381
382 /* Something is about to be wrong... Remember soft error
383 * for the case, if this connection will not able to recover.
384 */
385 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
386 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
387
388 mtu = dst_mtu(dst);
389
390 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
391 ip_sk_accept_pmtu(sk) &&
392 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
393 tcp_sync_mss(sk, pmtu: mtu);
394
395 /* Resend the TCP packet because it's
396 * clear that the old packet has been
397 * dropped. This is the new "fast" path mtu
398 * discovery.
399 */
400 tcp_simple_retransmit(sk);
401 } /* else let the usual retransmit timer handle it */
402}
403EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
404
405static void do_redirect(struct sk_buff *skb, struct sock *sk)
406{
407 struct dst_entry *dst = __sk_dst_check(sk, cookie: 0);
408
409 if (dst)
410 dst->ops->redirect(dst, sk, skb);
411}
412
413
414/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
415void tcp_req_err(struct sock *sk, u32 seq, bool abort)
416{
417 struct request_sock *req = inet_reqsk(sk);
418 struct net *net = sock_net(sk);
419
420 /* ICMPs are not backlogged, hence we cannot get
421 * an established socket here.
422 */
423 if (seq != tcp_rsk(req)->snt_isn) {
424 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
425 } else if (abort) {
426 /*
427 * Still in SYN_RECV, just remove it silently.
428 * There is no good way to pass the error to the newly
429 * created socket, and POSIX does not want network
430 * errors returned from accept().
431 */
432 inet_csk_reqsk_queue_drop(sk: req->rsk_listener, req);
433 tcp_listendrop(sk: req->rsk_listener);
434 }
435 reqsk_put(req);
436}
437EXPORT_IPV6_MOD(tcp_req_err);
438
439/* TCP-LD (RFC 6069) logic */
440void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
441{
442 struct inet_connection_sock *icsk = inet_csk(sk);
443 struct tcp_sock *tp = tcp_sk(sk);
444 struct sk_buff *skb;
445 s32 remaining;
446 u32 delta_us;
447
448 if (sock_owned_by_user(sk))
449 return;
450
451 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
452 !icsk->icsk_backoff)
453 return;
454
455 skb = tcp_rtx_queue_head(sk);
456 if (WARN_ON_ONCE(!skb))
457 return;
458
459 icsk->icsk_backoff--;
460 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
461 icsk->icsk_rto = inet_csk_rto_backoff(icsk, max_when: tcp_rto_max(sk));
462
463 tcp_mstamp_refresh(tp);
464 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
465 remaining = icsk->icsk_rto - usecs_to_jiffies(u: delta_us);
466
467 if (remaining > 0) {
468 tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, when: remaining, pace_delay: false);
469 } else {
470 /* RTO revert clocked out retransmission.
471 * Will retransmit now.
472 */
473 tcp_retransmit_timer(sk);
474 }
475}
476EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
477
478/*
479 * This routine is called by the ICMP module when it gets some
480 * sort of error condition. If err < 0 then the socket should
481 * be closed and the error returned to the user. If err > 0
482 * it's just the icmp type << 8 | icmp code. After adjustment
483 * header points to the first 8 bytes of the tcp header. We need
484 * to find the appropriate port.
485 *
486 * The locking strategy used here is very "optimistic". When
487 * someone else accesses the socket the ICMP is just dropped
488 * and for some paths there is no check at all.
489 * A more general error queue to queue errors for later handling
490 * is probably better.
491 *
492 */
493
494int tcp_v4_err(struct sk_buff *skb, u32 info)
495{
496 const struct iphdr *iph = (const struct iphdr *)skb->data;
497 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
498 struct net *net = dev_net_rcu(dev: skb->dev);
499 const int type = icmp_hdr(skb)->type;
500 const int code = icmp_hdr(skb)->code;
501 struct request_sock *fastopen;
502 struct tcp_sock *tp;
503 u32 seq, snd_una;
504 struct sock *sk;
505 int err;
506
507 sk = __inet_lookup_established(net, hashinfo: net->ipv4.tcp_death_row.hashinfo,
508 saddr: iph->daddr, sport: th->dest, daddr: iph->saddr,
509 ntohs(th->source), dif: inet_iif(skb), sdif: 0);
510 if (!sk) {
511 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
512 return -ENOENT;
513 }
514 if (sk->sk_state == TCP_TIME_WAIT) {
515 /* To increase the counter of ignored icmps for TCP-AO */
516 tcp_ao_ignore_icmp(sk, AF_INET, type, code);
517 inet_twsk_put(tw: inet_twsk(sk));
518 return 0;
519 }
520 seq = ntohl(th->seq);
521 if (sk->sk_state == TCP_NEW_SYN_RECV) {
522 tcp_req_err(sk, seq, abort: type == ICMP_PARAMETERPROB ||
523 type == ICMP_TIME_EXCEEDED ||
524 (type == ICMP_DEST_UNREACH &&
525 (code == ICMP_NET_UNREACH ||
526 code == ICMP_HOST_UNREACH)));
527 return 0;
528 }
529
530 if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
531 sock_put(sk);
532 return 0;
533 }
534
535 bh_lock_sock(sk);
536 /* If too many ICMPs get dropped on busy
537 * servers this needs to be solved differently.
538 * We do take care of PMTU discovery (RFC1191) special case :
539 * we can receive locally generated ICMP messages while socket is held.
540 */
541 if (sock_owned_by_user(sk)) {
542 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
543 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
544 }
545 if (sk->sk_state == TCP_CLOSE)
546 goto out;
547
548 if (static_branch_unlikely(&ip4_min_ttl)) {
549 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
550 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
551 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
552 goto out;
553 }
554 }
555
556 tp = tcp_sk(sk);
557 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
558 fastopen = rcu_dereference(tp->fastopen_rsk);
559 snd_una = fastopen ? tcp_rsk(req: fastopen)->snt_isn : tp->snd_una;
560 if (sk->sk_state != TCP_LISTEN &&
561 !between(seq1: seq, seq2: snd_una, seq3: tp->snd_nxt)) {
562 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
563 goto out;
564 }
565
566 switch (type) {
567 case ICMP_REDIRECT:
568 if (!sock_owned_by_user(sk))
569 do_redirect(skb, sk);
570 goto out;
571 case ICMP_SOURCE_QUENCH:
572 /* Just silently ignore these. */
573 goto out;
574 case ICMP_PARAMETERPROB:
575 err = EPROTO;
576 break;
577 case ICMP_DEST_UNREACH:
578 if (code > NR_ICMP_UNREACH)
579 goto out;
580
581 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
582 /* We are not interested in TCP_LISTEN and open_requests
583 * (SYN-ACKs send out by Linux are always <576bytes so
584 * they should go through unfragmented).
585 */
586 if (sk->sk_state == TCP_LISTEN)
587 goto out;
588
589 WRITE_ONCE(tp->mtu_info, info);
590 if (!sock_owned_by_user(sk)) {
591 tcp_v4_mtu_reduced(sk);
592 } else {
593 if (!test_and_set_bit(nr: TCP_MTU_REDUCED_DEFERRED, addr: &sk->sk_tsq_flags))
594 sock_hold(sk);
595 }
596 goto out;
597 }
598
599 err = icmp_err_convert[code].errno;
600 /* check if this ICMP message allows revert of backoff.
601 * (see RFC 6069)
602 */
603 if (!fastopen &&
604 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
605 tcp_ld_RTO_revert(sk, seq);
606 break;
607 case ICMP_TIME_EXCEEDED:
608 err = EHOSTUNREACH;
609 break;
610 default:
611 goto out;
612 }
613
614 switch (sk->sk_state) {
615 case TCP_SYN_SENT:
616 case TCP_SYN_RECV:
617 /* Only in fast or simultaneous open. If a fast open socket is
618 * already accepted it is treated as a connected one below.
619 */
620 if (fastopen && !fastopen->sk)
621 break;
622
623 ip_icmp_error(sk, skb, err, port: th->dest, info, payload: (u8 *)th);
624
625 if (!sock_owned_by_user(sk))
626 tcp_done_with_error(sk, err);
627 else
628 WRITE_ONCE(sk->sk_err_soft, err);
629 goto out;
630 }
631
632 /* If we've already connected we will keep trying
633 * until we time out, or the user gives up.
634 *
635 * rfc1122 4.2.3.9 allows to consider as hard errors
636 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
637 * but it is obsoleted by pmtu discovery).
638 *
639 * Note, that in modern internet, where routing is unreliable
640 * and in each dark corner broken firewalls sit, sending random
641 * errors ordered by their masters even this two messages finally lose
642 * their original sense (even Linux sends invalid PORT_UNREACHs)
643 *
644 * Now we are in compliance with RFCs.
645 * --ANK (980905)
646 */
647
648 if (!sock_owned_by_user(sk) &&
649 inet_test_bit(RECVERR, sk)) {
650 WRITE_ONCE(sk->sk_err, err);
651 sk_error_report(sk);
652 } else { /* Only an error on timeout */
653 WRITE_ONCE(sk->sk_err_soft, err);
654 }
655
656out:
657 bh_unlock_sock(sk);
658 sock_put(sk);
659 return 0;
660}
661
662void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
663{
664 struct tcphdr *th = tcp_hdr(skb);
665
666 th->check = ~tcp_v4_check(len: skb->len, saddr, daddr, base: 0);
667 skb->csum_start = skb_transport_header(skb) - skb->head;
668 skb->csum_offset = offsetof(struct tcphdr, check);
669}
670
671/* This routine computes an IPv4 TCP checksum. */
672void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
673{
674 const struct inet_sock *inet = inet_sk(sk);
675
676 __tcp_v4_send_check(skb, saddr: inet->inet_saddr, daddr: inet->inet_daddr);
677}
678EXPORT_IPV6_MOD(tcp_v4_send_check);
679
680#define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
681
682static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
683 const struct tcp_ao_hdr *aoh,
684 struct ip_reply_arg *arg, struct tcphdr *reply,
685 __be32 reply_options[REPLY_OPTIONS_LEN])
686{
687#ifdef CONFIG_TCP_AO
688 int sdif = tcp_v4_sdif(skb);
689 int dif = inet_iif(skb);
690 int l3index = sdif ? dif : 0;
691 bool allocated_traffic_key;
692 struct tcp_ao_key *key;
693 char *traffic_key;
694 bool drop = true;
695 u32 ao_sne = 0;
696 u8 keyid;
697
698 rcu_read_lock();
699 if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
700 key: &key, traffic_key: &traffic_key, allocated_traffic_key: &allocated_traffic_key,
701 keyid: &keyid, sne: &ao_sne))
702 goto out;
703
704 reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
705 (aoh->rnext_keyid << 8) | keyid);
706 arg->iov[0].iov_len += tcp_ao_len_aligned(key);
707 reply->doff = arg->iov[0].iov_len / 4;
708
709 if (tcp_ao_hash_hdr(AF_INET, ao_hash: (char *)&reply_options[1],
710 key, tkey: traffic_key,
711 daddr: (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
712 saddr: (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
713 th: reply, sne: ao_sne))
714 goto out;
715 drop = false;
716out:
717 rcu_read_unlock();
718 if (allocated_traffic_key)
719 kfree(objp: traffic_key);
720 return drop;
721#else
722 return true;
723#endif
724}
725
726/*
727 * This routine will send an RST to the other tcp.
728 *
729 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
730 * for reset.
731 * Answer: if a packet caused RST, it is not for a socket
732 * existing in our system, if it is matched to a socket,
733 * it is just duplicate segment or bug in other side's TCP.
734 * So that we build reply only basing on parameters
735 * arrived with segment.
736 * Exception: precedence violation. We do not implement it in any case.
737 */
738
739static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
740 enum sk_rst_reason reason)
741{
742 const struct tcphdr *th = tcp_hdr(skb);
743 struct {
744 struct tcphdr th;
745 __be32 opt[REPLY_OPTIONS_LEN];
746 } rep;
747 const __u8 *md5_hash_location = NULL;
748 const struct tcp_ao_hdr *aoh;
749 struct ip_reply_arg arg;
750#ifdef CONFIG_TCP_MD5SIG
751 struct tcp_md5sig_key *key = NULL;
752 unsigned char newhash[16];
753 struct sock *sk1 = NULL;
754 int genhash;
755#endif
756 u64 transmit_time = 0;
757 struct sock *ctl_sk;
758 struct net *net;
759 u32 txhash = 0;
760
761 /* Never send a reset in response to a reset. */
762 if (th->rst)
763 return;
764
765 /* If sk not NULL, it means we did a successful lookup and incoming
766 * route had to be correct. prequeue might have dropped our dst.
767 */
768 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
769 return;
770
771 /* Swap the send and the receive. */
772 memset(&rep, 0, sizeof(rep));
773 rep.th.dest = th->source;
774 rep.th.source = th->dest;
775 rep.th.doff = sizeof(struct tcphdr) / 4;
776 rep.th.rst = 1;
777
778 if (th->ack) {
779 rep.th.seq = th->ack_seq;
780 } else {
781 rep.th.ack = 1;
782 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
783 skb->len - (th->doff << 2));
784 }
785
786 memset(&arg, 0, sizeof(arg));
787 arg.iov[0].iov_base = (unsigned char *)&rep;
788 arg.iov[0].iov_len = sizeof(rep.th);
789
790 net = sk ? sock_net(sk) : dev_net_rcu(dev: skb_dst(skb)->dev);
791
792 /* Invalid TCP option size or twice included auth */
793 if (tcp_parse_auth_options(th: tcp_hdr(skb), md5_hash: &md5_hash_location, aoh: &aoh))
794 return;
795
796 if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, arg: &arg, reply: &rep.th, reply_options: rep.opt))
797 return;
798
799#ifdef CONFIG_TCP_MD5SIG
800 rcu_read_lock();
801 if (sk && sk_fullsock(sk)) {
802 const union tcp_md5_addr *addr;
803 int l3index;
804
805 /* sdif set, means packet ingressed via a device
806 * in an L3 domain and inet_iif is set to it.
807 */
808 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
809 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
810 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
811 } else if (md5_hash_location) {
812 const union tcp_md5_addr *addr;
813 int sdif = tcp_v4_sdif(skb);
814 int dif = inet_iif(skb);
815 int l3index;
816
817 /*
818 * active side is lost. Try to find listening socket through
819 * source port, and then find md5 key through listening socket.
820 * we are not loose security here:
821 * Incoming packet is checked with md5 hash with finding key,
822 * no RST generated if md5 hash doesn't match.
823 */
824 sk1 = __inet_lookup_listener(net, hashinfo: net->ipv4.tcp_death_row.hashinfo,
825 NULL, doff: 0, saddr: ip_hdr(skb)->saddr,
826 sport: th->source, daddr: ip_hdr(skb)->daddr,
827 ntohs(th->source), dif, sdif);
828 /* don't send rst if it can't find key */
829 if (!sk1)
830 goto out;
831
832 /* sdif set, means packet ingressed via a device
833 * in an L3 domain and dif is set to it.
834 */
835 l3index = sdif ? dif : 0;
836 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
837 key = tcp_md5_do_lookup(sk: sk1, l3index, addr, AF_INET);
838 if (!key)
839 goto out;
840
841
842 genhash = tcp_v4_md5_hash_skb(md5_hash: newhash, key, NULL, skb);
843 if (genhash || memcmp(p: md5_hash_location, q: newhash, size: 16) != 0)
844 goto out;
845
846 }
847
848 if (key) {
849 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
850 (TCPOPT_NOP << 16) |
851 (TCPOPT_MD5SIG << 8) |
852 TCPOLEN_MD5SIG);
853 /* Update length and the length the header thinks exists */
854 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
855 rep.th.doff = arg.iov[0].iov_len / 4;
856
857 tcp_v4_md5_hash_hdr(md5_hash: (__u8 *) &rep.opt[1],
858 key, daddr: ip_hdr(skb)->saddr,
859 saddr: ip_hdr(skb)->daddr, th: &rep.th);
860 }
861#endif
862 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
863 if (rep.opt[0] == 0) {
864 __be32 mrst = mptcp_reset_option(skb);
865
866 if (mrst) {
867 rep.opt[0] = mrst;
868 arg.iov[0].iov_len += sizeof(mrst);
869 rep.th.doff = arg.iov[0].iov_len / 4;
870 }
871 }
872
873 arg.csum = csum_tcpudp_nofold(saddr: ip_hdr(skb)->daddr,
874 daddr: ip_hdr(skb)->saddr, /* XXX */
875 len: arg.iov[0].iov_len, IPPROTO_TCP, sum: 0);
876 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
877 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
878
879 /* When socket is gone, all binding information is lost.
880 * routing might fail in this case. No choice here, if we choose to force
881 * input interface, we will misroute in case of asymmetric route.
882 */
883 if (sk)
884 arg.bound_dev_if = sk->sk_bound_dev_if;
885
886 trace_tcp_send_reset(sk, skb__nullable: skb, reason);
887
888 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
889 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
890
891 /* ECN bits of TW reset are cleared */
892 arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
893 arg.uid = sock_net_uid(net, sk: sk && sk_fullsock(sk) ? sk : NULL);
894 local_bh_disable();
895 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
896 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
897
898 sock_net_set(sk: ctl_sk, net);
899 if (sk) {
900 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
901 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
902 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
903 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
904 transmit_time = tcp_transmit_time(sk);
905 xfrm_sk_clone_policy(sk: ctl_sk, osk: sk);
906 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
907 inet_twsk(sk)->tw_txhash : sk->sk_txhash;
908 } else {
909 ctl_sk->sk_mark = 0;
910 ctl_sk->sk_priority = 0;
911 }
912 ip_send_unicast_reply(sk: ctl_sk, orig_sk: sk,
913 skb, sopt: &TCP_SKB_CB(skb)->header.h4.opt,
914 daddr: ip_hdr(skb)->saddr, saddr: ip_hdr(skb)->daddr,
915 arg: &arg, len: arg.iov[0].iov_len,
916 transmit_time, txhash);
917
918 xfrm_sk_free_policy(sk: ctl_sk);
919 sock_net_set(sk: ctl_sk, net: &init_net);
920 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
921 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
922 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
923 local_bh_enable();
924
925#ifdef CONFIG_TCP_MD5SIG
926out:
927 rcu_read_unlock();
928#endif
929}
930
931/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
932 outside socket context is ugly, certainly. What can I do?
933 */
934
935static void tcp_v4_send_ack(const struct sock *sk,
936 struct sk_buff *skb, u32 seq, u32 ack,
937 u32 win, u32 tsval, u32 tsecr, int oif,
938 struct tcp_key *key,
939 int reply_flags, u8 tos, u32 txhash)
940{
941 const struct tcphdr *th = tcp_hdr(skb);
942 struct {
943 struct tcphdr th;
944 __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
945 } rep;
946 struct net *net = sock_net(sk);
947 struct ip_reply_arg arg;
948 struct sock *ctl_sk;
949 u64 transmit_time;
950
951 memset(&rep.th, 0, sizeof(struct tcphdr));
952 memset(&arg, 0, sizeof(arg));
953
954 arg.iov[0].iov_base = (unsigned char *)&rep;
955 arg.iov[0].iov_len = sizeof(rep.th);
956 if (tsecr) {
957 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
958 (TCPOPT_TIMESTAMP << 8) |
959 TCPOLEN_TIMESTAMP);
960 rep.opt[1] = htonl(tsval);
961 rep.opt[2] = htonl(tsecr);
962 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
963 }
964
965 /* Swap the send and the receive. */
966 rep.th.dest = th->source;
967 rep.th.source = th->dest;
968 rep.th.doff = arg.iov[0].iov_len / 4;
969 rep.th.seq = htonl(seq);
970 rep.th.ack_seq = htonl(ack);
971 rep.th.ack = 1;
972 rep.th.window = htons(win);
973
974#ifdef CONFIG_TCP_MD5SIG
975 if (tcp_key_is_md5(key)) {
976 int offset = (tsecr) ? 3 : 0;
977
978 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
979 (TCPOPT_NOP << 16) |
980 (TCPOPT_MD5SIG << 8) |
981 TCPOLEN_MD5SIG);
982 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
983 rep.th.doff = arg.iov[0].iov_len/4;
984
985 tcp_v4_md5_hash_hdr(md5_hash: (__u8 *) &rep.opt[offset],
986 key: key->md5_key, daddr: ip_hdr(skb)->saddr,
987 saddr: ip_hdr(skb)->daddr, th: &rep.th);
988 }
989#endif
990#ifdef CONFIG_TCP_AO
991 if (tcp_key_is_ao(key)) {
992 int offset = (tsecr) ? 3 : 0;
993
994 rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
995 (tcp_ao_len(key->ao_key) << 16) |
996 (key->ao_key->sndid << 8) |
997 key->rcv_next);
998 arg.iov[0].iov_len += tcp_ao_len_aligned(key: key->ao_key);
999 rep.th.doff = arg.iov[0].iov_len / 4;
1000
1001 tcp_ao_hash_hdr(AF_INET, ao_hash: (char *)&rep.opt[offset],
1002 key: key->ao_key, tkey: key->traffic_key,
1003 daddr: (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
1004 saddr: (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
1005 th: &rep.th, sne: key->sne);
1006 }
1007#endif
1008 arg.flags = reply_flags;
1009 arg.csum = csum_tcpudp_nofold(saddr: ip_hdr(skb)->daddr,
1010 daddr: ip_hdr(skb)->saddr, /* XXX */
1011 len: arg.iov[0].iov_len, IPPROTO_TCP, sum: 0);
1012 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1013 if (oif)
1014 arg.bound_dev_if = oif;
1015 arg.tos = tos;
1016 arg.uid = sock_net_uid(net, sk: sk_fullsock(sk) ? sk : NULL);
1017 local_bh_disable();
1018 local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
1019 ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
1020 sock_net_set(sk: ctl_sk, net);
1021 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
1022 inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
1023 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
1024 inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
1025 transmit_time = tcp_transmit_time(sk);
1026 ip_send_unicast_reply(sk: ctl_sk, orig_sk: sk,
1027 skb, sopt: &TCP_SKB_CB(skb)->header.h4.opt,
1028 daddr: ip_hdr(skb)->saddr, saddr: ip_hdr(skb)->daddr,
1029 arg: &arg, len: arg.iov[0].iov_len,
1030 transmit_time, txhash);
1031
1032 sock_net_set(sk: ctl_sk, net: &init_net);
1033 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
1034 local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
1035 local_bh_enable();
1036}
1037
1038static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
1039 enum tcp_tw_status tw_status)
1040{
1041 struct inet_timewait_sock *tw = inet_twsk(sk);
1042 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
1043 struct tcp_key key = {};
1044 u8 tos = tw->tw_tos;
1045
1046 /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
1047 * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
1048 * being placed in a different service queues (Classic rather than L4S)
1049 */
1050 if (tw_status == TCP_TW_ACK_OOW)
1051 tos &= ~INET_ECN_MASK;
1052
1053#ifdef CONFIG_TCP_AO
1054 struct tcp_ao_info *ao_info;
1055
1056 if (static_branch_unlikely(&tcp_ao_needed.key)) {
1057 /* FIXME: the segment to-be-acked is not verified yet */
1058 ao_info = rcu_dereference(tcptw->ao_info);
1059 if (ao_info) {
1060 const struct tcp_ao_hdr *aoh;
1061
1062 if (tcp_parse_auth_options(th: tcp_hdr(skb), NULL, aoh: &aoh)) {
1063 inet_twsk_put(tw);
1064 return;
1065 }
1066
1067 if (aoh)
1068 key.ao_key = tcp_ao_established_key(sk, ao: ao_info,
1069 sndid: aoh->rnext_keyid, rcvid: -1);
1070 }
1071 }
1072 if (key.ao_key) {
1073 struct tcp_ao_key *rnext_key;
1074
1075 key.traffic_key = snd_other_key(key: key.ao_key);
1076 key.sne = READ_ONCE(ao_info->snd_sne);
1077 rnext_key = READ_ONCE(ao_info->rnext_key);
1078 key.rcv_next = rnext_key->rcvid;
1079 key.type = TCP_KEY_AO;
1080#else
1081 if (0) {
1082#endif
1083 } else if (static_branch_tcp_md5()) {
1084 key.md5_key = tcp_twsk_md5_key(tcptw);
1085 if (key.md5_key)
1086 key.type = TCP_KEY_MD5;
1087 }
1088
1089 tcp_v4_send_ack(sk, skb,
1090 seq: tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
1091 win: tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
1092 tsval: tcp_tw_tsval(tcptw),
1093 READ_ONCE(tcptw->tw_ts_recent),
1094 oif: tw->tw_bound_dev_if, key: &key,
1095 reply_flags: tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
1096 tos,
1097 txhash: tw->tw_txhash);
1098
1099 inet_twsk_put(tw);
1100}
1101
1102static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
1103 struct request_sock *req)
1104{
1105 struct tcp_key key = {};
1106
1107 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
1108 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
1109 */
1110 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
1111 tcp_sk(sk)->snd_nxt;
1112
1113#ifdef CONFIG_TCP_AO
1114 if (static_branch_unlikely(&tcp_ao_needed.key) &&
1115 tcp_rsk_used_ao(req)) {
1116 const union tcp_md5_addr *addr;
1117 const struct tcp_ao_hdr *aoh;
1118 int l3index;
1119
1120 /* Invalid TCP option size or twice included auth */
1121 if (tcp_parse_auth_options(th: tcp_hdr(skb), NULL, aoh: &aoh))
1122 return;
1123 if (!aoh)
1124 return;
1125
1126 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1127 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1128 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
1129 sndid: aoh->rnext_keyid, rcvid: -1);
1130 if (unlikely(!key.ao_key)) {
1131 /* Send ACK with any matching MKT for the peer */
1132 key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, sndid: -1, rcvid: -1);
1133 /* Matching key disappeared (user removed the key?)
1134 * let the handshake timeout.
1135 */
1136 if (!key.ao_key) {
1137 net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
1138 addr,
1139 ntohs(tcp_hdr(skb)->source),
1140 &ip_hdr(skb)->daddr,
1141 ntohs(tcp_hdr(skb)->dest));
1142 return;
1143 }
1144 }
1145 key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
1146 if (!key.traffic_key)
1147 return;
1148
1149 key.type = TCP_KEY_AO;
1150 key.rcv_next = aoh->keyid;
1151 tcp_v4_ao_calc_key_rsk(mkt: key.ao_key, key: key.traffic_key, req);
1152#else
1153 if (0) {
1154#endif
1155 } else if (static_branch_tcp_md5()) {
1156 const union tcp_md5_addr *addr;
1157 int l3index;
1158
1159 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
1160 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
1161 key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1162 if (key.md5_key)
1163 key.type = TCP_KEY_MD5;
1164 }
1165
1166 /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
1167 tcp_v4_send_ack(sk, skb, seq,
1168 ack: tcp_rsk(req)->rcv_nxt,
1169 win: tcp_synack_window(req) >> inet_rsk(sk: req)->rcv_wscale,
1170 tsval: tcp_rsk_tsval(treq: tcp_rsk(req)),
1171 tsecr: req->ts_recent,
1172 oif: 0, key: &key,
1173 reply_flags: inet_rsk(sk: req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1174 tos: ip_hdr(skb)->tos & ~INET_ECN_MASK,
1175 READ_ONCE(tcp_rsk(req)->txhash));
1176 if (tcp_key_is_ao(key: &key))
1177 kfree(objp: key.traffic_key);
1178}
1179
1180/*
1181 * Send a SYN-ACK after having received a SYN.
1182 * This still operates on a request_sock only, not on a big
1183 * socket.
1184 */
1185static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1186 struct flowi *fl,
1187 struct request_sock *req,
1188 struct tcp_fastopen_cookie *foc,
1189 enum tcp_synack_type synack_type,
1190 struct sk_buff *syn_skb)
1191{
1192 const struct inet_request_sock *ireq = inet_rsk(sk: req);
1193 struct flowi4 fl4;
1194 int err = -1;
1195 struct sk_buff *skb;
1196 u8 tos;
1197
1198 /* First, grab a route. */
1199 if (!dst && (dst = inet_csk_route_req(sk, fl4: &fl4, req)) == NULL)
1200 return -1;
1201
1202 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1203
1204 if (skb) {
1205 __tcp_v4_send_check(skb, saddr: ireq->ir_loc_addr, daddr: ireq->ir_rmt_addr);
1206
1207 tos = READ_ONCE(inet_sk(sk)->tos);
1208
1209 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1210 tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1211 (tos & INET_ECN_MASK);
1212
1213 if (!INET_ECN_is_capable(dsfield: tos) &&
1214 tcp_bpf_ca_needs_ecn(sk: (struct sock *)req))
1215 tos |= INET_ECN_ECT_0;
1216
1217 rcu_read_lock();
1218 err = ip_build_and_send_pkt(skb, sk, saddr: ireq->ir_loc_addr,
1219 daddr: ireq->ir_rmt_addr,
1220 rcu_dereference(ireq->ireq_opt),
1221 tos);
1222 rcu_read_unlock();
1223 err = net_xmit_eval(err);
1224 }
1225
1226 return err;
1227}
1228
1229/*
1230 * IPv4 request_sock destructor.
1231 */
1232static void tcp_v4_reqsk_destructor(struct request_sock *req)
1233{
1234 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1235}
1236
1237#ifdef CONFIG_TCP_MD5SIG
1238/*
1239 * RFC2385 MD5 checksumming requires a mapping of
1240 * IP address->MD5 Key.
1241 * We need to maintain these in the sk structure.
1242 */
1243
1244DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1245EXPORT_IPV6_MOD(tcp_md5_needed);
1246
1247static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1248{
1249 if (!old)
1250 return true;
1251
1252 /* l3index always overrides non-l3index */
1253 if (old->l3index && new->l3index == 0)
1254 return false;
1255 if (old->l3index == 0 && new->l3index)
1256 return true;
1257
1258 return old->prefixlen < new->prefixlen;
1259}
1260
1261/* Find the Key structure for an address. */
1262struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1263 const union tcp_md5_addr *addr,
1264 int family, bool any_l3index)
1265{
1266 const struct tcp_sock *tp = tcp_sk(sk);
1267 struct tcp_md5sig_key *key;
1268 const struct tcp_md5sig_info *md5sig;
1269 __be32 mask;
1270 struct tcp_md5sig_key *best_match = NULL;
1271 bool match;
1272
1273 /* caller either holds rcu_read_lock() or socket lock */
1274 md5sig = rcu_dereference_check(tp->md5sig_info,
1275 lockdep_sock_is_held(sk));
1276 if (!md5sig)
1277 return NULL;
1278
1279 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1280 lockdep_sock_is_held(sk)) {
1281 if (key->family != family)
1282 continue;
1283 if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
1284 key->l3index != l3index)
1285 continue;
1286 if (family == AF_INET) {
1287 mask = inet_make_mask(logmask: key->prefixlen);
1288 match = (key->addr.a4.s_addr & mask) ==
1289 (addr->a4.s_addr & mask);
1290#if IS_ENABLED(CONFIG_IPV6)
1291 } else if (family == AF_INET6) {
1292 match = ipv6_prefix_equal(addr1: &key->addr.a6, addr2: &addr->a6,
1293 prefixlen: key->prefixlen);
1294#endif
1295 } else {
1296 match = false;
1297 }
1298
1299 if (match && better_md5_match(old: best_match, new: key))
1300 best_match = key;
1301 }
1302 return best_match;
1303}
1304EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
1305
1306static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1307 const union tcp_md5_addr *addr,
1308 int family, u8 prefixlen,
1309 int l3index, u8 flags)
1310{
1311 const struct tcp_sock *tp = tcp_sk(sk);
1312 struct tcp_md5sig_key *key;
1313 unsigned int size = sizeof(struct in_addr);
1314 const struct tcp_md5sig_info *md5sig;
1315
1316 /* caller either holds rcu_read_lock() or socket lock */
1317 md5sig = rcu_dereference_check(tp->md5sig_info,
1318 lockdep_sock_is_held(sk));
1319 if (!md5sig)
1320 return NULL;
1321#if IS_ENABLED(CONFIG_IPV6)
1322 if (family == AF_INET6)
1323 size = sizeof(struct in6_addr);
1324#endif
1325 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1326 lockdep_sock_is_held(sk)) {
1327 if (key->family != family)
1328 continue;
1329 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1330 continue;
1331 if (key->l3index != l3index)
1332 continue;
1333 if (!memcmp(p: &key->addr, q: addr, size) &&
1334 key->prefixlen == prefixlen)
1335 return key;
1336 }
1337 return NULL;
1338}
1339
1340struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1341 const struct sock *addr_sk)
1342{
1343 const union tcp_md5_addr *addr;
1344 int l3index;
1345
1346 l3index = l3mdev_master_ifindex_by_index(net: sock_net(sk),
1347 ifindex: addr_sk->sk_bound_dev_if);
1348 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1349 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1350}
1351EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
1352
1353static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1354{
1355 struct tcp_sock *tp = tcp_sk(sk);
1356 struct tcp_md5sig_info *md5sig;
1357
1358 md5sig = kmalloc(sizeof(*md5sig), gfp);
1359 if (!md5sig)
1360 return -ENOMEM;
1361
1362 sk_gso_disable(sk);
1363 INIT_HLIST_HEAD(&md5sig->head);
1364 rcu_assign_pointer(tp->md5sig_info, md5sig);
1365 return 0;
1366}
1367
1368/* This can be called on a newly created socket, from other files */
1369static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1370 int family, u8 prefixlen, int l3index, u8 flags,
1371 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1372{
1373 /* Add Key to the list */
1374 struct tcp_md5sig_key *key;
1375 struct tcp_sock *tp = tcp_sk(sk);
1376 struct tcp_md5sig_info *md5sig;
1377
1378 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1379 if (key) {
1380 /* Pre-existing entry - just update that one.
1381 * Note that the key might be used concurrently.
1382 * data_race() is telling kcsan that we do not care of
1383 * key mismatches, since changing MD5 key on live flows
1384 * can lead to packet drops.
1385 */
1386 data_race(memcpy(key->key, newkey, newkeylen));
1387
1388 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1389 * Also note that a reader could catch new key->keylen value
1390 * but old key->key[], this is the reason we use __GFP_ZERO
1391 * at sock_kmalloc() time below these lines.
1392 */
1393 WRITE_ONCE(key->keylen, newkeylen);
1394
1395 return 0;
1396 }
1397
1398 md5sig = rcu_dereference_protected(tp->md5sig_info,
1399 lockdep_sock_is_held(sk));
1400
1401 key = sock_kmalloc(sk, size: sizeof(*key), priority: gfp | __GFP_ZERO);
1402 if (!key)
1403 return -ENOMEM;
1404
1405 memcpy(key->key, newkey, newkeylen);
1406 key->keylen = newkeylen;
1407 key->family = family;
1408 key->prefixlen = prefixlen;
1409 key->l3index = l3index;
1410 key->flags = flags;
1411 memcpy(&key->addr, addr,
1412 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1413 sizeof(struct in_addr));
1414 hlist_add_head_rcu(n: &key->node, h: &md5sig->head);
1415 return 0;
1416}
1417
1418int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1419 int family, u8 prefixlen, int l3index, u8 flags,
1420 const u8 *newkey, u8 newkeylen)
1421{
1422 struct tcp_sock *tp = tcp_sk(sk);
1423
1424 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1425 if (tcp_md5_alloc_sigpool())
1426 return -ENOMEM;
1427
1428 if (tcp_md5sig_info_add(sk, GFP_KERNEL)) {
1429 tcp_md5_release_sigpool();
1430 return -ENOMEM;
1431 }
1432
1433 if (!static_branch_inc(&tcp_md5_needed.key)) {
1434 struct tcp_md5sig_info *md5sig;
1435
1436 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1437 rcu_assign_pointer(tp->md5sig_info, NULL);
1438 kfree_rcu(md5sig, rcu);
1439 tcp_md5_release_sigpool();
1440 return -EUSERS;
1441 }
1442 }
1443
1444 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1445 newkey, newkeylen, GFP_KERNEL);
1446}
1447EXPORT_IPV6_MOD(tcp_md5_do_add);
1448
1449int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1450 int family, u8 prefixlen, int l3index,
1451 struct tcp_md5sig_key *key)
1452{
1453 struct tcp_sock *tp = tcp_sk(sk);
1454
1455 if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1456 tcp_md5_add_sigpool();
1457
1458 if (tcp_md5sig_info_add(sk, gfp: sk_gfp_mask(sk, GFP_ATOMIC))) {
1459 tcp_md5_release_sigpool();
1460 return -ENOMEM;
1461 }
1462
1463 if (!static_key_fast_inc_not_disabled(key: &tcp_md5_needed.key.key)) {
1464 struct tcp_md5sig_info *md5sig;
1465
1466 md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1467 net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1468 rcu_assign_pointer(tp->md5sig_info, NULL);
1469 kfree_rcu(md5sig, rcu);
1470 tcp_md5_release_sigpool();
1471 return -EUSERS;
1472 }
1473 }
1474
1475 return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1476 flags: key->flags, newkey: key->key, newkeylen: key->keylen,
1477 gfp: sk_gfp_mask(sk, GFP_ATOMIC));
1478}
1479EXPORT_IPV6_MOD(tcp_md5_key_copy);
1480
1481int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1482 u8 prefixlen, int l3index, u8 flags)
1483{
1484 struct tcp_md5sig_key *key;
1485
1486 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1487 if (!key)
1488 return -ENOENT;
1489 hlist_del_rcu(n: &key->node);
1490 atomic_sub(i: sizeof(*key), v: &sk->sk_omem_alloc);
1491 kfree_rcu(key, rcu);
1492 return 0;
1493}
1494EXPORT_IPV6_MOD(tcp_md5_do_del);
1495
1496void tcp_clear_md5_list(struct sock *sk)
1497{
1498 struct tcp_sock *tp = tcp_sk(sk);
1499 struct tcp_md5sig_key *key;
1500 struct hlist_node *n;
1501 struct tcp_md5sig_info *md5sig;
1502
1503 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1504
1505 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1506 hlist_del_rcu(n: &key->node);
1507 atomic_sub(i: sizeof(*key), v: &sk->sk_omem_alloc);
1508 kfree_rcu(key, rcu);
1509 }
1510}
1511
1512static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1513 sockptr_t optval, int optlen)
1514{
1515 struct tcp_md5sig cmd;
1516 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1517 const union tcp_md5_addr *addr;
1518 u8 prefixlen = 32;
1519 int l3index = 0;
1520 bool l3flag;
1521 u8 flags;
1522
1523 if (optlen < sizeof(cmd))
1524 return -EINVAL;
1525
1526 if (copy_from_sockptr(dst: &cmd, src: optval, size: sizeof(cmd)))
1527 return -EFAULT;
1528
1529 if (sin->sin_family != AF_INET)
1530 return -EINVAL;
1531
1532 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1533 l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1534
1535 if (optname == TCP_MD5SIG_EXT &&
1536 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1537 prefixlen = cmd.tcpm_prefixlen;
1538 if (prefixlen > 32)
1539 return -EINVAL;
1540 }
1541
1542 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1543 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1544 struct net_device *dev;
1545
1546 rcu_read_lock();
1547 dev = dev_get_by_index_rcu(net: sock_net(sk), ifindex: cmd.tcpm_ifindex);
1548 if (dev && netif_is_l3_master(dev))
1549 l3index = dev->ifindex;
1550
1551 rcu_read_unlock();
1552
1553 /* ok to reference set/not set outside of rcu;
1554 * right now device MUST be an L3 master
1555 */
1556 if (!dev || !l3index)
1557 return -EINVAL;
1558 }
1559
1560 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1561
1562 if (!cmd.tcpm_keylen)
1563 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1564
1565 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1566 return -EINVAL;
1567
1568 /* Don't allow keys for peers that have a matching TCP-AO key.
1569 * See the comment in tcp_ao_add_cmd()
1570 */
1571 if (tcp_ao_required(sk, saddr: addr, AF_INET, l3index: l3flag ? l3index : -1, stat_inc: false))
1572 return -EKEYREJECTED;
1573
1574 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1575 newkey: cmd.tcpm_key, newkeylen: cmd.tcpm_keylen);
1576}
1577
1578static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp,
1579 __be32 daddr, __be32 saddr,
1580 const struct tcphdr *th, int nbytes)
1581{
1582 struct tcp4_pseudohdr *bp;
1583 struct scatterlist sg;
1584 struct tcphdr *_th;
1585
1586 bp = hp->scratch;
1587 bp->saddr = saddr;
1588 bp->daddr = daddr;
1589 bp->pad = 0;
1590 bp->protocol = IPPROTO_TCP;
1591 bp->len = cpu_to_be16(nbytes);
1592
1593 _th = (struct tcphdr *)(bp + 1);
1594 memcpy(_th, th, sizeof(*th));
1595 _th->check = 0;
1596
1597 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1598 ahash_request_set_crypt(req: hp->req, src: &sg, NULL,
1599 nbytes: sizeof(*bp) + sizeof(*th));
1600 return crypto_ahash_update(req: hp->req);
1601}
1602
1603static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1604 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1605{
1606 struct tcp_sigpool hp;
1607
1608 if (tcp_sigpool_start(id: tcp_md5_sigpool_id, c: &hp))
1609 goto clear_hash_nostart;
1610
1611 if (crypto_ahash_init(req: hp.req))
1612 goto clear_hash;
1613 if (tcp_v4_md5_hash_headers(hp: &hp, daddr, saddr, th, nbytes: th->doff << 2))
1614 goto clear_hash;
1615 if (tcp_md5_hash_key(hp: &hp, key))
1616 goto clear_hash;
1617 ahash_request_set_crypt(req: hp.req, NULL, result: md5_hash, nbytes: 0);
1618 if (crypto_ahash_final(req: hp.req))
1619 goto clear_hash;
1620
1621 tcp_sigpool_end(c: &hp);
1622 return 0;
1623
1624clear_hash:
1625 tcp_sigpool_end(c: &hp);
1626clear_hash_nostart:
1627 memset(md5_hash, 0, 16);
1628 return 1;
1629}
1630
1631int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1632 const struct sock *sk,
1633 const struct sk_buff *skb)
1634{
1635 const struct tcphdr *th = tcp_hdr(skb);
1636 struct tcp_sigpool hp;
1637 __be32 saddr, daddr;
1638
1639 if (sk) { /* valid for establish/request sockets */
1640 saddr = sk->sk_rcv_saddr;
1641 daddr = sk->sk_daddr;
1642 } else {
1643 const struct iphdr *iph = ip_hdr(skb);
1644 saddr = iph->saddr;
1645 daddr = iph->daddr;
1646 }
1647
1648 if (tcp_sigpool_start(id: tcp_md5_sigpool_id, c: &hp))
1649 goto clear_hash_nostart;
1650
1651 if (crypto_ahash_init(req: hp.req))
1652 goto clear_hash;
1653
1654 if (tcp_v4_md5_hash_headers(hp: &hp, daddr, saddr, th, nbytes: skb->len))
1655 goto clear_hash;
1656 if (tcp_sigpool_hash_skb_data(hp: &hp, skb, header_len: th->doff << 2))
1657 goto clear_hash;
1658 if (tcp_md5_hash_key(hp: &hp, key))
1659 goto clear_hash;
1660 ahash_request_set_crypt(req: hp.req, NULL, result: md5_hash, nbytes: 0);
1661 if (crypto_ahash_final(req: hp.req))
1662 goto clear_hash;
1663
1664 tcp_sigpool_end(c: &hp);
1665 return 0;
1666
1667clear_hash:
1668 tcp_sigpool_end(c: &hp);
1669clear_hash_nostart:
1670 memset(md5_hash, 0, 16);
1671 return 1;
1672}
1673EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
1674
1675#endif
1676
1677static void tcp_v4_init_req(struct request_sock *req,
1678 const struct sock *sk_listener,
1679 struct sk_buff *skb)
1680{
1681 struct inet_request_sock *ireq = inet_rsk(sk: req);
1682 struct net *net = sock_net(sk: sk_listener);
1683
1684 sk_rcv_saddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->daddr);
1685 sk_daddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->saddr);
1686 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1687}
1688
1689static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1690 struct sk_buff *skb,
1691 struct flowi *fl,
1692 struct request_sock *req,
1693 u32 tw_isn)
1694{
1695 tcp_v4_init_req(req, sk_listener: sk, skb);
1696
1697 if (security_inet_conn_request(sk, skb, req))
1698 return NULL;
1699
1700 return inet_csk_route_req(sk, fl4: &fl->u.ip4, req);
1701}
1702
1703struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1704 .family = PF_INET,
1705 .obj_size = sizeof(struct tcp_request_sock),
1706 .rtx_syn_ack = tcp_rtx_synack,
1707 .send_ack = tcp_v4_reqsk_send_ack,
1708 .destructor = tcp_v4_reqsk_destructor,
1709 .send_reset = tcp_v4_send_reset,
1710 .syn_ack_timeout = tcp_syn_ack_timeout,
1711};
1712
1713const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1714 .mss_clamp = TCP_MSS_DEFAULT,
1715#ifdef CONFIG_TCP_MD5SIG
1716 .req_md5_lookup = tcp_v4_md5_lookup,
1717 .calc_md5_hash = tcp_v4_md5_hash_skb,
1718#endif
1719#ifdef CONFIG_TCP_AO
1720 .ao_lookup = tcp_v4_ao_lookup_rsk,
1721 .ao_calc_key = tcp_v4_ao_calc_key_rsk,
1722 .ao_synack_hash = tcp_v4_ao_synack_hash,
1723#endif
1724#ifdef CONFIG_SYN_COOKIES
1725 .cookie_init_seq = cookie_v4_init_sequence,
1726#endif
1727 .route_req = tcp_v4_route_req,
1728 .init_seq = tcp_v4_init_seq,
1729 .init_ts_off = tcp_v4_init_ts_off,
1730 .send_synack = tcp_v4_send_synack,
1731};
1732
1733int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1734{
1735 /* Never answer to SYNs send to broadcast or multicast */
1736 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1737 goto drop;
1738
1739 return tcp_conn_request(rsk_ops: &tcp_request_sock_ops,
1740 af_ops: &tcp_request_sock_ipv4_ops, sk, skb);
1741
1742drop:
1743 tcp_listendrop(sk);
1744 return 0;
1745}
1746EXPORT_IPV6_MOD(tcp_v4_conn_request);
1747
1748
1749/*
1750 * The three way handshake has completed - we got a valid synack -
1751 * now create the new socket.
1752 */
1753struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1754 struct request_sock *req,
1755 struct dst_entry *dst,
1756 struct request_sock *req_unhash,
1757 bool *own_req)
1758{
1759 struct inet_request_sock *ireq;
1760 bool found_dup_sk = false;
1761 struct inet_sock *newinet;
1762 struct tcp_sock *newtp;
1763 struct sock *newsk;
1764#ifdef CONFIG_TCP_MD5SIG
1765 const union tcp_md5_addr *addr;
1766 struct tcp_md5sig_key *key;
1767 int l3index;
1768#endif
1769 struct ip_options_rcu *inet_opt;
1770
1771 if (sk_acceptq_is_full(sk))
1772 goto exit_overflow;
1773
1774 newsk = tcp_create_openreq_child(sk, req, skb);
1775 if (!newsk)
1776 goto exit_nonewsk;
1777
1778 newsk->sk_gso_type = SKB_GSO_TCPV4;
1779 inet_sk_rx_dst_set(sk: newsk, skb);
1780
1781 newtp = tcp_sk(newsk);
1782 newinet = inet_sk(newsk);
1783 ireq = inet_rsk(sk: req);
1784 inet_opt = rcu_dereference(ireq->ireq_opt);
1785 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1786 newinet->mc_index = inet_iif(skb);
1787 newinet->mc_ttl = ip_hdr(skb)->ttl;
1788 newinet->rcv_tos = ip_hdr(skb)->tos;
1789 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1790 if (inet_opt)
1791 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1792 atomic_set(v: &newinet->inet_id, i: get_random_u16());
1793
1794 /* Set ToS of the new socket based upon the value of incoming SYN.
1795 * ECT bits are set later in tcp_init_transfer().
1796 */
1797 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1798 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1799
1800 if (!dst) {
1801 dst = inet_csk_route_child_sock(sk, newsk, req);
1802 if (!dst)
1803 goto put_and_exit;
1804 } else {
1805 /* syncookie case : see end of cookie_v4_check() */
1806 }
1807 sk_setup_caps(sk: newsk, dst);
1808
1809 tcp_ca_openreq_child(sk: newsk, dst);
1810
1811 tcp_sync_mss(sk: newsk, pmtu: dst_mtu(dst));
1812 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), mss: dst_metric_advmss(dst));
1813
1814 tcp_initialize_rcv_mss(sk: newsk);
1815
1816#ifdef CONFIG_TCP_MD5SIG
1817 l3index = l3mdev_master_ifindex_by_index(net: sock_net(sk), ifindex: ireq->ir_iif);
1818 /* Copy over the MD5 key from the original socket */
1819 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1820 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1821 if (key && !tcp_rsk_used_ao(req)) {
1822 if (tcp_md5_key_copy(sk: newsk, addr, AF_INET, prefixlen: 32, l3index, key))
1823 goto put_and_exit;
1824 sk_gso_disable(sk: newsk);
1825 }
1826#endif
1827#ifdef CONFIG_TCP_AO
1828 if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
1829 goto put_and_exit; /* OOM, release back memory */
1830#endif
1831
1832 if (__inet_inherit_port(sk, child: newsk) < 0)
1833 goto put_and_exit;
1834 *own_req = inet_ehash_nolisten(sk: newsk, osk: req_to_sk(req: req_unhash),
1835 found_dup_sk: &found_dup_sk);
1836 if (likely(*own_req)) {
1837 tcp_move_syn(tp: newtp, req);
1838 ireq->ireq_opt = NULL;
1839 } else {
1840 newinet->inet_opt = NULL;
1841
1842 if (!req_unhash && found_dup_sk) {
1843 /* This code path should only be executed in the
1844 * syncookie case only
1845 */
1846 bh_unlock_sock(newsk);
1847 sock_put(sk: newsk);
1848 newsk = NULL;
1849 }
1850 }
1851 return newsk;
1852
1853exit_overflow:
1854 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1855exit_nonewsk:
1856 dst_release(dst);
1857exit:
1858 tcp_listendrop(sk);
1859 return NULL;
1860put_and_exit:
1861 newinet->inet_opt = NULL;
1862 inet_csk_prepare_forced_close(sk: newsk);
1863 tcp_done(sk: newsk);
1864 goto exit;
1865}
1866EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
1867
1868static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1869{
1870#ifdef CONFIG_SYN_COOKIES
1871 const struct tcphdr *th = tcp_hdr(skb);
1872
1873 if (!th->syn)
1874 sk = cookie_v4_check(sk, skb);
1875#endif
1876 return sk;
1877}
1878
1879u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1880 struct tcphdr *th, u32 *cookie)
1881{
1882 u16 mss = 0;
1883#ifdef CONFIG_SYN_COOKIES
1884 mss = tcp_get_syncookie_mss(rsk_ops: &tcp_request_sock_ops,
1885 af_ops: &tcp_request_sock_ipv4_ops, sk, th);
1886 if (mss) {
1887 *cookie = __cookie_v4_init_sequence(iph, th, mssp: &mss);
1888 tcp_synq_overflow(sk);
1889 }
1890#endif
1891 return mss;
1892}
1893
1894INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1895 u32));
1896/* The socket must have it's spinlock held when we get
1897 * here, unless it is a TCP_LISTEN socket.
1898 *
1899 * We have a potential double-lock case here, so even when
1900 * doing backlog processing we use the BH locking scheme.
1901 * This is because we cannot sleep with the original spinlock
1902 * held.
1903 */
1904int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1905{
1906 enum skb_drop_reason reason;
1907 struct sock *rsk;
1908
1909 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1910 struct dst_entry *dst;
1911
1912 dst = rcu_dereference_protected(sk->sk_rx_dst,
1913 lockdep_sock_is_held(sk));
1914
1915 sock_rps_save_rxhash(sk, skb);
1916 sk_mark_napi_id(sk, skb);
1917 if (dst) {
1918 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1919 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1920 dst, 0)) {
1921 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1922 dst_release(dst);
1923 }
1924 }
1925 tcp_rcv_established(sk, skb);
1926 return 0;
1927 }
1928
1929 if (tcp_checksum_complete(skb))
1930 goto csum_err;
1931
1932 if (sk->sk_state == TCP_LISTEN) {
1933 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1934
1935 if (!nsk)
1936 return 0;
1937 if (nsk != sk) {
1938 reason = tcp_child_process(parent: sk, child: nsk, skb);
1939 if (reason) {
1940 rsk = nsk;
1941 goto reset;
1942 }
1943 return 0;
1944 }
1945 } else
1946 sock_rps_save_rxhash(sk, skb);
1947
1948 reason = tcp_rcv_state_process(sk, skb);
1949 if (reason) {
1950 rsk = sk;
1951 goto reset;
1952 }
1953 return 0;
1954
1955reset:
1956 tcp_v4_send_reset(sk: rsk, skb, reason: sk_rst_convert_drop_reason(reason));
1957discard:
1958 sk_skb_reason_drop(sk, skb, reason);
1959 /* Be careful here. If this function gets more complicated and
1960 * gcc suffers from register pressure on the x86, sk (in %ebx)
1961 * might be destroyed here. This current version compiles correctly,
1962 * but you have been warned.
1963 */
1964 return 0;
1965
1966csum_err:
1967 reason = SKB_DROP_REASON_TCP_CSUM;
1968 trace_tcp_bad_csum(skb);
1969 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1970 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1971 goto discard;
1972}
1973EXPORT_SYMBOL(tcp_v4_do_rcv);
1974
1975int tcp_v4_early_demux(struct sk_buff *skb)
1976{
1977 struct net *net = dev_net_rcu(dev: skb->dev);
1978 const struct iphdr *iph;
1979 const struct tcphdr *th;
1980 struct sock *sk;
1981
1982 if (skb->pkt_type != PACKET_HOST)
1983 return 0;
1984
1985 if (!pskb_may_pull(skb, len: skb_transport_offset(skb) + sizeof(struct tcphdr)))
1986 return 0;
1987
1988 iph = ip_hdr(skb);
1989 th = tcp_hdr(skb);
1990
1991 if (th->doff < sizeof(struct tcphdr) / 4)
1992 return 0;
1993
1994 sk = __inet_lookup_established(net, hashinfo: net->ipv4.tcp_death_row.hashinfo,
1995 saddr: iph->saddr, sport: th->source,
1996 daddr: iph->daddr, ntohs(th->dest),
1997 dif: skb->skb_iif, sdif: inet_sdif(skb));
1998 if (sk) {
1999 skb->sk = sk;
2000 skb->destructor = sock_edemux;
2001 if (sk_fullsock(sk)) {
2002 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
2003
2004 if (dst)
2005 dst = dst_check(dst, cookie: 0);
2006 if (dst &&
2007 sk->sk_rx_dst_ifindex == skb->skb_iif)
2008 skb_dst_set_noref(skb, dst);
2009 }
2010 }
2011 return 0;
2012}
2013
2014bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
2015 enum skb_drop_reason *reason)
2016{
2017 u32 tail_gso_size, tail_gso_segs;
2018 struct skb_shared_info *shinfo;
2019 const struct tcphdr *th;
2020 struct tcphdr *thtail;
2021 struct sk_buff *tail;
2022 unsigned int hdrlen;
2023 bool fragstolen;
2024 u32 gso_segs;
2025 u32 gso_size;
2026 u64 limit;
2027 int delta;
2028
2029 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
2030 * we can fix skb->truesize to its real value to avoid future drops.
2031 * This is valid because skb is not yet charged to the socket.
2032 * It has been noticed pure SACK packets were sometimes dropped
2033 * (if cooked by drivers without copybreak feature).
2034 */
2035 skb_condense(skb);
2036
2037 tcp_cleanup_skb(skb);
2038
2039 if (unlikely(tcp_checksum_complete(skb))) {
2040 bh_unlock_sock(sk);
2041 trace_tcp_bad_csum(skb);
2042 *reason = SKB_DROP_REASON_TCP_CSUM;
2043 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
2044 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
2045 return true;
2046 }
2047
2048 /* Attempt coalescing to last skb in backlog, even if we are
2049 * above the limits.
2050 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
2051 */
2052 th = (const struct tcphdr *)skb->data;
2053 hdrlen = th->doff * 4;
2054
2055 tail = sk->sk_backlog.tail;
2056 if (!tail)
2057 goto no_coalesce;
2058 thtail = (struct tcphdr *)tail->data;
2059
2060 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
2061 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
2062 ((TCP_SKB_CB(tail)->tcp_flags |
2063 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
2064 !((TCP_SKB_CB(tail)->tcp_flags &
2065 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
2066 ((TCP_SKB_CB(tail)->tcp_flags ^
2067 TCP_SKB_CB(skb)->tcp_flags) &
2068 (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
2069 !tcp_skb_can_collapse_rx(to: tail, from: skb) ||
2070 thtail->doff != th->doff ||
2071 memcmp(p: thtail + 1, q: th + 1, size: hdrlen - sizeof(*th)))
2072 goto no_coalesce;
2073
2074 __skb_pull(skb, len: hdrlen);
2075
2076 shinfo = skb_shinfo(skb);
2077 gso_size = shinfo->gso_size ?: skb->len;
2078 gso_segs = shinfo->gso_segs ?: 1;
2079
2080 shinfo = skb_shinfo(tail);
2081 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
2082 tail_gso_segs = shinfo->gso_segs ?: 1;
2083
2084 if (skb_try_coalesce(to: tail, from: skb, fragstolen: &fragstolen, delta_truesize: &delta)) {
2085 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
2086
2087 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
2088 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
2089 thtail->window = th->window;
2090 }
2091
2092 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
2093 * thtail->fin, so that the fast path in tcp_rcv_established()
2094 * is not entered if we append a packet with a FIN.
2095 * SYN, RST, URG are not present.
2096 * ACK is set on both packets.
2097 * PSH : we do not really care in TCP stack,
2098 * at least for 'GRO' packets.
2099 */
2100 thtail->fin |= th->fin;
2101 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
2102
2103 if (TCP_SKB_CB(skb)->has_rxtstamp) {
2104 TCP_SKB_CB(tail)->has_rxtstamp = true;
2105 tail->tstamp = skb->tstamp;
2106 skb_hwtstamps(skb: tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
2107 }
2108
2109 /* Not as strict as GRO. We only need to carry mss max value */
2110 shinfo->gso_size = max(gso_size, tail_gso_size);
2111 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
2112
2113 sk->sk_backlog.len += delta;
2114 __NET_INC_STATS(sock_net(sk),
2115 LINUX_MIB_TCPBACKLOGCOALESCE);
2116 kfree_skb_partial(skb, head_stolen: fragstolen);
2117 return false;
2118 }
2119 __skb_push(skb, len: hdrlen);
2120
2121no_coalesce:
2122 /* sk->sk_backlog.len is reset only at the end of __release_sock().
2123 * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
2124 * sk_rcvbuf in normal conditions.
2125 */
2126 limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
2127
2128 limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
2129
2130 /* Only socket owner can try to collapse/prune rx queues
2131 * to reduce memory overhead, so add a little headroom here.
2132 * Few sockets backlog are possibly concurrently non empty.
2133 */
2134 limit += 64 * 1024;
2135
2136 limit = min_t(u64, limit, UINT_MAX);
2137
2138 if (unlikely(sk_add_backlog(sk, skb, limit))) {
2139 bh_unlock_sock(sk);
2140 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
2141 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
2142 return true;
2143 }
2144 return false;
2145}
2146EXPORT_IPV6_MOD(tcp_add_backlog);
2147
2148int tcp_filter(struct sock *sk, struct sk_buff *skb)
2149{
2150 struct tcphdr *th = (struct tcphdr *)skb->data;
2151
2152 return sk_filter_trim_cap(sk, skb, cap: th->doff * 4);
2153}
2154EXPORT_IPV6_MOD(tcp_filter);
2155
2156static void tcp_v4_restore_cb(struct sk_buff *skb)
2157{
2158 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
2159 sizeof(struct inet_skb_parm));
2160}
2161
2162static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
2163 const struct tcphdr *th)
2164{
2165 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
2166 * barrier() makes sure compiler wont play fool^Waliasing games.
2167 */
2168 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
2169 sizeof(struct inet_skb_parm));
2170 barrier();
2171
2172 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
2173 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
2174 skb->len - th->doff * 4);
2175 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
2176 TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
2177 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
2178 TCP_SKB_CB(skb)->sacked = 0;
2179 TCP_SKB_CB(skb)->has_rxtstamp =
2180 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
2181}
2182
2183/*
2184 * From tcp_input.c
2185 */
2186
2187int tcp_v4_rcv(struct sk_buff *skb)
2188{
2189 struct net *net = dev_net_rcu(dev: skb->dev);
2190 enum skb_drop_reason drop_reason;
2191 enum tcp_tw_status tw_status;
2192 int sdif = inet_sdif(skb);
2193 int dif = inet_iif(skb);
2194 const struct iphdr *iph;
2195 const struct tcphdr *th;
2196 struct sock *sk = NULL;
2197 bool refcounted;
2198 int ret;
2199 u32 isn;
2200
2201 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
2202 if (skb->pkt_type != PACKET_HOST)
2203 goto discard_it;
2204
2205 /* Count it even if it's bad */
2206 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2207
2208 if (!pskb_may_pull(skb, len: sizeof(struct tcphdr)))
2209 goto discard_it;
2210
2211 th = (const struct tcphdr *)skb->data;
2212
2213 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2214 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2215 goto bad_packet;
2216 }
2217 if (!pskb_may_pull(skb, len: th->doff * 4))
2218 goto discard_it;
2219
2220 /* An explanation is required here, I think.
2221 * Packet length and doff are validated by header prediction,
2222 * provided case of th->doff==0 is eliminated.
2223 * So, we defer the checks. */
2224
2225 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2226 goto csum_error;
2227
2228 th = (const struct tcphdr *)skb->data;
2229 iph = ip_hdr(skb);
2230lookup:
2231 sk = __inet_lookup_skb(hashinfo: net->ipv4.tcp_death_row.hashinfo,
2232 skb, doff: __tcp_hdrlen(th), sport: th->source,
2233 dport: th->dest, sdif, refcounted: &refcounted);
2234 if (!sk)
2235 goto no_tcp_socket;
2236
2237 if (sk->sk_state == TCP_TIME_WAIT)
2238 goto do_time_wait;
2239
2240 if (sk->sk_state == TCP_NEW_SYN_RECV) {
2241 struct request_sock *req = inet_reqsk(sk);
2242 bool req_stolen = false;
2243 struct sock *nsk;
2244
2245 sk = req->rsk_listener;
2246 if (!xfrm4_policy_check(sk, dir: XFRM_POLICY_IN, skb))
2247 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2248 else
2249 drop_reason = tcp_inbound_hash(sk, req, skb,
2250 saddr: &iph->saddr, daddr: &iph->daddr,
2251 AF_INET, dif, sdif);
2252 if (unlikely(drop_reason)) {
2253 sk_drops_add(sk, skb);
2254 reqsk_put(req);
2255 goto discard_it;
2256 }
2257 if (tcp_checksum_complete(skb)) {
2258 reqsk_put(req);
2259 goto csum_error;
2260 }
2261 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2262 nsk = reuseport_migrate_sock(sk, migrating_sk: req_to_sk(req), skb);
2263 if (!nsk) {
2264 inet_csk_reqsk_queue_drop_and_put(sk, req);
2265 goto lookup;
2266 }
2267 sk = nsk;
2268 /* reuseport_migrate_sock() has already held one sk_refcnt
2269 * before returning.
2270 */
2271 } else {
2272 /* We own a reference on the listener, increase it again
2273 * as we might lose it too soon.
2274 */
2275 sock_hold(sk);
2276 }
2277 refcounted = true;
2278 nsk = NULL;
2279 if (!tcp_filter(sk, skb)) {
2280 th = (const struct tcphdr *)skb->data;
2281 iph = ip_hdr(skb);
2282 tcp_v4_fill_cb(skb, iph, th);
2283 nsk = tcp_check_req(sk, skb, req, fastopen: false, lost_race: &req_stolen,
2284 drop_reason: &drop_reason);
2285 } else {
2286 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2287 }
2288 if (!nsk) {
2289 reqsk_put(req);
2290 if (req_stolen) {
2291 /* Another cpu got exclusive access to req
2292 * and created a full blown socket.
2293 * Try to feed this packet to this socket
2294 * instead of discarding it.
2295 */
2296 tcp_v4_restore_cb(skb);
2297 sock_put(sk);
2298 goto lookup;
2299 }
2300 goto discard_and_relse;
2301 }
2302 nf_reset_ct(skb);
2303 if (nsk == sk) {
2304 reqsk_put(req);
2305 tcp_v4_restore_cb(skb);
2306 } else {
2307 drop_reason = tcp_child_process(parent: sk, child: nsk, skb);
2308 if (drop_reason) {
2309 enum sk_rst_reason rst_reason;
2310
2311 rst_reason = sk_rst_convert_drop_reason(reason: drop_reason);
2312 tcp_v4_send_reset(sk: nsk, skb, reason: rst_reason);
2313 goto discard_and_relse;
2314 }
2315 sock_put(sk);
2316 return 0;
2317 }
2318 }
2319
2320process:
2321 if (static_branch_unlikely(&ip4_min_ttl)) {
2322 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2323 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2324 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2325 drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2326 goto discard_and_relse;
2327 }
2328 }
2329
2330 if (!xfrm4_policy_check(sk, dir: XFRM_POLICY_IN, skb)) {
2331 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2332 goto discard_and_relse;
2333 }
2334
2335 drop_reason = tcp_inbound_hash(sk, NULL, skb, saddr: &iph->saddr, daddr: &iph->daddr,
2336 AF_INET, dif, sdif);
2337 if (drop_reason)
2338 goto discard_and_relse;
2339
2340 nf_reset_ct(skb);
2341
2342 if (tcp_filter(sk, skb)) {
2343 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2344 goto discard_and_relse;
2345 }
2346 th = (const struct tcphdr *)skb->data;
2347 iph = ip_hdr(skb);
2348 tcp_v4_fill_cb(skb, iph, th);
2349
2350 skb->dev = NULL;
2351
2352 if (sk->sk_state == TCP_LISTEN) {
2353 ret = tcp_v4_do_rcv(sk, skb);
2354 goto put_and_return;
2355 }
2356
2357 sk_incoming_cpu_update(sk);
2358
2359 bh_lock_sock_nested(sk);
2360 tcp_segs_in(tcp_sk(sk), skb);
2361 ret = 0;
2362 if (!sock_owned_by_user(sk)) {
2363 ret = tcp_v4_do_rcv(sk, skb);
2364 } else {
2365 if (tcp_add_backlog(sk, skb, reason: &drop_reason))
2366 goto discard_and_relse;
2367 }
2368 bh_unlock_sock(sk);
2369
2370put_and_return:
2371 if (refcounted)
2372 sock_put(sk);
2373
2374 return ret;
2375
2376no_tcp_socket:
2377 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2378 if (!xfrm4_policy_check(NULL, dir: XFRM_POLICY_IN, skb))
2379 goto discard_it;
2380
2381 tcp_v4_fill_cb(skb, iph, th);
2382
2383 if (tcp_checksum_complete(skb)) {
2384csum_error:
2385 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2386 trace_tcp_bad_csum(skb);
2387 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2388bad_packet:
2389 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2390 } else {
2391 tcp_v4_send_reset(NULL, skb, reason: sk_rst_convert_drop_reason(reason: drop_reason));
2392 }
2393
2394discard_it:
2395 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2396 /* Discard frame. */
2397 sk_skb_reason_drop(sk, skb, reason: drop_reason);
2398 return 0;
2399
2400discard_and_relse:
2401 sk_drops_add(sk, skb);
2402 if (refcounted)
2403 sock_put(sk);
2404 goto discard_it;
2405
2406do_time_wait:
2407 if (!xfrm4_policy_check(NULL, dir: XFRM_POLICY_IN, skb)) {
2408 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2409 inet_twsk_put(tw: inet_twsk(sk));
2410 goto discard_it;
2411 }
2412
2413 tcp_v4_fill_cb(skb, iph, th);
2414
2415 if (tcp_checksum_complete(skb)) {
2416 inet_twsk_put(tw: inet_twsk(sk));
2417 goto csum_error;
2418 }
2419
2420 tw_status = tcp_timewait_state_process(tw: inet_twsk(sk), skb, th, tw_isn: &isn,
2421 drop_reason: &drop_reason);
2422 switch (tw_status) {
2423 case TCP_TW_SYN: {
2424 struct sock *sk2 = inet_lookup_listener(net,
2425 hashinfo: net->ipv4.tcp_death_row.hashinfo,
2426 skb, doff: __tcp_hdrlen(th),
2427 saddr: iph->saddr, sport: th->source,
2428 daddr: iph->daddr, dport: th->dest,
2429 dif: inet_iif(skb),
2430 sdif);
2431 if (sk2) {
2432 inet_twsk_deschedule_put(tw: inet_twsk(sk));
2433 sk = sk2;
2434 tcp_v4_restore_cb(skb);
2435 refcounted = false;
2436 __this_cpu_write(tcp_tw_isn, isn);
2437 goto process;
2438 }
2439 }
2440 /* to ACK */
2441 fallthrough;
2442 case TCP_TW_ACK:
2443 case TCP_TW_ACK_OOW:
2444 tcp_v4_timewait_ack(sk, skb, tw_status);
2445 break;
2446 case TCP_TW_RST:
2447 tcp_v4_send_reset(sk, skb, reason: SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
2448 inet_twsk_deschedule_put(tw: inet_twsk(sk));
2449 goto discard_it;
2450 case TCP_TW_SUCCESS:;
2451 }
2452 goto discard_it;
2453}
2454
2455static struct timewait_sock_ops tcp_timewait_sock_ops = {
2456 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2457 .twsk_destructor= tcp_twsk_destructor,
2458};
2459
2460void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2461{
2462 struct dst_entry *dst = skb_dst(skb);
2463
2464 if (dst && dst_hold_safe(dst)) {
2465 rcu_assign_pointer(sk->sk_rx_dst, dst);
2466 sk->sk_rx_dst_ifindex = skb->skb_iif;
2467 }
2468}
2469EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
2470
2471const struct inet_connection_sock_af_ops ipv4_specific = {
2472 .queue_xmit = ip_queue_xmit,
2473 .send_check = tcp_v4_send_check,
2474 .rebuild_header = inet_sk_rebuild_header,
2475 .sk_rx_dst_set = inet_sk_rx_dst_set,
2476 .conn_request = tcp_v4_conn_request,
2477 .syn_recv_sock = tcp_v4_syn_recv_sock,
2478 .net_header_len = sizeof(struct iphdr),
2479 .setsockopt = ip_setsockopt,
2480 .getsockopt = ip_getsockopt,
2481 .mtu_reduced = tcp_v4_mtu_reduced,
2482};
2483EXPORT_IPV6_MOD(ipv4_specific);
2484
2485#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2486static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2487#ifdef CONFIG_TCP_MD5SIG
2488 .md5_lookup = tcp_v4_md5_lookup,
2489 .calc_md5_hash = tcp_v4_md5_hash_skb,
2490 .md5_parse = tcp_v4_parse_md5_keys,
2491#endif
2492#ifdef CONFIG_TCP_AO
2493 .ao_lookup = tcp_v4_ao_lookup,
2494 .calc_ao_hash = tcp_v4_ao_hash_skb,
2495 .ao_parse = tcp_v4_parse_ao,
2496 .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
2497#endif
2498};
2499#endif
2500
2501/* NOTE: A lot of things set to zero explicitly by call to
2502 * sk_alloc() so need not be done here.
2503 */
2504static int tcp_v4_init_sock(struct sock *sk)
2505{
2506 struct inet_connection_sock *icsk = inet_csk(sk);
2507
2508 tcp_init_sock(sk);
2509
2510 icsk->icsk_af_ops = &ipv4_specific;
2511
2512#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
2513 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2514#endif
2515
2516 return 0;
2517}
2518
2519#ifdef CONFIG_TCP_MD5SIG
2520static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
2521{
2522 struct tcp_md5sig_info *md5sig;
2523
2524 md5sig = container_of(head, struct tcp_md5sig_info, rcu);
2525 kfree(objp: md5sig);
2526 static_branch_slow_dec_deferred(&tcp_md5_needed);
2527 tcp_md5_release_sigpool();
2528}
2529#endif
2530
2531static void tcp_release_user_frags(struct sock *sk)
2532{
2533#ifdef CONFIG_PAGE_POOL
2534 unsigned long index;
2535 void *netmem;
2536
2537 xa_for_each(&sk->sk_user_frags, index, netmem)
2538 WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
2539#endif
2540}
2541
2542void tcp_v4_destroy_sock(struct sock *sk)
2543{
2544 struct tcp_sock *tp = tcp_sk(sk);
2545
2546 tcp_release_user_frags(sk);
2547
2548 xa_destroy(&sk->sk_user_frags);
2549
2550 trace_tcp_destroy_sock(sk);
2551
2552 tcp_clear_xmit_timers(sk);
2553
2554 tcp_cleanup_congestion_control(sk);
2555
2556 tcp_cleanup_ulp(sk);
2557
2558 /* Cleanup up the write buffer. */
2559 tcp_write_queue_purge(sk);
2560
2561 /* Check if we want to disable active TFO */
2562 tcp_fastopen_active_disable_ofo_check(sk);
2563
2564 /* Cleans up our, hopefully empty, out_of_order_queue. */
2565 skb_rbtree_purge(root: &tp->out_of_order_queue);
2566
2567#ifdef CONFIG_TCP_MD5SIG
2568 /* Clean up the MD5 key list, if any */
2569 if (tp->md5sig_info) {
2570 struct tcp_md5sig_info *md5sig;
2571
2572 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
2573 tcp_clear_md5_list(sk);
2574 call_rcu(head: &md5sig->rcu, func: tcp_md5sig_info_free_rcu);
2575 rcu_assign_pointer(tp->md5sig_info, NULL);
2576 }
2577#endif
2578 tcp_ao_destroy_sock(sk, twsk: false);
2579
2580 /* Clean up a referenced TCP bind bucket. */
2581 if (inet_csk(sk)->icsk_bind_hash)
2582 inet_put_port(sk);
2583
2584 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2585
2586 /* If socket is aborted during connect operation */
2587 tcp_free_fastopen_req(tp);
2588 tcp_fastopen_destroy_cipher(sk);
2589 tcp_saved_syn_free(tp);
2590
2591 sk_sockets_allocated_dec(sk);
2592}
2593EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
2594
2595#ifdef CONFIG_PROC_FS
2596/* Proc filesystem TCP sock list dumping. */
2597
2598static unsigned short seq_file_family(const struct seq_file *seq);
2599
2600static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2601{
2602 unsigned short family = seq_file_family(seq);
2603
2604 /* AF_UNSPEC is used as a match all */
2605 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2606 net_eq(net1: sock_net(sk), net2: seq_file_net(seq)));
2607}
2608
2609/* Find a non empty bucket (starting from st->bucket)
2610 * and return the first sk from it.
2611 */
2612static void *listening_get_first(struct seq_file *seq)
2613{
2614 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2615 struct tcp_iter_state *st = seq->private;
2616
2617 st->offset = 0;
2618 for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2619 struct inet_listen_hashbucket *ilb2;
2620 struct hlist_nulls_node *node;
2621 struct sock *sk;
2622
2623 ilb2 = &hinfo->lhash2[st->bucket];
2624 if (hlist_nulls_empty(h: &ilb2->nulls_head))
2625 continue;
2626
2627 spin_lock(lock: &ilb2->lock);
2628 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2629 if (seq_sk_match(seq, sk))
2630 return sk;
2631 }
2632 spin_unlock(lock: &ilb2->lock);
2633 }
2634
2635 return NULL;
2636}
2637
2638/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2639 * If "cur" is the last one in the st->bucket,
2640 * call listening_get_first() to return the first sk of the next
2641 * non empty bucket.
2642 */
2643static void *listening_get_next(struct seq_file *seq, void *cur)
2644{
2645 struct tcp_iter_state *st = seq->private;
2646 struct inet_listen_hashbucket *ilb2;
2647 struct hlist_nulls_node *node;
2648 struct inet_hashinfo *hinfo;
2649 struct sock *sk = cur;
2650
2651 ++st->num;
2652 ++st->offset;
2653
2654 sk = sk_nulls_next(sk);
2655 sk_nulls_for_each_from(sk, node) {
2656 if (seq_sk_match(seq, sk))
2657 return sk;
2658 }
2659
2660 hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2661 ilb2 = &hinfo->lhash2[st->bucket];
2662 spin_unlock(lock: &ilb2->lock);
2663 ++st->bucket;
2664 return listening_get_first(seq);
2665}
2666
2667static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2668{
2669 struct tcp_iter_state *st = seq->private;
2670 void *rc;
2671
2672 st->bucket = 0;
2673 st->offset = 0;
2674 rc = listening_get_first(seq);
2675
2676 while (rc && *pos) {
2677 rc = listening_get_next(seq, cur: rc);
2678 --*pos;
2679 }
2680 return rc;
2681}
2682
2683static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2684 const struct tcp_iter_state *st)
2685{
2686 return hlist_nulls_empty(h: &hinfo->ehash[st->bucket].chain);
2687}
2688
2689/*
2690 * Get first established socket starting from bucket given in st->bucket.
2691 * If st->bucket is zero, the very first socket in the hash is returned.
2692 */
2693static void *established_get_first(struct seq_file *seq)
2694{
2695 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2696 struct tcp_iter_state *st = seq->private;
2697
2698 st->offset = 0;
2699 for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2700 struct sock *sk;
2701 struct hlist_nulls_node *node;
2702 spinlock_t *lock = inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket);
2703
2704 cond_resched();
2705
2706 /* Lockless fast path for the common case of empty buckets */
2707 if (empty_bucket(hinfo, st))
2708 continue;
2709
2710 spin_lock_bh(lock);
2711 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2712 if (seq_sk_match(seq, sk))
2713 return sk;
2714 }
2715 spin_unlock_bh(lock);
2716 }
2717
2718 return NULL;
2719}
2720
2721static void *established_get_next(struct seq_file *seq, void *cur)
2722{
2723 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2724 struct tcp_iter_state *st = seq->private;
2725 struct hlist_nulls_node *node;
2726 struct sock *sk = cur;
2727
2728 ++st->num;
2729 ++st->offset;
2730
2731 sk = sk_nulls_next(sk);
2732
2733 sk_nulls_for_each_from(sk, node) {
2734 if (seq_sk_match(seq, sk))
2735 return sk;
2736 }
2737
2738 spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
2739 ++st->bucket;
2740 return established_get_first(seq);
2741}
2742
2743static void *established_get_idx(struct seq_file *seq, loff_t pos)
2744{
2745 struct tcp_iter_state *st = seq->private;
2746 void *rc;
2747
2748 st->bucket = 0;
2749 rc = established_get_first(seq);
2750
2751 while (rc && pos) {
2752 rc = established_get_next(seq, cur: rc);
2753 --pos;
2754 }
2755 return rc;
2756}
2757
2758static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2759{
2760 void *rc;
2761 struct tcp_iter_state *st = seq->private;
2762
2763 st->state = TCP_SEQ_STATE_LISTENING;
2764 rc = listening_get_idx(seq, pos: &pos);
2765
2766 if (!rc) {
2767 st->state = TCP_SEQ_STATE_ESTABLISHED;
2768 rc = established_get_idx(seq, pos);
2769 }
2770
2771 return rc;
2772}
2773
2774static void *tcp_seek_last_pos(struct seq_file *seq)
2775{
2776 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2777 struct tcp_iter_state *st = seq->private;
2778 int bucket = st->bucket;
2779 int offset = st->offset;
2780 int orig_num = st->num;
2781 void *rc = NULL;
2782
2783 switch (st->state) {
2784 case TCP_SEQ_STATE_LISTENING:
2785 if (st->bucket > hinfo->lhash2_mask)
2786 break;
2787 rc = listening_get_first(seq);
2788 while (offset-- && rc && bucket == st->bucket)
2789 rc = listening_get_next(seq, cur: rc);
2790 if (rc)
2791 break;
2792 st->bucket = 0;
2793 st->state = TCP_SEQ_STATE_ESTABLISHED;
2794 fallthrough;
2795 case TCP_SEQ_STATE_ESTABLISHED:
2796 if (st->bucket > hinfo->ehash_mask)
2797 break;
2798 rc = established_get_first(seq);
2799 while (offset-- && rc && bucket == st->bucket)
2800 rc = established_get_next(seq, cur: rc);
2801 }
2802
2803 st->num = orig_num;
2804
2805 return rc;
2806}
2807
2808void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2809{
2810 struct tcp_iter_state *st = seq->private;
2811 void *rc;
2812
2813 if (*pos && *pos == st->last_pos) {
2814 rc = tcp_seek_last_pos(seq);
2815 if (rc)
2816 goto out;
2817 }
2818
2819 st->state = TCP_SEQ_STATE_LISTENING;
2820 st->num = 0;
2821 st->bucket = 0;
2822 st->offset = 0;
2823 rc = *pos ? tcp_get_idx(seq, pos: *pos - 1) : SEQ_START_TOKEN;
2824
2825out:
2826 st->last_pos = *pos;
2827 return rc;
2828}
2829EXPORT_IPV6_MOD(tcp_seq_start);
2830
2831void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2832{
2833 struct tcp_iter_state *st = seq->private;
2834 void *rc = NULL;
2835
2836 if (v == SEQ_START_TOKEN) {
2837 rc = tcp_get_idx(seq, pos: 0);
2838 goto out;
2839 }
2840
2841 switch (st->state) {
2842 case TCP_SEQ_STATE_LISTENING:
2843 rc = listening_get_next(seq, cur: v);
2844 if (!rc) {
2845 st->state = TCP_SEQ_STATE_ESTABLISHED;
2846 st->bucket = 0;
2847 st->offset = 0;
2848 rc = established_get_first(seq);
2849 }
2850 break;
2851 case TCP_SEQ_STATE_ESTABLISHED:
2852 rc = established_get_next(seq, cur: v);
2853 break;
2854 }
2855out:
2856 ++*pos;
2857 st->last_pos = *pos;
2858 return rc;
2859}
2860EXPORT_IPV6_MOD(tcp_seq_next);
2861
2862void tcp_seq_stop(struct seq_file *seq, void *v)
2863{
2864 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2865 struct tcp_iter_state *st = seq->private;
2866
2867 switch (st->state) {
2868 case TCP_SEQ_STATE_LISTENING:
2869 if (v != SEQ_START_TOKEN)
2870 spin_unlock(lock: &hinfo->lhash2[st->bucket].lock);
2871 break;
2872 case TCP_SEQ_STATE_ESTABLISHED:
2873 if (v)
2874 spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
2875 break;
2876 }
2877}
2878EXPORT_IPV6_MOD(tcp_seq_stop);
2879
2880static void get_openreq4(const struct request_sock *req,
2881 struct seq_file *f, int i)
2882{
2883 const struct inet_request_sock *ireq = inet_rsk(sk: req);
2884 long delta = req->rsk_timer.expires - jiffies;
2885
2886 seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X"
2887 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2888 i,
2889 ireq->ir_loc_addr,
2890 ireq->ir_num,
2891 ireq->ir_rmt_addr,
2892 ntohs(ireq->ir_rmt_port),
2893 TCP_SYN_RECV,
2894 0, 0, /* could print option size, but that is af dependent. */
2895 1, /* timers active (only the expire timer) */
2896 jiffies_delta_to_clock_t(delta),
2897 req->num_timeout,
2898 from_kuid_munged(to: seq_user_ns(seq: f),
2899 uid: sock_i_uid(sk: req->rsk_listener)),
2900 0, /* non standard timer */
2901 0, /* open_requests have no inode */
2902 0,
2903 req);
2904}
2905
2906static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2907{
2908 int timer_active;
2909 unsigned long timer_expires;
2910 const struct tcp_sock *tp = tcp_sk(sk);
2911 const struct inet_connection_sock *icsk = inet_csk(sk);
2912 const struct inet_sock *inet = inet_sk(sk);
2913 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2914 __be32 dest = inet->inet_daddr;
2915 __be32 src = inet->inet_rcv_saddr;
2916 __u16 destp = ntohs(inet->inet_dport);
2917 __u16 srcp = ntohs(inet->inet_sport);
2918 u8 icsk_pending;
2919 int rx_queue;
2920 int state;
2921
2922 icsk_pending = smp_load_acquire(&icsk->icsk_pending);
2923 if (icsk_pending == ICSK_TIME_RETRANS ||
2924 icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2925 icsk_pending == ICSK_TIME_LOSS_PROBE) {
2926 timer_active = 1;
2927 timer_expires = icsk_timeout(icsk);
2928 } else if (icsk_pending == ICSK_TIME_PROBE0) {
2929 timer_active = 4;
2930 timer_expires = icsk_timeout(icsk);
2931 } else if (timer_pending(timer: &sk->sk_timer)) {
2932 timer_active = 2;
2933 timer_expires = sk->sk_timer.expires;
2934 } else {
2935 timer_active = 0;
2936 timer_expires = jiffies;
2937 }
2938
2939 state = inet_sk_state_load(sk);
2940 if (state == TCP_LISTEN)
2941 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2942 else
2943 /* Because we don't lock the socket,
2944 * we might find a transient negative value.
2945 */
2946 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2947 READ_ONCE(tp->copied_seq), 0);
2948
2949 seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2950 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2951 i, src, srcp, dest, destp, state,
2952 READ_ONCE(tp->write_seq) - tp->snd_una,
2953 rx_queue,
2954 timer_active,
2955 jiffies_delta_to_clock_t(delta: timer_expires - jiffies),
2956 icsk->icsk_retransmits,
2957 from_kuid_munged(to: seq_user_ns(seq: f), uid: sock_i_uid(sk)),
2958 icsk->icsk_probes_out,
2959 sock_i_ino(sk),
2960 refcount_read(r: &sk->sk_refcnt), sk,
2961 jiffies_to_clock_t(x: icsk->icsk_rto),
2962 jiffies_to_clock_t(x: icsk->icsk_ack.ato),
2963 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2964 tcp_snd_cwnd(tp),
2965 state == TCP_LISTEN ?
2966 fastopenq->max_qlen :
2967 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2968}
2969
2970static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2971 struct seq_file *f, int i)
2972{
2973 long delta = tw->tw_timer.expires - jiffies;
2974 __be32 dest, src;
2975 __u16 destp, srcp;
2976
2977 dest = tw->tw_daddr;
2978 src = tw->tw_rcv_saddr;
2979 destp = ntohs(tw->tw_dport);
2980 srcp = ntohs(tw->tw_sport);
2981
2982 seq_printf(m: f, fmt: "%4d: %08X:%04X %08X:%04X"
2983 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2984 i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
2985 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2986 refcount_read(r: &tw->tw_refcnt), tw);
2987}
2988
2989#define TMPSZ 150
2990
2991static int tcp4_seq_show(struct seq_file *seq, void *v)
2992{
2993 struct tcp_iter_state *st;
2994 struct sock *sk = v;
2995
2996 seq_setwidth(m: seq, TMPSZ - 1);
2997 if (v == SEQ_START_TOKEN) {
2998 seq_puts(m: seq, s: " sl local_address rem_address st tx_queue "
2999 "rx_queue tr tm->when retrnsmt uid timeout "
3000 "inode");
3001 goto out;
3002 }
3003 st = seq->private;
3004
3005 if (sk->sk_state == TCP_TIME_WAIT)
3006 get_timewait4_sock(tw: v, f: seq, i: st->num);
3007 else if (sk->sk_state == TCP_NEW_SYN_RECV)
3008 get_openreq4(req: v, f: seq, i: st->num);
3009 else
3010 get_tcp4_sock(sk: v, f: seq, i: st->num);
3011out:
3012 seq_pad(m: seq, c: '\n');
3013 return 0;
3014}
3015
3016#ifdef CONFIG_BPF_SYSCALL
3017struct bpf_tcp_iter_state {
3018 struct tcp_iter_state state;
3019 unsigned int cur_sk;
3020 unsigned int end_sk;
3021 unsigned int max_sk;
3022 struct sock **batch;
3023 bool st_bucket_done;
3024};
3025
3026struct bpf_iter__tcp {
3027 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3028 __bpf_md_ptr(struct sock_common *, sk_common);
3029 uid_t uid __aligned(8);
3030};
3031
3032static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3033 struct sock_common *sk_common, uid_t uid)
3034{
3035 struct bpf_iter__tcp ctx;
3036
3037 meta->seq_num--; /* skip SEQ_START_TOKEN */
3038 ctx.meta = meta;
3039 ctx.sk_common = sk_common;
3040 ctx.uid = uid;
3041 return bpf_iter_run_prog(prog, ctx: &ctx);
3042}
3043
3044static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
3045{
3046 while (iter->cur_sk < iter->end_sk)
3047 sock_gen_put(sk: iter->batch[iter->cur_sk++]);
3048}
3049
3050static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
3051 unsigned int new_batch_sz)
3052{
3053 struct sock **new_batch;
3054
3055 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3056 GFP_USER | __GFP_NOWARN);
3057 if (!new_batch)
3058 return -ENOMEM;
3059
3060 bpf_iter_tcp_put_batch(iter);
3061 kvfree(addr: iter->batch);
3062 iter->batch = new_batch;
3063 iter->max_sk = new_batch_sz;
3064
3065 return 0;
3066}
3067
3068static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
3069 struct sock *start_sk)
3070{
3071 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3072 struct bpf_tcp_iter_state *iter = seq->private;
3073 struct tcp_iter_state *st = &iter->state;
3074 struct hlist_nulls_node *node;
3075 unsigned int expected = 1;
3076 struct sock *sk;
3077
3078 sock_hold(sk: start_sk);
3079 iter->batch[iter->end_sk++] = start_sk;
3080
3081 sk = sk_nulls_next(sk: start_sk);
3082 sk_nulls_for_each_from(sk, node) {
3083 if (seq_sk_match(seq, sk)) {
3084 if (iter->end_sk < iter->max_sk) {
3085 sock_hold(sk);
3086 iter->batch[iter->end_sk++] = sk;
3087 }
3088 expected++;
3089 }
3090 }
3091 spin_unlock(lock: &hinfo->lhash2[st->bucket].lock);
3092
3093 return expected;
3094}
3095
3096static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
3097 struct sock *start_sk)
3098{
3099 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3100 struct bpf_tcp_iter_state *iter = seq->private;
3101 struct tcp_iter_state *st = &iter->state;
3102 struct hlist_nulls_node *node;
3103 unsigned int expected = 1;
3104 struct sock *sk;
3105
3106 sock_hold(sk: start_sk);
3107 iter->batch[iter->end_sk++] = start_sk;
3108
3109 sk = sk_nulls_next(sk: start_sk);
3110 sk_nulls_for_each_from(sk, node) {
3111 if (seq_sk_match(seq, sk)) {
3112 if (iter->end_sk < iter->max_sk) {
3113 sock_hold(sk);
3114 iter->batch[iter->end_sk++] = sk;
3115 }
3116 expected++;
3117 }
3118 }
3119 spin_unlock_bh(lock: inet_ehash_lockp(hashinfo: hinfo, hash: st->bucket));
3120
3121 return expected;
3122}
3123
3124static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
3125{
3126 struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
3127 struct bpf_tcp_iter_state *iter = seq->private;
3128 struct tcp_iter_state *st = &iter->state;
3129 unsigned int expected;
3130 bool resized = false;
3131 struct sock *sk;
3132
3133 /* The st->bucket is done. Directly advance to the next
3134 * bucket instead of having the tcp_seek_last_pos() to skip
3135 * one by one in the current bucket and eventually find out
3136 * it has to advance to the next bucket.
3137 */
3138 if (iter->st_bucket_done) {
3139 st->offset = 0;
3140 st->bucket++;
3141 if (st->state == TCP_SEQ_STATE_LISTENING &&
3142 st->bucket > hinfo->lhash2_mask) {
3143 st->state = TCP_SEQ_STATE_ESTABLISHED;
3144 st->bucket = 0;
3145 }
3146 }
3147
3148again:
3149 /* Get a new batch */
3150 iter->cur_sk = 0;
3151 iter->end_sk = 0;
3152 iter->st_bucket_done = false;
3153
3154 sk = tcp_seek_last_pos(seq);
3155 if (!sk)
3156 return NULL; /* Done */
3157
3158 if (st->state == TCP_SEQ_STATE_LISTENING)
3159 expected = bpf_iter_tcp_listening_batch(seq, start_sk: sk);
3160 else
3161 expected = bpf_iter_tcp_established_batch(seq, start_sk: sk);
3162
3163 if (iter->end_sk == expected) {
3164 iter->st_bucket_done = true;
3165 return sk;
3166 }
3167
3168 if (!resized && !bpf_iter_tcp_realloc_batch(iter, new_batch_sz: expected * 3 / 2)) {
3169 resized = true;
3170 goto again;
3171 }
3172
3173 return sk;
3174}
3175
3176static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
3177{
3178 /* bpf iter does not support lseek, so it always
3179 * continue from where it was stop()-ped.
3180 */
3181 if (*pos)
3182 return bpf_iter_tcp_batch(seq);
3183
3184 return SEQ_START_TOKEN;
3185}
3186
3187static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3188{
3189 struct bpf_tcp_iter_state *iter = seq->private;
3190 struct tcp_iter_state *st = &iter->state;
3191 struct sock *sk;
3192
3193 /* Whenever seq_next() is called, the iter->cur_sk is
3194 * done with seq_show(), so advance to the next sk in
3195 * the batch.
3196 */
3197 if (iter->cur_sk < iter->end_sk) {
3198 /* Keeping st->num consistent in tcp_iter_state.
3199 * bpf_iter_tcp does not use st->num.
3200 * meta.seq_num is used instead.
3201 */
3202 st->num++;
3203 /* Move st->offset to the next sk in the bucket such that
3204 * the future start() will resume at st->offset in
3205 * st->bucket. See tcp_seek_last_pos().
3206 */
3207 st->offset++;
3208 sock_gen_put(sk: iter->batch[iter->cur_sk++]);
3209 }
3210
3211 if (iter->cur_sk < iter->end_sk)
3212 sk = iter->batch[iter->cur_sk];
3213 else
3214 sk = bpf_iter_tcp_batch(seq);
3215
3216 ++*pos;
3217 /* Keeping st->last_pos consistent in tcp_iter_state.
3218 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
3219 */
3220 st->last_pos = *pos;
3221 return sk;
3222}
3223
3224static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
3225{
3226 struct bpf_iter_meta meta;
3227 struct bpf_prog *prog;
3228 struct sock *sk = v;
3229 uid_t uid;
3230 int ret;
3231
3232 if (v == SEQ_START_TOKEN)
3233 return 0;
3234
3235 if (sk_fullsock(sk))
3236 lock_sock(sk);
3237
3238 if (unlikely(sk_unhashed(sk))) {
3239 ret = SEQ_SKIP;
3240 goto unlock;
3241 }
3242
3243 if (sk->sk_state == TCP_TIME_WAIT) {
3244 uid = 0;
3245 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
3246 const struct request_sock *req = v;
3247
3248 uid = from_kuid_munged(to: seq_user_ns(seq),
3249 uid: sock_i_uid(sk: req->rsk_listener));
3250 } else {
3251 uid = from_kuid_munged(to: seq_user_ns(seq), uid: sock_i_uid(sk));
3252 }
3253
3254 meta.seq = seq;
3255 prog = bpf_iter_get_info(meta: &meta, in_stop: false);
3256 ret = tcp_prog_seq_show(prog, meta: &meta, sk_common: v, uid);
3257
3258unlock:
3259 if (sk_fullsock(sk))
3260 release_sock(sk);
3261 return ret;
3262
3263}
3264
3265static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3266{
3267 struct bpf_tcp_iter_state *iter = seq->private;
3268 struct bpf_iter_meta meta;
3269 struct bpf_prog *prog;
3270
3271 if (!v) {
3272 meta.seq = seq;
3273 prog = bpf_iter_get_info(meta: &meta, in_stop: true);
3274 if (prog)
3275 (void)tcp_prog_seq_show(prog, meta: &meta, sk_common: v, uid: 0);
3276 }
3277
3278 if (iter->cur_sk < iter->end_sk) {
3279 bpf_iter_tcp_put_batch(iter);
3280 iter->st_bucket_done = false;
3281 }
3282}
3283
3284static const struct seq_operations bpf_iter_tcp_seq_ops = {
3285 .show = bpf_iter_tcp_seq_show,
3286 .start = bpf_iter_tcp_seq_start,
3287 .next = bpf_iter_tcp_seq_next,
3288 .stop = bpf_iter_tcp_seq_stop,
3289};
3290#endif
3291static unsigned short seq_file_family(const struct seq_file *seq)
3292{
3293 const struct tcp_seq_afinfo *afinfo;
3294
3295#ifdef CONFIG_BPF_SYSCALL
3296 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
3297 if (seq->op == &bpf_iter_tcp_seq_ops)
3298 return AF_UNSPEC;
3299#endif
3300
3301 /* Iterated from proc fs */
3302 afinfo = pde_data(inode: file_inode(f: seq->file));
3303 return afinfo->family;
3304}
3305
3306static const struct seq_operations tcp4_seq_ops = {
3307 .show = tcp4_seq_show,
3308 .start = tcp_seq_start,
3309 .next = tcp_seq_next,
3310 .stop = tcp_seq_stop,
3311};
3312
3313static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3314 .family = AF_INET,
3315};
3316
3317static int __net_init tcp4_proc_init_net(struct net *net)
3318{
3319 if (!proc_create_net_data(name: "tcp", mode: 0444, parent: net->proc_net, ops: &tcp4_seq_ops,
3320 state_size: sizeof(struct tcp_iter_state), data: &tcp4_seq_afinfo))
3321 return -ENOMEM;
3322 return 0;
3323}
3324
3325static void __net_exit tcp4_proc_exit_net(struct net *net)
3326{
3327 remove_proc_entry("tcp", net->proc_net);
3328}
3329
3330static struct pernet_operations tcp4_net_ops = {
3331 .init = tcp4_proc_init_net,
3332 .exit = tcp4_proc_exit_net,
3333};
3334
3335int __init tcp4_proc_init(void)
3336{
3337 return register_pernet_subsys(&tcp4_net_ops);
3338}
3339
3340void tcp4_proc_exit(void)
3341{
3342 unregister_pernet_subsys(&tcp4_net_ops);
3343}
3344#endif /* CONFIG_PROC_FS */
3345
3346/* @wake is one when sk_stream_write_space() calls us.
3347 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3348 * This mimics the strategy used in sock_def_write_space().
3349 */
3350bool tcp_stream_memory_free(const struct sock *sk, int wake)
3351{
3352 const struct tcp_sock *tp = tcp_sk(sk);
3353 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3354 READ_ONCE(tp->snd_nxt);
3355
3356 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3357}
3358EXPORT_SYMBOL(tcp_stream_memory_free);
3359
3360struct proto tcp_prot = {
3361 .name = "TCP",
3362 .owner = THIS_MODULE,
3363 .close = tcp_close,
3364 .pre_connect = tcp_v4_pre_connect,
3365 .connect = tcp_v4_connect,
3366 .disconnect = tcp_disconnect,
3367 .accept = inet_csk_accept,
3368 .ioctl = tcp_ioctl,
3369 .init = tcp_v4_init_sock,
3370 .destroy = tcp_v4_destroy_sock,
3371 .shutdown = tcp_shutdown,
3372 .setsockopt = tcp_setsockopt,
3373 .getsockopt = tcp_getsockopt,
3374 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3375 .keepalive = tcp_set_keepalive,
3376 .recvmsg = tcp_recvmsg,
3377 .sendmsg = tcp_sendmsg,
3378 .splice_eof = tcp_splice_eof,
3379 .backlog_rcv = tcp_v4_do_rcv,
3380 .release_cb = tcp_release_cb,
3381 .hash = inet_hash,
3382 .unhash = inet_unhash,
3383 .get_port = inet_csk_get_port,
3384 .put_port = inet_put_port,
3385#ifdef CONFIG_BPF_SYSCALL
3386 .psock_update_sk_prot = tcp_bpf_update_proto,
3387#endif
3388 .enter_memory_pressure = tcp_enter_memory_pressure,
3389 .leave_memory_pressure = tcp_leave_memory_pressure,
3390 .stream_memory_free = tcp_stream_memory_free,
3391 .sockets_allocated = &tcp_sockets_allocated,
3392 .orphan_count = &tcp_orphan_count,
3393
3394 .memory_allocated = &tcp_memory_allocated,
3395 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3396
3397 .memory_pressure = &tcp_memory_pressure,
3398 .sysctl_mem = sysctl_tcp_mem,
3399 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3400 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3401 .max_header = MAX_TCP_HEADER,
3402 .obj_size = sizeof(struct tcp_sock),
3403 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3404 .twsk_prot = &tcp_timewait_sock_ops,
3405 .rsk_prot = &tcp_request_sock_ops,
3406 .h.hashinfo = NULL,
3407 .no_autobind = true,
3408 .diag_destroy = tcp_abort,
3409};
3410EXPORT_SYMBOL(tcp_prot);
3411
3412static void __net_exit tcp_sk_exit(struct net *net)
3413{
3414 if (net->ipv4.tcp_congestion_control)
3415 bpf_module_put(data: net->ipv4.tcp_congestion_control,
3416 owner: net->ipv4.tcp_congestion_control->owner);
3417}
3418
3419static void __net_init tcp_set_hashinfo(struct net *net)
3420{
3421 struct inet_hashinfo *hinfo;
3422 unsigned int ehash_entries;
3423 struct net *old_net;
3424
3425 if (net_eq(net1: net, net2: &init_net))
3426 goto fallback;
3427
3428 old_net = current->nsproxy->net_ns;
3429 ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3430 if (!ehash_entries)
3431 goto fallback;
3432
3433 ehash_entries = roundup_pow_of_two(ehash_entries);
3434 hinfo = inet_pernet_hashinfo_alloc(hashinfo: &tcp_hashinfo, ehash_entries);
3435 if (!hinfo) {
3436 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3437 "for a netns, fallback to the global one\n",
3438 ehash_entries);
3439fallback:
3440 hinfo = &tcp_hashinfo;
3441 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3442 }
3443
3444 net->ipv4.tcp_death_row.hashinfo = hinfo;
3445 net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3446 net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3447}
3448
3449static int __net_init tcp_sk_init(struct net *net)
3450{
3451 net->ipv4.sysctl_tcp_ecn = 2;
3452 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3453
3454 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3455 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3456 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3457 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3458 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3459
3460 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3461 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3462 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3463
3464 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3465 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3466 net->ipv4.sysctl_tcp_syncookies = 1;
3467 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3468 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3469 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3470 net->ipv4.sysctl_tcp_orphan_retries = 0;
3471 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3472 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3473 net->ipv4.sysctl_tcp_tw_reuse = 2;
3474 net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
3475 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3476
3477 refcount_set(r: &net->ipv4.tcp_death_row.tw_refcount, n: 1);
3478 tcp_set_hashinfo(net);
3479
3480 net->ipv4.sysctl_tcp_sack = 1;
3481 net->ipv4.sysctl_tcp_window_scaling = 1;
3482 net->ipv4.sysctl_tcp_timestamps = 1;
3483 net->ipv4.sysctl_tcp_early_retrans = 3;
3484 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3485 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3486 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3487 net->ipv4.sysctl_tcp_max_reordering = 300;
3488 net->ipv4.sysctl_tcp_dsack = 1;
3489 net->ipv4.sysctl_tcp_app_win = 31;
3490 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3491 net->ipv4.sysctl_tcp_frto = 2;
3492 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3493 /* This limits the percentage of the congestion window which we
3494 * will allow a single TSO frame to consume. Building TSO frames
3495 * which are too large can cause TCP streams to be bursty.
3496 */
3497 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3498 /* Default TSQ limit of 4 MB */
3499 net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
3500
3501 /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3502 net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3503
3504 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3505 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3506 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3507 net->ipv4.sysctl_tcp_autocorking = 1;
3508 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3509 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3510 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3511 if (net != &init_net) {
3512 memcpy(net->ipv4.sysctl_tcp_rmem,
3513 init_net.ipv4.sysctl_tcp_rmem,
3514 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3515 memcpy(net->ipv4.sysctl_tcp_wmem,
3516 init_net.ipv4.sysctl_tcp_wmem,
3517 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3518 }
3519 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3520 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3521 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3522 net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
3523 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3524 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3525 atomic_set(v: &net->ipv4.tfo_active_disable_times, i: 0);
3526
3527 /* Set default values for PLB */
3528 net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3529 net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3530 net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3531 net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3532 /* Default congestion threshold for PLB to mark a round is 50% */
3533 net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3534
3535 /* Reno is always built in */
3536 if (!net_eq(net1: net, net2: &init_net) &&
3537 bpf_try_module_get(data: init_net.ipv4.tcp_congestion_control,
3538 owner: init_net.ipv4.tcp_congestion_control->owner))
3539 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3540 else
3541 net->ipv4.tcp_congestion_control = &tcp_reno;
3542
3543 net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3544 net->ipv4.sysctl_tcp_shrink_window = 0;
3545
3546 net->ipv4.sysctl_tcp_pingpong_thresh = 1;
3547 net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
3548 net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
3549
3550 return 0;
3551}
3552
3553static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3554{
3555 struct net *net;
3556
3557 /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
3558 * and failed setup_net error unwinding path are serialized.
3559 *
3560 * tcp_twsk_purge() handles twsk in any dead netns, not just those in
3561 * net_exit_list, the thread that dismantles a particular twsk must
3562 * do so without other thread progressing to refcount_dec_and_test() of
3563 * tcp_death_row.tw_refcount.
3564 */
3565 mutex_lock(&tcp_exit_batch_mutex);
3566
3567 tcp_twsk_purge(net_exit_list);
3568
3569 list_for_each_entry(net, net_exit_list, exit_list) {
3570 inet_pernet_hashinfo_free(hashinfo: net->ipv4.tcp_death_row.hashinfo);
3571 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3572 tcp_fastopen_ctx_destroy(net);
3573 }
3574
3575 mutex_unlock(lock: &tcp_exit_batch_mutex);
3576}
3577
3578static struct pernet_operations __net_initdata tcp_sk_ops = {
3579 .init = tcp_sk_init,
3580 .exit = tcp_sk_exit,
3581 .exit_batch = tcp_sk_exit_batch,
3582};
3583
3584#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3585DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3586 struct sock_common *sk_common, uid_t uid)
3587
3588#define INIT_BATCH_SZ 16
3589
3590static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3591{
3592 struct bpf_tcp_iter_state *iter = priv_data;
3593 int err;
3594
3595 err = bpf_iter_init_seq_net(priv_data, aux);
3596 if (err)
3597 return err;
3598
3599 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3600 if (err) {
3601 bpf_iter_fini_seq_net(priv_data);
3602 return err;
3603 }
3604
3605 return 0;
3606}
3607
3608static void bpf_iter_fini_tcp(void *priv_data)
3609{
3610 struct bpf_tcp_iter_state *iter = priv_data;
3611
3612 bpf_iter_fini_seq_net(priv_data);
3613 kvfree(addr: iter->batch);
3614}
3615
3616static const struct bpf_iter_seq_info tcp_seq_info = {
3617 .seq_ops = &bpf_iter_tcp_seq_ops,
3618 .init_seq_private = bpf_iter_init_tcp,
3619 .fini_seq_private = bpf_iter_fini_tcp,
3620 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3621};
3622
3623static const struct bpf_func_proto *
3624bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3625 const struct bpf_prog *prog)
3626{
3627 switch (func_id) {
3628 case BPF_FUNC_setsockopt:
3629 return &bpf_sk_setsockopt_proto;
3630 case BPF_FUNC_getsockopt:
3631 return &bpf_sk_getsockopt_proto;
3632 default:
3633 return NULL;
3634 }
3635}
3636
3637static struct bpf_iter_reg tcp_reg_info = {
3638 .target = "tcp",
3639 .ctx_arg_info_size = 1,
3640 .ctx_arg_info = {
3641 { offsetof(struct bpf_iter__tcp, sk_common),
3642 PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3643 },
3644 .get_func_proto = bpf_iter_tcp_get_func_proto,
3645 .seq_info = &tcp_seq_info,
3646};
3647
3648static void __init bpf_iter_register(void)
3649{
3650 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3651 if (bpf_iter_reg_target(reg_info: &tcp_reg_info))
3652 pr_warn("Warning: could not register bpf iterator tcp\n");
3653}
3654
3655#endif
3656
3657void __init tcp_v4_init(void)
3658{
3659 int cpu, res;
3660
3661 for_each_possible_cpu(cpu) {
3662 struct sock *sk;
3663
3664 res = inet_ctl_sock_create(sk: &sk, PF_INET, type: SOCK_RAW,
3665 IPPROTO_TCP, net: &init_net);
3666 if (res)
3667 panic(fmt: "Failed to create the TCP control socket.\n");
3668 sock_set_flag(sk, flag: SOCK_USE_WRITE_QUEUE);
3669
3670 /* Please enforce IP_DF and IPID==0 for RST and
3671 * ACK sent in SYN-RECV and TIME-WAIT state.
3672 */
3673 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3674
3675 sk->sk_clockid = CLOCK_MONOTONIC;
3676
3677 per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
3678 }
3679 if (register_pernet_subsys(&tcp_sk_ops))
3680 panic(fmt: "Failed to create the TCP control socket.\n");
3681
3682#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3683 bpf_iter_register();
3684#endif
3685}
3686

Provided by KDAB

Privacy Policy
Improve your Profiling and Debugging skills
Find out more

source code of linux/net/ipv4/tcp_ipv4.c