1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19/*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48#define pr_fmt(fmt) "TCP: " fmt
49
50#include <linux/bottom_half.h>
51#include <linux/types.h>
52#include <linux/fcntl.h>
53#include <linux/module.h>
54#include <linux/random.h>
55#include <linux/cache.h>
56#include <linux/jhash.h>
57#include <linux/init.h>
58#include <linux/times.h>
59#include <linux/slab.h>
60
61#include <net/net_namespace.h>
62#include <net/icmp.h>
63#include <net/inet_hashtables.h>
64#include <net/tcp.h>
65#include <net/transp_v6.h>
66#include <net/ipv6.h>
67#include <net/inet_common.h>
68#include <net/timewait_sock.h>
69#include <net/xfrm.h>
70#include <net/secure_seq.h>
71#include <net/busy_poll.h>
72
73#include <linux/inet.h>
74#include <linux/ipv6.h>
75#include <linux/stddef.h>
76#include <linux/proc_fs.h>
77#include <linux/seq_file.h>
78#include <linux/inetdevice.h>
79#include <linux/btf_ids.h>
80
81#include <crypto/hash.h>
82#include <linux/scatterlist.h>
83
84#include <trace/events/tcp.h>
85
86#ifdef CONFIG_TCP_MD5SIG
87static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
89#endif
90
91struct inet_hashinfo tcp_hashinfo;
92EXPORT_SYMBOL(tcp_hashinfo);
93
94static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
96static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97{
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 ip_hdr(skb)->saddr,
100 tcp_hdr(skb)->dest,
101 tcp_hdr(skb)->source);
102}
103
104static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105{
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107}
108
109int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110{
111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
115
116 if (reuse == 2) {
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
120 */
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 loopback = true;
124#if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 loopback = true;
131 } else
132#endif
133 {
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
136 loopback = true;
137 }
138 if (!loopback)
139 reuse = 0;
140 }
141
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
148 holder.
149
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
152 */
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156 /* In case of repair and re-using TIME-WAIT sockets we still
157 * want to be sure that it is safe as above but honor the
158 * sequence numbers and time stamps set as part of the repair
159 * process.
160 *
161 * Without this check re-using a TIME-WAIT socket with TCP
162 * repair would accumulate a -1 on the repair assigned
163 * sequence number. The first time it is reused the sequence
164 * is -1, the second time -2, etc. This fixes that issue
165 * without appearing to create any others.
166 */
167 if (likely(!tp->repair)) {
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170 if (!seq)
171 seq = 1;
172 WRITE_ONCE(tp->write_seq, seq);
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 }
176 sock_hold(sktw);
177 return 1;
178 }
179
180 return 0;
181}
182EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
184static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 int addr_len)
186{
187 /* This check is replicated from tcp_v4_connect() and intended to
188 * prevent BPF program called below from accessing bytes that are out
189 * of the bound specified by user in addr_len.
190 */
191 if (addr_len < sizeof(struct sockaddr_in))
192 return -EINVAL;
193
194 sock_owned_by_me(sk);
195
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197}
198
199/* This will initiate an outgoing connection. */
200int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201{
202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 struct inet_sock *inet = inet_sk(sk);
204 struct tcp_sock *tp = tcp_sk(sk);
205 __be16 orig_sport, orig_dport;
206 __be32 daddr, nexthop;
207 struct flowi4 *fl4;
208 struct rtable *rt;
209 int err;
210 struct ip_options_rcu *inet_opt;
211 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
212
213 if (addr_len < sizeof(struct sockaddr_in))
214 return -EINVAL;
215
216 if (usin->sin_family != AF_INET)
217 return -EAFNOSUPPORT;
218
219 nexthop = daddr = usin->sin_addr.s_addr;
220 inet_opt = rcu_dereference_protected(inet->inet_opt,
221 lockdep_sock_is_held(sk));
222 if (inet_opt && inet_opt->opt.srr) {
223 if (!daddr)
224 return -EINVAL;
225 nexthop = inet_opt->opt.faddr;
226 }
227
228 orig_sport = inet->inet_sport;
229 orig_dport = usin->sin_port;
230 fl4 = &inet->cork.fl.u.ip4;
231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
233 orig_dport, sk);
234 if (IS_ERR(rt)) {
235 err = PTR_ERR(rt);
236 if (err == -ENETUNREACH)
237 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
238 return err;
239 }
240
241 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
242 ip_rt_put(rt);
243 return -ENETUNREACH;
244 }
245
246 if (!inet_opt || !inet_opt->opt.srr)
247 daddr = fl4->daddr;
248
249 if (!inet->inet_saddr)
250 inet->inet_saddr = fl4->saddr;
251 sk_rcv_saddr_set(sk, inet->inet_saddr);
252
253 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
254 /* Reset inherited state */
255 tp->rx_opt.ts_recent = 0;
256 tp->rx_opt.ts_recent_stamp = 0;
257 if (likely(!tp->repair))
258 WRITE_ONCE(tp->write_seq, 0);
259 }
260
261 inet->inet_dport = usin->sin_port;
262 sk_daddr_set(sk, daddr);
263
264 inet_csk(sk)->icsk_ext_hdr_len = 0;
265 if (inet_opt)
266 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
267
268 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
269
270 /* Socket identity is still unknown (sport may be zero).
271 * However we set state to SYN-SENT and not releasing socket
272 * lock select source port, enter ourselves into the hash tables and
273 * complete initialization after this.
274 */
275 tcp_set_state(sk, TCP_SYN_SENT);
276 err = inet_hash_connect(tcp_death_row, sk);
277 if (err)
278 goto failure;
279
280 sk_set_txhash(sk);
281
282 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
283 inet->inet_sport, inet->inet_dport, sk);
284 if (IS_ERR(rt)) {
285 err = PTR_ERR(rt);
286 rt = NULL;
287 goto failure;
288 }
289 /* OK, now commit destination to socket. */
290 sk->sk_gso_type = SKB_GSO_TCPV4;
291 sk_setup_caps(sk, &rt->dst);
292 rt = NULL;
293
294 if (likely(!tp->repair)) {
295 if (!tp->write_seq)
296 WRITE_ONCE(tp->write_seq,
297 secure_tcp_seq(inet->inet_saddr,
298 inet->inet_daddr,
299 inet->inet_sport,
300 usin->sin_port));
301 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
302 inet->inet_saddr,
303 inet->inet_daddr);
304 }
305
306 inet->inet_id = prandom_u32();
307
308 if (tcp_fastopen_defer_connect(sk, &err))
309 return err;
310 if (err)
311 goto failure;
312
313 err = tcp_connect(sk);
314
315 if (err)
316 goto failure;
317
318 return 0;
319
320failure:
321 /*
322 * This unhashes the socket and releases the local port,
323 * if necessary.
324 */
325 tcp_set_state(sk, TCP_CLOSE);
326 ip_rt_put(rt);
327 sk->sk_route_caps = 0;
328 inet->inet_dport = 0;
329 return err;
330}
331EXPORT_SYMBOL(tcp_v4_connect);
332
333/*
334 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
335 * It can be called through tcp_release_cb() if socket was owned by user
336 * at the time tcp_v4_err() was called to handle ICMP message.
337 */
338void tcp_v4_mtu_reduced(struct sock *sk)
339{
340 struct inet_sock *inet = inet_sk(sk);
341 struct dst_entry *dst;
342 u32 mtu;
343
344 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
345 return;
346 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
347 dst = inet_csk_update_pmtu(sk, mtu);
348 if (!dst)
349 return;
350
351 /* Something is about to be wrong... Remember soft error
352 * for the case, if this connection will not able to recover.
353 */
354 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
355 sk->sk_err_soft = EMSGSIZE;
356
357 mtu = dst_mtu(dst);
358
359 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
360 ip_sk_accept_pmtu(sk) &&
361 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
362 tcp_sync_mss(sk, mtu);
363
364 /* Resend the TCP packet because it's
365 * clear that the old packet has been
366 * dropped. This is the new "fast" path mtu
367 * discovery.
368 */
369 tcp_simple_retransmit(sk);
370 } /* else let the usual retransmit timer handle it */
371}
372EXPORT_SYMBOL(tcp_v4_mtu_reduced);
373
374static void do_redirect(struct sk_buff *skb, struct sock *sk)
375{
376 struct dst_entry *dst = __sk_dst_check(sk, 0);
377
378 if (dst)
379 dst->ops->redirect(dst, sk, skb);
380}
381
382
383/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
384void tcp_req_err(struct sock *sk, u32 seq, bool abort)
385{
386 struct request_sock *req = inet_reqsk(sk);
387 struct net *net = sock_net(sk);
388
389 /* ICMPs are not backlogged, hence we cannot get
390 * an established socket here.
391 */
392 if (seq != tcp_rsk(req)->snt_isn) {
393 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
394 } else if (abort) {
395 /*
396 * Still in SYN_RECV, just remove it silently.
397 * There is no good way to pass the error to the newly
398 * created socket, and POSIX does not want network
399 * errors returned from accept().
400 */
401 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
402 tcp_listendrop(req->rsk_listener);
403 }
404 reqsk_put(req);
405}
406EXPORT_SYMBOL(tcp_req_err);
407
408/* TCP-LD (RFC 6069) logic */
409void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
410{
411 struct inet_connection_sock *icsk = inet_csk(sk);
412 struct tcp_sock *tp = tcp_sk(sk);
413 struct sk_buff *skb;
414 s32 remaining;
415 u32 delta_us;
416
417 if (sock_owned_by_user(sk))
418 return;
419
420 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
421 !icsk->icsk_backoff)
422 return;
423
424 skb = tcp_rtx_queue_head(sk);
425 if (WARN_ON_ONCE(!skb))
426 return;
427
428 icsk->icsk_backoff--;
429 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
430 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
431
432 tcp_mstamp_refresh(tp);
433 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
434 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
435
436 if (remaining > 0) {
437 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
438 remaining, TCP_RTO_MAX);
439 } else {
440 /* RTO revert clocked out retransmission.
441 * Will retransmit now.
442 */
443 tcp_retransmit_timer(sk);
444 }
445}
446EXPORT_SYMBOL(tcp_ld_RTO_revert);
447
448/*
449 * This routine is called by the ICMP module when it gets some
450 * sort of error condition. If err < 0 then the socket should
451 * be closed and the error returned to the user. If err > 0
452 * it's just the icmp type << 8 | icmp code. After adjustment
453 * header points to the first 8 bytes of the tcp header. We need
454 * to find the appropriate port.
455 *
456 * The locking strategy used here is very "optimistic". When
457 * someone else accesses the socket the ICMP is just dropped
458 * and for some paths there is no check at all.
459 * A more general error queue to queue errors for later handling
460 * is probably better.
461 *
462 */
463
464int tcp_v4_err(struct sk_buff *skb, u32 info)
465{
466 const struct iphdr *iph = (const struct iphdr *)skb->data;
467 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
468 struct tcp_sock *tp;
469 struct inet_sock *inet;
470 const int type = icmp_hdr(skb)->type;
471 const int code = icmp_hdr(skb)->code;
472 struct sock *sk;
473 struct request_sock *fastopen;
474 u32 seq, snd_una;
475 int err;
476 struct net *net = dev_net(skb->dev);
477
478 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
479 th->dest, iph->saddr, ntohs(th->source),
480 inet_iif(skb), 0);
481 if (!sk) {
482 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
483 return -ENOENT;
484 }
485 if (sk->sk_state == TCP_TIME_WAIT) {
486 inet_twsk_put(inet_twsk(sk));
487 return 0;
488 }
489 seq = ntohl(th->seq);
490 if (sk->sk_state == TCP_NEW_SYN_RECV) {
491 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
492 type == ICMP_TIME_EXCEEDED ||
493 (type == ICMP_DEST_UNREACH &&
494 (code == ICMP_NET_UNREACH ||
495 code == ICMP_HOST_UNREACH)));
496 return 0;
497 }
498
499 bh_lock_sock(sk);
500 /* If too many ICMPs get dropped on busy
501 * servers this needs to be solved differently.
502 * We do take care of PMTU discovery (RFC1191) special case :
503 * we can receive locally generated ICMP messages while socket is held.
504 */
505 if (sock_owned_by_user(sk)) {
506 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
507 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
508 }
509 if (sk->sk_state == TCP_CLOSE)
510 goto out;
511
512 if (static_branch_unlikely(&ip4_min_ttl)) {
513 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
514 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
515 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
516 goto out;
517 }
518 }
519
520 tp = tcp_sk(sk);
521 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
522 fastopen = rcu_dereference(tp->fastopen_rsk);
523 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
524 if (sk->sk_state != TCP_LISTEN &&
525 !between(seq, snd_una, tp->snd_nxt)) {
526 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
527 goto out;
528 }
529
530 switch (type) {
531 case ICMP_REDIRECT:
532 if (!sock_owned_by_user(sk))
533 do_redirect(skb, sk);
534 goto out;
535 case ICMP_SOURCE_QUENCH:
536 /* Just silently ignore these. */
537 goto out;
538 case ICMP_PARAMETERPROB:
539 err = EPROTO;
540 break;
541 case ICMP_DEST_UNREACH:
542 if (code > NR_ICMP_UNREACH)
543 goto out;
544
545 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
546 /* We are not interested in TCP_LISTEN and open_requests
547 * (SYN-ACKs send out by Linux are always <576bytes so
548 * they should go through unfragmented).
549 */
550 if (sk->sk_state == TCP_LISTEN)
551 goto out;
552
553 WRITE_ONCE(tp->mtu_info, info);
554 if (!sock_owned_by_user(sk)) {
555 tcp_v4_mtu_reduced(sk);
556 } else {
557 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
558 sock_hold(sk);
559 }
560 goto out;
561 }
562
563 err = icmp_err_convert[code].errno;
564 /* check if this ICMP message allows revert of backoff.
565 * (see RFC 6069)
566 */
567 if (!fastopen &&
568 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
569 tcp_ld_RTO_revert(sk, seq);
570 break;
571 case ICMP_TIME_EXCEEDED:
572 err = EHOSTUNREACH;
573 break;
574 default:
575 goto out;
576 }
577
578 switch (sk->sk_state) {
579 case TCP_SYN_SENT:
580 case TCP_SYN_RECV:
581 /* Only in fast or simultaneous open. If a fast open socket is
582 * already accepted it is treated as a connected one below.
583 */
584 if (fastopen && !fastopen->sk)
585 break;
586
587 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
588
589 if (!sock_owned_by_user(sk)) {
590 sk->sk_err = err;
591
592 sk_error_report(sk);
593
594 tcp_done(sk);
595 } else {
596 sk->sk_err_soft = err;
597 }
598 goto out;
599 }
600
601 /* If we've already connected we will keep trying
602 * until we time out, or the user gives up.
603 *
604 * rfc1122 4.2.3.9 allows to consider as hard errors
605 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
606 * but it is obsoleted by pmtu discovery).
607 *
608 * Note, that in modern internet, where routing is unreliable
609 * and in each dark corner broken firewalls sit, sending random
610 * errors ordered by their masters even this two messages finally lose
611 * their original sense (even Linux sends invalid PORT_UNREACHs)
612 *
613 * Now we are in compliance with RFCs.
614 * --ANK (980905)
615 */
616
617 inet = inet_sk(sk);
618 if (!sock_owned_by_user(sk) && inet->recverr) {
619 sk->sk_err = err;
620 sk_error_report(sk);
621 } else { /* Only an error on timeout */
622 sk->sk_err_soft = err;
623 }
624
625out:
626 bh_unlock_sock(sk);
627 sock_put(sk);
628 return 0;
629}
630
631void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
632{
633 struct tcphdr *th = tcp_hdr(skb);
634
635 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
636 skb->csum_start = skb_transport_header(skb) - skb->head;
637 skb->csum_offset = offsetof(struct tcphdr, check);
638}
639
640/* This routine computes an IPv4 TCP checksum. */
641void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
642{
643 const struct inet_sock *inet = inet_sk(sk);
644
645 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
646}
647EXPORT_SYMBOL(tcp_v4_send_check);
648
649/*
650 * This routine will send an RST to the other tcp.
651 *
652 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
653 * for reset.
654 * Answer: if a packet caused RST, it is not for a socket
655 * existing in our system, if it is matched to a socket,
656 * it is just duplicate segment or bug in other side's TCP.
657 * So that we build reply only basing on parameters
658 * arrived with segment.
659 * Exception: precedence violation. We do not implement it in any case.
660 */
661
662#ifdef CONFIG_TCP_MD5SIG
663#define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
664#else
665#define OPTION_BYTES sizeof(__be32)
666#endif
667
668static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
669{
670 const struct tcphdr *th = tcp_hdr(skb);
671 struct {
672 struct tcphdr th;
673 __be32 opt[OPTION_BYTES / sizeof(__be32)];
674 } rep;
675 struct ip_reply_arg arg;
676#ifdef CONFIG_TCP_MD5SIG
677 struct tcp_md5sig_key *key = NULL;
678 const __u8 *hash_location = NULL;
679 unsigned char newhash[16];
680 int genhash;
681 struct sock *sk1 = NULL;
682#endif
683 u64 transmit_time = 0;
684 struct sock *ctl_sk;
685 struct net *net;
686
687 /* Never send a reset in response to a reset. */
688 if (th->rst)
689 return;
690
691 /* If sk not NULL, it means we did a successful lookup and incoming
692 * route had to be correct. prequeue might have dropped our dst.
693 */
694 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
695 return;
696
697 /* Swap the send and the receive. */
698 memset(&rep, 0, sizeof(rep));
699 rep.th.dest = th->source;
700 rep.th.source = th->dest;
701 rep.th.doff = sizeof(struct tcphdr) / 4;
702 rep.th.rst = 1;
703
704 if (th->ack) {
705 rep.th.seq = th->ack_seq;
706 } else {
707 rep.th.ack = 1;
708 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
709 skb->len - (th->doff << 2));
710 }
711
712 memset(&arg, 0, sizeof(arg));
713 arg.iov[0].iov_base = (unsigned char *)&rep;
714 arg.iov[0].iov_len = sizeof(rep.th);
715
716 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
717#ifdef CONFIG_TCP_MD5SIG
718 rcu_read_lock();
719 hash_location = tcp_parse_md5sig_option(th);
720 if (sk && sk_fullsock(sk)) {
721 const union tcp_md5_addr *addr;
722 int l3index;
723
724 /* sdif set, means packet ingressed via a device
725 * in an L3 domain and inet_iif is set to it.
726 */
727 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
728 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
729 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
730 } else if (hash_location) {
731 const union tcp_md5_addr *addr;
732 int sdif = tcp_v4_sdif(skb);
733 int dif = inet_iif(skb);
734 int l3index;
735
736 /*
737 * active side is lost. Try to find listening socket through
738 * source port, and then find md5 key through listening socket.
739 * we are not loose security here:
740 * Incoming packet is checked with md5 hash with finding key,
741 * no RST generated if md5 hash doesn't match.
742 */
743 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
744 ip_hdr(skb)->saddr,
745 th->source, ip_hdr(skb)->daddr,
746 ntohs(th->source), dif, sdif);
747 /* don't send rst if it can't find key */
748 if (!sk1)
749 goto out;
750
751 /* sdif set, means packet ingressed via a device
752 * in an L3 domain and dif is set to it.
753 */
754 l3index = sdif ? dif : 0;
755 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
756 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
757 if (!key)
758 goto out;
759
760
761 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
762 if (genhash || memcmp(hash_location, newhash, 16) != 0)
763 goto out;
764
765 }
766
767 if (key) {
768 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
769 (TCPOPT_NOP << 16) |
770 (TCPOPT_MD5SIG << 8) |
771 TCPOLEN_MD5SIG);
772 /* Update length and the length the header thinks exists */
773 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
774 rep.th.doff = arg.iov[0].iov_len / 4;
775
776 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
777 key, ip_hdr(skb)->saddr,
778 ip_hdr(skb)->daddr, &rep.th);
779 }
780#endif
781 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
782 if (rep.opt[0] == 0) {
783 __be32 mrst = mptcp_reset_option(skb);
784
785 if (mrst) {
786 rep.opt[0] = mrst;
787 arg.iov[0].iov_len += sizeof(mrst);
788 rep.th.doff = arg.iov[0].iov_len / 4;
789 }
790 }
791
792 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
793 ip_hdr(skb)->saddr, /* XXX */
794 arg.iov[0].iov_len, IPPROTO_TCP, 0);
795 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
796 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
797
798 /* When socket is gone, all binding information is lost.
799 * routing might fail in this case. No choice here, if we choose to force
800 * input interface, we will misroute in case of asymmetric route.
801 */
802 if (sk) {
803 arg.bound_dev_if = sk->sk_bound_dev_if;
804 if (sk_fullsock(sk))
805 trace_tcp_send_reset(sk, skb);
806 }
807
808 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
809 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
810
811 arg.tos = ip_hdr(skb)->tos;
812 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
813 local_bh_disable();
814 ctl_sk = this_cpu_read(ipv4_tcp_sk);
815 sock_net_set(ctl_sk, net);
816 if (sk) {
817 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
818 inet_twsk(sk)->tw_mark : sk->sk_mark;
819 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
820 inet_twsk(sk)->tw_priority : sk->sk_priority;
821 transmit_time = tcp_transmit_time(sk);
822 xfrm_sk_clone_policy(ctl_sk, sk);
823 }
824 ip_send_unicast_reply(ctl_sk,
825 skb, &TCP_SKB_CB(skb)->header.h4.opt,
826 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
827 &arg, arg.iov[0].iov_len,
828 transmit_time);
829
830 ctl_sk->sk_mark = 0;
831 xfrm_sk_free_policy(ctl_sk);
832 sock_net_set(ctl_sk, &init_net);
833 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
834 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
835 local_bh_enable();
836
837#ifdef CONFIG_TCP_MD5SIG
838out:
839 rcu_read_unlock();
840#endif
841}
842
843/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
844 outside socket context is ugly, certainly. What can I do?
845 */
846
847static void tcp_v4_send_ack(const struct sock *sk,
848 struct sk_buff *skb, u32 seq, u32 ack,
849 u32 win, u32 tsval, u32 tsecr, int oif,
850 struct tcp_md5sig_key *key,
851 int reply_flags, u8 tos)
852{
853 const struct tcphdr *th = tcp_hdr(skb);
854 struct {
855 struct tcphdr th;
856 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
857#ifdef CONFIG_TCP_MD5SIG
858 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
859#endif
860 ];
861 } rep;
862 struct net *net = sock_net(sk);
863 struct ip_reply_arg arg;
864 struct sock *ctl_sk;
865 u64 transmit_time;
866
867 memset(&rep.th, 0, sizeof(struct tcphdr));
868 memset(&arg, 0, sizeof(arg));
869
870 arg.iov[0].iov_base = (unsigned char *)&rep;
871 arg.iov[0].iov_len = sizeof(rep.th);
872 if (tsecr) {
873 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
874 (TCPOPT_TIMESTAMP << 8) |
875 TCPOLEN_TIMESTAMP);
876 rep.opt[1] = htonl(tsval);
877 rep.opt[2] = htonl(tsecr);
878 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
879 }
880
881 /* Swap the send and the receive. */
882 rep.th.dest = th->source;
883 rep.th.source = th->dest;
884 rep.th.doff = arg.iov[0].iov_len / 4;
885 rep.th.seq = htonl(seq);
886 rep.th.ack_seq = htonl(ack);
887 rep.th.ack = 1;
888 rep.th.window = htons(win);
889
890#ifdef CONFIG_TCP_MD5SIG
891 if (key) {
892 int offset = (tsecr) ? 3 : 0;
893
894 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
895 (TCPOPT_NOP << 16) |
896 (TCPOPT_MD5SIG << 8) |
897 TCPOLEN_MD5SIG);
898 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
899 rep.th.doff = arg.iov[0].iov_len/4;
900
901 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
902 key, ip_hdr(skb)->saddr,
903 ip_hdr(skb)->daddr, &rep.th);
904 }
905#endif
906 arg.flags = reply_flags;
907 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
908 ip_hdr(skb)->saddr, /* XXX */
909 arg.iov[0].iov_len, IPPROTO_TCP, 0);
910 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
911 if (oif)
912 arg.bound_dev_if = oif;
913 arg.tos = tos;
914 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
915 local_bh_disable();
916 ctl_sk = this_cpu_read(ipv4_tcp_sk);
917 sock_net_set(ctl_sk, net);
918 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
919 inet_twsk(sk)->tw_mark : sk->sk_mark;
920 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
921 inet_twsk(sk)->tw_priority : sk->sk_priority;
922 transmit_time = tcp_transmit_time(sk);
923 ip_send_unicast_reply(ctl_sk,
924 skb, &TCP_SKB_CB(skb)->header.h4.opt,
925 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
926 &arg, arg.iov[0].iov_len,
927 transmit_time);
928
929 ctl_sk->sk_mark = 0;
930 sock_net_set(ctl_sk, &init_net);
931 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
932 local_bh_enable();
933}
934
935static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
936{
937 struct inet_timewait_sock *tw = inet_twsk(sk);
938 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
939
940 tcp_v4_send_ack(sk, skb,
941 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
942 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
943 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
944 tcptw->tw_ts_recent,
945 tw->tw_bound_dev_if,
946 tcp_twsk_md5_key(tcptw),
947 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
948 tw->tw_tos
949 );
950
951 inet_twsk_put(tw);
952}
953
954static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
955 struct request_sock *req)
956{
957 const union tcp_md5_addr *addr;
958 int l3index;
959
960 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
961 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
962 */
963 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
964 tcp_sk(sk)->snd_nxt;
965
966 /* RFC 7323 2.3
967 * The window field (SEG.WND) of every outgoing segment, with the
968 * exception of <SYN> segments, MUST be right-shifted by
969 * Rcv.Wind.Shift bits:
970 */
971 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
972 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
973 tcp_v4_send_ack(sk, skb, seq,
974 tcp_rsk(req)->rcv_nxt,
975 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
976 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
977 req->ts_recent,
978 0,
979 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
980 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
981 ip_hdr(skb)->tos);
982}
983
984/*
985 * Send a SYN-ACK after having received a SYN.
986 * This still operates on a request_sock only, not on a big
987 * socket.
988 */
989static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
990 struct flowi *fl,
991 struct request_sock *req,
992 struct tcp_fastopen_cookie *foc,
993 enum tcp_synack_type synack_type,
994 struct sk_buff *syn_skb)
995{
996 const struct inet_request_sock *ireq = inet_rsk(req);
997 struct flowi4 fl4;
998 int err = -1;
999 struct sk_buff *skb;
1000 u8 tos;
1001
1002 /* First, grab a route. */
1003 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1004 return -1;
1005
1006 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1007
1008 if (skb) {
1009 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1010
1011 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1012 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1013 (inet_sk(sk)->tos & INET_ECN_MASK) :
1014 inet_sk(sk)->tos;
1015
1016 if (!INET_ECN_is_capable(tos) &&
1017 tcp_bpf_ca_needs_ecn((struct sock *)req))
1018 tos |= INET_ECN_ECT_0;
1019
1020 rcu_read_lock();
1021 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1022 ireq->ir_rmt_addr,
1023 rcu_dereference(ireq->ireq_opt),
1024 tos);
1025 rcu_read_unlock();
1026 err = net_xmit_eval(err);
1027 }
1028
1029 return err;
1030}
1031
1032/*
1033 * IPv4 request_sock destructor.
1034 */
1035static void tcp_v4_reqsk_destructor(struct request_sock *req)
1036{
1037 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1038}
1039
1040#ifdef CONFIG_TCP_MD5SIG
1041/*
1042 * RFC2385 MD5 checksumming requires a mapping of
1043 * IP address->MD5 Key.
1044 * We need to maintain these in the sk structure.
1045 */
1046
1047DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1048EXPORT_SYMBOL(tcp_md5_needed);
1049
1050static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1051{
1052 if (!old)
1053 return true;
1054
1055 /* l3index always overrides non-l3index */
1056 if (old->l3index && new->l3index == 0)
1057 return false;
1058 if (old->l3index == 0 && new->l3index)
1059 return true;
1060
1061 return old->prefixlen < new->prefixlen;
1062}
1063
1064/* Find the Key structure for an address. */
1065struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1066 const union tcp_md5_addr *addr,
1067 int family)
1068{
1069 const struct tcp_sock *tp = tcp_sk(sk);
1070 struct tcp_md5sig_key *key;
1071 const struct tcp_md5sig_info *md5sig;
1072 __be32 mask;
1073 struct tcp_md5sig_key *best_match = NULL;
1074 bool match;
1075
1076 /* caller either holds rcu_read_lock() or socket lock */
1077 md5sig = rcu_dereference_check(tp->md5sig_info,
1078 lockdep_sock_is_held(sk));
1079 if (!md5sig)
1080 return NULL;
1081
1082 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1083 lockdep_sock_is_held(sk)) {
1084 if (key->family != family)
1085 continue;
1086 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1087 continue;
1088 if (family == AF_INET) {
1089 mask = inet_make_mask(key->prefixlen);
1090 match = (key->addr.a4.s_addr & mask) ==
1091 (addr->a4.s_addr & mask);
1092#if IS_ENABLED(CONFIG_IPV6)
1093 } else if (family == AF_INET6) {
1094 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1095 key->prefixlen);
1096#endif
1097 } else {
1098 match = false;
1099 }
1100
1101 if (match && better_md5_match(best_match, key))
1102 best_match = key;
1103 }
1104 return best_match;
1105}
1106EXPORT_SYMBOL(__tcp_md5_do_lookup);
1107
1108static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1109 const union tcp_md5_addr *addr,
1110 int family, u8 prefixlen,
1111 int l3index, u8 flags)
1112{
1113 const struct tcp_sock *tp = tcp_sk(sk);
1114 struct tcp_md5sig_key *key;
1115 unsigned int size = sizeof(struct in_addr);
1116 const struct tcp_md5sig_info *md5sig;
1117
1118 /* caller either holds rcu_read_lock() or socket lock */
1119 md5sig = rcu_dereference_check(tp->md5sig_info,
1120 lockdep_sock_is_held(sk));
1121 if (!md5sig)
1122 return NULL;
1123#if IS_ENABLED(CONFIG_IPV6)
1124 if (family == AF_INET6)
1125 size = sizeof(struct in6_addr);
1126#endif
1127 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1128 lockdep_sock_is_held(sk)) {
1129 if (key->family != family)
1130 continue;
1131 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1132 continue;
1133 if (key->l3index != l3index)
1134 continue;
1135 if (!memcmp(&key->addr, addr, size) &&
1136 key->prefixlen == prefixlen)
1137 return key;
1138 }
1139 return NULL;
1140}
1141
1142struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1143 const struct sock *addr_sk)
1144{
1145 const union tcp_md5_addr *addr;
1146 int l3index;
1147
1148 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1149 addr_sk->sk_bound_dev_if);
1150 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1151 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1152}
1153EXPORT_SYMBOL(tcp_v4_md5_lookup);
1154
1155/* This can be called on a newly created socket, from other files */
1156int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1157 int family, u8 prefixlen, int l3index, u8 flags,
1158 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1159{
1160 /* Add Key to the list */
1161 struct tcp_md5sig_key *key;
1162 struct tcp_sock *tp = tcp_sk(sk);
1163 struct tcp_md5sig_info *md5sig;
1164
1165 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1166 if (key) {
1167 /* Pre-existing entry - just update that one.
1168 * Note that the key might be used concurrently.
1169 * data_race() is telling kcsan that we do not care of
1170 * key mismatches, since changing MD5 key on live flows
1171 * can lead to packet drops.
1172 */
1173 data_race(memcpy(key->key, newkey, newkeylen));
1174
1175 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1176 * Also note that a reader could catch new key->keylen value
1177 * but old key->key[], this is the reason we use __GFP_ZERO
1178 * at sock_kmalloc() time below these lines.
1179 */
1180 WRITE_ONCE(key->keylen, newkeylen);
1181
1182 return 0;
1183 }
1184
1185 md5sig = rcu_dereference_protected(tp->md5sig_info,
1186 lockdep_sock_is_held(sk));
1187 if (!md5sig) {
1188 md5sig = kmalloc(sizeof(*md5sig), gfp);
1189 if (!md5sig)
1190 return -ENOMEM;
1191
1192 sk_gso_disable(sk);
1193 INIT_HLIST_HEAD(&md5sig->head);
1194 rcu_assign_pointer(tp->md5sig_info, md5sig);
1195 }
1196
1197 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1198 if (!key)
1199 return -ENOMEM;
1200 if (!tcp_alloc_md5sig_pool()) {
1201 sock_kfree_s(sk, key, sizeof(*key));
1202 return -ENOMEM;
1203 }
1204
1205 memcpy(key->key, newkey, newkeylen);
1206 key->keylen = newkeylen;
1207 key->family = family;
1208 key->prefixlen = prefixlen;
1209 key->l3index = l3index;
1210 key->flags = flags;
1211 memcpy(&key->addr, addr,
1212 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1213 sizeof(struct in_addr));
1214 hlist_add_head_rcu(&key->node, &md5sig->head);
1215 return 0;
1216}
1217EXPORT_SYMBOL(tcp_md5_do_add);
1218
1219int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1220 u8 prefixlen, int l3index, u8 flags)
1221{
1222 struct tcp_md5sig_key *key;
1223
1224 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1225 if (!key)
1226 return -ENOENT;
1227 hlist_del_rcu(&key->node);
1228 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1229 kfree_rcu(key, rcu);
1230 return 0;
1231}
1232EXPORT_SYMBOL(tcp_md5_do_del);
1233
1234static void tcp_clear_md5_list(struct sock *sk)
1235{
1236 struct tcp_sock *tp = tcp_sk(sk);
1237 struct tcp_md5sig_key *key;
1238 struct hlist_node *n;
1239 struct tcp_md5sig_info *md5sig;
1240
1241 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1242
1243 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1244 hlist_del_rcu(&key->node);
1245 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1246 kfree_rcu(key, rcu);
1247 }
1248}
1249
1250static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1251 sockptr_t optval, int optlen)
1252{
1253 struct tcp_md5sig cmd;
1254 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1255 const union tcp_md5_addr *addr;
1256 u8 prefixlen = 32;
1257 int l3index = 0;
1258 u8 flags;
1259
1260 if (optlen < sizeof(cmd))
1261 return -EINVAL;
1262
1263 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1264 return -EFAULT;
1265
1266 if (sin->sin_family != AF_INET)
1267 return -EINVAL;
1268
1269 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1270
1271 if (optname == TCP_MD5SIG_EXT &&
1272 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1273 prefixlen = cmd.tcpm_prefixlen;
1274 if (prefixlen > 32)
1275 return -EINVAL;
1276 }
1277
1278 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1279 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1280 struct net_device *dev;
1281
1282 rcu_read_lock();
1283 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1284 if (dev && netif_is_l3_master(dev))
1285 l3index = dev->ifindex;
1286
1287 rcu_read_unlock();
1288
1289 /* ok to reference set/not set outside of rcu;
1290 * right now device MUST be an L3 master
1291 */
1292 if (!dev || !l3index)
1293 return -EINVAL;
1294 }
1295
1296 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1297
1298 if (!cmd.tcpm_keylen)
1299 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1300
1301 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1302 return -EINVAL;
1303
1304 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1305 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1306}
1307
1308static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1309 __be32 daddr, __be32 saddr,
1310 const struct tcphdr *th, int nbytes)
1311{
1312 struct tcp4_pseudohdr *bp;
1313 struct scatterlist sg;
1314 struct tcphdr *_th;
1315
1316 bp = hp->scratch;
1317 bp->saddr = saddr;
1318 bp->daddr = daddr;
1319 bp->pad = 0;
1320 bp->protocol = IPPROTO_TCP;
1321 bp->len = cpu_to_be16(nbytes);
1322
1323 _th = (struct tcphdr *)(bp + 1);
1324 memcpy(_th, th, sizeof(*th));
1325 _th->check = 0;
1326
1327 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1328 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1329 sizeof(*bp) + sizeof(*th));
1330 return crypto_ahash_update(hp->md5_req);
1331}
1332
1333static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1334 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1335{
1336 struct tcp_md5sig_pool *hp;
1337 struct ahash_request *req;
1338
1339 hp = tcp_get_md5sig_pool();
1340 if (!hp)
1341 goto clear_hash_noput;
1342 req = hp->md5_req;
1343
1344 if (crypto_ahash_init(req))
1345 goto clear_hash;
1346 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1347 goto clear_hash;
1348 if (tcp_md5_hash_key(hp, key))
1349 goto clear_hash;
1350 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1351 if (crypto_ahash_final(req))
1352 goto clear_hash;
1353
1354 tcp_put_md5sig_pool();
1355 return 0;
1356
1357clear_hash:
1358 tcp_put_md5sig_pool();
1359clear_hash_noput:
1360 memset(md5_hash, 0, 16);
1361 return 1;
1362}
1363
1364int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1365 const struct sock *sk,
1366 const struct sk_buff *skb)
1367{
1368 struct tcp_md5sig_pool *hp;
1369 struct ahash_request *req;
1370 const struct tcphdr *th = tcp_hdr(skb);
1371 __be32 saddr, daddr;
1372
1373 if (sk) { /* valid for establish/request sockets */
1374 saddr = sk->sk_rcv_saddr;
1375 daddr = sk->sk_daddr;
1376 } else {
1377 const struct iphdr *iph = ip_hdr(skb);
1378 saddr = iph->saddr;
1379 daddr = iph->daddr;
1380 }
1381
1382 hp = tcp_get_md5sig_pool();
1383 if (!hp)
1384 goto clear_hash_noput;
1385 req = hp->md5_req;
1386
1387 if (crypto_ahash_init(req))
1388 goto clear_hash;
1389
1390 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1391 goto clear_hash;
1392 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1393 goto clear_hash;
1394 if (tcp_md5_hash_key(hp, key))
1395 goto clear_hash;
1396 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1397 if (crypto_ahash_final(req))
1398 goto clear_hash;
1399
1400 tcp_put_md5sig_pool();
1401 return 0;
1402
1403clear_hash:
1404 tcp_put_md5sig_pool();
1405clear_hash_noput:
1406 memset(md5_hash, 0, 16);
1407 return 1;
1408}
1409EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1410
1411#endif
1412
1413static void tcp_v4_init_req(struct request_sock *req,
1414 const struct sock *sk_listener,
1415 struct sk_buff *skb)
1416{
1417 struct inet_request_sock *ireq = inet_rsk(req);
1418 struct net *net = sock_net(sk_listener);
1419
1420 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1421 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1422 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1423}
1424
1425static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1426 struct sk_buff *skb,
1427 struct flowi *fl,
1428 struct request_sock *req)
1429{
1430 tcp_v4_init_req(req, sk, skb);
1431
1432 if (security_inet_conn_request(sk, skb, req))
1433 return NULL;
1434
1435 return inet_csk_route_req(sk, &fl->u.ip4, req);
1436}
1437
1438struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1439 .family = PF_INET,
1440 .obj_size = sizeof(struct tcp_request_sock),
1441 .rtx_syn_ack = tcp_rtx_synack,
1442 .send_ack = tcp_v4_reqsk_send_ack,
1443 .destructor = tcp_v4_reqsk_destructor,
1444 .send_reset = tcp_v4_send_reset,
1445 .syn_ack_timeout = tcp_syn_ack_timeout,
1446};
1447
1448const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1449 .mss_clamp = TCP_MSS_DEFAULT,
1450#ifdef CONFIG_TCP_MD5SIG
1451 .req_md5_lookup = tcp_v4_md5_lookup,
1452 .calc_md5_hash = tcp_v4_md5_hash_skb,
1453#endif
1454#ifdef CONFIG_SYN_COOKIES
1455 .cookie_init_seq = cookie_v4_init_sequence,
1456#endif
1457 .route_req = tcp_v4_route_req,
1458 .init_seq = tcp_v4_init_seq,
1459 .init_ts_off = tcp_v4_init_ts_off,
1460 .send_synack = tcp_v4_send_synack,
1461};
1462
1463int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1464{
1465 /* Never answer to SYNs send to broadcast or multicast */
1466 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1467 goto drop;
1468
1469 return tcp_conn_request(&tcp_request_sock_ops,
1470 &tcp_request_sock_ipv4_ops, sk, skb);
1471
1472drop:
1473 tcp_listendrop(sk);
1474 return 0;
1475}
1476EXPORT_SYMBOL(tcp_v4_conn_request);
1477
1478
1479/*
1480 * The three way handshake has completed - we got a valid synack -
1481 * now create the new socket.
1482 */
1483struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1484 struct request_sock *req,
1485 struct dst_entry *dst,
1486 struct request_sock *req_unhash,
1487 bool *own_req)
1488{
1489 struct inet_request_sock *ireq;
1490 bool found_dup_sk = false;
1491 struct inet_sock *newinet;
1492 struct tcp_sock *newtp;
1493 struct sock *newsk;
1494#ifdef CONFIG_TCP_MD5SIG
1495 const union tcp_md5_addr *addr;
1496 struct tcp_md5sig_key *key;
1497 int l3index;
1498#endif
1499 struct ip_options_rcu *inet_opt;
1500
1501 if (sk_acceptq_is_full(sk))
1502 goto exit_overflow;
1503
1504 newsk = tcp_create_openreq_child(sk, req, skb);
1505 if (!newsk)
1506 goto exit_nonewsk;
1507
1508 newsk->sk_gso_type = SKB_GSO_TCPV4;
1509 inet_sk_rx_dst_set(newsk, skb);
1510
1511 newtp = tcp_sk(newsk);
1512 newinet = inet_sk(newsk);
1513 ireq = inet_rsk(req);
1514 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1515 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1516 newsk->sk_bound_dev_if = ireq->ir_iif;
1517 newinet->inet_saddr = ireq->ir_loc_addr;
1518 inet_opt = rcu_dereference(ireq->ireq_opt);
1519 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1520 newinet->mc_index = inet_iif(skb);
1521 newinet->mc_ttl = ip_hdr(skb)->ttl;
1522 newinet->rcv_tos = ip_hdr(skb)->tos;
1523 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1524 if (inet_opt)
1525 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1526 newinet->inet_id = prandom_u32();
1527
1528 /* Set ToS of the new socket based upon the value of incoming SYN.
1529 * ECT bits are set later in tcp_init_transfer().
1530 */
1531 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1532 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1533
1534 if (!dst) {
1535 dst = inet_csk_route_child_sock(sk, newsk, req);
1536 if (!dst)
1537 goto put_and_exit;
1538 } else {
1539 /* syncookie case : see end of cookie_v4_check() */
1540 }
1541 sk_setup_caps(newsk, dst);
1542
1543 tcp_ca_openreq_child(newsk, dst);
1544
1545 tcp_sync_mss(newsk, dst_mtu(dst));
1546 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1547
1548 tcp_initialize_rcv_mss(newsk);
1549
1550#ifdef CONFIG_TCP_MD5SIG
1551 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1552 /* Copy over the MD5 key from the original socket */
1553 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1554 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1555 if (key) {
1556 /*
1557 * We're using one, so create a matching key
1558 * on the newsk structure. If we fail to get
1559 * memory, then we end up not copying the key
1560 * across. Shucks.
1561 */
1562 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1563 key->key, key->keylen, GFP_ATOMIC);
1564 sk_gso_disable(newsk);
1565 }
1566#endif
1567
1568 if (__inet_inherit_port(sk, newsk) < 0)
1569 goto put_and_exit;
1570 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1571 &found_dup_sk);
1572 if (likely(*own_req)) {
1573 tcp_move_syn(newtp, req);
1574 ireq->ireq_opt = NULL;
1575 } else {
1576 newinet->inet_opt = NULL;
1577
1578 if (!req_unhash && found_dup_sk) {
1579 /* This code path should only be executed in the
1580 * syncookie case only
1581 */
1582 bh_unlock_sock(newsk);
1583 sock_put(newsk);
1584 newsk = NULL;
1585 }
1586 }
1587 return newsk;
1588
1589exit_overflow:
1590 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1591exit_nonewsk:
1592 dst_release(dst);
1593exit:
1594 tcp_listendrop(sk);
1595 return NULL;
1596put_and_exit:
1597 newinet->inet_opt = NULL;
1598 inet_csk_prepare_forced_close(newsk);
1599 tcp_done(newsk);
1600 goto exit;
1601}
1602EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1603
1604static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1605{
1606#ifdef CONFIG_SYN_COOKIES
1607 const struct tcphdr *th = tcp_hdr(skb);
1608
1609 if (!th->syn)
1610 sk = cookie_v4_check(sk, skb);
1611#endif
1612 return sk;
1613}
1614
1615u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1616 struct tcphdr *th, u32 *cookie)
1617{
1618 u16 mss = 0;
1619#ifdef CONFIG_SYN_COOKIES
1620 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1621 &tcp_request_sock_ipv4_ops, sk, th);
1622 if (mss) {
1623 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1624 tcp_synq_overflow(sk);
1625 }
1626#endif
1627 return mss;
1628}
1629
1630INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1631 u32));
1632/* The socket must have it's spinlock held when we get
1633 * here, unless it is a TCP_LISTEN socket.
1634 *
1635 * We have a potential double-lock case here, so even when
1636 * doing backlog processing we use the BH locking scheme.
1637 * This is because we cannot sleep with the original spinlock
1638 * held.
1639 */
1640int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1641{
1642 enum skb_drop_reason reason;
1643 struct sock *rsk;
1644
1645 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1646 struct dst_entry *dst;
1647
1648 dst = rcu_dereference_protected(sk->sk_rx_dst,
1649 lockdep_sock_is_held(sk));
1650
1651 sock_rps_save_rxhash(sk, skb);
1652 sk_mark_napi_id(sk, skb);
1653 if (dst) {
1654 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1655 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1656 dst, 0)) {
1657 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1658 dst_release(dst);
1659 }
1660 }
1661 tcp_rcv_established(sk, skb);
1662 return 0;
1663 }
1664
1665 reason = SKB_DROP_REASON_NOT_SPECIFIED;
1666 if (tcp_checksum_complete(skb))
1667 goto csum_err;
1668
1669 if (sk->sk_state == TCP_LISTEN) {
1670 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1671
1672 if (!nsk)
1673 goto discard;
1674 if (nsk != sk) {
1675 if (tcp_child_process(sk, nsk, skb)) {
1676 rsk = nsk;
1677 goto reset;
1678 }
1679 return 0;
1680 }
1681 } else
1682 sock_rps_save_rxhash(sk, skb);
1683
1684 if (tcp_rcv_state_process(sk, skb)) {
1685 rsk = sk;
1686 goto reset;
1687 }
1688 return 0;
1689
1690reset:
1691 tcp_v4_send_reset(rsk, skb);
1692discard:
1693 kfree_skb_reason(skb, reason);
1694 /* Be careful here. If this function gets more complicated and
1695 * gcc suffers from register pressure on the x86, sk (in %ebx)
1696 * might be destroyed here. This current version compiles correctly,
1697 * but you have been warned.
1698 */
1699 return 0;
1700
1701csum_err:
1702 reason = SKB_DROP_REASON_TCP_CSUM;
1703 trace_tcp_bad_csum(skb);
1704 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1705 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1706 goto discard;
1707}
1708EXPORT_SYMBOL(tcp_v4_do_rcv);
1709
1710int tcp_v4_early_demux(struct sk_buff *skb)
1711{
1712 const struct iphdr *iph;
1713 const struct tcphdr *th;
1714 struct sock *sk;
1715
1716 if (skb->pkt_type != PACKET_HOST)
1717 return 0;
1718
1719 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1720 return 0;
1721
1722 iph = ip_hdr(skb);
1723 th = tcp_hdr(skb);
1724
1725 if (th->doff < sizeof(struct tcphdr) / 4)
1726 return 0;
1727
1728 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1729 iph->saddr, th->source,
1730 iph->daddr, ntohs(th->dest),
1731 skb->skb_iif, inet_sdif(skb));
1732 if (sk) {
1733 skb->sk = sk;
1734 skb->destructor = sock_edemux;
1735 if (sk_fullsock(sk)) {
1736 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1737
1738 if (dst)
1739 dst = dst_check(dst, 0);
1740 if (dst &&
1741 sk->sk_rx_dst_ifindex == skb->skb_iif)
1742 skb_dst_set_noref(skb, dst);
1743 }
1744 }
1745 return 0;
1746}
1747
1748bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1749 enum skb_drop_reason *reason)
1750{
1751 u32 limit, tail_gso_size, tail_gso_segs;
1752 struct skb_shared_info *shinfo;
1753 const struct tcphdr *th;
1754 struct tcphdr *thtail;
1755 struct sk_buff *tail;
1756 unsigned int hdrlen;
1757 bool fragstolen;
1758 u32 gso_segs;
1759 u32 gso_size;
1760 int delta;
1761
1762 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1763 * we can fix skb->truesize to its real value to avoid future drops.
1764 * This is valid because skb is not yet charged to the socket.
1765 * It has been noticed pure SACK packets were sometimes dropped
1766 * (if cooked by drivers without copybreak feature).
1767 */
1768 skb_condense(skb);
1769
1770 skb_dst_drop(skb);
1771
1772 if (unlikely(tcp_checksum_complete(skb))) {
1773 bh_unlock_sock(sk);
1774 trace_tcp_bad_csum(skb);
1775 *reason = SKB_DROP_REASON_TCP_CSUM;
1776 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1777 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1778 return true;
1779 }
1780
1781 /* Attempt coalescing to last skb in backlog, even if we are
1782 * above the limits.
1783 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1784 */
1785 th = (const struct tcphdr *)skb->data;
1786 hdrlen = th->doff * 4;
1787
1788 tail = sk->sk_backlog.tail;
1789 if (!tail)
1790 goto no_coalesce;
1791 thtail = (struct tcphdr *)tail->data;
1792
1793 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1794 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1795 ((TCP_SKB_CB(tail)->tcp_flags |
1796 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1797 !((TCP_SKB_CB(tail)->tcp_flags &
1798 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1799 ((TCP_SKB_CB(tail)->tcp_flags ^
1800 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1801#ifdef CONFIG_TLS_DEVICE
1802 tail->decrypted != skb->decrypted ||
1803#endif
1804 thtail->doff != th->doff ||
1805 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1806 goto no_coalesce;
1807
1808 __skb_pull(skb, hdrlen);
1809
1810 shinfo = skb_shinfo(skb);
1811 gso_size = shinfo->gso_size ?: skb->len;
1812 gso_segs = shinfo->gso_segs ?: 1;
1813
1814 shinfo = skb_shinfo(tail);
1815 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1816 tail_gso_segs = shinfo->gso_segs ?: 1;
1817
1818 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1819 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1820
1821 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1822 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1823 thtail->window = th->window;
1824 }
1825
1826 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1827 * thtail->fin, so that the fast path in tcp_rcv_established()
1828 * is not entered if we append a packet with a FIN.
1829 * SYN, RST, URG are not present.
1830 * ACK is set on both packets.
1831 * PSH : we do not really care in TCP stack,
1832 * at least for 'GRO' packets.
1833 */
1834 thtail->fin |= th->fin;
1835 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1836
1837 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1838 TCP_SKB_CB(tail)->has_rxtstamp = true;
1839 tail->tstamp = skb->tstamp;
1840 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1841 }
1842
1843 /* Not as strict as GRO. We only need to carry mss max value */
1844 shinfo->gso_size = max(gso_size, tail_gso_size);
1845 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1846
1847 sk->sk_backlog.len += delta;
1848 __NET_INC_STATS(sock_net(sk),
1849 LINUX_MIB_TCPBACKLOGCOALESCE);
1850 kfree_skb_partial(skb, fragstolen);
1851 return false;
1852 }
1853 __skb_push(skb, hdrlen);
1854
1855no_coalesce:
1856 /* Only socket owner can try to collapse/prune rx queues
1857 * to reduce memory overhead, so add a little headroom here.
1858 * Few sockets backlog are possibly concurrently non empty.
1859 */
1860 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1861
1862 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1863 bh_unlock_sock(sk);
1864 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1865 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1866 return true;
1867 }
1868 return false;
1869}
1870EXPORT_SYMBOL(tcp_add_backlog);
1871
1872int tcp_filter(struct sock *sk, struct sk_buff *skb)
1873{
1874 struct tcphdr *th = (struct tcphdr *)skb->data;
1875
1876 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1877}
1878EXPORT_SYMBOL(tcp_filter);
1879
1880static void tcp_v4_restore_cb(struct sk_buff *skb)
1881{
1882 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1883 sizeof(struct inet_skb_parm));
1884}
1885
1886static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1887 const struct tcphdr *th)
1888{
1889 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1890 * barrier() makes sure compiler wont play fool^Waliasing games.
1891 */
1892 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1893 sizeof(struct inet_skb_parm));
1894 barrier();
1895
1896 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1897 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1898 skb->len - th->doff * 4);
1899 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1900 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1901 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1902 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1903 TCP_SKB_CB(skb)->sacked = 0;
1904 TCP_SKB_CB(skb)->has_rxtstamp =
1905 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1906}
1907
1908/*
1909 * From tcp_input.c
1910 */
1911
1912int tcp_v4_rcv(struct sk_buff *skb)
1913{
1914 struct net *net = dev_net(skb->dev);
1915 enum skb_drop_reason drop_reason;
1916 int sdif = inet_sdif(skb);
1917 int dif = inet_iif(skb);
1918 const struct iphdr *iph;
1919 const struct tcphdr *th;
1920 bool refcounted;
1921 struct sock *sk;
1922 int ret;
1923
1924 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1925 if (skb->pkt_type != PACKET_HOST)
1926 goto discard_it;
1927
1928 /* Count it even if it's bad */
1929 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1930
1931 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1932 goto discard_it;
1933
1934 th = (const struct tcphdr *)skb->data;
1935
1936 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1937 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1938 goto bad_packet;
1939 }
1940 if (!pskb_may_pull(skb, th->doff * 4))
1941 goto discard_it;
1942
1943 /* An explanation is required here, I think.
1944 * Packet length and doff are validated by header prediction,
1945 * provided case of th->doff==0 is eliminated.
1946 * So, we defer the checks. */
1947
1948 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1949 goto csum_error;
1950
1951 th = (const struct tcphdr *)skb->data;
1952 iph = ip_hdr(skb);
1953lookup:
1954 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1955 th->dest, sdif, &refcounted);
1956 if (!sk)
1957 goto no_tcp_socket;
1958
1959process:
1960 if (sk->sk_state == TCP_TIME_WAIT)
1961 goto do_time_wait;
1962
1963 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1964 struct request_sock *req = inet_reqsk(sk);
1965 bool req_stolen = false;
1966 struct sock *nsk;
1967
1968 sk = req->rsk_listener;
1969 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1970 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1971 else
1972 drop_reason = tcp_inbound_md5_hash(sk, skb,
1973 &iph->saddr, &iph->daddr,
1974 AF_INET, dif, sdif);
1975 if (unlikely(drop_reason)) {
1976 sk_drops_add(sk, skb);
1977 reqsk_put(req);
1978 goto discard_it;
1979 }
1980 if (tcp_checksum_complete(skb)) {
1981 reqsk_put(req);
1982 goto csum_error;
1983 }
1984 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1985 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1986 if (!nsk) {
1987 inet_csk_reqsk_queue_drop_and_put(sk, req);
1988 goto lookup;
1989 }
1990 sk = nsk;
1991 /* reuseport_migrate_sock() has already held one sk_refcnt
1992 * before returning.
1993 */
1994 } else {
1995 /* We own a reference on the listener, increase it again
1996 * as we might lose it too soon.
1997 */
1998 sock_hold(sk);
1999 }
2000 refcounted = true;
2001 nsk = NULL;
2002 if (!tcp_filter(sk, skb)) {
2003 th = (const struct tcphdr *)skb->data;
2004 iph = ip_hdr(skb);
2005 tcp_v4_fill_cb(skb, iph, th);
2006 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2007 } else {
2008 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2009 }
2010 if (!nsk) {
2011 reqsk_put(req);
2012 if (req_stolen) {
2013 /* Another cpu got exclusive access to req
2014 * and created a full blown socket.
2015 * Try to feed this packet to this socket
2016 * instead of discarding it.
2017 */
2018 tcp_v4_restore_cb(skb);
2019 sock_put(sk);
2020 goto lookup;
2021 }
2022 goto discard_and_relse;
2023 }
2024 nf_reset_ct(skb);
2025 if (nsk == sk) {
2026 reqsk_put(req);
2027 tcp_v4_restore_cb(skb);
2028 } else if (tcp_child_process(sk, nsk, skb)) {
2029 tcp_v4_send_reset(nsk, skb);
2030 goto discard_and_relse;
2031 } else {
2032 sock_put(sk);
2033 return 0;
2034 }
2035 }
2036
2037 if (static_branch_unlikely(&ip4_min_ttl)) {
2038 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2039 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2040 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2041 goto discard_and_relse;
2042 }
2043 }
2044
2045 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2046 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2047 goto discard_and_relse;
2048 }
2049
2050 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2051 &iph->daddr, AF_INET, dif, sdif);
2052 if (drop_reason)
2053 goto discard_and_relse;
2054
2055 nf_reset_ct(skb);
2056
2057 if (tcp_filter(sk, skb)) {
2058 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2059 goto discard_and_relse;
2060 }
2061 th = (const struct tcphdr *)skb->data;
2062 iph = ip_hdr(skb);
2063 tcp_v4_fill_cb(skb, iph, th);
2064
2065 skb->dev = NULL;
2066
2067 if (sk->sk_state == TCP_LISTEN) {
2068 ret = tcp_v4_do_rcv(sk, skb);
2069 goto put_and_return;
2070 }
2071
2072 sk_incoming_cpu_update(sk);
2073
2074 bh_lock_sock_nested(sk);
2075 tcp_segs_in(tcp_sk(sk), skb);
2076 ret = 0;
2077 if (!sock_owned_by_user(sk)) {
2078 ret = tcp_v4_do_rcv(sk, skb);
2079 } else {
2080 if (tcp_add_backlog(sk, skb, &drop_reason))
2081 goto discard_and_relse;
2082 }
2083 bh_unlock_sock(sk);
2084
2085put_and_return:
2086 if (refcounted)
2087 sock_put(sk);
2088
2089 return ret;
2090
2091no_tcp_socket:
2092 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2093 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2094 goto discard_it;
2095
2096 tcp_v4_fill_cb(skb, iph, th);
2097
2098 if (tcp_checksum_complete(skb)) {
2099csum_error:
2100 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2101 trace_tcp_bad_csum(skb);
2102 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2103bad_packet:
2104 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2105 } else {
2106 tcp_v4_send_reset(NULL, skb);
2107 }
2108
2109discard_it:
2110 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2111 /* Discard frame. */
2112 kfree_skb_reason(skb, drop_reason);
2113 return 0;
2114
2115discard_and_relse:
2116 sk_drops_add(sk, skb);
2117 if (refcounted)
2118 sock_put(sk);
2119 goto discard_it;
2120
2121do_time_wait:
2122 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2123 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2124 inet_twsk_put(inet_twsk(sk));
2125 goto discard_it;
2126 }
2127
2128 tcp_v4_fill_cb(skb, iph, th);
2129
2130 if (tcp_checksum_complete(skb)) {
2131 inet_twsk_put(inet_twsk(sk));
2132 goto csum_error;
2133 }
2134 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2135 case TCP_TW_SYN: {
2136 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2137 &tcp_hashinfo, skb,
2138 __tcp_hdrlen(th),
2139 iph->saddr, th->source,
2140 iph->daddr, th->dest,
2141 inet_iif(skb),
2142 sdif);
2143 if (sk2) {
2144 inet_twsk_deschedule_put(inet_twsk(sk));
2145 sk = sk2;
2146 tcp_v4_restore_cb(skb);
2147 refcounted = false;
2148 goto process;
2149 }
2150 }
2151 /* to ACK */
2152 fallthrough;
2153 case TCP_TW_ACK:
2154 tcp_v4_timewait_ack(sk, skb);
2155 break;
2156 case TCP_TW_RST:
2157 tcp_v4_send_reset(sk, skb);
2158 inet_twsk_deschedule_put(inet_twsk(sk));
2159 goto discard_it;
2160 case TCP_TW_SUCCESS:;
2161 }
2162 goto discard_it;
2163}
2164
2165static struct timewait_sock_ops tcp_timewait_sock_ops = {
2166 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2167 .twsk_unique = tcp_twsk_unique,
2168 .twsk_destructor= tcp_twsk_destructor,
2169};
2170
2171void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2172{
2173 struct dst_entry *dst = skb_dst(skb);
2174
2175 if (dst && dst_hold_safe(dst)) {
2176 rcu_assign_pointer(sk->sk_rx_dst, dst);
2177 sk->sk_rx_dst_ifindex = skb->skb_iif;
2178 }
2179}
2180EXPORT_SYMBOL(inet_sk_rx_dst_set);
2181
2182const struct inet_connection_sock_af_ops ipv4_specific = {
2183 .queue_xmit = ip_queue_xmit,
2184 .send_check = tcp_v4_send_check,
2185 .rebuild_header = inet_sk_rebuild_header,
2186 .sk_rx_dst_set = inet_sk_rx_dst_set,
2187 .conn_request = tcp_v4_conn_request,
2188 .syn_recv_sock = tcp_v4_syn_recv_sock,
2189 .net_header_len = sizeof(struct iphdr),
2190 .setsockopt = ip_setsockopt,
2191 .getsockopt = ip_getsockopt,
2192 .addr2sockaddr = inet_csk_addr2sockaddr,
2193 .sockaddr_len = sizeof(struct sockaddr_in),
2194 .mtu_reduced = tcp_v4_mtu_reduced,
2195};
2196EXPORT_SYMBOL(ipv4_specific);
2197
2198#ifdef CONFIG_TCP_MD5SIG
2199static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2200 .md5_lookup = tcp_v4_md5_lookup,
2201 .calc_md5_hash = tcp_v4_md5_hash_skb,
2202 .md5_parse = tcp_v4_parse_md5_keys,
2203};
2204#endif
2205
2206/* NOTE: A lot of things set to zero explicitly by call to
2207 * sk_alloc() so need not be done here.
2208 */
2209static int tcp_v4_init_sock(struct sock *sk)
2210{
2211 struct inet_connection_sock *icsk = inet_csk(sk);
2212
2213 tcp_init_sock(sk);
2214
2215 icsk->icsk_af_ops = &ipv4_specific;
2216
2217#ifdef CONFIG_TCP_MD5SIG
2218 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2219#endif
2220
2221 return 0;
2222}
2223
2224void tcp_v4_destroy_sock(struct sock *sk)
2225{
2226 struct tcp_sock *tp = tcp_sk(sk);
2227
2228 trace_tcp_destroy_sock(sk);
2229
2230 tcp_clear_xmit_timers(sk);
2231
2232 tcp_cleanup_congestion_control(sk);
2233
2234 tcp_cleanup_ulp(sk);
2235
2236 /* Cleanup up the write buffer. */
2237 tcp_write_queue_purge(sk);
2238
2239 /* Check if we want to disable active TFO */
2240 tcp_fastopen_active_disable_ofo_check(sk);
2241
2242 /* Cleans up our, hopefully empty, out_of_order_queue. */
2243 skb_rbtree_purge(&tp->out_of_order_queue);
2244
2245#ifdef CONFIG_TCP_MD5SIG
2246 /* Clean up the MD5 key list, if any */
2247 if (tp->md5sig_info) {
2248 tcp_clear_md5_list(sk);
2249 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2250 tp->md5sig_info = NULL;
2251 }
2252#endif
2253
2254 /* Clean up a referenced TCP bind bucket. */
2255 if (inet_csk(sk)->icsk_bind_hash)
2256 inet_put_port(sk);
2257
2258 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2259
2260 /* If socket is aborted during connect operation */
2261 tcp_free_fastopen_req(tp);
2262 tcp_fastopen_destroy_cipher(sk);
2263 tcp_saved_syn_free(tp);
2264
2265 sk_sockets_allocated_dec(sk);
2266}
2267EXPORT_SYMBOL(tcp_v4_destroy_sock);
2268
2269#ifdef CONFIG_PROC_FS
2270/* Proc filesystem TCP sock list dumping. */
2271
2272static unsigned short seq_file_family(const struct seq_file *seq);
2273
2274static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2275{
2276 unsigned short family = seq_file_family(seq);
2277
2278 /* AF_UNSPEC is used as a match all */
2279 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2280 net_eq(sock_net(sk), seq_file_net(seq)));
2281}
2282
2283/* Find a non empty bucket (starting from st->bucket)
2284 * and return the first sk from it.
2285 */
2286static void *listening_get_first(struct seq_file *seq)
2287{
2288 struct tcp_iter_state *st = seq->private;
2289
2290 st->offset = 0;
2291 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2292 struct inet_listen_hashbucket *ilb2;
2293 struct hlist_nulls_node *node;
2294 struct sock *sk;
2295
2296 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2297 if (hlist_nulls_empty(&ilb2->nulls_head))
2298 continue;
2299
2300 spin_lock(&ilb2->lock);
2301 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2302 if (seq_sk_match(seq, sk))
2303 return sk;
2304 }
2305 spin_unlock(&ilb2->lock);
2306 }
2307
2308 return NULL;
2309}
2310
2311/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2312 * If "cur" is the last one in the st->bucket,
2313 * call listening_get_first() to return the first sk of the next
2314 * non empty bucket.
2315 */
2316static void *listening_get_next(struct seq_file *seq, void *cur)
2317{
2318 struct tcp_iter_state *st = seq->private;
2319 struct inet_listen_hashbucket *ilb2;
2320 struct hlist_nulls_node *node;
2321 struct sock *sk = cur;
2322
2323 ++st->num;
2324 ++st->offset;
2325
2326 sk = sk_nulls_next(sk);
2327 sk_nulls_for_each_from(sk, node) {
2328 if (seq_sk_match(seq, sk))
2329 return sk;
2330 }
2331
2332 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2333 spin_unlock(&ilb2->lock);
2334 ++st->bucket;
2335 return listening_get_first(seq);
2336}
2337
2338static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2339{
2340 struct tcp_iter_state *st = seq->private;
2341 void *rc;
2342
2343 st->bucket = 0;
2344 st->offset = 0;
2345 rc = listening_get_first(seq);
2346
2347 while (rc && *pos) {
2348 rc = listening_get_next(seq, rc);
2349 --*pos;
2350 }
2351 return rc;
2352}
2353
2354static inline bool empty_bucket(const struct tcp_iter_state *st)
2355{
2356 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2357}
2358
2359/*
2360 * Get first established socket starting from bucket given in st->bucket.
2361 * If st->bucket is zero, the very first socket in the hash is returned.
2362 */
2363static void *established_get_first(struct seq_file *seq)
2364{
2365 struct tcp_iter_state *st = seq->private;
2366
2367 st->offset = 0;
2368 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2369 struct sock *sk;
2370 struct hlist_nulls_node *node;
2371 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2372
2373 /* Lockless fast path for the common case of empty buckets */
2374 if (empty_bucket(st))
2375 continue;
2376
2377 spin_lock_bh(lock);
2378 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2379 if (seq_sk_match(seq, sk))
2380 return sk;
2381 }
2382 spin_unlock_bh(lock);
2383 }
2384
2385 return NULL;
2386}
2387
2388static void *established_get_next(struct seq_file *seq, void *cur)
2389{
2390 struct sock *sk = cur;
2391 struct hlist_nulls_node *node;
2392 struct tcp_iter_state *st = seq->private;
2393
2394 ++st->num;
2395 ++st->offset;
2396
2397 sk = sk_nulls_next(sk);
2398
2399 sk_nulls_for_each_from(sk, node) {
2400 if (seq_sk_match(seq, sk))
2401 return sk;
2402 }
2403
2404 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2405 ++st->bucket;
2406 return established_get_first(seq);
2407}
2408
2409static void *established_get_idx(struct seq_file *seq, loff_t pos)
2410{
2411 struct tcp_iter_state *st = seq->private;
2412 void *rc;
2413
2414 st->bucket = 0;
2415 rc = established_get_first(seq);
2416
2417 while (rc && pos) {
2418 rc = established_get_next(seq, rc);
2419 --pos;
2420 }
2421 return rc;
2422}
2423
2424static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2425{
2426 void *rc;
2427 struct tcp_iter_state *st = seq->private;
2428
2429 st->state = TCP_SEQ_STATE_LISTENING;
2430 rc = listening_get_idx(seq, &pos);
2431
2432 if (!rc) {
2433 st->state = TCP_SEQ_STATE_ESTABLISHED;
2434 rc = established_get_idx(seq, pos);
2435 }
2436
2437 return rc;
2438}
2439
2440static void *tcp_seek_last_pos(struct seq_file *seq)
2441{
2442 struct tcp_iter_state *st = seq->private;
2443 int bucket = st->bucket;
2444 int offset = st->offset;
2445 int orig_num = st->num;
2446 void *rc = NULL;
2447
2448 switch (st->state) {
2449 case TCP_SEQ_STATE_LISTENING:
2450 if (st->bucket > tcp_hashinfo.lhash2_mask)
2451 break;
2452 st->state = TCP_SEQ_STATE_LISTENING;
2453 rc = listening_get_first(seq);
2454 while (offset-- && rc && bucket == st->bucket)
2455 rc = listening_get_next(seq, rc);
2456 if (rc)
2457 break;
2458 st->bucket = 0;
2459 st->state = TCP_SEQ_STATE_ESTABLISHED;
2460 fallthrough;
2461 case TCP_SEQ_STATE_ESTABLISHED:
2462 if (st->bucket > tcp_hashinfo.ehash_mask)
2463 break;
2464 rc = established_get_first(seq);
2465 while (offset-- && rc && bucket == st->bucket)
2466 rc = established_get_next(seq, rc);
2467 }
2468
2469 st->num = orig_num;
2470
2471 return rc;
2472}
2473
2474void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2475{
2476 struct tcp_iter_state *st = seq->private;
2477 void *rc;
2478
2479 if (*pos && *pos == st->last_pos) {
2480 rc = tcp_seek_last_pos(seq);
2481 if (rc)
2482 goto out;
2483 }
2484
2485 st->state = TCP_SEQ_STATE_LISTENING;
2486 st->num = 0;
2487 st->bucket = 0;
2488 st->offset = 0;
2489 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2490
2491out:
2492 st->last_pos = *pos;
2493 return rc;
2494}
2495EXPORT_SYMBOL(tcp_seq_start);
2496
2497void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2498{
2499 struct tcp_iter_state *st = seq->private;
2500 void *rc = NULL;
2501
2502 if (v == SEQ_START_TOKEN) {
2503 rc = tcp_get_idx(seq, 0);
2504 goto out;
2505 }
2506
2507 switch (st->state) {
2508 case TCP_SEQ_STATE_LISTENING:
2509 rc = listening_get_next(seq, v);
2510 if (!rc) {
2511 st->state = TCP_SEQ_STATE_ESTABLISHED;
2512 st->bucket = 0;
2513 st->offset = 0;
2514 rc = established_get_first(seq);
2515 }
2516 break;
2517 case TCP_SEQ_STATE_ESTABLISHED:
2518 rc = established_get_next(seq, v);
2519 break;
2520 }
2521out:
2522 ++*pos;
2523 st->last_pos = *pos;
2524 return rc;
2525}
2526EXPORT_SYMBOL(tcp_seq_next);
2527
2528void tcp_seq_stop(struct seq_file *seq, void *v)
2529{
2530 struct tcp_iter_state *st = seq->private;
2531
2532 switch (st->state) {
2533 case TCP_SEQ_STATE_LISTENING:
2534 if (v != SEQ_START_TOKEN)
2535 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2536 break;
2537 case TCP_SEQ_STATE_ESTABLISHED:
2538 if (v)
2539 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2540 break;
2541 }
2542}
2543EXPORT_SYMBOL(tcp_seq_stop);
2544
2545static void get_openreq4(const struct request_sock *req,
2546 struct seq_file *f, int i)
2547{
2548 const struct inet_request_sock *ireq = inet_rsk(req);
2549 long delta = req->rsk_timer.expires - jiffies;
2550
2551 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2552 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2553 i,
2554 ireq->ir_loc_addr,
2555 ireq->ir_num,
2556 ireq->ir_rmt_addr,
2557 ntohs(ireq->ir_rmt_port),
2558 TCP_SYN_RECV,
2559 0, 0, /* could print option size, but that is af dependent. */
2560 1, /* timers active (only the expire timer) */
2561 jiffies_delta_to_clock_t(delta),
2562 req->num_timeout,
2563 from_kuid_munged(seq_user_ns(f),
2564 sock_i_uid(req->rsk_listener)),
2565 0, /* non standard timer */
2566 0, /* open_requests have no inode */
2567 0,
2568 req);
2569}
2570
2571static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2572{
2573 int timer_active;
2574 unsigned long timer_expires;
2575 const struct tcp_sock *tp = tcp_sk(sk);
2576 const struct inet_connection_sock *icsk = inet_csk(sk);
2577 const struct inet_sock *inet = inet_sk(sk);
2578 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2579 __be32 dest = inet->inet_daddr;
2580 __be32 src = inet->inet_rcv_saddr;
2581 __u16 destp = ntohs(inet->inet_dport);
2582 __u16 srcp = ntohs(inet->inet_sport);
2583 int rx_queue;
2584 int state;
2585
2586 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2587 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2588 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2589 timer_active = 1;
2590 timer_expires = icsk->icsk_timeout;
2591 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2592 timer_active = 4;
2593 timer_expires = icsk->icsk_timeout;
2594 } else if (timer_pending(&sk->sk_timer)) {
2595 timer_active = 2;
2596 timer_expires = sk->sk_timer.expires;
2597 } else {
2598 timer_active = 0;
2599 timer_expires = jiffies;
2600 }
2601
2602 state = inet_sk_state_load(sk);
2603 if (state == TCP_LISTEN)
2604 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2605 else
2606 /* Because we don't lock the socket,
2607 * we might find a transient negative value.
2608 */
2609 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2610 READ_ONCE(tp->copied_seq), 0);
2611
2612 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2613 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2614 i, src, srcp, dest, destp, state,
2615 READ_ONCE(tp->write_seq) - tp->snd_una,
2616 rx_queue,
2617 timer_active,
2618 jiffies_delta_to_clock_t(timer_expires - jiffies),
2619 icsk->icsk_retransmits,
2620 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2621 icsk->icsk_probes_out,
2622 sock_i_ino(sk),
2623 refcount_read(&sk->sk_refcnt), sk,
2624 jiffies_to_clock_t(icsk->icsk_rto),
2625 jiffies_to_clock_t(icsk->icsk_ack.ato),
2626 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2627 tcp_snd_cwnd(tp),
2628 state == TCP_LISTEN ?
2629 fastopenq->max_qlen :
2630 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2631}
2632
2633static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2634 struct seq_file *f, int i)
2635{
2636 long delta = tw->tw_timer.expires - jiffies;
2637 __be32 dest, src;
2638 __u16 destp, srcp;
2639
2640 dest = tw->tw_daddr;
2641 src = tw->tw_rcv_saddr;
2642 destp = ntohs(tw->tw_dport);
2643 srcp = ntohs(tw->tw_sport);
2644
2645 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2646 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2647 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2648 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2649 refcount_read(&tw->tw_refcnt), tw);
2650}
2651
2652#define TMPSZ 150
2653
2654static int tcp4_seq_show(struct seq_file *seq, void *v)
2655{
2656 struct tcp_iter_state *st;
2657 struct sock *sk = v;
2658
2659 seq_setwidth(seq, TMPSZ - 1);
2660 if (v == SEQ_START_TOKEN) {
2661 seq_puts(seq, " sl local_address rem_address st tx_queue "
2662 "rx_queue tr tm->when retrnsmt uid timeout "
2663 "inode");
2664 goto out;
2665 }
2666 st = seq->private;
2667
2668 if (sk->sk_state == TCP_TIME_WAIT)
2669 get_timewait4_sock(v, seq, st->num);
2670 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2671 get_openreq4(v, seq, st->num);
2672 else
2673 get_tcp4_sock(v, seq, st->num);
2674out:
2675 seq_pad(seq, '\n');
2676 return 0;
2677}
2678
2679#ifdef CONFIG_BPF_SYSCALL
2680struct bpf_tcp_iter_state {
2681 struct tcp_iter_state state;
2682 unsigned int cur_sk;
2683 unsigned int end_sk;
2684 unsigned int max_sk;
2685 struct sock **batch;
2686 bool st_bucket_done;
2687};
2688
2689struct bpf_iter__tcp {
2690 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2691 __bpf_md_ptr(struct sock_common *, sk_common);
2692 uid_t uid __aligned(8);
2693};
2694
2695static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2696 struct sock_common *sk_common, uid_t uid)
2697{
2698 struct bpf_iter__tcp ctx;
2699
2700 meta->seq_num--; /* skip SEQ_START_TOKEN */
2701 ctx.meta = meta;
2702 ctx.sk_common = sk_common;
2703 ctx.uid = uid;
2704 return bpf_iter_run_prog(prog, &ctx);
2705}
2706
2707static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2708{
2709 while (iter->cur_sk < iter->end_sk)
2710 sock_put(iter->batch[iter->cur_sk++]);
2711}
2712
2713static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2714 unsigned int new_batch_sz)
2715{
2716 struct sock **new_batch;
2717
2718 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2719 GFP_USER | __GFP_NOWARN);
2720 if (!new_batch)
2721 return -ENOMEM;
2722
2723 bpf_iter_tcp_put_batch(iter);
2724 kvfree(iter->batch);
2725 iter->batch = new_batch;
2726 iter->max_sk = new_batch_sz;
2727
2728 return 0;
2729}
2730
2731static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2732 struct sock *start_sk)
2733{
2734 struct bpf_tcp_iter_state *iter = seq->private;
2735 struct tcp_iter_state *st = &iter->state;
2736 struct hlist_nulls_node *node;
2737 unsigned int expected = 1;
2738 struct sock *sk;
2739
2740 sock_hold(start_sk);
2741 iter->batch[iter->end_sk++] = start_sk;
2742
2743 sk = sk_nulls_next(start_sk);
2744 sk_nulls_for_each_from(sk, node) {
2745 if (seq_sk_match(seq, sk)) {
2746 if (iter->end_sk < iter->max_sk) {
2747 sock_hold(sk);
2748 iter->batch[iter->end_sk++] = sk;
2749 }
2750 expected++;
2751 }
2752 }
2753 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2754
2755 return expected;
2756}
2757
2758static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2759 struct sock *start_sk)
2760{
2761 struct bpf_tcp_iter_state *iter = seq->private;
2762 struct tcp_iter_state *st = &iter->state;
2763 struct hlist_nulls_node *node;
2764 unsigned int expected = 1;
2765 struct sock *sk;
2766
2767 sock_hold(start_sk);
2768 iter->batch[iter->end_sk++] = start_sk;
2769
2770 sk = sk_nulls_next(start_sk);
2771 sk_nulls_for_each_from(sk, node) {
2772 if (seq_sk_match(seq, sk)) {
2773 if (iter->end_sk < iter->max_sk) {
2774 sock_hold(sk);
2775 iter->batch[iter->end_sk++] = sk;
2776 }
2777 expected++;
2778 }
2779 }
2780 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2781
2782 return expected;
2783}
2784
2785static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2786{
2787 struct bpf_tcp_iter_state *iter = seq->private;
2788 struct tcp_iter_state *st = &iter->state;
2789 unsigned int expected;
2790 bool resized = false;
2791 struct sock *sk;
2792
2793 /* The st->bucket is done. Directly advance to the next
2794 * bucket instead of having the tcp_seek_last_pos() to skip
2795 * one by one in the current bucket and eventually find out
2796 * it has to advance to the next bucket.
2797 */
2798 if (iter->st_bucket_done) {
2799 st->offset = 0;
2800 st->bucket++;
2801 if (st->state == TCP_SEQ_STATE_LISTENING &&
2802 st->bucket > tcp_hashinfo.lhash2_mask) {
2803 st->state = TCP_SEQ_STATE_ESTABLISHED;
2804 st->bucket = 0;
2805 }
2806 }
2807
2808again:
2809 /* Get a new batch */
2810 iter->cur_sk = 0;
2811 iter->end_sk = 0;
2812 iter->st_bucket_done = false;
2813
2814 sk = tcp_seek_last_pos(seq);
2815 if (!sk)
2816 return NULL; /* Done */
2817
2818 if (st->state == TCP_SEQ_STATE_LISTENING)
2819 expected = bpf_iter_tcp_listening_batch(seq, sk);
2820 else
2821 expected = bpf_iter_tcp_established_batch(seq, sk);
2822
2823 if (iter->end_sk == expected) {
2824 iter->st_bucket_done = true;
2825 return sk;
2826 }
2827
2828 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2829 resized = true;
2830 goto again;
2831 }
2832
2833 return sk;
2834}
2835
2836static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2837{
2838 /* bpf iter does not support lseek, so it always
2839 * continue from where it was stop()-ped.
2840 */
2841 if (*pos)
2842 return bpf_iter_tcp_batch(seq);
2843
2844 return SEQ_START_TOKEN;
2845}
2846
2847static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2848{
2849 struct bpf_tcp_iter_state *iter = seq->private;
2850 struct tcp_iter_state *st = &iter->state;
2851 struct sock *sk;
2852
2853 /* Whenever seq_next() is called, the iter->cur_sk is
2854 * done with seq_show(), so advance to the next sk in
2855 * the batch.
2856 */
2857 if (iter->cur_sk < iter->end_sk) {
2858 /* Keeping st->num consistent in tcp_iter_state.
2859 * bpf_iter_tcp does not use st->num.
2860 * meta.seq_num is used instead.
2861 */
2862 st->num++;
2863 /* Move st->offset to the next sk in the bucket such that
2864 * the future start() will resume at st->offset in
2865 * st->bucket. See tcp_seek_last_pos().
2866 */
2867 st->offset++;
2868 sock_put(iter->batch[iter->cur_sk++]);
2869 }
2870
2871 if (iter->cur_sk < iter->end_sk)
2872 sk = iter->batch[iter->cur_sk];
2873 else
2874 sk = bpf_iter_tcp_batch(seq);
2875
2876 ++*pos;
2877 /* Keeping st->last_pos consistent in tcp_iter_state.
2878 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2879 */
2880 st->last_pos = *pos;
2881 return sk;
2882}
2883
2884static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2885{
2886 struct bpf_iter_meta meta;
2887 struct bpf_prog *prog;
2888 struct sock *sk = v;
2889 bool slow;
2890 uid_t uid;
2891 int ret;
2892
2893 if (v == SEQ_START_TOKEN)
2894 return 0;
2895
2896 if (sk_fullsock(sk))
2897 slow = lock_sock_fast(sk);
2898
2899 if (unlikely(sk_unhashed(sk))) {
2900 ret = SEQ_SKIP;
2901 goto unlock;
2902 }
2903
2904 if (sk->sk_state == TCP_TIME_WAIT) {
2905 uid = 0;
2906 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2907 const struct request_sock *req = v;
2908
2909 uid = from_kuid_munged(seq_user_ns(seq),
2910 sock_i_uid(req->rsk_listener));
2911 } else {
2912 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2913 }
2914
2915 meta.seq = seq;
2916 prog = bpf_iter_get_info(&meta, false);
2917 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2918
2919unlock:
2920 if (sk_fullsock(sk))
2921 unlock_sock_fast(sk, slow);
2922 return ret;
2923
2924}
2925
2926static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2927{
2928 struct bpf_tcp_iter_state *iter = seq->private;
2929 struct bpf_iter_meta meta;
2930 struct bpf_prog *prog;
2931
2932 if (!v) {
2933 meta.seq = seq;
2934 prog = bpf_iter_get_info(&meta, true);
2935 if (prog)
2936 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2937 }
2938
2939 if (iter->cur_sk < iter->end_sk) {
2940 bpf_iter_tcp_put_batch(iter);
2941 iter->st_bucket_done = false;
2942 }
2943}
2944
2945static const struct seq_operations bpf_iter_tcp_seq_ops = {
2946 .show = bpf_iter_tcp_seq_show,
2947 .start = bpf_iter_tcp_seq_start,
2948 .next = bpf_iter_tcp_seq_next,
2949 .stop = bpf_iter_tcp_seq_stop,
2950};
2951#endif
2952static unsigned short seq_file_family(const struct seq_file *seq)
2953{
2954 const struct tcp_seq_afinfo *afinfo;
2955
2956#ifdef CONFIG_BPF_SYSCALL
2957 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
2958 if (seq->op == &bpf_iter_tcp_seq_ops)
2959 return AF_UNSPEC;
2960#endif
2961
2962 /* Iterated from proc fs */
2963 afinfo = pde_data(file_inode(seq->file));
2964 return afinfo->family;
2965}
2966
2967static const struct seq_operations tcp4_seq_ops = {
2968 .show = tcp4_seq_show,
2969 .start = tcp_seq_start,
2970 .next = tcp_seq_next,
2971 .stop = tcp_seq_stop,
2972};
2973
2974static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2975 .family = AF_INET,
2976};
2977
2978static int __net_init tcp4_proc_init_net(struct net *net)
2979{
2980 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2981 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2982 return -ENOMEM;
2983 return 0;
2984}
2985
2986static void __net_exit tcp4_proc_exit_net(struct net *net)
2987{
2988 remove_proc_entry("tcp", net->proc_net);
2989}
2990
2991static struct pernet_operations tcp4_net_ops = {
2992 .init = tcp4_proc_init_net,
2993 .exit = tcp4_proc_exit_net,
2994};
2995
2996int __init tcp4_proc_init(void)
2997{
2998 return register_pernet_subsys(&tcp4_net_ops);
2999}
3000
3001void tcp4_proc_exit(void)
3002{
3003 unregister_pernet_subsys(&tcp4_net_ops);
3004}
3005#endif /* CONFIG_PROC_FS */
3006
3007/* @wake is one when sk_stream_write_space() calls us.
3008 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3009 * This mimics the strategy used in sock_def_write_space().
3010 */
3011bool tcp_stream_memory_free(const struct sock *sk, int wake)
3012{
3013 const struct tcp_sock *tp = tcp_sk(sk);
3014 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3015 READ_ONCE(tp->snd_nxt);
3016
3017 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3018}
3019EXPORT_SYMBOL(tcp_stream_memory_free);
3020
3021struct proto tcp_prot = {
3022 .name = "TCP",
3023 .owner = THIS_MODULE,
3024 .close = tcp_close,
3025 .pre_connect = tcp_v4_pre_connect,
3026 .connect = tcp_v4_connect,
3027 .disconnect = tcp_disconnect,
3028 .accept = inet_csk_accept,
3029 .ioctl = tcp_ioctl,
3030 .init = tcp_v4_init_sock,
3031 .destroy = tcp_v4_destroy_sock,
3032 .shutdown = tcp_shutdown,
3033 .setsockopt = tcp_setsockopt,
3034 .getsockopt = tcp_getsockopt,
3035 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3036 .keepalive = tcp_set_keepalive,
3037 .recvmsg = tcp_recvmsg,
3038 .sendmsg = tcp_sendmsg,
3039 .sendpage = tcp_sendpage,
3040 .backlog_rcv = tcp_v4_do_rcv,
3041 .release_cb = tcp_release_cb,
3042 .hash = inet_hash,
3043 .unhash = inet_unhash,
3044 .get_port = inet_csk_get_port,
3045 .put_port = inet_put_port,
3046#ifdef CONFIG_BPF_SYSCALL
3047 .psock_update_sk_prot = tcp_bpf_update_proto,
3048#endif
3049 .enter_memory_pressure = tcp_enter_memory_pressure,
3050 .leave_memory_pressure = tcp_leave_memory_pressure,
3051 .stream_memory_free = tcp_stream_memory_free,
3052 .sockets_allocated = &tcp_sockets_allocated,
3053 .orphan_count = &tcp_orphan_count,
3054
3055 .memory_allocated = &tcp_memory_allocated,
3056 .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
3057
3058 .memory_pressure = &tcp_memory_pressure,
3059 .sysctl_mem = sysctl_tcp_mem,
3060 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3061 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3062 .max_header = MAX_TCP_HEADER,
3063 .obj_size = sizeof(struct tcp_sock),
3064 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3065 .twsk_prot = &tcp_timewait_sock_ops,
3066 .rsk_prot = &tcp_request_sock_ops,
3067 .h.hashinfo = &tcp_hashinfo,
3068 .no_autobind = true,
3069 .diag_destroy = tcp_abort,
3070};
3071EXPORT_SYMBOL(tcp_prot);
3072
3073static void __net_exit tcp_sk_exit(struct net *net)
3074{
3075 struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3076
3077 if (net->ipv4.tcp_congestion_control)
3078 bpf_module_put(net->ipv4.tcp_congestion_control,
3079 net->ipv4.tcp_congestion_control->owner);
3080 if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3081 kfree(tcp_death_row);
3082}
3083
3084static int __net_init tcp_sk_init(struct net *net)
3085{
3086 int cnt;
3087
3088 net->ipv4.sysctl_tcp_ecn = 2;
3089 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3090
3091 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3092 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3093 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3094 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3095 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3096
3097 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3098 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3099 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3100
3101 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3102 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3103 net->ipv4.sysctl_tcp_syncookies = 1;
3104 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3105 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3106 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3107 net->ipv4.sysctl_tcp_orphan_retries = 0;
3108 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3109 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3110 net->ipv4.sysctl_tcp_tw_reuse = 2;
3111 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3112
3113 net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3114 if (!net->ipv4.tcp_death_row)
3115 return -ENOMEM;
3116 refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3117 cnt = tcp_hashinfo.ehash_mask + 1;
3118 net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3119 net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3120
3121 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3122 net->ipv4.sysctl_tcp_sack = 1;
3123 net->ipv4.sysctl_tcp_window_scaling = 1;
3124 net->ipv4.sysctl_tcp_timestamps = 1;
3125 net->ipv4.sysctl_tcp_early_retrans = 3;
3126 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3127 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3128 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3129 net->ipv4.sysctl_tcp_max_reordering = 300;
3130 net->ipv4.sysctl_tcp_dsack = 1;
3131 net->ipv4.sysctl_tcp_app_win = 31;
3132 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3133 net->ipv4.sysctl_tcp_frto = 2;
3134 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3135 /* This limits the percentage of the congestion window which we
3136 * will allow a single TSO frame to consume. Building TSO frames
3137 * which are too large can cause TCP streams to be bursty.
3138 */
3139 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3140 /* Default TSQ limit of 16 TSO segments */
3141 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3142 /* rfc5961 challenge ack rate limiting */
3143 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3144 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3145 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3146 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3147 net->ipv4.sysctl_tcp_autocorking = 1;
3148 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3149 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3150 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3151 if (net != &init_net) {
3152 memcpy(net->ipv4.sysctl_tcp_rmem,
3153 init_net.ipv4.sysctl_tcp_rmem,
3154 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3155 memcpy(net->ipv4.sysctl_tcp_wmem,
3156 init_net.ipv4.sysctl_tcp_wmem,
3157 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3158 }
3159 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3160 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3161 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3162 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3163 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3164 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3165
3166 /* Reno is always built in */
3167 if (!net_eq(net, &init_net) &&
3168 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3169 init_net.ipv4.tcp_congestion_control->owner))
3170 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3171 else
3172 net->ipv4.tcp_congestion_control = &tcp_reno;
3173
3174 return 0;
3175}
3176
3177static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3178{
3179 struct net *net;
3180
3181 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3182
3183 list_for_each_entry(net, net_exit_list, exit_list)
3184 tcp_fastopen_ctx_destroy(net);
3185}
3186
3187static struct pernet_operations __net_initdata tcp_sk_ops = {
3188 .init = tcp_sk_init,
3189 .exit = tcp_sk_exit,
3190 .exit_batch = tcp_sk_exit_batch,
3191};
3192
3193#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3194DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3195 struct sock_common *sk_common, uid_t uid)
3196
3197#define INIT_BATCH_SZ 16
3198
3199static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3200{
3201 struct bpf_tcp_iter_state *iter = priv_data;
3202 int err;
3203
3204 err = bpf_iter_init_seq_net(priv_data, aux);
3205 if (err)
3206 return err;
3207
3208 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3209 if (err) {
3210 bpf_iter_fini_seq_net(priv_data);
3211 return err;
3212 }
3213
3214 return 0;
3215}
3216
3217static void bpf_iter_fini_tcp(void *priv_data)
3218{
3219 struct bpf_tcp_iter_state *iter = priv_data;
3220
3221 bpf_iter_fini_seq_net(priv_data);
3222 kvfree(iter->batch);
3223}
3224
3225static const struct bpf_iter_seq_info tcp_seq_info = {
3226 .seq_ops = &bpf_iter_tcp_seq_ops,
3227 .init_seq_private = bpf_iter_init_tcp,
3228 .fini_seq_private = bpf_iter_fini_tcp,
3229 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3230};
3231
3232static const struct bpf_func_proto *
3233bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3234 const struct bpf_prog *prog)
3235{
3236 switch (func_id) {
3237 case BPF_FUNC_setsockopt:
3238 return &bpf_sk_setsockopt_proto;
3239 case BPF_FUNC_getsockopt:
3240 return &bpf_sk_getsockopt_proto;
3241 default:
3242 return NULL;
3243 }
3244}
3245
3246static struct bpf_iter_reg tcp_reg_info = {
3247 .target = "tcp",
3248 .ctx_arg_info_size = 1,
3249 .ctx_arg_info = {
3250 { offsetof(struct bpf_iter__tcp, sk_common),
3251 PTR_TO_BTF_ID_OR_NULL },
3252 },
3253 .get_func_proto = bpf_iter_tcp_get_func_proto,
3254 .seq_info = &tcp_seq_info,
3255};
3256
3257static void __init bpf_iter_register(void)
3258{
3259 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3260 if (bpf_iter_reg_target(&tcp_reg_info))
3261 pr_warn("Warning: could not register bpf iterator tcp\n");
3262}
3263
3264#endif
3265
3266void __init tcp_v4_init(void)
3267{
3268 int cpu, res;
3269
3270 for_each_possible_cpu(cpu) {
3271 struct sock *sk;
3272
3273 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3274 IPPROTO_TCP, &init_net);
3275 if (res)
3276 panic("Failed to create the TCP control socket.\n");
3277 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3278
3279 /* Please enforce IP_DF and IPID==0 for RST and
3280 * ACK sent in SYN-RECV and TIME-WAIT state.
3281 */
3282 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3283
3284 per_cpu(ipv4_tcp_sk, cpu) = sk;
3285 }
3286 if (register_pernet_subsys(&tcp_sk_ops))
3287 panic("Failed to create the TCP control socket.\n");
3288
3289#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3290 bpf_iter_register();
3291#endif
3292}
3293

source code of linux/net/ipv4/tcp_ipv4.c