1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Syncookies implementation for the Linux kernel |
4 | * |
5 | * Copyright (C) 1997 Andi Kleen |
6 | * Based on ideas by D.J.Bernstein and Eric Schenk. |
7 | */ |
8 | |
9 | #include <linux/tcp.h> |
10 | #include <linux/siphash.h> |
11 | #include <linux/kernel.h> |
12 | #include <linux/export.h> |
13 | #include <net/secure_seq.h> |
14 | #include <net/tcp.h> |
15 | #include <net/route.h> |
16 | |
17 | static siphash_aligned_key_t syncookie_secret[2]; |
18 | |
19 | #define COOKIEBITS 24 /* Upper bits store count */ |
20 | #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) |
21 | |
22 | /* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK |
23 | * stores TCP options: |
24 | * |
25 | * MSB LSB |
26 | * | 31 ... 6 | 5 | 4 | 3 2 1 0 | |
27 | * | Timestamp | ECN | SACK | WScale | |
28 | * |
29 | * When we receive a valid cookie-ACK, we look at the echoed tsval (if |
30 | * any) to figure out which TCP options we should use for the rebuilt |
31 | * connection. |
32 | * |
33 | * A WScale setting of '0xf' (which is an invalid scaling value) |
34 | * means that original syn did not include the TCP window scaling option. |
35 | */ |
36 | #define TS_OPT_WSCALE_MASK 0xf |
37 | #define TS_OPT_SACK BIT(4) |
38 | #define TS_OPT_ECN BIT(5) |
39 | /* There is no TS_OPT_TIMESTAMP: |
40 | * if ACK contains timestamp option, we already know it was |
41 | * requested/supported by the syn/synack exchange. |
42 | */ |
43 | #define TSBITS 6 |
44 | |
45 | static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, |
46 | u32 count, int c) |
47 | { |
48 | net_get_random_once(syncookie_secret, sizeof(syncookie_secret)); |
49 | return siphash_4u32(a: (__force u32)saddr, b: (__force u32)daddr, |
50 | c: (__force u32)sport << 16 | (__force u32)dport, |
51 | d: count, key: &syncookie_secret[c]); |
52 | } |
53 | |
54 | /* |
55 | * when syncookies are in effect and tcp timestamps are enabled we encode |
56 | * tcp options in the lower bits of the timestamp value that will be |
57 | * sent in the syn-ack. |
58 | * Since subsequent timestamps use the normal tcp_time_stamp value, we |
59 | * must make sure that the resulting initial timestamp is <= tcp_time_stamp. |
60 | */ |
61 | u64 cookie_init_timestamp(struct request_sock *req, u64 now) |
62 | { |
63 | const struct inet_request_sock *ireq = inet_rsk(sk: req); |
64 | u64 ts, ts_now = tcp_ns_to_ts(usec_ts: false, val: now); |
65 | u32 options = 0; |
66 | |
67 | options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK; |
68 | if (ireq->sack_ok) |
69 | options |= TS_OPT_SACK; |
70 | if (ireq->ecn_ok) |
71 | options |= TS_OPT_ECN; |
72 | |
73 | ts = (ts_now >> TSBITS) << TSBITS; |
74 | ts |= options; |
75 | if (ts > ts_now) |
76 | ts -= (1UL << TSBITS); |
77 | |
78 | if (tcp_rsk(req)->req_usec_ts) |
79 | return ts * NSEC_PER_USEC; |
80 | return ts * NSEC_PER_MSEC; |
81 | } |
82 | |
83 | |
84 | static __u32 secure_tcp_syn_cookie(__be32 saddr, __be32 daddr, __be16 sport, |
85 | __be16 dport, __u32 sseq, __u32 data) |
86 | { |
87 | /* |
88 | * Compute the secure sequence number. |
89 | * The output should be: |
90 | * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24) |
91 | * + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24). |
92 | * Where sseq is their sequence number and count increases every |
93 | * minute by 1. |
94 | * As an extra hack, we add a small "data" value that encodes the |
95 | * MSS into the second hash value. |
96 | */ |
97 | u32 count = tcp_cookie_time(); |
98 | return (cookie_hash(saddr, daddr, sport, dport, count: 0, c: 0) + |
99 | sseq + (count << COOKIEBITS) + |
100 | ((cookie_hash(saddr, daddr, sport, dport, count, c: 1) + data) |
101 | & COOKIEMASK)); |
102 | } |
103 | |
104 | /* |
105 | * This retrieves the small "data" value from the syncookie. |
106 | * If the syncookie is bad, the data returned will be out of |
107 | * range. This must be checked by the caller. |
108 | * |
109 | * The count value used to generate the cookie must be less than |
110 | * MAX_SYNCOOKIE_AGE minutes in the past. |
111 | * The return value (__u32)-1 if this test fails. |
112 | */ |
113 | static __u32 check_tcp_syn_cookie(__u32 cookie, __be32 saddr, __be32 daddr, |
114 | __be16 sport, __be16 dport, __u32 sseq) |
115 | { |
116 | u32 diff, count = tcp_cookie_time(); |
117 | |
118 | /* Strip away the layers from the cookie */ |
119 | cookie -= cookie_hash(saddr, daddr, sport, dport, count: 0, c: 0) + sseq; |
120 | |
121 | /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ |
122 | diff = (count - (cookie >> COOKIEBITS)) & ((__u32) -1 >> COOKIEBITS); |
123 | if (diff >= MAX_SYNCOOKIE_AGE) |
124 | return (__u32)-1; |
125 | |
126 | return (cookie - |
127 | cookie_hash(saddr, daddr, sport, dport, count: count - diff, c: 1)) |
128 | & COOKIEMASK; /* Leaving the data behind */ |
129 | } |
130 | |
131 | /* |
132 | * MSS Values are chosen based on the 2011 paper |
133 | * 'An Analysis of TCP Maximum Segement Sizes' by S. Alcock and R. Nelson. |
134 | * Values .. |
135 | * .. lower than 536 are rare (< 0.2%) |
136 | * .. between 537 and 1299 account for less than < 1.5% of observed values |
137 | * .. in the 1300-1349 range account for about 15 to 20% of observed mss values |
138 | * .. exceeding 1460 are very rare (< 0.04%) |
139 | * |
140 | * 1460 is the single most frequently announced mss value (30 to 46% depending |
141 | * on monitor location). Table must be sorted. |
142 | */ |
143 | static __u16 const msstab[] = { |
144 | 536, |
145 | 1300, |
146 | 1440, /* 1440, 1452: PPPoE */ |
147 | 1460, |
148 | }; |
149 | |
150 | /* |
151 | * Generate a syncookie. mssp points to the mss, which is returned |
152 | * rounded down to the value encoded in the cookie. |
153 | */ |
154 | u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, |
155 | u16 *mssp) |
156 | { |
157 | int mssind; |
158 | const __u16 mss = *mssp; |
159 | |
160 | for (mssind = ARRAY_SIZE(msstab) - 1; mssind ; mssind--) |
161 | if (mss >= msstab[mssind]) |
162 | break; |
163 | *mssp = msstab[mssind]; |
164 | |
165 | return secure_tcp_syn_cookie(saddr: iph->saddr, daddr: iph->daddr, |
166 | sport: th->source, dport: th->dest, ntohl(th->seq), |
167 | data: mssind); |
168 | } |
169 | EXPORT_SYMBOL_GPL(__cookie_v4_init_sequence); |
170 | |
171 | __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp) |
172 | { |
173 | const struct iphdr *iph = ip_hdr(skb); |
174 | const struct tcphdr *th = tcp_hdr(skb); |
175 | |
176 | return __cookie_v4_init_sequence(iph, th, mssp); |
177 | } |
178 | |
179 | /* |
180 | * Check if a ack sequence number is a valid syncookie. |
181 | * Return the decoded mss if it is, or 0 if not. |
182 | */ |
183 | int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th) |
184 | { |
185 | __u32 cookie = ntohl(th->ack_seq) - 1; |
186 | __u32 seq = ntohl(th->seq) - 1; |
187 | __u32 mssind; |
188 | |
189 | mssind = check_tcp_syn_cookie(cookie, saddr: iph->saddr, daddr: iph->daddr, |
190 | sport: th->source, dport: th->dest, sseq: seq); |
191 | |
192 | return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0; |
193 | } |
194 | EXPORT_SYMBOL_GPL(__cookie_v4_check); |
195 | |
196 | struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, |
197 | struct request_sock *req, |
198 | struct dst_entry *dst) |
199 | { |
200 | struct inet_connection_sock *icsk = inet_csk(sk); |
201 | struct sock *child; |
202 | bool own_req; |
203 | |
204 | child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst, |
205 | NULL, &own_req); |
206 | if (child) { |
207 | refcount_set(r: &req->rsk_refcnt, n: 1); |
208 | sock_rps_save_rxhash(sk: child, skb); |
209 | |
210 | if (rsk_drop_req(req)) { |
211 | reqsk_put(req); |
212 | return child; |
213 | } |
214 | |
215 | if (inet_csk_reqsk_queue_add(sk, req, child)) |
216 | return child; |
217 | |
218 | bh_unlock_sock(child); |
219 | sock_put(sk: child); |
220 | } |
221 | __reqsk_free(req); |
222 | |
223 | return NULL; |
224 | } |
225 | EXPORT_SYMBOL(tcp_get_cookie_sock); |
226 | |
227 | /* |
228 | * when syncookies are in effect and tcp timestamps are enabled we stored |
229 | * additional tcp options in the timestamp. |
230 | * This extracts these options from the timestamp echo. |
231 | * |
232 | * return false if we decode a tcp option that is disabled |
233 | * on the host. |
234 | */ |
235 | bool cookie_timestamp_decode(const struct net *net, |
236 | struct tcp_options_received *tcp_opt) |
237 | { |
238 | /* echoed timestamp, lowest bits contain options */ |
239 | u32 options = tcp_opt->rcv_tsecr; |
240 | |
241 | if (!tcp_opt->saw_tstamp) { |
242 | tcp_clear_options(rx_opt: tcp_opt); |
243 | return true; |
244 | } |
245 | |
246 | if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) |
247 | return false; |
248 | |
249 | tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0; |
250 | |
251 | if (tcp_opt->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) |
252 | return false; |
253 | |
254 | if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK) |
255 | return true; /* no window scaling */ |
256 | |
257 | tcp_opt->wscale_ok = 1; |
258 | tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK; |
259 | |
260 | return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0; |
261 | } |
262 | EXPORT_SYMBOL(cookie_timestamp_decode); |
263 | |
264 | static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, |
265 | struct request_sock *req) |
266 | { |
267 | struct inet_request_sock *ireq = inet_rsk(sk: req); |
268 | struct tcp_request_sock *treq = tcp_rsk(req); |
269 | const struct tcphdr *th = tcp_hdr(skb); |
270 | |
271 | req->num_retrans = 0; |
272 | |
273 | ireq->ir_num = ntohs(th->dest); |
274 | ireq->ir_rmt_port = th->source; |
275 | ireq->ir_iif = inet_request_bound_dev_if(sk, skb); |
276 | ireq->ir_mark = inet_request_mark(sk, skb); |
277 | |
278 | if (IS_ENABLED(CONFIG_SMC)) |
279 | ireq->smc_ok = 0; |
280 | |
281 | treq->snt_synack = 0; |
282 | treq->tfo_listener = false; |
283 | treq->txhash = net_tx_rndhash(); |
284 | treq->rcv_isn = ntohl(th->seq) - 1; |
285 | treq->snt_isn = ntohl(th->ack_seq) - 1; |
286 | treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield; |
287 | treq->req_usec_ts = false; |
288 | |
289 | #if IS_ENABLED(CONFIG_MPTCP) |
290 | treq->is_mptcp = sk_is_mptcp(sk); |
291 | if (treq->is_mptcp) |
292 | return mptcp_subflow_init_cookie_req(req, sk_listener: sk, skb); |
293 | #endif |
294 | |
295 | return 0; |
296 | } |
297 | |
298 | #if IS_ENABLED(CONFIG_BPF) |
299 | struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb) |
300 | { |
301 | struct request_sock *req = inet_reqsk(sk: skb->sk); |
302 | |
303 | skb->sk = NULL; |
304 | skb->destructor = NULL; |
305 | |
306 | if (cookie_tcp_reqsk_init(sk, skb, req)) { |
307 | reqsk_free(req); |
308 | req = NULL; |
309 | } |
310 | |
311 | return req; |
312 | } |
313 | EXPORT_SYMBOL_GPL(cookie_bpf_check); |
314 | #endif |
315 | |
316 | struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, |
317 | struct sock *sk, struct sk_buff *skb, |
318 | struct tcp_options_received *tcp_opt, |
319 | int mss, u32 tsoff) |
320 | { |
321 | struct inet_request_sock *ireq; |
322 | struct tcp_request_sock *treq; |
323 | struct request_sock *req; |
324 | |
325 | if (sk_is_mptcp(sk)) |
326 | req = mptcp_subflow_reqsk_alloc(ops, sk_listener: sk, attach_listener: false); |
327 | else |
328 | req = inet_reqsk_alloc(ops, sk_listener: sk, attach_listener: false); |
329 | |
330 | if (!req) |
331 | return NULL; |
332 | |
333 | if (cookie_tcp_reqsk_init(sk, skb, req)) { |
334 | reqsk_free(req); |
335 | return NULL; |
336 | } |
337 | |
338 | ireq = inet_rsk(sk: req); |
339 | treq = tcp_rsk(req); |
340 | |
341 | req->mss = mss; |
342 | req->ts_recent = tcp_opt->saw_tstamp ? tcp_opt->rcv_tsval : 0; |
343 | |
344 | ireq->snd_wscale = tcp_opt->snd_wscale; |
345 | ireq->tstamp_ok = tcp_opt->saw_tstamp; |
346 | ireq->sack_ok = tcp_opt->sack_ok; |
347 | ireq->wscale_ok = tcp_opt->wscale_ok; |
348 | ireq->ecn_ok = !!(tcp_opt->rcv_tsecr & TS_OPT_ECN); |
349 | |
350 | treq->ts_off = tsoff; |
351 | |
352 | return req; |
353 | } |
354 | EXPORT_SYMBOL_GPL(cookie_tcp_reqsk_alloc); |
355 | |
356 | static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, |
357 | struct sk_buff *skb) |
358 | { |
359 | struct tcp_options_received tcp_opt; |
360 | u32 tsoff = 0; |
361 | int mss; |
362 | |
363 | if (tcp_synq_no_recent_overflow(sk)) |
364 | goto out; |
365 | |
366 | mss = __cookie_v4_check(ip_hdr(skb), tcp_hdr(skb)); |
367 | if (!mss) { |
368 | __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED); |
369 | goto out; |
370 | } |
371 | |
372 | __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV); |
373 | |
374 | /* check for timestamp cookie support */ |
375 | memset(&tcp_opt, 0, sizeof(tcp_opt)); |
376 | tcp_parse_options(net, skb, opt_rx: &tcp_opt, estab: 0, NULL); |
377 | |
378 | if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { |
379 | tsoff = secure_tcp_ts_off(net, |
380 | saddr: ip_hdr(skb)->daddr, |
381 | daddr: ip_hdr(skb)->saddr); |
382 | tcp_opt.rcv_tsecr -= tsoff; |
383 | } |
384 | |
385 | if (!cookie_timestamp_decode(net, &tcp_opt)) |
386 | goto out; |
387 | |
388 | return cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb, |
389 | &tcp_opt, mss, tsoff); |
390 | out: |
391 | return ERR_PTR(error: -EINVAL); |
392 | } |
393 | |
394 | /* On input, sk is a listener. |
395 | * Output is listener if incoming packet would not create a child |
396 | * NULL if memory could not be allocated. |
397 | */ |
398 | struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) |
399 | { |
400 | struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; |
401 | const struct tcphdr *th = tcp_hdr(skb); |
402 | struct tcp_sock *tp = tcp_sk(sk); |
403 | struct inet_request_sock *ireq; |
404 | struct net *net = sock_net(sk); |
405 | struct request_sock *req; |
406 | struct sock *ret = sk; |
407 | struct flowi4 fl4; |
408 | struct rtable *rt; |
409 | __u8 rcv_wscale; |
410 | int full_space; |
411 | SKB_DR(reason); |
412 | |
413 | if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) || |
414 | !th->ack || th->rst) |
415 | goto out; |
416 | |
417 | if (cookie_bpf_ok(skb)) { |
418 | req = cookie_bpf_check(sk, skb); |
419 | } else { |
420 | req = cookie_tcp_check(net, sk, skb); |
421 | if (IS_ERR(ptr: req)) |
422 | goto out; |
423 | } |
424 | if (!req) { |
425 | SKB_DR_SET(reason, NO_SOCKET); |
426 | goto out_drop; |
427 | } |
428 | |
429 | ireq = inet_rsk(sk: req); |
430 | |
431 | sk_rcv_saddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->daddr); |
432 | sk_daddr_set(sk: req_to_sk(req), addr: ip_hdr(skb)->saddr); |
433 | |
434 | /* We throwed the options of the initial SYN away, so we hope |
435 | * the ACK carries the same options again (see RFC1122 4.2.3.8) |
436 | */ |
437 | RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb)); |
438 | |
439 | if (security_inet_conn_request(sk, skb, req)) { |
440 | SKB_DR_SET(reason, SECURITY_HOOK); |
441 | goto out_free; |
442 | } |
443 | |
444 | tcp_ao_syncookie(sk, skb, req, AF_INET); |
445 | |
446 | /* |
447 | * We need to lookup the route here to get at the correct |
448 | * window size. We should better make sure that the window size |
449 | * hasn't changed since we received the original syn, but I see |
450 | * no easy way to do this. |
451 | */ |
452 | flowi4_init_output(fl4: &fl4, oif: ireq->ir_iif, mark: ireq->ir_mark, |
453 | tos: ip_sock_rt_tos(sk), scope: ip_sock_rt_scope(sk), |
454 | IPPROTO_TCP, flags: inet_sk_flowi_flags(sk), |
455 | daddr: opt->srr ? opt->faddr : ireq->ir_rmt_addr, |
456 | saddr: ireq->ir_loc_addr, dport: th->source, sport: th->dest, uid: sk->sk_uid); |
457 | security_req_classify_flow(req, flic: flowi4_to_flowi_common(fl4: &fl4)); |
458 | rt = ip_route_output_key(net, flp: &fl4); |
459 | if (IS_ERR(ptr: rt)) { |
460 | SKB_DR_SET(reason, IP_OUTNOROUTES); |
461 | goto out_free; |
462 | } |
463 | |
464 | /* Try to redo what tcp_v4_send_synack did. */ |
465 | req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst: &rt->dst, RTAX_WINDOW); |
466 | /* limit the window selection if the user enforce a smaller rx buffer */ |
467 | full_space = tcp_full_space(sk); |
468 | if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && |
469 | (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) |
470 | req->rsk_window_clamp = full_space; |
471 | |
472 | tcp_select_initial_window(sk, space: full_space, mss: req->mss, |
473 | rcv_wnd: &req->rsk_rcv_wnd, window_clamp: &req->rsk_window_clamp, |
474 | wscale_ok: ireq->wscale_ok, rcv_wscale: &rcv_wscale, |
475 | init_rcv_wnd: dst_metric(dst: &rt->dst, RTAX_INITRWND)); |
476 | |
477 | /* req->syncookie is set true only if ACK is validated |
478 | * by BPF kfunc, then, rcv_wscale is already configured. |
479 | */ |
480 | if (!req->syncookie) |
481 | ireq->rcv_wscale = rcv_wscale; |
482 | ireq->ecn_ok &= cookie_ecn_ok(net, dst: &rt->dst); |
483 | |
484 | ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); |
485 | /* ip_queue_xmit() depends on our flow being setup |
486 | * Normal sockets get it right from inet_csk_route_child_sock() |
487 | */ |
488 | if (!ret) { |
489 | SKB_DR_SET(reason, NO_SOCKET); |
490 | goto out_drop; |
491 | } |
492 | inet_sk(ret)->cork.fl.u.ip4 = fl4; |
493 | out: |
494 | return ret; |
495 | out_free: |
496 | reqsk_free(req); |
497 | out_drop: |
498 | kfree_skb_reason(skb, reason); |
499 | return NULL; |
500 | } |
501 | |