tcp_cong.c source code [linux/net/ipv4/tcp_cong.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Pluggable TCP congestion control support and newReno
4	* congestion control.
5	* Based on ideas from I/O scheduler support and Web100.
6	*
7	* Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
8	*/
9
10	#define pr_fmt(fmt) "TCP: " fmt
11
12	#include <linux/module.h>
13	#include <linux/mm.h>
14	#include <linux/types.h>
15	#include <linux/list.h>
16	#include <linux/gfp.h>
17	#include <linux/jhash.h>
18	#include <net/tcp.h>
19	#include <trace/events/tcp.h>
20
21	static DEFINE_SPINLOCK(tcp_cong_list_lock);
22	static LIST_HEAD(tcp_cong_list);
23
24	/ Simple linear search, don't expect many entries! /
25	struct tcp_congestion_ops tcp_ca_find(const* char *name)
26	{
27	struct tcp_congestion_ops *e;
28
29	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
30	if (strcmp(e->name, name) == `0`)
31	return e;
32	}
33
34	return NULL;
35	}
36
37	void tcp_set_ca_state(struct sock sk, const* u8 ca_state)
38	{
39	struct inet_connection_sock *icsk = inet_csk(sk);
40
41	trace_tcp_cong_state_set(sk, ca_state);
42
43	if (icsk->icsk_ca_ops->set_state)
44	icsk->icsk_ca_ops->set_state(sk, ca_state);
45	icsk->icsk_ca_state = ca_state;
46	}
47
48	/ Must be called with rcu lock held /
49	static struct tcp_congestion_ops tcp_ca_find_autoload(struct* net *net,
50	const char *name)
51	{
52	struct tcp_congestion_ops *ca = tcp_ca_find(name);
53
54	#ifdef CONFIG_MODULES
55	if (!ca && capable(CAP_NET_ADMIN)) {
56	rcu_read_unlock();
57	request_module("tcp_%s", name);
58	rcu_read_lock();
59	ca = tcp_ca_find(name);
60	}
61	#endif
62	return ca;
63	}
64
65	/ Simple linear search, not much in here. /
66	struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
67	{
68	struct tcp_congestion_ops *e;
69
70	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
71	if (e->key == key)
72	return e;
73	}
74
75	return NULL;
76	}
77
78	int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
79	{
80	/ all algorithms must implement these /
81	if (!ca->ssthresh \|\| !ca->undo_cwnd \|\|
82	!(ca->cong_avoid \|\| ca->cong_control)) {
83	pr_err("%s does not implement required ops\n", ca->name);
84	return -EINVAL;
85	}
86
87	return `0`;
88	}
89
90	/ Attach new congestion control algorithm to the list*
91	* of available options.
92	*/
93	int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
94	{
95	int ret;
96
97	ret = tcp_validate_congestion_control(ca);
98	if (ret)
99	return ret;
100
101	ca->key = jhash(key: ca->name, length: sizeof(ca->name), strlen(ca->name));
102
103	spin_lock(lock: &tcp_cong_list_lock);
104	if (ca->key == TCP_CA_UNSPEC \|\| tcp_ca_find_key(key: ca->key)) {
105	pr_notice("%s already registered or non-unique key\n",
106	ca->name);
107	ret = -EEXIST;
108	} else {
109	list_add_tail_rcu(new: &ca->list, head: &tcp_cong_list);
110	pr_debug("%s registered\n", ca->name);
111	}
112	spin_unlock(lock: &tcp_cong_list_lock);
113
114	return ret;
115	}
116	EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
117
118	/*
119	* Remove congestion control algorithm, called from
120	* the module's remove function. Module ref counts are used
121	* to ensure that this can't be done till all sockets using
122	* that method are closed.
123	*/
124	void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
125	{
126	spin_lock(lock: &tcp_cong_list_lock);
127	list_del_rcu(entry: &ca->list);
128	spin_unlock(lock: &tcp_cong_list_lock);
129
130	/ Wait for outstanding readers to complete before the*
131	* module gets removed entirely.
132	*
133	* A try_module_get() should fail by now as our module is
134	* in "going" state since no refs are held anymore and
135	* module_exit() handler being called.
136	*/
137	synchronize_rcu();
138	}
139	EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
140
141	/ Replace a registered old ca with a new one.*
142	*
143	* The new ca must have the same name as the old one, that has been
144	* registered.
145	*/
146	int tcp_update_congestion_control(struct tcp_congestion_ops ca, struct* tcp_congestion_ops *old_ca)
147	{
148	struct tcp_congestion_ops *existing;
149	int ret = `0`;
150
151	ca->key = jhash(key: ca->name, length: sizeof(ca->name), strlen(ca->name));
152
153	spin_lock(lock: &tcp_cong_list_lock);
154	existing = tcp_ca_find_key(key: old_ca->key);
155	if (ca->key == TCP_CA_UNSPEC \|\| !existing \|\| strcmp(existing->name, ca->name)) {
156	pr_notice("%s not registered or non-unique key\n",
157	ca->name);
158	ret = -EINVAL;
159	} else if (existing != old_ca) {
160	pr_notice("invalid old congestion control algorithm to replace\n");
161	ret = -EINVAL;
162	} else {
163	/ Add the new one before removing the old one to keep*
164	* one implementation available all the time.
165	*/
166	list_add_tail_rcu(new: &ca->list, head: &tcp_cong_list);
167	list_del_rcu(entry: &existing->list);
168	pr_debug("%s updated\n", ca->name);
169	}
170	spin_unlock(lock: &tcp_cong_list_lock);
171
172	/ Wait for outstanding readers to complete before the*
173	* module or struct_ops gets removed entirely.
174	*/
175	if (!ret)
176	synchronize_rcu();
177
178	return ret;
179	}
180
181	u32 tcp_ca_get_key_by_name(struct net net, const* char name, bool ecn_ca)
182	{
183	const struct tcp_congestion_ops *ca;
184	u32 key = TCP_CA_UNSPEC;
185
186	might_sleep();
187
188	rcu_read_lock();
189	ca = tcp_ca_find_autoload(net, name);
190	if (ca) {
191	key = ca->key;
192	*ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
193	}
194	rcu_read_unlock();
195
196	return key;
197	}
198
199	char tcp_ca_get_name_by_key(u32 key, char* *buffer)
200	{
201	const struct tcp_congestion_ops *ca;
202	char *ret = NULL;
203
204	rcu_read_lock();
205	ca = tcp_ca_find_key(key);
206	if (ca)
207	ret = strncpy(p: buffer, q: ca->name,
208	TCP_CA_NAME_MAX);
209	rcu_read_unlock();
210
211	return ret;
212	}
213
214	/ Assign choice of congestion control. /
215	void tcp_assign_congestion_control(struct sock *sk)
216	{
217	struct net *net = sock_net(sk);
218	struct inet_connection_sock *icsk = inet_csk(sk);
219	const struct tcp_congestion_ops *ca;
220
221	rcu_read_lock();
222	ca = rcu_dereference(net->ipv4.tcp_congestion_control);
223	if (unlikely(!bpf_try_module_get(ca, ca->owner)))
224	ca = &tcp_reno;
225	icsk->icsk_ca_ops = ca;
226	rcu_read_unlock();
227
228	memset(icsk->icsk_ca_priv, `0`, sizeof(icsk->icsk_ca_priv));
229	if (ca->flags & TCP_CONG_NEEDS_ECN)
230	INET_ECN_xmit(sk);
231	else
232	INET_ECN_dontxmit(sk);
233	}
234
235	void tcp_init_congestion_control(struct sock *sk)
236	{
237	struct inet_connection_sock *icsk = inet_csk(sk);
238
239	tcp_sk(sk)->prior_ssthresh = `0`;
240	if (icsk->icsk_ca_ops->init)
241	icsk->icsk_ca_ops->init(sk);
242	if (tcp_ca_needs_ecn(sk))
243	INET_ECN_xmit(sk);
244	else
245	INET_ECN_dontxmit(sk);
246	icsk->icsk_ca_initialized = `1`;
247	}
248
249	static void tcp_reinit_congestion_control(struct sock *sk,
250	const struct tcp_congestion_ops *ca)
251	{
252	struct inet_connection_sock *icsk = inet_csk(sk);
253
254	tcp_cleanup_congestion_control(sk);
255	icsk->icsk_ca_ops = ca;
256	icsk->icsk_ca_setsockopt = `1`;
257	memset(icsk->icsk_ca_priv, `0`, sizeof(icsk->icsk_ca_priv));
258
259	if (ca->flags & TCP_CONG_NEEDS_ECN)
260	INET_ECN_xmit(sk);
261	else
262	INET_ECN_dontxmit(sk);
263
264	if (!((`1` << sk->sk_state) & (TCPF_CLOSE \| TCPF_LISTEN)))
265	tcp_init_congestion_control(sk);
266	}
267
268	/ Manage refcounts on socket close. /
269	void tcp_cleanup_congestion_control(struct sock *sk)
270	{
271	struct inet_connection_sock *icsk = inet_csk(sk);
272
273	if (icsk->icsk_ca_ops->release)
274	icsk->icsk_ca_ops->release(sk);
275	bpf_module_put(data: icsk->icsk_ca_ops, owner: icsk->icsk_ca_ops->owner);
276	}
277
278	/ Used by sysctl to change default congestion control /
279	int tcp_set_default_congestion_control(struct net net, const* char *name)
280	{
281	struct tcp_congestion_ops *ca;
282	const struct tcp_congestion_ops *prev;
283	int ret;
284
285	rcu_read_lock();
286	ca = tcp_ca_find_autoload(net, name);
287	if (!ca) {
288	ret = -ENOENT;
289	} else if (!bpf_try_module_get(data: ca, owner: ca->owner)) {
290	ret = -EBUSY;
291	} else if (!net_eq(net1: net, net2: &init_net) &&
292	!(ca->flags & TCP_CONG_NON_RESTRICTED)) {
293	/ Only init netns can set default to a restricted algorithm /
294	ret = -EPERM;
295	} else {
296	prev = xchg(&net->ipv4.tcp_congestion_control, ca);
297	if (prev)
298	bpf_module_put(data: prev, owner: prev->owner);
299
300	ca->flags \|= TCP_CONG_NON_RESTRICTED;
301	ret = `0`;
302	}
303	rcu_read_unlock();
304
305	return ret;
306	}
307
308	/ Set default value from kernel configuration at bootup /
309	static int __init tcp_congestion_default(void)
310	{
311	return tcp_set_default_congestion_control(net: &init_net,
312	CONFIG_DEFAULT_TCP_CONG);
313	}
314	late_initcall(tcp_congestion_default);
315
316	/ Build string with list of available congestion control values /
317	void tcp_get_available_congestion_control(char *buf, size_t maxlen)
318	{
319	struct tcp_congestion_ops *ca;
320	size_t offs = `0`;
321
322	rcu_read_lock();
323	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
324	offs += snprintf(buf: buf + offs, size: maxlen - offs,
325	fmt: "%s%s",
326	offs == `0` ? "" : " ", ca->name);
327
328	if (WARN_ON_ONCE(offs >= maxlen))
329	break;
330	}
331	rcu_read_unlock();
332	}
333
334	/ Get current default congestion control /
335	void tcp_get_default_congestion_control(struct net net, char* *name)
336	{
337	const struct tcp_congestion_ops *ca;
338
339	rcu_read_lock();
340	ca = rcu_dereference(net->ipv4.tcp_congestion_control);
341	strncpy(p: name, q: ca->name, TCP_CA_NAME_MAX);
342	rcu_read_unlock();
343	}
344
345	/ Built list of non-restricted congestion control values /
346	void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
347	{
348	struct tcp_congestion_ops *ca;
349	size_t offs = `0`;
350
351	*buf = `'\0'`;
352	rcu_read_lock();
353	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
354	if (!(ca->flags & TCP_CONG_NON_RESTRICTED))
355	continue;
356	offs += snprintf(buf: buf + offs, size: maxlen - offs,
357	fmt: "%s%s",
358	offs == `0` ? "" : " ", ca->name);
359
360	if (WARN_ON_ONCE(offs >= maxlen))
361	break;
362	}
363	rcu_read_unlock();
364	}
365
366	/ Change list of non-restricted congestion control /
367	int tcp_set_allowed_congestion_control(char *val)
368	{
369	struct tcp_congestion_ops *ca;
370	char saved_clone, clone, *name;
371	int ret = `0`;
372
373	saved_clone = clone = kstrdup(s: val, GFP_USER);
374	if (!clone)
375	return -ENOMEM;
376
377	spin_lock(lock: &tcp_cong_list_lock);
378	/ pass 1 check for bad entries /
379	while ((name = strsep(&clone, " ")) && *name) {
380	ca = tcp_ca_find(name);
381	if (!ca) {
382	ret = -ENOENT;
383	goto out;
384	}
385	}
386
387	/ pass 2 clear old values /
388	list_for_each_entry_rcu(ca, &tcp_cong_list, list)
389	ca->flags &= ~TCP_CONG_NON_RESTRICTED;
390
391	/ pass 3 mark as allowed /
392	while ((name = strsep(&val, " ")) && *name) {
393	ca = tcp_ca_find(name);
394	WARN_ON(!ca);
395	if (ca)
396	ca->flags \|= TCP_CONG_NON_RESTRICTED;
397	}
398	out:
399	spin_unlock(lock: &tcp_cong_list_lock);
400	kfree(objp: saved_clone);
401
402	return ret;
403	}
404
405	/ Change congestion control for socket. If load is false, then it is the*
406	* responsibility of the caller to call tcp_init_congestion_control or
407	* tcp_reinit_congestion_control (if the current congestion control was
408	* already initialized.
409	*/
410	int tcp_set_congestion_control(struct sock sk, const* char *name, bool load,
411	bool cap_net_admin)
412	{
413	struct inet_connection_sock *icsk = inet_csk(sk);
414	const struct tcp_congestion_ops *ca;
415	int err = `0`;
416
417	if (icsk->icsk_ca_dst_locked)
418	return -EPERM;
419
420	rcu_read_lock();
421	if (!load)
422	ca = tcp_ca_find(name);
423	else
424	ca = tcp_ca_find_autoload(net: sock_net(sk), name);
425
426	/ No change asking for existing value /
427	if (ca == icsk->icsk_ca_ops) {
428	icsk->icsk_ca_setsockopt = `1`;
429	goto out;
430	}
431
432	if (!ca)
433	err = -ENOENT;
434	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) \|\| cap_net_admin))
435	err = -EPERM;
436	else if (!bpf_try_module_get(data: ca, owner: ca->owner))
437	err = -EBUSY;
438	else
439	tcp_reinit_congestion_control(sk, ca);
440	out:
441	rcu_read_unlock();
442	return err;
443	}
444
445	/ Slow start is used when congestion window is no greater than the slow start*
446	* threshold. We base on RFC2581 and also handle stretch ACKs properly.
447	* We do not implement RFC3465 Appropriate Byte Counting (ABC) per se but
448	* something better;) a packet is only considered (s)acked in its entirety to
449	* defend the ACK attacks described in the RFC. Slow start processes a stretch
450	* ACK of degree N as if N acks of degree 1 are received back to back except
451	* ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
452	* returns the leftover acks to adjust cwnd in congestion avoidance mode.
453	*/
454	__bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
455	{
456	u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh);
457
458	acked -= cwnd - tcp_snd_cwnd(tp);
459	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));
460
461	return acked;
462	}
463	EXPORT_SYMBOL_GPL(tcp_slow_start);
464
465	/ In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),*
466	* for every packet that was ACKed.
467	*/
468	__bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
469	{
470	/ If credits accumulated at a higher w, apply them gently now. /
471	if (tp->snd_cwnd_cnt >= w) {
472	tp->snd_cwnd_cnt = `0`;
473	tcp_snd_cwnd_set(tp, val: tcp_snd_cwnd(tp) + `1`);
474	}
475
476	tp->snd_cwnd_cnt += acked;
477	if (tp->snd_cwnd_cnt >= w) {
478	u32 delta = tp->snd_cwnd_cnt / w;
479
480	tp->snd_cwnd_cnt -= delta * w;
481	tcp_snd_cwnd_set(tp, val: tcp_snd_cwnd(tp) + delta);
482	}
483	tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp));
484	}
485	EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
486
487	/*
488	* TCP Reno congestion control
489	* This is special case used for fallback as well.
490	*/
491	/ This is Jacobson's slow start and congestion avoidance.*
492	* SIGCOMM '88, p. 328.
493	*/
494	__bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
495	{
496	struct tcp_sock *tp = tcp_sk(sk);
497
498	if (!tcp_is_cwnd_limited(sk))
499	return;
500
501	/ In "safe" area, increase. /
502	if (tcp_in_slow_start(tp)) {
503	acked = tcp_slow_start(tp, acked);
504	if (!acked)
505	return;
506	}
507	/ In dangerous area, increase slowly. /
508	tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
509	}
510	EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
511
512	/ Slow start threshold is half the congestion window (min 2) /
513	__bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk)
514	{
515	const struct tcp_sock *tp = tcp_sk(sk);
516
517	return max(tcp_snd_cwnd(tp) >> `1U`, `2U`);
518	}
519	EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
520
521	__bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk)
522	{
523	const struct tcp_sock *tp = tcp_sk(sk);
524
525	return max(tcp_snd_cwnd(tp), tp->prior_cwnd);
526	}
527	EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
528
529	struct tcp_congestion_ops tcp_reno = {
530	.flags = TCP_CONG_NON_RESTRICTED,
531	.name = "reno",
532	.owner = THIS_MODULE,
533	.ssthresh = tcp_reno_ssthresh,
534	.cong_avoid = tcp_reno_cong_avoid,
535	.undo_cwnd = tcp_reno_undo_cwnd,
536	};
537

source code of linux/net/ipv4/tcp_cong.c