ip_vs_est.c source code [linux/net/netfilter/ipvs/ip_vs_est.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* ip_vs_est.c: simple rate estimator for IPVS
4	*
5	* Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
6	*
7	* Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
8	* Network name space (netns) aware.
9	* Global data moved to netns i.e struct netns_ipvs
10	* Affected data: est_list and est_lock.
11	* estimation_timer() runs with timer per netns.
12	* get_stats()) do the per cpu summing.
13	*/
14
15	#define KMSG_COMPONENT "IPVS"
16	#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
17
18	#include <linux/kernel.h>
19	#include <linux/jiffies.h>
20	#include <linux/types.h>
21	#include <linux/interrupt.h>
22	#include <linux/sysctl.h>
23	#include <linux/list.h>
24	#include <linux/rcupdate_wait.h>
25
26	#include <net/ip_vs.h>
27
28	/*
29	This code is to estimate rate in a shorter interval (such as 8
30	seconds) for virtual services and real servers. For measure rate in a
31	long interval, it is easy to implement a user level daemon which
32	periodically reads those statistical counters and measure rate.
33
34	We measure rate during the last 8 seconds every 2 seconds:
35
36	avgrate = avgrate(1-W) + rateW
37
38	where W = 2^(-2)
39
40	NOTES.
41
42	* Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10.
43
44	* Netlink users can see 64-bit values but sockopt users are restricted
45	to 32-bit values for conns, packets, bps, cps and pps.
46
47	* A lot of code is taken from net/core/gen_estimator.c
48
49	KEY POINTS:
50	- cpustats counters are updated per-cpu in SoftIRQ context with BH disabled
51	- kthreads read the cpustats to update the estimators (svcs, dests, total)
52	- the states of estimators can be read (get stats) or modified (zero stats)
53	from processes
54
55	KTHREADS:
56	- estimators are added initially to est_temp_list and later kthread 0
57	distributes them to one or many kthreads for estimation
58	- kthread contexts are created and attached to array
59	- the kthread tasks are started when first service is added, before that
60	the total stats are not estimated
61	- when configuration (cpulist/nice) is changed, the tasks are restarted
62	by work (est_reload_work)
63	- kthread tasks are stopped while the cpulist is empty
64	- the kthread context holds lists with estimators (chains) which are
65	processed every 2 seconds
66	- as estimators can be added dynamically and in bursts, we try to spread
67	them to multiple chains which are estimated at different time
68	- on start, kthread 0 enters calculation phase to determine the chain limits
69	and the limit of estimators per kthread
70	- est_add_ktid: ktid where to add new ests, can point to empty slot where
71	we should add kt data
72	*/
73
74	static struct lock_class_key __ipvs_est_key;
75
76	static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs);
77	static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs);
78
79	static void ip_vs_chain_estimation(struct hlist_head *chain)
80	{
81	struct ip_vs_estimator *e;
82	struct ip_vs_cpu_stats *c;
83	struct ip_vs_stats *s;
84	u64 rate;
85
86	hlist_for_each_entry_rcu(e, chain, list) {
87	u64 conns, inpkts, outpkts, inbytes, outbytes;
88	u64 kconns = `0`, kinpkts = `0`, koutpkts = `0`;
89	u64 kinbytes = `0`, koutbytes = `0`;
90	unsigned int start;
91	int i;
92
93	if (kthread_should_stop())
94	break;
95
96	s = container_of(e, struct ip_vs_stats, est);
97	for_each_possible_cpu(i) {
98	c = per_cpu_ptr(s->cpustats, i);
99	do {
100	start = u64_stats_fetch_begin(syncp: &c->syncp);
101	conns = u64_stats_read(p: &c->cnt.conns);
102	inpkts = u64_stats_read(p: &c->cnt.inpkts);
103	outpkts = u64_stats_read(p: &c->cnt.outpkts);
104	inbytes = u64_stats_read(p: &c->cnt.inbytes);
105	outbytes = u64_stats_read(p: &c->cnt.outbytes);
106	} while (u64_stats_fetch_retry(syncp: &c->syncp, start));
107	kconns += conns;
108	kinpkts += inpkts;
109	koutpkts += outpkts;
110	kinbytes += inbytes;
111	koutbytes += outbytes;
112	}
113
114	spin_lock(lock: &s->lock);
115
116	s->kstats.conns = kconns;
117	s->kstats.inpkts = kinpkts;
118	s->kstats.outpkts = koutpkts;
119	s->kstats.inbytes = kinbytes;
120	s->kstats.outbytes = koutbytes;
121
122	/ scaled by 2^10, but divided 2 seconds /
123	rate = (s->kstats.conns - e->last_conns) << `9`;
124	e->last_conns = s->kstats.conns;
125	e->cps += ((s64)rate - (s64)e->cps) >> `2`;
126
127	rate = (s->kstats.inpkts - e->last_inpkts) << `9`;
128	e->last_inpkts = s->kstats.inpkts;
129	e->inpps += ((s64)rate - (s64)e->inpps) >> `2`;
130
131	rate = (s->kstats.outpkts - e->last_outpkts) << `9`;
132	e->last_outpkts = s->kstats.outpkts;
133	e->outpps += ((s64)rate - (s64)e->outpps) >> `2`;
134
135	/ scaled by 2^5, but divided 2 seconds /
136	rate = (s->kstats.inbytes - e->last_inbytes) << `4`;
137	e->last_inbytes = s->kstats.inbytes;
138	e->inbps += ((s64)rate - (s64)e->inbps) >> `2`;
139
140	rate = (s->kstats.outbytes - e->last_outbytes) << `4`;
141	e->last_outbytes = s->kstats.outbytes;
142	e->outbps += ((s64)rate - (s64)e->outbps) >> `2`;
143	spin_unlock(lock: &s->lock);
144	}
145	}
146
147	static void ip_vs_tick_estimation(struct ip_vs_est_kt_data kd, int* row)
148	{
149	struct ip_vs_est_tick_data *td;
150	int cid;
151
152	rcu_read_lock();
153	td = rcu_dereference(kd->ticks[row]);
154	if (!td)
155	goto out;
156	for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) {
157	if (kthread_should_stop())
158	break;
159	ip_vs_chain_estimation(chain: &td->chains[cid]);
160	cond_resched_rcu();
161	td = rcu_dereference(kd->ticks[row]);
162	if (!td)
163	break;
164	}
165
166	out:
167	rcu_read_unlock();
168	}
169
170	static int ip_vs_estimation_kthread(void *data)
171	{
172	struct ip_vs_est_kt_data *kd = data;
173	struct netns_ipvs *ipvs = kd->ipvs;
174	int row = kd->est_row;
175	unsigned long now;
176	int id = kd->id;
177	long gap;
178
179	if (id > `0`) {
180	if (!ipvs->est_chain_max)
181	return `0`;
182	} else {
183	if (!ipvs->est_chain_max) {
184	ipvs->est_calc_phase = `1`;
185	/ commit est_calc_phase before reading est_genid /
186	smp_mb();
187	}
188
189	/ kthread 0 will handle the calc phase /
190	if (ipvs->est_calc_phase)
191	ip_vs_est_calc_phase(ipvs);
192	}
193
194	while (`1`) {
195	if (!id && !hlist_empty(h: &ipvs->est_temp_list))
196	ip_vs_est_drain_temp_list(ipvs);
197	set_current_state(TASK_IDLE);
198	if (kthread_should_stop())
199	break;
200
201	/ before estimation, check if we should sleep /
202	now = jiffies;
203	gap = kd->est_timer - now;
204	if (gap > `0`) {
205	if (gap > IPVS_EST_TICK) {
206	kd->est_timer = now - IPVS_EST_TICK;
207	gap = IPVS_EST_TICK;
208	}
209	schedule_timeout(timeout: gap);
210	} else {
211	__set_current_state(TASK_RUNNING);
212	if (gap < -`8` * IPVS_EST_TICK)
213	kd->est_timer = now;
214	}
215
216	if (kd->tick_len[row])
217	ip_vs_tick_estimation(kd, row);
218
219	row++;
220	if (row >= IPVS_EST_NTICKS)
221	row = `0`;
222	WRITE_ONCE(kd->est_row, row);
223	kd->est_timer += IPVS_EST_TICK;
224	}
225	__set_current_state(TASK_RUNNING);
226
227	return `0`;
228	}
229
230	/ Schedule stop/start for kthread tasks /
231	void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
232	{
233	/ Ignore reloads before first service is added /
234	if (!ipvs->enable)
235	return;
236	ip_vs_est_stopped_recalc(ipvs);
237	/ Bump the kthread configuration genid /
238	atomic_inc(v: &ipvs->est_genid);
239	queue_delayed_work(wq: system_long_wq, dwork: &ipvs->est_reload_work, delay: `0`);
240	}
241
242	/ Start kthread task with current configuration /
243	int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
244	struct ip_vs_est_kt_data *kd)
245	{
246	unsigned long now;
247	int ret = `0`;
248	long gap;
249
250	lockdep_assert_held(&ipvs->est_mutex);
251
252	if (kd->task)
253	goto out;
254	now = jiffies;
255	gap = kd->est_timer - now;
256	/ Sync est_timer if task is starting later /
257	if (abs(gap) > `4` * IPVS_EST_TICK)
258	kd->est_timer = now;
259	kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d",
260	ipvs->gen, kd->id);
261	if (IS_ERR(ptr: kd->task)) {
262	ret = PTR_ERR(ptr: kd->task);
263	kd->task = NULL;
264	goto out;
265	}
266
267	set_user_nice(p: kd->task, nice: sysctl_est_nice(ipvs));
268	set_cpus_allowed_ptr(p: kd->task, new_mask: sysctl_est_cpulist(ipvs));
269
270	pr_info("starting estimator thread %d...\n", kd->id);
271	wake_up_process(tsk: kd->task);
272
273	out:
274	return ret;
275	}
276
277	void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd)
278	{
279	if (kd->task) {
280	pr_info("stopping estimator thread %d...\n", kd->id);
281	kthread_stop(k: kd->task);
282	kd->task = NULL;
283	}
284	}
285
286	/ Apply parameters to kthread /
287	static void ip_vs_est_set_params(struct netns_ipvs *ipvs,
288	struct ip_vs_est_kt_data *kd)
289	{
290	kd->chain_max = ipvs->est_chain_max;
291	/ We are using single chain on RCU preemption /
292	if (IPVS_EST_TICK_CHAINS == `1`)
293	kd->chain_max *= IPVS_EST_CHAIN_FACTOR;
294	kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max;
295	kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max;
296	}
297
298	/ Create and start estimation kthread in a free or new array slot /
299	static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
300	{
301	struct ip_vs_est_kt_data *kd = NULL;
302	int id = ipvs->est_kt_count;
303	int ret = -ENOMEM;
304	void *arr = NULL;
305	int i;
306
307	if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
308	ipvs->enable && ipvs->est_max_threads)
309	return -EINVAL;
310
311	mutex_lock(&ipvs->est_mutex);
312
313	for (i = `0`; i < id; i++) {
314	if (!ipvs->est_kt_arr[i])
315	break;
316	}
317	if (i >= id) {
318	arr = krealloc_array(p: ipvs->est_kt_arr, new_n: id + `1`,
319	new_size: sizeof(struct ip_vs_est_kt_data *),
320	GFP_KERNEL);
321	if (!arr)
322	goto out;
323	ipvs->est_kt_arr = arr;
324	} else {
325	id = i;
326	}
327
328	kd = kzalloc(size: sizeof(*kd), GFP_KERNEL);
329	if (!kd)
330	goto out;
331	kd->ipvs = ipvs;
332	bitmap_fill(dst: kd->avail, IPVS_EST_NTICKS);
333	kd->est_timer = jiffies;
334	kd->id = id;
335	ip_vs_est_set_params(ipvs, kd);
336
337	/ Pre-allocate stats used in calc phase /
338	if (!id && !kd->calc_stats) {
339	kd->calc_stats = ip_vs_stats_alloc();
340	if (!kd->calc_stats)
341	goto out;
342	}
343
344	/ Start kthread tasks only when services are present /
345	if (ipvs->enable && !ip_vs_est_stopped(ipvs)) {
346	ret = ip_vs_est_kthread_start(ipvs, kd);
347	if (ret < `0`)
348	goto out;
349	}
350
351	if (arr)
352	ipvs->est_kt_count++;
353	ipvs->est_kt_arr[id] = kd;
354	kd = NULL;
355	/ Use most recent kthread for new ests /
356	ipvs->est_add_ktid = id;
357	ret = `0`;
358
359	out:
360	mutex_unlock(lock: &ipvs->est_mutex);
361	if (kd) {
362	ip_vs_stats_free(stats: kd->calc_stats);
363	kfree(objp: kd);
364	}
365
366	return ret;
367	}
368
369	/ Select ktid where to add new ests: available, unused or new slot /
370	static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs)
371	{
372	int ktid, best = ipvs->est_kt_count;
373	struct ip_vs_est_kt_data *kd;
374
375	for (ktid = `0`; ktid < ipvs->est_kt_count; ktid++) {
376	kd = ipvs->est_kt_arr[ktid];
377	if (kd) {
378	if (kd->est_count < kd->est_max_count) {
379	best = ktid;
380	break;
381	}
382	} else if (ktid < best) {
383	best = ktid;
384	}
385	}
386	ipvs->est_add_ktid = best;
387	}
388
389	/ Add estimator to current kthread (est_add_ktid) /
390	static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs,
391	struct ip_vs_estimator *est)
392	{
393	struct ip_vs_est_kt_data *kd = NULL;
394	struct ip_vs_est_tick_data *td;
395	int ktid, row, crow, cid, ret;
396	int delay = est->ktrow;
397
398	BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > `127`,
399	"Too many chains for ktcid");
400
401	if (ipvs->est_add_ktid < ipvs->est_kt_count) {
402	kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
403	if (kd)
404	goto add_est;
405	}
406
407	ret = ip_vs_est_add_kthread(ipvs);
408	if (ret < `0`)
409	goto out;
410	kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
411
412	add_est:
413	ktid = kd->id;
414	/ For small number of estimators prefer to use few ticks,*
415	* otherwise try to add into the last estimated row.
416	* est_row and add_row point after the row we should use
417	*/
418	if (kd->est_count >= `2` * kd->tick_max \|\| delay < IPVS_EST_NTICKS - `1`)
419	crow = READ_ONCE(kd->est_row);
420	else
421	crow = kd->add_row;
422	crow += delay;
423	if (crow >= IPVS_EST_NTICKS)
424	crow -= IPVS_EST_NTICKS;
425	/ Assume initial delay ? /
426	if (delay >= IPVS_EST_NTICKS - `1`) {
427	/ Preserve initial delay or decrease it if no space in tick /
428	row = crow;
429	if (crow < IPVS_EST_NTICKS - `1`) {
430	crow++;
431	row = find_last_bit(addr: kd->avail, size: crow);
432	}
433	if (row >= crow)
434	row = find_last_bit(addr: kd->avail, IPVS_EST_NTICKS);
435	} else {
436	/ Preserve delay or increase it if no space in tick /
437	row = IPVS_EST_NTICKS;
438	if (crow > `0`)
439	row = find_next_bit(addr: kd->avail, IPVS_EST_NTICKS, offset: crow);
440	if (row >= IPVS_EST_NTICKS)
441	row = find_first_bit(addr: kd->avail, IPVS_EST_NTICKS);
442	}
443
444	td = rcu_dereference_protected(kd->ticks[row], `1`);
445	if (!td) {
446	td = kzalloc(size: sizeof(*td), GFP_KERNEL);
447	if (!td) {
448	ret = -ENOMEM;
449	goto out;
450	}
451	rcu_assign_pointer(kd->ticks[row], td);
452	}
453
454	cid = find_first_zero_bit(addr: td->full, IPVS_EST_TICK_CHAINS);
455
456	kd->est_count++;
457	kd->tick_len[row]++;
458	if (!td->chain_len[cid])
459	__set_bit(cid, td->present);
460	td->chain_len[cid]++;
461	est->ktid = ktid;
462	est->ktrow = row;
463	est->ktcid = cid;
464	hlist_add_head_rcu(n: &est->list, h: &td->chains[cid]);
465
466	if (td->chain_len[cid] >= kd->chain_max) {
467	__set_bit(cid, td->full);
468	if (kd->tick_len[row] >= kd->tick_max)
469	__clear_bit(row, kd->avail);
470	}
471
472	/ Update est_add_ktid to point to first available/empty kt slot /
473	if (kd->est_count == kd->est_max_count)
474	ip_vs_est_update_ktid(ipvs);
475
476	ret = `0`;
477
478	out:
479	return ret;
480	}
481
482	/ Start estimation for stats /
483	int ip_vs_start_estimator(struct netns_ipvs ipvs, struct* ip_vs_stats *stats)
484	{
485	struct ip_vs_estimator *est = &stats->est;
486	int ret;
487
488	if (!ipvs->est_max_threads && ipvs->enable)
489	ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
490
491	est->ktid = -`1`;
492	est->ktrow = IPVS_EST_NTICKS - `1`; / Initial delay /
493
494	/ We prefer this code to be short, kthread 0 will requeue the*
495	* estimator to available chain. If tasks are disabled, we
496	* will not allocate much memory, just for kt 0.
497	*/
498	ret = `0`;
499	if (!ipvs->est_kt_count \|\| !ipvs->est_kt_arr[`0`])
500	ret = ip_vs_est_add_kthread(ipvs);
501	if (ret >= `0`)
502	hlist_add_head(n: &est->list, h: &ipvs->est_temp_list);
503	else
504	INIT_HLIST_NODE(h: &est->list);
505	return ret;
506	}
507
508	static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd)
509	{
510	if (kd) {
511	if (kd->task) {
512	pr_info("stop unused estimator thread %d...\n", kd->id);
513	kthread_stop(k: kd->task);
514	}
515	ip_vs_stats_free(stats: kd->calc_stats);
516	kfree(objp: kd);
517	}
518	}
519
520	/ Unlink estimator from chain /
521	void ip_vs_stop_estimator(struct netns_ipvs ipvs, struct* ip_vs_stats *stats)
522	{
523	struct ip_vs_estimator *est = &stats->est;
524	struct ip_vs_est_tick_data *td;
525	struct ip_vs_est_kt_data *kd;
526	int ktid = est->ktid;
527	int row = est->ktrow;
528	int cid = est->ktcid;
529
530	/ Failed to add to chain ? /
531	if (hlist_unhashed(h: &est->list))
532	return;
533
534	/ On return, estimator can be freed, dequeue it now /
535
536	/ In est_temp_list ? /
537	if (ktid < `0`) {
538	hlist_del(n: &est->list);
539	goto end_kt0;
540	}
541
542	hlist_del_rcu(n: &est->list);
543	kd = ipvs->est_kt_arr[ktid];
544	td = rcu_dereference_protected(kd->ticks[row], `1`);
545	__clear_bit(cid, td->full);
546	td->chain_len[cid]--;
547	if (!td->chain_len[cid])
548	__clear_bit(cid, td->present);
549	kd->tick_len[row]--;
550	__set_bit(row, kd->avail);
551	if (!kd->tick_len[row]) {
552	RCU_INIT_POINTER(kd->ticks[row], NULL);
553	kfree_rcu(td, rcu_head);
554	}
555	kd->est_count--;
556	if (kd->est_count) {
557	/ This kt slot can become available just now, prefer it /
558	if (ktid < ipvs->est_add_ktid)
559	ipvs->est_add_ktid = ktid;
560	return;
561	}
562
563	if (ktid > `0`) {
564	mutex_lock(&ipvs->est_mutex);
565	ip_vs_est_kthread_destroy(kd);
566	ipvs->est_kt_arr[ktid] = NULL;
567	if (ktid == ipvs->est_kt_count - `1`) {
568	ipvs->est_kt_count--;
569	while (ipvs->est_kt_count > `1` &&
570	!ipvs->est_kt_arr[ipvs->est_kt_count - `1`])
571	ipvs->est_kt_count--;
572	}
573	mutex_unlock(lock: &ipvs->est_mutex);
574
575	/ This slot is now empty, prefer another available kt slot /
576	if (ktid == ipvs->est_add_ktid)
577	ip_vs_est_update_ktid(ipvs);
578	}
579
580	end_kt0:
581	/ kt 0 is freed after all other kthreads and chains are empty /
582	if (ipvs->est_kt_count == `1` && hlist_empty(h: &ipvs->est_temp_list)) {
583	kd = ipvs->est_kt_arr[`0`];
584	if (!kd \|\| !kd->est_count) {
585	mutex_lock(&ipvs->est_mutex);
586	if (kd) {
587	ip_vs_est_kthread_destroy(kd);
588	ipvs->est_kt_arr[`0`] = NULL;
589	}
590	ipvs->est_kt_count--;
591	mutex_unlock(lock: &ipvs->est_mutex);
592	ipvs->est_add_ktid = `0`;
593	}
594	}
595	}
596
597	/ Register all ests from est_temp_list to kthreads /
598	static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
599	{
600	struct ip_vs_estimator *est;
601
602	while (`1`) {
603	int max = `16`;
604
605	mutex_lock(&__ip_vs_mutex);
606
607	while (max-- > `0`) {
608	est = hlist_entry_safe(ipvs->est_temp_list.first,
609	struct ip_vs_estimator, list);
610	if (est) {
611	if (kthread_should_stop())
612	goto unlock;
613	hlist_del_init(n: &est->list);
614	if (ip_vs_enqueue_estimator(ipvs, est) >= `0`)
615	continue;
616	est->ktid = -`1`;
617	hlist_add_head(n: &est->list,
618	h: &ipvs->est_temp_list);
619	/ Abort, some entries will not be estimated*
620	* until next attempt
621	*/
622	}
623	goto unlock;
624	}
625	mutex_unlock(lock: &__ip_vs_mutex);
626	cond_resched();
627	}
628
629	unlock:
630	mutex_unlock(lock: &__ip_vs_mutex);
631	}
632
633	/ Calculate limits for all kthreads /
634	static int ip_vs_est_calc_limits(struct netns_ipvs ipvs, int* *chain_max)
635	{
636	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
637	struct ip_vs_est_kt_data *kd;
638	struct hlist_head chain;
639	struct ip_vs_stats *s;
640	int cache_factor = `4`;
641	int i, loops, ntest;
642	s32 min_est = `0`;
643	ktime_t t1, t2;
644	int max = `8`;
645	int ret = `1`;
646	s64 diff;
647	u64 val;
648
649	INIT_HLIST_HEAD(&chain);
650	mutex_lock(&__ip_vs_mutex);
651	kd = ipvs->est_kt_arr[`0`];
652	mutex_unlock(lock: &__ip_vs_mutex);
653	s = kd ? kd->calc_stats : NULL;
654	if (!s)
655	goto out;
656	hlist_add_head(n: &s->est.list, h: &chain);
657
658	loops = `1`;
659	/ Get best result from many tests /
660	for (ntest = `0`; ntest < `12`; ntest++) {
661	if (!(ntest & `3`)) {
662	/ Wait for cpufreq frequency transition /
663	wait_event_idle_timeout(wq, kthread_should_stop(),
664	HZ / `50`);
665	if (!ipvs->enable \|\| kthread_should_stop())
666	goto stop;
667	}
668
669	local_bh_disable();
670	rcu_read_lock();
671
672	/ Put stats in cache /
673	ip_vs_chain_estimation(chain: &chain);
674
675	t1 = ktime_get();
676	for (i = loops * cache_factor; i > `0`; i--)
677	ip_vs_chain_estimation(chain: &chain);
678	t2 = ktime_get();
679
680	rcu_read_unlock();
681	local_bh_enable();
682
683	if (!ipvs->enable \|\| kthread_should_stop())
684	goto stop;
685	cond_resched();
686
687	diff = ktime_to_ns(ktime_sub(t2, t1));
688	if (diff <= `1` * NSEC_PER_USEC) {
689	/ Do more loops on low time resolution /
690	loops *= `2`;
691	continue;
692	}
693	if (diff >= NSEC_PER_SEC)
694	continue;
695	val = diff;
696	do_div(val, loops);
697	if (!min_est \|\| val < min_est) {
698	min_est = val;
699	/ goal: 95usec per chain /
700	val = `95` * NSEC_PER_USEC;
701	if (val >= min_est) {
702	do_div(val, min_est);
703	max = (int)val;
704	} else {
705	max = `1`;
706	}
707	}
708	}
709
710	out:
711	if (s)
712	hlist_del_init(n: &s->est.list);
713	*chain_max = max;
714	return ret;
715
716	stop:
717	ret = `0`;
718	goto out;
719	}
720
721	/ Calculate the parameters and apply them in context of kt #0*
722	* ECP: est_calc_phase
723	* ECM: est_chain_max
724	* ECP ECM Insert Chain enable Description
725	* ---------------------------------------------------------------------------
726	* 0 0 est_temp_list 0 create kt #0 context
727	* 0 0 est_temp_list 0->1 service added, start kthread #0 task
728	* 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase
729	* 1 0 est_temp_list 1 kt #0: determine est_chain_max,
730	* stop tasks, move ests to est_temp_list
731	* and free kd for kthreads 1..last
732	* 1->0 0->N kt chains 1 ests can go to kthreads
733	* 0 N kt chains 1 drain est_temp_list, create new kthread
734	* contexts, start tasks, estimate
735	*/
736	static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
737	{
738	int genid = atomic_read(v: &ipvs->est_genid);
739	struct ip_vs_est_tick_data *td;
740	struct ip_vs_est_kt_data *kd;
741	struct ip_vs_estimator *est;
742	struct ip_vs_stats *stats;
743	int id, row, cid, delay;
744	bool last, last_td;
745	int chain_max;
746	int step;
747
748	if (!ip_vs_est_calc_limits(ipvs, chain_max: &chain_max))
749	return;
750
751	mutex_lock(&__ip_vs_mutex);
752
753	/ Stop all other tasks, so that we can immediately move the*
754	* estimators to est_temp_list without RCU grace period
755	*/
756	mutex_lock(&ipvs->est_mutex);
757	for (id = `1`; id < ipvs->est_kt_count; id++) {
758	/ netns clean up started, abort /
759	if (!ipvs->enable)
760	goto unlock2;
761	kd = ipvs->est_kt_arr[id];
762	if (!kd)
763	continue;
764	ip_vs_est_kthread_stop(kd);
765	}
766	mutex_unlock(lock: &ipvs->est_mutex);
767
768	/ Move all estimators to est_temp_list but carefully,*
769	* all estimators and kthread data can be released while
770	* we reschedule. Even for kthread 0.
771	*/
772	step = `0`;
773
774	/ Order entries in est_temp_list in ascending delay, so now*
775	* walk delay(desc), id(desc), cid(asc)
776	*/
777	delay = IPVS_EST_NTICKS;
778
779	next_delay:
780	delay--;
781	if (delay < `0`)
782	goto end_dequeue;
783
784	last_kt:
785	/ Destroy contexts backwards /
786	id = ipvs->est_kt_count;
787
788	next_kt:
789	if (!ipvs->enable \|\| kthread_should_stop())
790	goto unlock;
791	id--;
792	if (id < `0`)
793	goto next_delay;
794	kd = ipvs->est_kt_arr[id];
795	if (!kd)
796	goto next_kt;
797	/ kt 0 can exist with empty chains /
798	if (!id && kd->est_count <= `1`)
799	goto next_delay;
800
801	row = kd->est_row + delay;
802	if (row >= IPVS_EST_NTICKS)
803	row -= IPVS_EST_NTICKS;
804	td = rcu_dereference_protected(kd->ticks[row], `1`);
805	if (!td)
806	goto next_kt;
807
808	cid = `0`;
809
810	walk_chain:
811	if (kthread_should_stop())
812	goto unlock;
813	step++;
814	if (!(step & `63`)) {
815	/ Give chance estimators to be added (to est_temp_list)*
816	* and deleted (releasing kthread contexts)
817	*/
818	mutex_unlock(lock: &__ip_vs_mutex);
819	cond_resched();
820	mutex_lock(&__ip_vs_mutex);
821
822	/ Current kt released ? /
823	if (id >= ipvs->est_kt_count)
824	goto last_kt;
825	if (kd != ipvs->est_kt_arr[id])
826	goto next_kt;
827	/ Current td released ? /
828	if (td != rcu_dereference_protected(kd->ticks[row], `1`))
829	goto next_kt;
830	/ No fatal changes on the current kd and td /
831	}
832	est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator,
833	list);
834	if (!est) {
835	cid++;
836	if (cid >= IPVS_EST_TICK_CHAINS)
837	goto next_kt;
838	goto walk_chain;
839	}
840	/ We can cheat and increase est_count to protect kt 0 context*
841	* from release but we prefer to keep the last estimator
842	*/
843	last = kd->est_count <= `1`;
844	/ Do not free kt #0 data /
845	if (!id && last)
846	goto next_delay;
847	last_td = kd->tick_len[row] <= `1`;
848	stats = container_of(est, struct ip_vs_stats, est);
849	ip_vs_stop_estimator(ipvs, stats);
850	/ Tasks are stopped, move without RCU grace period /
851	est->ktid = -`1`;
852	est->ktrow = row - kd->est_row;
853	if (est->ktrow < `0`)
854	est->ktrow += IPVS_EST_NTICKS;
855	hlist_add_head(n: &est->list, h: &ipvs->est_temp_list);
856	/ kd freed ? /
857	if (last)
858	goto next_kt;
859	/ td freed ? /
860	if (last_td)
861	goto next_kt;
862	goto walk_chain;
863
864	end_dequeue:
865	/ All estimators removed while calculating ? /
866	if (!ipvs->est_kt_count)
867	goto unlock;
868	kd = ipvs->est_kt_arr[`0`];
869	if (!kd)
870	goto unlock;
871	kd->add_row = kd->est_row;
872	ipvs->est_chain_max = chain_max;
873	ip_vs_est_set_params(ipvs, kd);
874
875	pr_info("using max %d ests per chain, %d per kthread\n",
876	kd->chain_max, kd->est_max_count);
877
878	/ Try to keep tot_stats in kt0, enqueue it early /
879	if (ipvs->tot_stats && !hlist_unhashed(h: &ipvs->tot_stats->s.est.list) &&
880	ipvs->tot_stats->s.est.ktid == -`1`) {
881	hlist_del(n: &ipvs->tot_stats->s.est.list);
882	hlist_add_head(n: &ipvs->tot_stats->s.est.list,
883	h: &ipvs->est_temp_list);
884	}
885
886	mutex_lock(&ipvs->est_mutex);
887
888	/ We completed the calc phase, new calc phase not requested /
889	if (genid == atomic_read(v: &ipvs->est_genid))
890	ipvs->est_calc_phase = `0`;
891
892	unlock2:
893	mutex_unlock(lock: &ipvs->est_mutex);
894
895	unlock:
896	mutex_unlock(lock: &__ip_vs_mutex);
897	}
898
899	void ip_vs_zero_estimator(struct ip_vs_stats *stats)
900	{
901	struct ip_vs_estimator *est = &stats->est;
902	struct ip_vs_kstats *k = &stats->kstats;
903
904	/ reset counters, caller must hold the stats->lock lock /
905	est->last_inbytes = k->inbytes;
906	est->last_outbytes = k->outbytes;
907	est->last_conns = k->conns;
908	est->last_inpkts = k->inpkts;
909	est->last_outpkts = k->outpkts;
910	est->cps = `0`;
911	est->inpps = `0`;
912	est->outpps = `0`;
913	est->inbps = `0`;
914	est->outbps = `0`;
915	}
916
917	/ Get decoded rates /
918	void ip_vs_read_estimator(struct ip_vs_kstats dst, struct* ip_vs_stats *stats)
919	{
920	struct ip_vs_estimator *e = &stats->est;
921
922	dst->cps = (e->cps + `0x1FF`) >> `10`;
923	dst->inpps = (e->inpps + `0x1FF`) >> `10`;
924	dst->outpps = (e->outpps + `0x1FF`) >> `10`;
925	dst->inbps = (e->inbps + `0xF`) >> `5`;
926	dst->outbps = (e->outbps + `0xF`) >> `5`;
927	}
928
929	int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
930	{
931	INIT_HLIST_HEAD(&ipvs->est_temp_list);
932	ipvs->est_kt_arr = NULL;
933	ipvs->est_max_threads = `0`;
934	ipvs->est_calc_phase = `0`;
935	ipvs->est_chain_max = `0`;
936	ipvs->est_kt_count = `0`;
937	ipvs->est_add_ktid = `0`;
938	atomic_set(v: &ipvs->est_genid, i: `0`);
939	atomic_set(v: &ipvs->est_genid_done, i: `0`);
940	__mutex_init(lock: &ipvs->est_mutex, name: "ipvs->est_mutex", key: &__ipvs_est_key);
941	return `0`;
942	}
943
944	void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
945	{
946	int i;
947
948	for (i = `0`; i < ipvs->est_kt_count; i++)
949	ip_vs_est_kthread_destroy(kd: ipvs->est_kt_arr[i]);
950	kfree(objp: ipvs->est_kt_arr);
951	mutex_destroy(lock: &ipvs->est_mutex);
952	}
953

source code of linux/net/netfilter/ipvs/ip_vs_est.c