1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * ip_vs_est.c: simple rate estimator for IPVS |
4 | * |
5 | * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> |
6 | * |
7 | * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> |
8 | * Network name space (netns) aware. |
9 | * Global data moved to netns i.e struct netns_ipvs |
10 | * Affected data: est_list and est_lock. |
11 | * estimation_timer() runs with timer per netns. |
12 | * get_stats()) do the per cpu summing. |
13 | */ |
14 | |
15 | #define KMSG_COMPONENT "IPVS" |
16 | #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt |
17 | |
18 | #include <linux/kernel.h> |
19 | #include <linux/jiffies.h> |
20 | #include <linux/types.h> |
21 | #include <linux/interrupt.h> |
22 | #include <linux/sysctl.h> |
23 | #include <linux/list.h> |
24 | #include <linux/rcupdate_wait.h> |
25 | |
26 | #include <net/ip_vs.h> |
27 | |
28 | /* |
29 | This code is to estimate rate in a shorter interval (such as 8 |
30 | seconds) for virtual services and real servers. For measure rate in a |
31 | long interval, it is easy to implement a user level daemon which |
32 | periodically reads those statistical counters and measure rate. |
33 | |
34 | We measure rate during the last 8 seconds every 2 seconds: |
35 | |
36 | avgrate = avgrate*(1-W) + rate*W |
37 | |
38 | where W = 2^(-2) |
39 | |
40 | NOTES. |
41 | |
42 | * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10. |
43 | |
44 | * Netlink users can see 64-bit values but sockopt users are restricted |
45 | to 32-bit values for conns, packets, bps, cps and pps. |
46 | |
47 | * A lot of code is taken from net/core/gen_estimator.c |
48 | |
49 | KEY POINTS: |
50 | - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled |
51 | - kthreads read the cpustats to update the estimators (svcs, dests, total) |
52 | - the states of estimators can be read (get stats) or modified (zero stats) |
53 | from processes |
54 | |
55 | KTHREADS: |
56 | - estimators are added initially to est_temp_list and later kthread 0 |
57 | distributes them to one or many kthreads for estimation |
58 | - kthread contexts are created and attached to array |
59 | - the kthread tasks are started when first service is added, before that |
60 | the total stats are not estimated |
61 | - when configuration (cpulist/nice) is changed, the tasks are restarted |
62 | by work (est_reload_work) |
63 | - kthread tasks are stopped while the cpulist is empty |
64 | - the kthread context holds lists with estimators (chains) which are |
65 | processed every 2 seconds |
66 | - as estimators can be added dynamically and in bursts, we try to spread |
67 | them to multiple chains which are estimated at different time |
68 | - on start, kthread 0 enters calculation phase to determine the chain limits |
69 | and the limit of estimators per kthread |
70 | - est_add_ktid: ktid where to add new ests, can point to empty slot where |
71 | we should add kt data |
72 | */ |
73 | |
74 | static struct lock_class_key __ipvs_est_key; |
75 | |
76 | static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs); |
77 | static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs); |
78 | |
79 | static void ip_vs_chain_estimation(struct hlist_head *chain) |
80 | { |
81 | struct ip_vs_estimator *e; |
82 | struct ip_vs_cpu_stats *c; |
83 | struct ip_vs_stats *s; |
84 | u64 rate; |
85 | |
86 | hlist_for_each_entry_rcu(e, chain, list) { |
87 | u64 conns, inpkts, outpkts, inbytes, outbytes; |
88 | u64 kconns = 0, kinpkts = 0, koutpkts = 0; |
89 | u64 kinbytes = 0, koutbytes = 0; |
90 | unsigned int start; |
91 | int i; |
92 | |
93 | if (kthread_should_stop()) |
94 | break; |
95 | |
96 | s = container_of(e, struct ip_vs_stats, est); |
97 | for_each_possible_cpu(i) { |
98 | c = per_cpu_ptr(s->cpustats, i); |
99 | do { |
100 | start = u64_stats_fetch_begin(syncp: &c->syncp); |
101 | conns = u64_stats_read(p: &c->cnt.conns); |
102 | inpkts = u64_stats_read(p: &c->cnt.inpkts); |
103 | outpkts = u64_stats_read(p: &c->cnt.outpkts); |
104 | inbytes = u64_stats_read(p: &c->cnt.inbytes); |
105 | outbytes = u64_stats_read(p: &c->cnt.outbytes); |
106 | } while (u64_stats_fetch_retry(syncp: &c->syncp, start)); |
107 | kconns += conns; |
108 | kinpkts += inpkts; |
109 | koutpkts += outpkts; |
110 | kinbytes += inbytes; |
111 | koutbytes += outbytes; |
112 | } |
113 | |
114 | spin_lock(lock: &s->lock); |
115 | |
116 | s->kstats.conns = kconns; |
117 | s->kstats.inpkts = kinpkts; |
118 | s->kstats.outpkts = koutpkts; |
119 | s->kstats.inbytes = kinbytes; |
120 | s->kstats.outbytes = koutbytes; |
121 | |
122 | /* scaled by 2^10, but divided 2 seconds */ |
123 | rate = (s->kstats.conns - e->last_conns) << 9; |
124 | e->last_conns = s->kstats.conns; |
125 | e->cps += ((s64)rate - (s64)e->cps) >> 2; |
126 | |
127 | rate = (s->kstats.inpkts - e->last_inpkts) << 9; |
128 | e->last_inpkts = s->kstats.inpkts; |
129 | e->inpps += ((s64)rate - (s64)e->inpps) >> 2; |
130 | |
131 | rate = (s->kstats.outpkts - e->last_outpkts) << 9; |
132 | e->last_outpkts = s->kstats.outpkts; |
133 | e->outpps += ((s64)rate - (s64)e->outpps) >> 2; |
134 | |
135 | /* scaled by 2^5, but divided 2 seconds */ |
136 | rate = (s->kstats.inbytes - e->last_inbytes) << 4; |
137 | e->last_inbytes = s->kstats.inbytes; |
138 | e->inbps += ((s64)rate - (s64)e->inbps) >> 2; |
139 | |
140 | rate = (s->kstats.outbytes - e->last_outbytes) << 4; |
141 | e->last_outbytes = s->kstats.outbytes; |
142 | e->outbps += ((s64)rate - (s64)e->outbps) >> 2; |
143 | spin_unlock(lock: &s->lock); |
144 | } |
145 | } |
146 | |
147 | static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row) |
148 | { |
149 | struct ip_vs_est_tick_data *td; |
150 | int cid; |
151 | |
152 | rcu_read_lock(); |
153 | td = rcu_dereference(kd->ticks[row]); |
154 | if (!td) |
155 | goto out; |
156 | for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) { |
157 | if (kthread_should_stop()) |
158 | break; |
159 | ip_vs_chain_estimation(chain: &td->chains[cid]); |
160 | cond_resched_rcu(); |
161 | td = rcu_dereference(kd->ticks[row]); |
162 | if (!td) |
163 | break; |
164 | } |
165 | |
166 | out: |
167 | rcu_read_unlock(); |
168 | } |
169 | |
170 | static int ip_vs_estimation_kthread(void *data) |
171 | { |
172 | struct ip_vs_est_kt_data *kd = data; |
173 | struct netns_ipvs *ipvs = kd->ipvs; |
174 | int row = kd->est_row; |
175 | unsigned long now; |
176 | int id = kd->id; |
177 | long gap; |
178 | |
179 | if (id > 0) { |
180 | if (!ipvs->est_chain_max) |
181 | return 0; |
182 | } else { |
183 | if (!ipvs->est_chain_max) { |
184 | ipvs->est_calc_phase = 1; |
185 | /* commit est_calc_phase before reading est_genid */ |
186 | smp_mb(); |
187 | } |
188 | |
189 | /* kthread 0 will handle the calc phase */ |
190 | if (ipvs->est_calc_phase) |
191 | ip_vs_est_calc_phase(ipvs); |
192 | } |
193 | |
194 | while (1) { |
195 | if (!id && !hlist_empty(h: &ipvs->est_temp_list)) |
196 | ip_vs_est_drain_temp_list(ipvs); |
197 | set_current_state(TASK_IDLE); |
198 | if (kthread_should_stop()) |
199 | break; |
200 | |
201 | /* before estimation, check if we should sleep */ |
202 | now = jiffies; |
203 | gap = kd->est_timer - now; |
204 | if (gap > 0) { |
205 | if (gap > IPVS_EST_TICK) { |
206 | kd->est_timer = now - IPVS_EST_TICK; |
207 | gap = IPVS_EST_TICK; |
208 | } |
209 | schedule_timeout(timeout: gap); |
210 | } else { |
211 | __set_current_state(TASK_RUNNING); |
212 | if (gap < -8 * IPVS_EST_TICK) |
213 | kd->est_timer = now; |
214 | } |
215 | |
216 | if (kd->tick_len[row]) |
217 | ip_vs_tick_estimation(kd, row); |
218 | |
219 | row++; |
220 | if (row >= IPVS_EST_NTICKS) |
221 | row = 0; |
222 | WRITE_ONCE(kd->est_row, row); |
223 | kd->est_timer += IPVS_EST_TICK; |
224 | } |
225 | __set_current_state(TASK_RUNNING); |
226 | |
227 | return 0; |
228 | } |
229 | |
230 | /* Schedule stop/start for kthread tasks */ |
231 | void ip_vs_est_reload_start(struct netns_ipvs *ipvs) |
232 | { |
233 | /* Ignore reloads before first service is added */ |
234 | if (!ipvs->enable) |
235 | return; |
236 | ip_vs_est_stopped_recalc(ipvs); |
237 | /* Bump the kthread configuration genid */ |
238 | atomic_inc(v: &ipvs->est_genid); |
239 | queue_delayed_work(wq: system_long_wq, dwork: &ipvs->est_reload_work, delay: 0); |
240 | } |
241 | |
242 | /* Start kthread task with current configuration */ |
243 | int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, |
244 | struct ip_vs_est_kt_data *kd) |
245 | { |
246 | unsigned long now; |
247 | int ret = 0; |
248 | long gap; |
249 | |
250 | lockdep_assert_held(&ipvs->est_mutex); |
251 | |
252 | if (kd->task) |
253 | goto out; |
254 | now = jiffies; |
255 | gap = kd->est_timer - now; |
256 | /* Sync est_timer if task is starting later */ |
257 | if (abs(gap) > 4 * IPVS_EST_TICK) |
258 | kd->est_timer = now; |
259 | kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d" , |
260 | ipvs->gen, kd->id); |
261 | if (IS_ERR(ptr: kd->task)) { |
262 | ret = PTR_ERR(ptr: kd->task); |
263 | kd->task = NULL; |
264 | goto out; |
265 | } |
266 | |
267 | set_user_nice(p: kd->task, nice: sysctl_est_nice(ipvs)); |
268 | set_cpus_allowed_ptr(p: kd->task, new_mask: sysctl_est_cpulist(ipvs)); |
269 | |
270 | pr_info("starting estimator thread %d...\n" , kd->id); |
271 | wake_up_process(tsk: kd->task); |
272 | |
273 | out: |
274 | return ret; |
275 | } |
276 | |
277 | void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd) |
278 | { |
279 | if (kd->task) { |
280 | pr_info("stopping estimator thread %d...\n" , kd->id); |
281 | kthread_stop(k: kd->task); |
282 | kd->task = NULL; |
283 | } |
284 | } |
285 | |
286 | /* Apply parameters to kthread */ |
287 | static void ip_vs_est_set_params(struct netns_ipvs *ipvs, |
288 | struct ip_vs_est_kt_data *kd) |
289 | { |
290 | kd->chain_max = ipvs->est_chain_max; |
291 | /* We are using single chain on RCU preemption */ |
292 | if (IPVS_EST_TICK_CHAINS == 1) |
293 | kd->chain_max *= IPVS_EST_CHAIN_FACTOR; |
294 | kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max; |
295 | kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max; |
296 | } |
297 | |
298 | /* Create and start estimation kthread in a free or new array slot */ |
299 | static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) |
300 | { |
301 | struct ip_vs_est_kt_data *kd = NULL; |
302 | int id = ipvs->est_kt_count; |
303 | int ret = -ENOMEM; |
304 | void *arr = NULL; |
305 | int i; |
306 | |
307 | if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && |
308 | ipvs->enable && ipvs->est_max_threads) |
309 | return -EINVAL; |
310 | |
311 | mutex_lock(&ipvs->est_mutex); |
312 | |
313 | for (i = 0; i < id; i++) { |
314 | if (!ipvs->est_kt_arr[i]) |
315 | break; |
316 | } |
317 | if (i >= id) { |
318 | arr = krealloc_array(p: ipvs->est_kt_arr, new_n: id + 1, |
319 | new_size: sizeof(struct ip_vs_est_kt_data *), |
320 | GFP_KERNEL); |
321 | if (!arr) |
322 | goto out; |
323 | ipvs->est_kt_arr = arr; |
324 | } else { |
325 | id = i; |
326 | } |
327 | |
328 | kd = kzalloc(size: sizeof(*kd), GFP_KERNEL); |
329 | if (!kd) |
330 | goto out; |
331 | kd->ipvs = ipvs; |
332 | bitmap_fill(dst: kd->avail, IPVS_EST_NTICKS); |
333 | kd->est_timer = jiffies; |
334 | kd->id = id; |
335 | ip_vs_est_set_params(ipvs, kd); |
336 | |
337 | /* Pre-allocate stats used in calc phase */ |
338 | if (!id && !kd->calc_stats) { |
339 | kd->calc_stats = ip_vs_stats_alloc(); |
340 | if (!kd->calc_stats) |
341 | goto out; |
342 | } |
343 | |
344 | /* Start kthread tasks only when services are present */ |
345 | if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { |
346 | ret = ip_vs_est_kthread_start(ipvs, kd); |
347 | if (ret < 0) |
348 | goto out; |
349 | } |
350 | |
351 | if (arr) |
352 | ipvs->est_kt_count++; |
353 | ipvs->est_kt_arr[id] = kd; |
354 | kd = NULL; |
355 | /* Use most recent kthread for new ests */ |
356 | ipvs->est_add_ktid = id; |
357 | ret = 0; |
358 | |
359 | out: |
360 | mutex_unlock(lock: &ipvs->est_mutex); |
361 | if (kd) { |
362 | ip_vs_stats_free(stats: kd->calc_stats); |
363 | kfree(objp: kd); |
364 | } |
365 | |
366 | return ret; |
367 | } |
368 | |
369 | /* Select ktid where to add new ests: available, unused or new slot */ |
370 | static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs) |
371 | { |
372 | int ktid, best = ipvs->est_kt_count; |
373 | struct ip_vs_est_kt_data *kd; |
374 | |
375 | for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) { |
376 | kd = ipvs->est_kt_arr[ktid]; |
377 | if (kd) { |
378 | if (kd->est_count < kd->est_max_count) { |
379 | best = ktid; |
380 | break; |
381 | } |
382 | } else if (ktid < best) { |
383 | best = ktid; |
384 | } |
385 | } |
386 | ipvs->est_add_ktid = best; |
387 | } |
388 | |
389 | /* Add estimator to current kthread (est_add_ktid) */ |
390 | static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs, |
391 | struct ip_vs_estimator *est) |
392 | { |
393 | struct ip_vs_est_kt_data *kd = NULL; |
394 | struct ip_vs_est_tick_data *td; |
395 | int ktid, row, crow, cid, ret; |
396 | int delay = est->ktrow; |
397 | |
398 | BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127, |
399 | "Too many chains for ktcid" ); |
400 | |
401 | if (ipvs->est_add_ktid < ipvs->est_kt_count) { |
402 | kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; |
403 | if (kd) |
404 | goto add_est; |
405 | } |
406 | |
407 | ret = ip_vs_est_add_kthread(ipvs); |
408 | if (ret < 0) |
409 | goto out; |
410 | kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; |
411 | |
412 | add_est: |
413 | ktid = kd->id; |
414 | /* For small number of estimators prefer to use few ticks, |
415 | * otherwise try to add into the last estimated row. |
416 | * est_row and add_row point after the row we should use |
417 | */ |
418 | if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1) |
419 | crow = READ_ONCE(kd->est_row); |
420 | else |
421 | crow = kd->add_row; |
422 | crow += delay; |
423 | if (crow >= IPVS_EST_NTICKS) |
424 | crow -= IPVS_EST_NTICKS; |
425 | /* Assume initial delay ? */ |
426 | if (delay >= IPVS_EST_NTICKS - 1) { |
427 | /* Preserve initial delay or decrease it if no space in tick */ |
428 | row = crow; |
429 | if (crow < IPVS_EST_NTICKS - 1) { |
430 | crow++; |
431 | row = find_last_bit(addr: kd->avail, size: crow); |
432 | } |
433 | if (row >= crow) |
434 | row = find_last_bit(addr: kd->avail, IPVS_EST_NTICKS); |
435 | } else { |
436 | /* Preserve delay or increase it if no space in tick */ |
437 | row = IPVS_EST_NTICKS; |
438 | if (crow > 0) |
439 | row = find_next_bit(addr: kd->avail, IPVS_EST_NTICKS, offset: crow); |
440 | if (row >= IPVS_EST_NTICKS) |
441 | row = find_first_bit(addr: kd->avail, IPVS_EST_NTICKS); |
442 | } |
443 | |
444 | td = rcu_dereference_protected(kd->ticks[row], 1); |
445 | if (!td) { |
446 | td = kzalloc(size: sizeof(*td), GFP_KERNEL); |
447 | if (!td) { |
448 | ret = -ENOMEM; |
449 | goto out; |
450 | } |
451 | rcu_assign_pointer(kd->ticks[row], td); |
452 | } |
453 | |
454 | cid = find_first_zero_bit(addr: td->full, IPVS_EST_TICK_CHAINS); |
455 | |
456 | kd->est_count++; |
457 | kd->tick_len[row]++; |
458 | if (!td->chain_len[cid]) |
459 | __set_bit(cid, td->present); |
460 | td->chain_len[cid]++; |
461 | est->ktid = ktid; |
462 | est->ktrow = row; |
463 | est->ktcid = cid; |
464 | hlist_add_head_rcu(n: &est->list, h: &td->chains[cid]); |
465 | |
466 | if (td->chain_len[cid] >= kd->chain_max) { |
467 | __set_bit(cid, td->full); |
468 | if (kd->tick_len[row] >= kd->tick_max) |
469 | __clear_bit(row, kd->avail); |
470 | } |
471 | |
472 | /* Update est_add_ktid to point to first available/empty kt slot */ |
473 | if (kd->est_count == kd->est_max_count) |
474 | ip_vs_est_update_ktid(ipvs); |
475 | |
476 | ret = 0; |
477 | |
478 | out: |
479 | return ret; |
480 | } |
481 | |
482 | /* Start estimation for stats */ |
483 | int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) |
484 | { |
485 | struct ip_vs_estimator *est = &stats->est; |
486 | int ret; |
487 | |
488 | if (!ipvs->est_max_threads && ipvs->enable) |
489 | ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); |
490 | |
491 | est->ktid = -1; |
492 | est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ |
493 | |
494 | /* We prefer this code to be short, kthread 0 will requeue the |
495 | * estimator to available chain. If tasks are disabled, we |
496 | * will not allocate much memory, just for kt 0. |
497 | */ |
498 | ret = 0; |
499 | if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) |
500 | ret = ip_vs_est_add_kthread(ipvs); |
501 | if (ret >= 0) |
502 | hlist_add_head(n: &est->list, h: &ipvs->est_temp_list); |
503 | else |
504 | INIT_HLIST_NODE(h: &est->list); |
505 | return ret; |
506 | } |
507 | |
508 | static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd) |
509 | { |
510 | if (kd) { |
511 | if (kd->task) { |
512 | pr_info("stop unused estimator thread %d...\n" , kd->id); |
513 | kthread_stop(k: kd->task); |
514 | } |
515 | ip_vs_stats_free(stats: kd->calc_stats); |
516 | kfree(objp: kd); |
517 | } |
518 | } |
519 | |
520 | /* Unlink estimator from chain */ |
521 | void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) |
522 | { |
523 | struct ip_vs_estimator *est = &stats->est; |
524 | struct ip_vs_est_tick_data *td; |
525 | struct ip_vs_est_kt_data *kd; |
526 | int ktid = est->ktid; |
527 | int row = est->ktrow; |
528 | int cid = est->ktcid; |
529 | |
530 | /* Failed to add to chain ? */ |
531 | if (hlist_unhashed(h: &est->list)) |
532 | return; |
533 | |
534 | /* On return, estimator can be freed, dequeue it now */ |
535 | |
536 | /* In est_temp_list ? */ |
537 | if (ktid < 0) { |
538 | hlist_del(n: &est->list); |
539 | goto end_kt0; |
540 | } |
541 | |
542 | hlist_del_rcu(n: &est->list); |
543 | kd = ipvs->est_kt_arr[ktid]; |
544 | td = rcu_dereference_protected(kd->ticks[row], 1); |
545 | __clear_bit(cid, td->full); |
546 | td->chain_len[cid]--; |
547 | if (!td->chain_len[cid]) |
548 | __clear_bit(cid, td->present); |
549 | kd->tick_len[row]--; |
550 | __set_bit(row, kd->avail); |
551 | if (!kd->tick_len[row]) { |
552 | RCU_INIT_POINTER(kd->ticks[row], NULL); |
553 | kfree_rcu(td, rcu_head); |
554 | } |
555 | kd->est_count--; |
556 | if (kd->est_count) { |
557 | /* This kt slot can become available just now, prefer it */ |
558 | if (ktid < ipvs->est_add_ktid) |
559 | ipvs->est_add_ktid = ktid; |
560 | return; |
561 | } |
562 | |
563 | if (ktid > 0) { |
564 | mutex_lock(&ipvs->est_mutex); |
565 | ip_vs_est_kthread_destroy(kd); |
566 | ipvs->est_kt_arr[ktid] = NULL; |
567 | if (ktid == ipvs->est_kt_count - 1) { |
568 | ipvs->est_kt_count--; |
569 | while (ipvs->est_kt_count > 1 && |
570 | !ipvs->est_kt_arr[ipvs->est_kt_count - 1]) |
571 | ipvs->est_kt_count--; |
572 | } |
573 | mutex_unlock(lock: &ipvs->est_mutex); |
574 | |
575 | /* This slot is now empty, prefer another available kt slot */ |
576 | if (ktid == ipvs->est_add_ktid) |
577 | ip_vs_est_update_ktid(ipvs); |
578 | } |
579 | |
580 | end_kt0: |
581 | /* kt 0 is freed after all other kthreads and chains are empty */ |
582 | if (ipvs->est_kt_count == 1 && hlist_empty(h: &ipvs->est_temp_list)) { |
583 | kd = ipvs->est_kt_arr[0]; |
584 | if (!kd || !kd->est_count) { |
585 | mutex_lock(&ipvs->est_mutex); |
586 | if (kd) { |
587 | ip_vs_est_kthread_destroy(kd); |
588 | ipvs->est_kt_arr[0] = NULL; |
589 | } |
590 | ipvs->est_kt_count--; |
591 | mutex_unlock(lock: &ipvs->est_mutex); |
592 | ipvs->est_add_ktid = 0; |
593 | } |
594 | } |
595 | } |
596 | |
597 | /* Register all ests from est_temp_list to kthreads */ |
598 | static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) |
599 | { |
600 | struct ip_vs_estimator *est; |
601 | |
602 | while (1) { |
603 | int max = 16; |
604 | |
605 | mutex_lock(&__ip_vs_mutex); |
606 | |
607 | while (max-- > 0) { |
608 | est = hlist_entry_safe(ipvs->est_temp_list.first, |
609 | struct ip_vs_estimator, list); |
610 | if (est) { |
611 | if (kthread_should_stop()) |
612 | goto unlock; |
613 | hlist_del_init(n: &est->list); |
614 | if (ip_vs_enqueue_estimator(ipvs, est) >= 0) |
615 | continue; |
616 | est->ktid = -1; |
617 | hlist_add_head(n: &est->list, |
618 | h: &ipvs->est_temp_list); |
619 | /* Abort, some entries will not be estimated |
620 | * until next attempt |
621 | */ |
622 | } |
623 | goto unlock; |
624 | } |
625 | mutex_unlock(lock: &__ip_vs_mutex); |
626 | cond_resched(); |
627 | } |
628 | |
629 | unlock: |
630 | mutex_unlock(lock: &__ip_vs_mutex); |
631 | } |
632 | |
633 | /* Calculate limits for all kthreads */ |
634 | static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) |
635 | { |
636 | DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); |
637 | struct ip_vs_est_kt_data *kd; |
638 | struct hlist_head chain; |
639 | struct ip_vs_stats *s; |
640 | int cache_factor = 4; |
641 | int i, loops, ntest; |
642 | s32 min_est = 0; |
643 | ktime_t t1, t2; |
644 | int max = 8; |
645 | int ret = 1; |
646 | s64 diff; |
647 | u64 val; |
648 | |
649 | INIT_HLIST_HEAD(&chain); |
650 | mutex_lock(&__ip_vs_mutex); |
651 | kd = ipvs->est_kt_arr[0]; |
652 | mutex_unlock(lock: &__ip_vs_mutex); |
653 | s = kd ? kd->calc_stats : NULL; |
654 | if (!s) |
655 | goto out; |
656 | hlist_add_head(n: &s->est.list, h: &chain); |
657 | |
658 | loops = 1; |
659 | /* Get best result from many tests */ |
660 | for (ntest = 0; ntest < 12; ntest++) { |
661 | if (!(ntest & 3)) { |
662 | /* Wait for cpufreq frequency transition */ |
663 | wait_event_idle_timeout(wq, kthread_should_stop(), |
664 | HZ / 50); |
665 | if (!ipvs->enable || kthread_should_stop()) |
666 | goto stop; |
667 | } |
668 | |
669 | local_bh_disable(); |
670 | rcu_read_lock(); |
671 | |
672 | /* Put stats in cache */ |
673 | ip_vs_chain_estimation(chain: &chain); |
674 | |
675 | t1 = ktime_get(); |
676 | for (i = loops * cache_factor; i > 0; i--) |
677 | ip_vs_chain_estimation(chain: &chain); |
678 | t2 = ktime_get(); |
679 | |
680 | rcu_read_unlock(); |
681 | local_bh_enable(); |
682 | |
683 | if (!ipvs->enable || kthread_should_stop()) |
684 | goto stop; |
685 | cond_resched(); |
686 | |
687 | diff = ktime_to_ns(ktime_sub(t2, t1)); |
688 | if (diff <= 1 * NSEC_PER_USEC) { |
689 | /* Do more loops on low time resolution */ |
690 | loops *= 2; |
691 | continue; |
692 | } |
693 | if (diff >= NSEC_PER_SEC) |
694 | continue; |
695 | val = diff; |
696 | do_div(val, loops); |
697 | if (!min_est || val < min_est) { |
698 | min_est = val; |
699 | /* goal: 95usec per chain */ |
700 | val = 95 * NSEC_PER_USEC; |
701 | if (val >= min_est) { |
702 | do_div(val, min_est); |
703 | max = (int)val; |
704 | } else { |
705 | max = 1; |
706 | } |
707 | } |
708 | } |
709 | |
710 | out: |
711 | if (s) |
712 | hlist_del_init(n: &s->est.list); |
713 | *chain_max = max; |
714 | return ret; |
715 | |
716 | stop: |
717 | ret = 0; |
718 | goto out; |
719 | } |
720 | |
721 | /* Calculate the parameters and apply them in context of kt #0 |
722 | * ECP: est_calc_phase |
723 | * ECM: est_chain_max |
724 | * ECP ECM Insert Chain enable Description |
725 | * --------------------------------------------------------------------------- |
726 | * 0 0 est_temp_list 0 create kt #0 context |
727 | * 0 0 est_temp_list 0->1 service added, start kthread #0 task |
728 | * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase |
729 | * 1 0 est_temp_list 1 kt #0: determine est_chain_max, |
730 | * stop tasks, move ests to est_temp_list |
731 | * and free kd for kthreads 1..last |
732 | * 1->0 0->N kt chains 1 ests can go to kthreads |
733 | * 0 N kt chains 1 drain est_temp_list, create new kthread |
734 | * contexts, start tasks, estimate |
735 | */ |
736 | static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) |
737 | { |
738 | int genid = atomic_read(v: &ipvs->est_genid); |
739 | struct ip_vs_est_tick_data *td; |
740 | struct ip_vs_est_kt_data *kd; |
741 | struct ip_vs_estimator *est; |
742 | struct ip_vs_stats *stats; |
743 | int id, row, cid, delay; |
744 | bool last, last_td; |
745 | int chain_max; |
746 | int step; |
747 | |
748 | if (!ip_vs_est_calc_limits(ipvs, chain_max: &chain_max)) |
749 | return; |
750 | |
751 | mutex_lock(&__ip_vs_mutex); |
752 | |
753 | /* Stop all other tasks, so that we can immediately move the |
754 | * estimators to est_temp_list without RCU grace period |
755 | */ |
756 | mutex_lock(&ipvs->est_mutex); |
757 | for (id = 1; id < ipvs->est_kt_count; id++) { |
758 | /* netns clean up started, abort */ |
759 | if (!ipvs->enable) |
760 | goto unlock2; |
761 | kd = ipvs->est_kt_arr[id]; |
762 | if (!kd) |
763 | continue; |
764 | ip_vs_est_kthread_stop(kd); |
765 | } |
766 | mutex_unlock(lock: &ipvs->est_mutex); |
767 | |
768 | /* Move all estimators to est_temp_list but carefully, |
769 | * all estimators and kthread data can be released while |
770 | * we reschedule. Even for kthread 0. |
771 | */ |
772 | step = 0; |
773 | |
774 | /* Order entries in est_temp_list in ascending delay, so now |
775 | * walk delay(desc), id(desc), cid(asc) |
776 | */ |
777 | delay = IPVS_EST_NTICKS; |
778 | |
779 | next_delay: |
780 | delay--; |
781 | if (delay < 0) |
782 | goto end_dequeue; |
783 | |
784 | last_kt: |
785 | /* Destroy contexts backwards */ |
786 | id = ipvs->est_kt_count; |
787 | |
788 | next_kt: |
789 | if (!ipvs->enable || kthread_should_stop()) |
790 | goto unlock; |
791 | id--; |
792 | if (id < 0) |
793 | goto next_delay; |
794 | kd = ipvs->est_kt_arr[id]; |
795 | if (!kd) |
796 | goto next_kt; |
797 | /* kt 0 can exist with empty chains */ |
798 | if (!id && kd->est_count <= 1) |
799 | goto next_delay; |
800 | |
801 | row = kd->est_row + delay; |
802 | if (row >= IPVS_EST_NTICKS) |
803 | row -= IPVS_EST_NTICKS; |
804 | td = rcu_dereference_protected(kd->ticks[row], 1); |
805 | if (!td) |
806 | goto next_kt; |
807 | |
808 | cid = 0; |
809 | |
810 | walk_chain: |
811 | if (kthread_should_stop()) |
812 | goto unlock; |
813 | step++; |
814 | if (!(step & 63)) { |
815 | /* Give chance estimators to be added (to est_temp_list) |
816 | * and deleted (releasing kthread contexts) |
817 | */ |
818 | mutex_unlock(lock: &__ip_vs_mutex); |
819 | cond_resched(); |
820 | mutex_lock(&__ip_vs_mutex); |
821 | |
822 | /* Current kt released ? */ |
823 | if (id >= ipvs->est_kt_count) |
824 | goto last_kt; |
825 | if (kd != ipvs->est_kt_arr[id]) |
826 | goto next_kt; |
827 | /* Current td released ? */ |
828 | if (td != rcu_dereference_protected(kd->ticks[row], 1)) |
829 | goto next_kt; |
830 | /* No fatal changes on the current kd and td */ |
831 | } |
832 | est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator, |
833 | list); |
834 | if (!est) { |
835 | cid++; |
836 | if (cid >= IPVS_EST_TICK_CHAINS) |
837 | goto next_kt; |
838 | goto walk_chain; |
839 | } |
840 | /* We can cheat and increase est_count to protect kt 0 context |
841 | * from release but we prefer to keep the last estimator |
842 | */ |
843 | last = kd->est_count <= 1; |
844 | /* Do not free kt #0 data */ |
845 | if (!id && last) |
846 | goto next_delay; |
847 | last_td = kd->tick_len[row] <= 1; |
848 | stats = container_of(est, struct ip_vs_stats, est); |
849 | ip_vs_stop_estimator(ipvs, stats); |
850 | /* Tasks are stopped, move without RCU grace period */ |
851 | est->ktid = -1; |
852 | est->ktrow = row - kd->est_row; |
853 | if (est->ktrow < 0) |
854 | est->ktrow += IPVS_EST_NTICKS; |
855 | hlist_add_head(n: &est->list, h: &ipvs->est_temp_list); |
856 | /* kd freed ? */ |
857 | if (last) |
858 | goto next_kt; |
859 | /* td freed ? */ |
860 | if (last_td) |
861 | goto next_kt; |
862 | goto walk_chain; |
863 | |
864 | end_dequeue: |
865 | /* All estimators removed while calculating ? */ |
866 | if (!ipvs->est_kt_count) |
867 | goto unlock; |
868 | kd = ipvs->est_kt_arr[0]; |
869 | if (!kd) |
870 | goto unlock; |
871 | kd->add_row = kd->est_row; |
872 | ipvs->est_chain_max = chain_max; |
873 | ip_vs_est_set_params(ipvs, kd); |
874 | |
875 | pr_info("using max %d ests per chain, %d per kthread\n" , |
876 | kd->chain_max, kd->est_max_count); |
877 | |
878 | /* Try to keep tot_stats in kt0, enqueue it early */ |
879 | if (ipvs->tot_stats && !hlist_unhashed(h: &ipvs->tot_stats->s.est.list) && |
880 | ipvs->tot_stats->s.est.ktid == -1) { |
881 | hlist_del(n: &ipvs->tot_stats->s.est.list); |
882 | hlist_add_head(n: &ipvs->tot_stats->s.est.list, |
883 | h: &ipvs->est_temp_list); |
884 | } |
885 | |
886 | mutex_lock(&ipvs->est_mutex); |
887 | |
888 | /* We completed the calc phase, new calc phase not requested */ |
889 | if (genid == atomic_read(v: &ipvs->est_genid)) |
890 | ipvs->est_calc_phase = 0; |
891 | |
892 | unlock2: |
893 | mutex_unlock(lock: &ipvs->est_mutex); |
894 | |
895 | unlock: |
896 | mutex_unlock(lock: &__ip_vs_mutex); |
897 | } |
898 | |
899 | void ip_vs_zero_estimator(struct ip_vs_stats *stats) |
900 | { |
901 | struct ip_vs_estimator *est = &stats->est; |
902 | struct ip_vs_kstats *k = &stats->kstats; |
903 | |
904 | /* reset counters, caller must hold the stats->lock lock */ |
905 | est->last_inbytes = k->inbytes; |
906 | est->last_outbytes = k->outbytes; |
907 | est->last_conns = k->conns; |
908 | est->last_inpkts = k->inpkts; |
909 | est->last_outpkts = k->outpkts; |
910 | est->cps = 0; |
911 | est->inpps = 0; |
912 | est->outpps = 0; |
913 | est->inbps = 0; |
914 | est->outbps = 0; |
915 | } |
916 | |
917 | /* Get decoded rates */ |
918 | void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) |
919 | { |
920 | struct ip_vs_estimator *e = &stats->est; |
921 | |
922 | dst->cps = (e->cps + 0x1FF) >> 10; |
923 | dst->inpps = (e->inpps + 0x1FF) >> 10; |
924 | dst->outpps = (e->outpps + 0x1FF) >> 10; |
925 | dst->inbps = (e->inbps + 0xF) >> 5; |
926 | dst->outbps = (e->outbps + 0xF) >> 5; |
927 | } |
928 | |
929 | int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) |
930 | { |
931 | INIT_HLIST_HEAD(&ipvs->est_temp_list); |
932 | ipvs->est_kt_arr = NULL; |
933 | ipvs->est_max_threads = 0; |
934 | ipvs->est_calc_phase = 0; |
935 | ipvs->est_chain_max = 0; |
936 | ipvs->est_kt_count = 0; |
937 | ipvs->est_add_ktid = 0; |
938 | atomic_set(v: &ipvs->est_genid, i: 0); |
939 | atomic_set(v: &ipvs->est_genid_done, i: 0); |
940 | __mutex_init(lock: &ipvs->est_mutex, name: "ipvs->est_mutex" , key: &__ipvs_est_key); |
941 | return 0; |
942 | } |
943 | |
944 | void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) |
945 | { |
946 | int i; |
947 | |
948 | for (i = 0; i < ipvs->est_kt_count; i++) |
949 | ip_vs_est_kthread_destroy(kd: ipvs->est_kt_arr[i]); |
950 | kfree(objp: ipvs->est_kt_arr); |
951 | mutex_destroy(lock: &ipvs->est_mutex); |
952 | } |
953 | |