1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Block rq-qos base io controller |
4 | * |
5 | * This works similar to wbt with a few exceptions |
6 | * |
7 | * - It's bio based, so the latency covers the whole block layer in addition to |
8 | * the actual io. |
9 | * - We will throttle all IO that comes in here if we need to. |
10 | * - We use the mean latency over the 100ms window. This is because writes can |
11 | * be particularly fast, which could give us a false sense of the impact of |
12 | * other workloads on our protected workload. |
13 | * - By default there's no throttling, we set the queue_depth to UINT_MAX so |
14 | * that we can have as many outstanding bio's as we're allowed to. Only at |
15 | * throttle time do we pay attention to the actual queue depth. |
16 | * |
17 | * The hierarchy works like the cpu controller does, we track the latency at |
18 | * every configured node, and each configured node has it's own independent |
19 | * queue depth. This means that we only care about our latency targets at the |
20 | * peer level. Some group at the bottom of the hierarchy isn't going to affect |
21 | * a group at the end of some other path if we're only configred at leaf level. |
22 | * |
23 | * Consider the following |
24 | * |
25 | * root blkg |
26 | * / \ |
27 | * fast (target=5ms) slow (target=10ms) |
28 | * / \ / \ |
29 | * a b normal(15ms) unloved |
30 | * |
31 | * "a" and "b" have no target, but their combined io under "fast" cannot exceed |
32 | * an average latency of 5ms. If it does then we will throttle the "slow" |
33 | * group. In the case of "normal", if it exceeds its 15ms target, we will |
34 | * throttle "unloved", but nobody else. |
35 | * |
36 | * In this example "fast", "slow", and "normal" will be the only groups actually |
37 | * accounting their io latencies. We have to walk up the heirarchy to the root |
38 | * on every submit and complete so we can do the appropriate stat recording and |
39 | * adjust the queue depth of ourselves if needed. |
40 | * |
41 | * There are 2 ways we throttle IO. |
42 | * |
43 | * 1) Queue depth throttling. As we throttle down we will adjust the maximum |
44 | * number of IO's we're allowed to have in flight. This starts at (u64)-1 down |
45 | * to 1. If the group is only ever submitting IO for itself then this is the |
46 | * only way we throttle. |
47 | * |
48 | * 2) Induced delay throttling. This is for the case that a group is generating |
49 | * IO that has to be issued by the root cg to avoid priority inversion. So think |
50 | * REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot |
51 | * of work done for us on behalf of the root cg and are being asked to scale |
52 | * down more then we induce a latency at userspace return. We accumulate the |
53 | * total amount of time we need to be punished by doing |
54 | * |
55 | * total_time += min_lat_nsec - actual_io_completion |
56 | * |
57 | * and then at throttle time will do |
58 | * |
59 | * throttle_time = min(total_time, NSEC_PER_SEC) |
60 | * |
61 | * This induced delay will throttle back the activity that is generating the |
62 | * root cg issued io's, wethere that's some metadata intensive operation or the |
63 | * group is using so much memory that it is pushing us into swap. |
64 | * |
65 | * Copyright (C) 2018 Josef Bacik |
66 | */ |
67 | #include <linux/kernel.h> |
68 | #include <linux/blk_types.h> |
69 | #include <linux/backing-dev.h> |
70 | #include <linux/module.h> |
71 | #include <linux/timer.h> |
72 | #include <linux/memcontrol.h> |
73 | #include <linux/sched/loadavg.h> |
74 | #include <linux/sched/signal.h> |
75 | #include <trace/events/block.h> |
76 | #include <linux/blk-mq.h> |
77 | #include "blk-rq-qos.h" |
78 | #include "blk-stat.h" |
79 | #include "blk-cgroup.h" |
80 | #include "blk.h" |
81 | |
82 | #define DEFAULT_SCALE_COOKIE 1000000U |
83 | |
84 | static struct blkcg_policy blkcg_policy_iolatency; |
85 | struct iolatency_grp; |
86 | |
87 | struct blk_iolatency { |
88 | struct rq_qos rqos; |
89 | struct timer_list timer; |
90 | |
91 | /* |
92 | * ->enabled is the master enable switch gating the throttling logic and |
93 | * inflight tracking. The number of cgroups which have iolat enabled is |
94 | * tracked in ->enable_cnt, and ->enable is flipped on/off accordingly |
95 | * from ->enable_work with the request_queue frozen. For details, See |
96 | * blkiolatency_enable_work_fn(). |
97 | */ |
98 | bool enabled; |
99 | atomic_t enable_cnt; |
100 | struct work_struct enable_work; |
101 | }; |
102 | |
103 | static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) |
104 | { |
105 | return container_of(rqos, struct blk_iolatency, rqos); |
106 | } |
107 | |
108 | struct child_latency_info { |
109 | spinlock_t lock; |
110 | |
111 | /* Last time we adjusted the scale of everybody. */ |
112 | u64 last_scale_event; |
113 | |
114 | /* The latency that we missed. */ |
115 | u64 scale_lat; |
116 | |
117 | /* Total io's from all of our children for the last summation. */ |
118 | u64 nr_samples; |
119 | |
120 | /* The guy who actually changed the latency numbers. */ |
121 | struct iolatency_grp *scale_grp; |
122 | |
123 | /* Cookie to tell if we need to scale up or down. */ |
124 | atomic_t scale_cookie; |
125 | }; |
126 | |
127 | struct percentile_stats { |
128 | u64 total; |
129 | u64 missed; |
130 | }; |
131 | |
132 | struct latency_stat { |
133 | union { |
134 | struct percentile_stats ps; |
135 | struct blk_rq_stat rqs; |
136 | }; |
137 | }; |
138 | |
139 | struct iolatency_grp { |
140 | struct blkg_policy_data pd; |
141 | struct latency_stat __percpu *stats; |
142 | struct latency_stat cur_stat; |
143 | struct blk_iolatency *blkiolat; |
144 | unsigned int max_depth; |
145 | struct rq_wait rq_wait; |
146 | atomic64_t window_start; |
147 | atomic_t scale_cookie; |
148 | u64 min_lat_nsec; |
149 | u64 cur_win_nsec; |
150 | |
151 | /* total running average of our io latency. */ |
152 | u64 lat_avg; |
153 | |
154 | /* Our current number of IO's for the last summation. */ |
155 | u64 nr_samples; |
156 | |
157 | bool ssd; |
158 | struct child_latency_info child_lat; |
159 | }; |
160 | |
161 | #define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC) |
162 | #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC |
163 | /* |
164 | * These are the constants used to fake the fixed-point moving average |
165 | * calculation just like load average. The call to calc_load() folds |
166 | * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling |
167 | * window size is bucketed to try to approximately calculate average |
168 | * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows |
169 | * elapse immediately. Note, windows only elapse with IO activity. Idle |
170 | * periods extend the most recent window. |
171 | */ |
172 | #define BLKIOLATENCY_NR_EXP_FACTORS 5 |
173 | #define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \ |
174 | (BLKIOLATENCY_NR_EXP_FACTORS - 1)) |
175 | static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = { |
176 | 2045, // exp(1/600) - 600 samples |
177 | 2039, // exp(1/240) - 240 samples |
178 | 2031, // exp(1/120) - 120 samples |
179 | 2023, // exp(1/80) - 80 samples |
180 | 2014, // exp(1/60) - 60 samples |
181 | }; |
182 | |
183 | static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd) |
184 | { |
185 | return pd ? container_of(pd, struct iolatency_grp, pd) : NULL; |
186 | } |
187 | |
188 | static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg) |
189 | { |
190 | return pd_to_lat(pd: blkg_to_pd(blkg, pol: &blkcg_policy_iolatency)); |
191 | } |
192 | |
193 | static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat) |
194 | { |
195 | return pd_to_blkg(pd: &iolat->pd); |
196 | } |
197 | |
198 | static inline void latency_stat_init(struct iolatency_grp *iolat, |
199 | struct latency_stat *stat) |
200 | { |
201 | if (iolat->ssd) { |
202 | stat->ps.total = 0; |
203 | stat->ps.missed = 0; |
204 | } else |
205 | blk_rq_stat_init(&stat->rqs); |
206 | } |
207 | |
208 | static inline void latency_stat_sum(struct iolatency_grp *iolat, |
209 | struct latency_stat *sum, |
210 | struct latency_stat *stat) |
211 | { |
212 | if (iolat->ssd) { |
213 | sum->ps.total += stat->ps.total; |
214 | sum->ps.missed += stat->ps.missed; |
215 | } else |
216 | blk_rq_stat_sum(&sum->rqs, &stat->rqs); |
217 | } |
218 | |
219 | static inline void latency_stat_record_time(struct iolatency_grp *iolat, |
220 | u64 req_time) |
221 | { |
222 | struct latency_stat *stat = get_cpu_ptr(iolat->stats); |
223 | if (iolat->ssd) { |
224 | if (req_time >= iolat->min_lat_nsec) |
225 | stat->ps.missed++; |
226 | stat->ps.total++; |
227 | } else |
228 | blk_rq_stat_add(&stat->rqs, req_time); |
229 | put_cpu_ptr(stat); |
230 | } |
231 | |
232 | static inline bool latency_sum_ok(struct iolatency_grp *iolat, |
233 | struct latency_stat *stat) |
234 | { |
235 | if (iolat->ssd) { |
236 | u64 thresh = div64_u64(dividend: stat->ps.total, divisor: 10); |
237 | thresh = max(thresh, 1ULL); |
238 | return stat->ps.missed < thresh; |
239 | } |
240 | return stat->rqs.mean <= iolat->min_lat_nsec; |
241 | } |
242 | |
243 | static inline u64 latency_stat_samples(struct iolatency_grp *iolat, |
244 | struct latency_stat *stat) |
245 | { |
246 | if (iolat->ssd) |
247 | return stat->ps.total; |
248 | return stat->rqs.nr_samples; |
249 | } |
250 | |
251 | static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, |
252 | struct latency_stat *stat) |
253 | { |
254 | int exp_idx; |
255 | |
256 | if (iolat->ssd) |
257 | return; |
258 | |
259 | /* |
260 | * calc_load() takes in a number stored in fixed point representation. |
261 | * Because we are using this for IO time in ns, the values stored |
262 | * are significantly larger than the FIXED_1 denominator (2048). |
263 | * Therefore, rounding errors in the calculation are negligible and |
264 | * can be ignored. |
265 | */ |
266 | exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, |
267 | div64_u64(iolat->cur_win_nsec, |
268 | BLKIOLATENCY_EXP_BUCKET_SIZE)); |
269 | iolat->lat_avg = calc_load(load: iolat->lat_avg, |
270 | exp: iolatency_exp_factors[exp_idx], |
271 | active: stat->rqs.mean); |
272 | } |
273 | |
274 | static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) |
275 | { |
276 | atomic_dec(v: &rqw->inflight); |
277 | wake_up(&rqw->wait); |
278 | } |
279 | |
280 | static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) |
281 | { |
282 | struct iolatency_grp *iolat = private_data; |
283 | return rq_wait_inc_below(rq_wait: rqw, limit: iolat->max_depth); |
284 | } |
285 | |
286 | static void __blkcg_iolatency_throttle(struct rq_qos *rqos, |
287 | struct iolatency_grp *iolat, |
288 | bool issue_as_root, |
289 | bool use_memdelay) |
290 | { |
291 | struct rq_wait *rqw = &iolat->rq_wait; |
292 | unsigned use_delay = atomic_read(v: &lat_to_blkg(iolat)->use_delay); |
293 | |
294 | if (use_delay) |
295 | blkcg_schedule_throttle(disk: rqos->disk, use_memdelay); |
296 | |
297 | /* |
298 | * To avoid priority inversions we want to just take a slot if we are |
299 | * issuing as root. If we're being killed off there's no point in |
300 | * delaying things, we may have been killed by OOM so throttling may |
301 | * make recovery take even longer, so just let the IO's through so the |
302 | * task can go away. |
303 | */ |
304 | if (issue_as_root || fatal_signal_pending(current)) { |
305 | atomic_inc(v: &rqw->inflight); |
306 | return; |
307 | } |
308 | |
309 | rq_qos_wait(rqw, private_data: iolat, acquire_inflight_cb: iolat_acquire_inflight, cleanup_cb: iolat_cleanup_cb); |
310 | } |
311 | |
312 | #define SCALE_DOWN_FACTOR 2 |
313 | #define SCALE_UP_FACTOR 4 |
314 | |
315 | static inline unsigned long scale_amount(unsigned long qd, bool up) |
316 | { |
317 | return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL); |
318 | } |
319 | |
320 | /* |
321 | * We scale the qd down faster than we scale up, so we need to use this helper |
322 | * to adjust the scale_cookie accordingly so we don't prematurely get |
323 | * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much. |
324 | * |
325 | * Each group has their own local copy of the last scale cookie they saw, so if |
326 | * the global scale cookie goes up or down they know which way they need to go |
327 | * based on their last knowledge of it. |
328 | */ |
329 | static void scale_cookie_change(struct blk_iolatency *blkiolat, |
330 | struct child_latency_info *lat_info, |
331 | bool up) |
332 | { |
333 | unsigned long qd = blkiolat->rqos.disk->queue->nr_requests; |
334 | unsigned long scale = scale_amount(qd, up); |
335 | unsigned long old = atomic_read(v: &lat_info->scale_cookie); |
336 | unsigned long max_scale = qd << 1; |
337 | unsigned long diff = 0; |
338 | |
339 | if (old < DEFAULT_SCALE_COOKIE) |
340 | diff = DEFAULT_SCALE_COOKIE - old; |
341 | |
342 | if (up) { |
343 | if (scale + old > DEFAULT_SCALE_COOKIE) |
344 | atomic_set(v: &lat_info->scale_cookie, |
345 | DEFAULT_SCALE_COOKIE); |
346 | else if (diff > qd) |
347 | atomic_inc(v: &lat_info->scale_cookie); |
348 | else |
349 | atomic_add(i: scale, v: &lat_info->scale_cookie); |
350 | } else { |
351 | /* |
352 | * We don't want to dig a hole so deep that it takes us hours to |
353 | * dig out of it. Just enough that we don't throttle/unthrottle |
354 | * with jagged workloads but can still unthrottle once pressure |
355 | * has sufficiently dissipated. |
356 | */ |
357 | if (diff > qd) { |
358 | if (diff < max_scale) |
359 | atomic_dec(v: &lat_info->scale_cookie); |
360 | } else { |
361 | atomic_sub(i: scale, v: &lat_info->scale_cookie); |
362 | } |
363 | } |
364 | } |
365 | |
366 | /* |
367 | * Change the queue depth of the iolatency_grp. We add 1/16th of the |
368 | * queue depth at a time so we don't get wild swings and hopefully dial in to |
369 | * fairer distribution of the overall queue depth. We halve the queue depth |
370 | * at a time so we can scale down queue depth quickly from default unlimited |
371 | * to target. |
372 | */ |
373 | static void scale_change(struct iolatency_grp *iolat, bool up) |
374 | { |
375 | unsigned long qd = iolat->blkiolat->rqos.disk->queue->nr_requests; |
376 | unsigned long scale = scale_amount(qd, up); |
377 | unsigned long old = iolat->max_depth; |
378 | |
379 | if (old > qd) |
380 | old = qd; |
381 | |
382 | if (up) { |
383 | if (old == 1 && blkcg_unuse_delay(blkg: lat_to_blkg(iolat))) |
384 | return; |
385 | |
386 | if (old < qd) { |
387 | old += scale; |
388 | old = min(old, qd); |
389 | iolat->max_depth = old; |
390 | wake_up_all(&iolat->rq_wait.wait); |
391 | } |
392 | } else { |
393 | old >>= 1; |
394 | iolat->max_depth = max(old, 1UL); |
395 | } |
396 | } |
397 | |
398 | /* Check our parent and see if the scale cookie has changed. */ |
399 | static void check_scale_change(struct iolatency_grp *iolat) |
400 | { |
401 | struct iolatency_grp *parent; |
402 | struct child_latency_info *lat_info; |
403 | unsigned int cur_cookie; |
404 | unsigned int our_cookie = atomic_read(v: &iolat->scale_cookie); |
405 | u64 scale_lat; |
406 | int direction = 0; |
407 | |
408 | parent = blkg_to_lat(blkg: lat_to_blkg(iolat)->parent); |
409 | if (!parent) |
410 | return; |
411 | |
412 | lat_info = &parent->child_lat; |
413 | cur_cookie = atomic_read(v: &lat_info->scale_cookie); |
414 | scale_lat = READ_ONCE(lat_info->scale_lat); |
415 | |
416 | if (cur_cookie < our_cookie) |
417 | direction = -1; |
418 | else if (cur_cookie > our_cookie) |
419 | direction = 1; |
420 | else |
421 | return; |
422 | |
423 | if (!atomic_try_cmpxchg(v: &iolat->scale_cookie, old: &our_cookie, new: cur_cookie)) { |
424 | /* Somebody beat us to the punch, just bail. */ |
425 | return; |
426 | } |
427 | |
428 | if (direction < 0 && iolat->min_lat_nsec) { |
429 | u64 samples_thresh; |
430 | |
431 | if (!scale_lat || iolat->min_lat_nsec <= scale_lat) |
432 | return; |
433 | |
434 | /* |
435 | * Sometimes high priority groups are their own worst enemy, so |
436 | * instead of taking it out on some poor other group that did 5% |
437 | * or less of the IO's for the last summation just skip this |
438 | * scale down event. |
439 | */ |
440 | samples_thresh = lat_info->nr_samples * 5; |
441 | samples_thresh = max(1ULL, div64_u64(samples_thresh, 100)); |
442 | if (iolat->nr_samples <= samples_thresh) |
443 | return; |
444 | } |
445 | |
446 | /* We're as low as we can go. */ |
447 | if (iolat->max_depth == 1 && direction < 0) { |
448 | blkcg_use_delay(blkg: lat_to_blkg(iolat)); |
449 | return; |
450 | } |
451 | |
452 | /* We're back to the default cookie, unthrottle all the things. */ |
453 | if (cur_cookie == DEFAULT_SCALE_COOKIE) { |
454 | blkcg_clear_delay(blkg: lat_to_blkg(iolat)); |
455 | iolat->max_depth = UINT_MAX; |
456 | wake_up_all(&iolat->rq_wait.wait); |
457 | return; |
458 | } |
459 | |
460 | scale_change(iolat, up: direction > 0); |
461 | } |
462 | |
463 | static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) |
464 | { |
465 | struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); |
466 | struct blkcg_gq *blkg = bio->bi_blkg; |
467 | bool issue_as_root = bio_issue_as_root_blkg(bio); |
468 | |
469 | if (!blkiolat->enabled) |
470 | return; |
471 | |
472 | while (blkg && blkg->parent) { |
473 | struct iolatency_grp *iolat = blkg_to_lat(blkg); |
474 | if (!iolat) { |
475 | blkg = blkg->parent; |
476 | continue; |
477 | } |
478 | |
479 | check_scale_change(iolat); |
480 | __blkcg_iolatency_throttle(rqos, iolat, issue_as_root, |
481 | use_memdelay: (bio->bi_opf & REQ_SWAP) == REQ_SWAP); |
482 | blkg = blkg->parent; |
483 | } |
484 | if (!timer_pending(timer: &blkiolat->timer)) |
485 | mod_timer(timer: &blkiolat->timer, expires: jiffies + HZ); |
486 | } |
487 | |
488 | static void iolatency_record_time(struct iolatency_grp *iolat, |
489 | struct bio_issue *issue, u64 now, |
490 | bool issue_as_root) |
491 | { |
492 | u64 start = bio_issue_time(issue); |
493 | u64 req_time; |
494 | |
495 | /* |
496 | * Have to do this so we are truncated to the correct time that our |
497 | * issue is truncated to. |
498 | */ |
499 | now = __bio_issue_time(time: now); |
500 | |
501 | if (now <= start) |
502 | return; |
503 | |
504 | req_time = now - start; |
505 | |
506 | /* |
507 | * We don't want to count issue_as_root bio's in the cgroups latency |
508 | * statistics as it could skew the numbers downwards. |
509 | */ |
510 | if (unlikely(issue_as_root && iolat->max_depth != UINT_MAX)) { |
511 | u64 sub = iolat->min_lat_nsec; |
512 | if (req_time < sub) |
513 | blkcg_add_delay(blkg: lat_to_blkg(iolat), now, delta: sub - req_time); |
514 | return; |
515 | } |
516 | |
517 | latency_stat_record_time(iolat, req_time); |
518 | } |
519 | |
520 | #define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) |
521 | #define BLKIOLATENCY_MIN_GOOD_SAMPLES 5 |
522 | |
523 | static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) |
524 | { |
525 | struct blkcg_gq *blkg = lat_to_blkg(iolat); |
526 | struct iolatency_grp *parent; |
527 | struct child_latency_info *lat_info; |
528 | struct latency_stat stat; |
529 | unsigned long flags; |
530 | int cpu; |
531 | |
532 | latency_stat_init(iolat, stat: &stat); |
533 | preempt_disable(); |
534 | for_each_online_cpu(cpu) { |
535 | struct latency_stat *s; |
536 | s = per_cpu_ptr(iolat->stats, cpu); |
537 | latency_stat_sum(iolat, sum: &stat, stat: s); |
538 | latency_stat_init(iolat, stat: s); |
539 | } |
540 | preempt_enable(); |
541 | |
542 | parent = blkg_to_lat(blkg: blkg->parent); |
543 | if (!parent) |
544 | return; |
545 | |
546 | lat_info = &parent->child_lat; |
547 | |
548 | iolat_update_total_lat_avg(iolat, stat: &stat); |
549 | |
550 | /* Everything is ok and we don't need to adjust the scale. */ |
551 | if (latency_sum_ok(iolat, stat: &stat) && |
552 | atomic_read(v: &lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) |
553 | return; |
554 | |
555 | /* Somebody beat us to the punch, just bail. */ |
556 | spin_lock_irqsave(&lat_info->lock, flags); |
557 | |
558 | latency_stat_sum(iolat, sum: &iolat->cur_stat, stat: &stat); |
559 | lat_info->nr_samples -= iolat->nr_samples; |
560 | lat_info->nr_samples += latency_stat_samples(iolat, stat: &iolat->cur_stat); |
561 | iolat->nr_samples = latency_stat_samples(iolat, stat: &iolat->cur_stat); |
562 | |
563 | if ((lat_info->last_scale_event >= now || |
564 | now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME)) |
565 | goto out; |
566 | |
567 | if (latency_sum_ok(iolat, stat: &iolat->cur_stat) && |
568 | latency_sum_ok(iolat, stat: &stat)) { |
569 | if (latency_stat_samples(iolat, stat: &iolat->cur_stat) < |
570 | BLKIOLATENCY_MIN_GOOD_SAMPLES) |
571 | goto out; |
572 | if (lat_info->scale_grp == iolat) { |
573 | lat_info->last_scale_event = now; |
574 | scale_cookie_change(blkiolat: iolat->blkiolat, lat_info, up: true); |
575 | } |
576 | } else if (lat_info->scale_lat == 0 || |
577 | lat_info->scale_lat >= iolat->min_lat_nsec) { |
578 | lat_info->last_scale_event = now; |
579 | if (!lat_info->scale_grp || |
580 | lat_info->scale_lat > iolat->min_lat_nsec) { |
581 | WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec); |
582 | lat_info->scale_grp = iolat; |
583 | } |
584 | scale_cookie_change(blkiolat: iolat->blkiolat, lat_info, up: false); |
585 | } |
586 | latency_stat_init(iolat, stat: &iolat->cur_stat); |
587 | out: |
588 | spin_unlock_irqrestore(lock: &lat_info->lock, flags); |
589 | } |
590 | |
591 | static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) |
592 | { |
593 | struct blkcg_gq *blkg; |
594 | struct rq_wait *rqw; |
595 | struct iolatency_grp *iolat; |
596 | u64 window_start; |
597 | u64 now; |
598 | bool issue_as_root = bio_issue_as_root_blkg(bio); |
599 | int inflight = 0; |
600 | |
601 | blkg = bio->bi_blkg; |
602 | if (!blkg || !bio_flagged(bio, bit: BIO_QOS_THROTTLED)) |
603 | return; |
604 | |
605 | iolat = blkg_to_lat(blkg: bio->bi_blkg); |
606 | if (!iolat) |
607 | return; |
608 | |
609 | if (!iolat->blkiolat->enabled) |
610 | return; |
611 | |
612 | now = ktime_to_ns(kt: ktime_get()); |
613 | while (blkg && blkg->parent) { |
614 | iolat = blkg_to_lat(blkg); |
615 | if (!iolat) { |
616 | blkg = blkg->parent; |
617 | continue; |
618 | } |
619 | rqw = &iolat->rq_wait; |
620 | |
621 | inflight = atomic_dec_return(v: &rqw->inflight); |
622 | WARN_ON_ONCE(inflight < 0); |
623 | /* |
624 | * If bi_status is BLK_STS_AGAIN, the bio wasn't actually |
625 | * submitted, so do not account for it. |
626 | */ |
627 | if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) { |
628 | iolatency_record_time(iolat, issue: &bio->bi_issue, now, |
629 | issue_as_root); |
630 | window_start = atomic64_read(v: &iolat->window_start); |
631 | if (now > window_start && |
632 | (now - window_start) >= iolat->cur_win_nsec) { |
633 | if (atomic64_try_cmpxchg(v: &iolat->window_start, |
634 | old: &window_start, new: now)) |
635 | iolatency_check_latencies(iolat, now); |
636 | } |
637 | } |
638 | wake_up(&rqw->wait); |
639 | blkg = blkg->parent; |
640 | } |
641 | } |
642 | |
643 | static void blkcg_iolatency_exit(struct rq_qos *rqos) |
644 | { |
645 | struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); |
646 | |
647 | timer_shutdown_sync(timer: &blkiolat->timer); |
648 | flush_work(work: &blkiolat->enable_work); |
649 | blkcg_deactivate_policy(disk: rqos->disk, pol: &blkcg_policy_iolatency); |
650 | kfree(objp: blkiolat); |
651 | } |
652 | |
653 | static const struct rq_qos_ops blkcg_iolatency_ops = { |
654 | .throttle = blkcg_iolatency_throttle, |
655 | .done_bio = blkcg_iolatency_done_bio, |
656 | .exit = blkcg_iolatency_exit, |
657 | }; |
658 | |
659 | static void blkiolatency_timer_fn(struct timer_list *t) |
660 | { |
661 | struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); |
662 | struct blkcg_gq *blkg; |
663 | struct cgroup_subsys_state *pos_css; |
664 | u64 now = ktime_to_ns(kt: ktime_get()); |
665 | |
666 | rcu_read_lock(); |
667 | blkg_for_each_descendant_pre(blkg, pos_css, |
668 | blkiolat->rqos.disk->queue->root_blkg) { |
669 | struct iolatency_grp *iolat; |
670 | struct child_latency_info *lat_info; |
671 | unsigned long flags; |
672 | u64 cookie; |
673 | |
674 | /* |
675 | * We could be exiting, don't access the pd unless we have a |
676 | * ref on the blkg. |
677 | */ |
678 | if (!blkg_tryget(blkg)) |
679 | continue; |
680 | |
681 | iolat = blkg_to_lat(blkg); |
682 | if (!iolat) |
683 | goto next; |
684 | |
685 | lat_info = &iolat->child_lat; |
686 | cookie = atomic_read(v: &lat_info->scale_cookie); |
687 | |
688 | if (cookie >= DEFAULT_SCALE_COOKIE) |
689 | goto next; |
690 | |
691 | spin_lock_irqsave(&lat_info->lock, flags); |
692 | if (lat_info->last_scale_event >= now) |
693 | goto next_lock; |
694 | |
695 | /* |
696 | * We scaled down but don't have a scale_grp, scale up and carry |
697 | * on. |
698 | */ |
699 | if (lat_info->scale_grp == NULL) { |
700 | scale_cookie_change(blkiolat: iolat->blkiolat, lat_info, up: true); |
701 | goto next_lock; |
702 | } |
703 | |
704 | /* |
705 | * It's been 5 seconds since our last scale event, clear the |
706 | * scale grp in case the group that needed the scale down isn't |
707 | * doing any IO currently. |
708 | */ |
709 | if (now - lat_info->last_scale_event >= |
710 | ((u64)NSEC_PER_SEC * 5)) |
711 | lat_info->scale_grp = NULL; |
712 | next_lock: |
713 | spin_unlock_irqrestore(lock: &lat_info->lock, flags); |
714 | next: |
715 | blkg_put(blkg); |
716 | } |
717 | rcu_read_unlock(); |
718 | } |
719 | |
720 | /** |
721 | * blkiolatency_enable_work_fn - Enable or disable iolatency on the device |
722 | * @work: enable_work of the blk_iolatency of interest |
723 | * |
724 | * iolatency needs to keep track of the number of in-flight IOs per cgroup. This |
725 | * is relatively expensive as it involves walking up the hierarchy twice for |
726 | * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we |
727 | * want to disable the in-flight tracking. |
728 | * |
729 | * We have to make sure that the counting is balanced - we don't want to leak |
730 | * the in-flight counts by disabling accounting in the completion path while IOs |
731 | * are in flight. This is achieved by ensuring that no IO is in flight by |
732 | * freezing the queue while flipping ->enabled. As this requires a sleepable |
733 | * context, ->enabled flipping is punted to this work function. |
734 | */ |
735 | static void blkiolatency_enable_work_fn(struct work_struct *work) |
736 | { |
737 | struct blk_iolatency *blkiolat = container_of(work, struct blk_iolatency, |
738 | enable_work); |
739 | bool enabled; |
740 | |
741 | /* |
742 | * There can only be one instance of this function running for @blkiolat |
743 | * and it's guaranteed to be executed at least once after the latest |
744 | * ->enabled_cnt modification. Acting on the latest ->enable_cnt is |
745 | * sufficient. |
746 | * |
747 | * Also, we know @blkiolat is safe to access as ->enable_work is flushed |
748 | * in blkcg_iolatency_exit(). |
749 | */ |
750 | enabled = atomic_read(v: &blkiolat->enable_cnt); |
751 | if (enabled != blkiolat->enabled) { |
752 | blk_mq_freeze_queue(q: blkiolat->rqos.disk->queue); |
753 | blkiolat->enabled = enabled; |
754 | blk_mq_unfreeze_queue(q: blkiolat->rqos.disk->queue); |
755 | } |
756 | } |
757 | |
758 | static int blk_iolatency_init(struct gendisk *disk) |
759 | { |
760 | struct blk_iolatency *blkiolat; |
761 | int ret; |
762 | |
763 | blkiolat = kzalloc(size: sizeof(*blkiolat), GFP_KERNEL); |
764 | if (!blkiolat) |
765 | return -ENOMEM; |
766 | |
767 | ret = rq_qos_add(rqos: &blkiolat->rqos, disk, id: RQ_QOS_LATENCY, |
768 | ops: &blkcg_iolatency_ops); |
769 | if (ret) |
770 | goto err_free; |
771 | ret = blkcg_activate_policy(disk, pol: &blkcg_policy_iolatency); |
772 | if (ret) |
773 | goto err_qos_del; |
774 | |
775 | timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); |
776 | INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn); |
777 | |
778 | return 0; |
779 | |
780 | err_qos_del: |
781 | rq_qos_del(rqos: &blkiolat->rqos); |
782 | err_free: |
783 | kfree(objp: blkiolat); |
784 | return ret; |
785 | } |
786 | |
787 | static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) |
788 | { |
789 | struct iolatency_grp *iolat = blkg_to_lat(blkg); |
790 | struct blk_iolatency *blkiolat = iolat->blkiolat; |
791 | u64 oldval = iolat->min_lat_nsec; |
792 | |
793 | iolat->min_lat_nsec = val; |
794 | iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE); |
795 | iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, |
796 | BLKIOLATENCY_MAX_WIN_SIZE); |
797 | |
798 | if (!oldval && val) { |
799 | if (atomic_inc_return(v: &blkiolat->enable_cnt) == 1) |
800 | schedule_work(work: &blkiolat->enable_work); |
801 | } |
802 | if (oldval && !val) { |
803 | blkcg_clear_delay(blkg); |
804 | if (atomic_dec_return(v: &blkiolat->enable_cnt) == 0) |
805 | schedule_work(work: &blkiolat->enable_work); |
806 | } |
807 | } |
808 | |
809 | static void iolatency_clear_scaling(struct blkcg_gq *blkg) |
810 | { |
811 | if (blkg->parent) { |
812 | struct iolatency_grp *iolat = blkg_to_lat(blkg: blkg->parent); |
813 | struct child_latency_info *lat_info; |
814 | if (!iolat) |
815 | return; |
816 | |
817 | lat_info = &iolat->child_lat; |
818 | spin_lock(lock: &lat_info->lock); |
819 | atomic_set(v: &lat_info->scale_cookie, DEFAULT_SCALE_COOKIE); |
820 | lat_info->last_scale_event = 0; |
821 | lat_info->scale_grp = NULL; |
822 | lat_info->scale_lat = 0; |
823 | spin_unlock(lock: &lat_info->lock); |
824 | } |
825 | } |
826 | |
827 | static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, |
828 | size_t nbytes, loff_t off) |
829 | { |
830 | struct blkcg *blkcg = css_to_blkcg(css: of_css(of)); |
831 | struct blkcg_gq *blkg; |
832 | struct blkg_conf_ctx ctx; |
833 | struct iolatency_grp *iolat; |
834 | char *p, *tok; |
835 | u64 lat_val = 0; |
836 | u64 oldval; |
837 | int ret; |
838 | |
839 | blkg_conf_init(ctx: &ctx, input: buf); |
840 | |
841 | ret = blkg_conf_open_bdev(ctx: &ctx); |
842 | if (ret) |
843 | goto out; |
844 | |
845 | /* |
846 | * blk_iolatency_init() may fail after rq_qos_add() succeeds which can |
847 | * confuse iolat_rq_qos() test. Make the test and init atomic. |
848 | */ |
849 | lockdep_assert_held(&ctx.bdev->bd_queue->rq_qos_mutex); |
850 | if (!iolat_rq_qos(q: ctx.bdev->bd_queue)) |
851 | ret = blk_iolatency_init(disk: ctx.bdev->bd_disk); |
852 | if (ret) |
853 | goto out; |
854 | |
855 | ret = blkg_conf_prep(blkcg, pol: &blkcg_policy_iolatency, ctx: &ctx); |
856 | if (ret) |
857 | goto out; |
858 | |
859 | iolat = blkg_to_lat(blkg: ctx.blkg); |
860 | p = ctx.body; |
861 | |
862 | ret = -EINVAL; |
863 | while ((tok = strsep(&p, " " ))) { |
864 | char key[16]; |
865 | char val[21]; /* 18446744073709551616 */ |
866 | |
867 | if (sscanf(tok, "%15[^=]=%20s" , key, val) != 2) |
868 | goto out; |
869 | |
870 | if (!strcmp(key, "target" )) { |
871 | u64 v; |
872 | |
873 | if (!strcmp(val, "max" )) |
874 | lat_val = 0; |
875 | else if (sscanf(val, "%llu" , &v) == 1) |
876 | lat_val = v * NSEC_PER_USEC; |
877 | else |
878 | goto out; |
879 | } else { |
880 | goto out; |
881 | } |
882 | } |
883 | |
884 | /* Walk up the tree to see if our new val is lower than it should be. */ |
885 | blkg = ctx.blkg; |
886 | oldval = iolat->min_lat_nsec; |
887 | |
888 | iolatency_set_min_lat_nsec(blkg, val: lat_val); |
889 | if (oldval != iolat->min_lat_nsec) |
890 | iolatency_clear_scaling(blkg); |
891 | ret = 0; |
892 | out: |
893 | blkg_conf_exit(ctx: &ctx); |
894 | return ret ?: nbytes; |
895 | } |
896 | |
897 | static u64 iolatency_prfill_limit(struct seq_file *sf, |
898 | struct blkg_policy_data *pd, int off) |
899 | { |
900 | struct iolatency_grp *iolat = pd_to_lat(pd); |
901 | const char *dname = blkg_dev_name(blkg: pd->blkg); |
902 | |
903 | if (!dname || !iolat->min_lat_nsec) |
904 | return 0; |
905 | seq_printf(m: sf, fmt: "%s target=%llu\n" , |
906 | dname, div_u64(dividend: iolat->min_lat_nsec, NSEC_PER_USEC)); |
907 | return 0; |
908 | } |
909 | |
910 | static int iolatency_print_limit(struct seq_file *sf, void *v) |
911 | { |
912 | blkcg_print_blkgs(sf, blkcg: css_to_blkcg(css: seq_css(seq: sf)), |
913 | prfill: iolatency_prfill_limit, |
914 | pol: &blkcg_policy_iolatency, data: seq_cft(seq: sf)->private, show_total: false); |
915 | return 0; |
916 | } |
917 | |
918 | static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) |
919 | { |
920 | struct latency_stat stat; |
921 | int cpu; |
922 | |
923 | latency_stat_init(iolat, stat: &stat); |
924 | preempt_disable(); |
925 | for_each_online_cpu(cpu) { |
926 | struct latency_stat *s; |
927 | s = per_cpu_ptr(iolat->stats, cpu); |
928 | latency_stat_sum(iolat, sum: &stat, stat: s); |
929 | } |
930 | preempt_enable(); |
931 | |
932 | if (iolat->max_depth == UINT_MAX) |
933 | seq_printf(m: s, fmt: " missed=%llu total=%llu depth=max" , |
934 | (unsigned long long)stat.ps.missed, |
935 | (unsigned long long)stat.ps.total); |
936 | else |
937 | seq_printf(m: s, fmt: " missed=%llu total=%llu depth=%u" , |
938 | (unsigned long long)stat.ps.missed, |
939 | (unsigned long long)stat.ps.total, |
940 | iolat->max_depth); |
941 | } |
942 | |
943 | static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) |
944 | { |
945 | struct iolatency_grp *iolat = pd_to_lat(pd); |
946 | unsigned long long avg_lat; |
947 | unsigned long long cur_win; |
948 | |
949 | if (!blkcg_debug_stats) |
950 | return; |
951 | |
952 | if (iolat->ssd) |
953 | return iolatency_ssd_stat(iolat, s); |
954 | |
955 | avg_lat = div64_u64(dividend: iolat->lat_avg, NSEC_PER_USEC); |
956 | cur_win = div64_u64(dividend: iolat->cur_win_nsec, NSEC_PER_MSEC); |
957 | if (iolat->max_depth == UINT_MAX) |
958 | seq_printf(m: s, fmt: " depth=max avg_lat=%llu win=%llu" , |
959 | avg_lat, cur_win); |
960 | else |
961 | seq_printf(m: s, fmt: " depth=%u avg_lat=%llu win=%llu" , |
962 | iolat->max_depth, avg_lat, cur_win); |
963 | } |
964 | |
965 | static struct blkg_policy_data *iolatency_pd_alloc(struct gendisk *disk, |
966 | struct blkcg *blkcg, gfp_t gfp) |
967 | { |
968 | struct iolatency_grp *iolat; |
969 | |
970 | iolat = kzalloc_node(size: sizeof(*iolat), flags: gfp, node: disk->node_id); |
971 | if (!iolat) |
972 | return NULL; |
973 | iolat->stats = __alloc_percpu_gfp(size: sizeof(struct latency_stat), |
974 | align: __alignof__(struct latency_stat), gfp); |
975 | if (!iolat->stats) { |
976 | kfree(objp: iolat); |
977 | return NULL; |
978 | } |
979 | return &iolat->pd; |
980 | } |
981 | |
982 | static void iolatency_pd_init(struct blkg_policy_data *pd) |
983 | { |
984 | struct iolatency_grp *iolat = pd_to_lat(pd); |
985 | struct blkcg_gq *blkg = lat_to_blkg(iolat); |
986 | struct rq_qos *rqos = iolat_rq_qos(q: blkg->q); |
987 | struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); |
988 | u64 now = ktime_to_ns(kt: ktime_get()); |
989 | int cpu; |
990 | |
991 | if (blk_queue_nonrot(blkg->q)) |
992 | iolat->ssd = true; |
993 | else |
994 | iolat->ssd = false; |
995 | |
996 | for_each_possible_cpu(cpu) { |
997 | struct latency_stat *stat; |
998 | stat = per_cpu_ptr(iolat->stats, cpu); |
999 | latency_stat_init(iolat, stat); |
1000 | } |
1001 | |
1002 | latency_stat_init(iolat, stat: &iolat->cur_stat); |
1003 | rq_wait_init(rq_wait: &iolat->rq_wait); |
1004 | spin_lock_init(&iolat->child_lat.lock); |
1005 | iolat->max_depth = UINT_MAX; |
1006 | iolat->blkiolat = blkiolat; |
1007 | iolat->cur_win_nsec = 100 * NSEC_PER_MSEC; |
1008 | atomic64_set(v: &iolat->window_start, i: now); |
1009 | |
1010 | /* |
1011 | * We init things in list order, so the pd for the parent may not be |
1012 | * init'ed yet for whatever reason. |
1013 | */ |
1014 | if (blkg->parent && blkg_to_pd(blkg: blkg->parent, pol: &blkcg_policy_iolatency)) { |
1015 | struct iolatency_grp *parent = blkg_to_lat(blkg: blkg->parent); |
1016 | atomic_set(v: &iolat->scale_cookie, |
1017 | i: atomic_read(v: &parent->child_lat.scale_cookie)); |
1018 | } else { |
1019 | atomic_set(v: &iolat->scale_cookie, DEFAULT_SCALE_COOKIE); |
1020 | } |
1021 | |
1022 | atomic_set(v: &iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE); |
1023 | } |
1024 | |
1025 | static void iolatency_pd_offline(struct blkg_policy_data *pd) |
1026 | { |
1027 | struct iolatency_grp *iolat = pd_to_lat(pd); |
1028 | struct blkcg_gq *blkg = lat_to_blkg(iolat); |
1029 | |
1030 | iolatency_set_min_lat_nsec(blkg, val: 0); |
1031 | iolatency_clear_scaling(blkg); |
1032 | } |
1033 | |
1034 | static void iolatency_pd_free(struct blkg_policy_data *pd) |
1035 | { |
1036 | struct iolatency_grp *iolat = pd_to_lat(pd); |
1037 | free_percpu(pdata: iolat->stats); |
1038 | kfree(objp: iolat); |
1039 | } |
1040 | |
1041 | static struct cftype iolatency_files[] = { |
1042 | { |
1043 | .name = "latency" , |
1044 | .flags = CFTYPE_NOT_ON_ROOT, |
1045 | .seq_show = iolatency_print_limit, |
1046 | .write = iolatency_set_limit, |
1047 | }, |
1048 | {} |
1049 | }; |
1050 | |
1051 | static struct blkcg_policy blkcg_policy_iolatency = { |
1052 | .dfl_cftypes = iolatency_files, |
1053 | .pd_alloc_fn = iolatency_pd_alloc, |
1054 | .pd_init_fn = iolatency_pd_init, |
1055 | .pd_offline_fn = iolatency_pd_offline, |
1056 | .pd_free_fn = iolatency_pd_free, |
1057 | .pd_stat_fn = iolatency_pd_stat, |
1058 | }; |
1059 | |
1060 | static int __init iolatency_init(void) |
1061 | { |
1062 | return blkcg_policy_register(pol: &blkcg_policy_iolatency); |
1063 | } |
1064 | |
1065 | static void __exit iolatency_exit(void) |
1066 | { |
1067 | blkcg_policy_unregister(pol: &blkcg_policy_iolatency); |
1068 | } |
1069 | |
1070 | module_init(iolatency_init); |
1071 | module_exit(iolatency_exit); |
1072 | |