1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * The Kyber I/O scheduler. Controls latency by throttling queue depths using |
4 | * scalable techniques. |
5 | * |
6 | * Copyright (C) 2017 Facebook |
7 | */ |
8 | |
9 | #include <linux/kernel.h> |
10 | #include <linux/blkdev.h> |
11 | #include <linux/module.h> |
12 | #include <linux/sbitmap.h> |
13 | |
14 | #include <trace/events/block.h> |
15 | |
16 | #include "elevator.h" |
17 | #include "blk.h" |
18 | #include "blk-mq.h" |
19 | #include "blk-mq-debugfs.h" |
20 | #include "blk-mq-sched.h" |
21 | |
22 | #define CREATE_TRACE_POINTS |
23 | #include <trace/events/kyber.h> |
24 | |
25 | /* |
26 | * Scheduling domains: the device is divided into multiple domains based on the |
27 | * request type. |
28 | */ |
29 | enum { |
30 | KYBER_READ, |
31 | KYBER_WRITE, |
32 | KYBER_DISCARD, |
33 | KYBER_OTHER, |
34 | KYBER_NUM_DOMAINS, |
35 | }; |
36 | |
37 | static const char *kyber_domain_names[] = { |
38 | [KYBER_READ] = "READ" , |
39 | [KYBER_WRITE] = "WRITE" , |
40 | [KYBER_DISCARD] = "DISCARD" , |
41 | [KYBER_OTHER] = "OTHER" , |
42 | }; |
43 | |
44 | enum { |
45 | /* |
46 | * In order to prevent starvation of synchronous requests by a flood of |
47 | * asynchronous requests, we reserve 25% of requests for synchronous |
48 | * operations. |
49 | */ |
50 | KYBER_ASYNC_PERCENT = 75, |
51 | }; |
52 | |
53 | /* |
54 | * Maximum device-wide depth for each scheduling domain. |
55 | * |
56 | * Even for fast devices with lots of tags like NVMe, you can saturate the |
57 | * device with only a fraction of the maximum possible queue depth. So, we cap |
58 | * these to a reasonable value. |
59 | */ |
60 | static const unsigned int kyber_depth[] = { |
61 | [KYBER_READ] = 256, |
62 | [KYBER_WRITE] = 128, |
63 | [KYBER_DISCARD] = 64, |
64 | [KYBER_OTHER] = 16, |
65 | }; |
66 | |
67 | /* |
68 | * Default latency targets for each scheduling domain. |
69 | */ |
70 | static const u64 kyber_latency_targets[] = { |
71 | [KYBER_READ] = 2ULL * NSEC_PER_MSEC, |
72 | [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC, |
73 | [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC, |
74 | }; |
75 | |
76 | /* |
77 | * Batch size (number of requests we'll dispatch in a row) for each scheduling |
78 | * domain. |
79 | */ |
80 | static const unsigned int kyber_batch_size[] = { |
81 | [KYBER_READ] = 16, |
82 | [KYBER_WRITE] = 8, |
83 | [KYBER_DISCARD] = 1, |
84 | [KYBER_OTHER] = 1, |
85 | }; |
86 | |
87 | /* |
88 | * Requests latencies are recorded in a histogram with buckets defined relative |
89 | * to the target latency: |
90 | * |
91 | * <= 1/4 * target latency |
92 | * <= 1/2 * target latency |
93 | * <= 3/4 * target latency |
94 | * <= target latency |
95 | * <= 1 1/4 * target latency |
96 | * <= 1 1/2 * target latency |
97 | * <= 1 3/4 * target latency |
98 | * > 1 3/4 * target latency |
99 | */ |
100 | enum { |
101 | /* |
102 | * The width of the latency histogram buckets is |
103 | * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency. |
104 | */ |
105 | KYBER_LATENCY_SHIFT = 2, |
106 | /* |
107 | * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency, |
108 | * thus, "good". |
109 | */ |
110 | KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT, |
111 | /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */ |
112 | KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT, |
113 | }; |
114 | |
115 | /* |
116 | * We measure both the total latency and the I/O latency (i.e., latency after |
117 | * submitting to the device). |
118 | */ |
119 | enum { |
120 | KYBER_TOTAL_LATENCY, |
121 | KYBER_IO_LATENCY, |
122 | }; |
123 | |
124 | static const char *kyber_latency_type_names[] = { |
125 | [KYBER_TOTAL_LATENCY] = "total" , |
126 | [KYBER_IO_LATENCY] = "I/O" , |
127 | }; |
128 | |
129 | /* |
130 | * Per-cpu latency histograms: total latency and I/O latency for each scheduling |
131 | * domain except for KYBER_OTHER. |
132 | */ |
133 | struct kyber_cpu_latency { |
134 | atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS]; |
135 | }; |
136 | |
137 | /* |
138 | * There is a same mapping between ctx & hctx and kcq & khd, |
139 | * we use request->mq_ctx->index_hw to index the kcq in khd. |
140 | */ |
141 | struct kyber_ctx_queue { |
142 | /* |
143 | * Used to ensure operations on rq_list and kcq_map to be an atmoic one. |
144 | * Also protect the rqs on rq_list when merge. |
145 | */ |
146 | spinlock_t lock; |
147 | struct list_head rq_list[KYBER_NUM_DOMAINS]; |
148 | } ____cacheline_aligned_in_smp; |
149 | |
150 | struct kyber_queue_data { |
151 | struct request_queue *q; |
152 | dev_t dev; |
153 | |
154 | /* |
155 | * Each scheduling domain has a limited number of in-flight requests |
156 | * device-wide, limited by these tokens. |
157 | */ |
158 | struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; |
159 | |
160 | /* |
161 | * Async request percentage, converted to per-word depth for |
162 | * sbitmap_get_shallow(). |
163 | */ |
164 | unsigned int async_depth; |
165 | |
166 | struct kyber_cpu_latency __percpu *cpu_latency; |
167 | |
168 | /* Timer for stats aggregation and adjusting domain tokens. */ |
169 | struct timer_list timer; |
170 | |
171 | unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS]; |
172 | |
173 | unsigned long latency_timeout[KYBER_OTHER]; |
174 | |
175 | int domain_p99[KYBER_OTHER]; |
176 | |
177 | /* Target latencies in nanoseconds. */ |
178 | u64 latency_targets[KYBER_OTHER]; |
179 | }; |
180 | |
181 | struct kyber_hctx_data { |
182 | spinlock_t lock; |
183 | struct list_head rqs[KYBER_NUM_DOMAINS]; |
184 | unsigned int cur_domain; |
185 | unsigned int batching; |
186 | struct kyber_ctx_queue *kcqs; |
187 | struct sbitmap kcq_map[KYBER_NUM_DOMAINS]; |
188 | struct sbq_wait domain_wait[KYBER_NUM_DOMAINS]; |
189 | struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; |
190 | atomic_t wait_index[KYBER_NUM_DOMAINS]; |
191 | }; |
192 | |
193 | static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, |
194 | void *key); |
195 | |
196 | static unsigned int kyber_sched_domain(blk_opf_t opf) |
197 | { |
198 | switch (opf & REQ_OP_MASK) { |
199 | case REQ_OP_READ: |
200 | return KYBER_READ; |
201 | case REQ_OP_WRITE: |
202 | return KYBER_WRITE; |
203 | case REQ_OP_DISCARD: |
204 | return KYBER_DISCARD; |
205 | default: |
206 | return KYBER_OTHER; |
207 | } |
208 | } |
209 | |
210 | static void flush_latency_buckets(struct kyber_queue_data *kqd, |
211 | struct kyber_cpu_latency *cpu_latency, |
212 | unsigned int sched_domain, unsigned int type) |
213 | { |
214 | unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; |
215 | atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type]; |
216 | unsigned int bucket; |
217 | |
218 | for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++) |
219 | buckets[bucket] += atomic_xchg(v: &cpu_buckets[bucket], new: 0); |
220 | } |
221 | |
222 | /* |
223 | * Calculate the histogram bucket with the given percentile rank, or -1 if there |
224 | * aren't enough samples yet. |
225 | */ |
226 | static int calculate_percentile(struct kyber_queue_data *kqd, |
227 | unsigned int sched_domain, unsigned int type, |
228 | unsigned int percentile) |
229 | { |
230 | unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; |
231 | unsigned int bucket, samples = 0, percentile_samples; |
232 | |
233 | for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++) |
234 | samples += buckets[bucket]; |
235 | |
236 | if (!samples) |
237 | return -1; |
238 | |
239 | /* |
240 | * We do the calculation once we have 500 samples or one second passes |
241 | * since the first sample was recorded, whichever comes first. |
242 | */ |
243 | if (!kqd->latency_timeout[sched_domain]) |
244 | kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL); |
245 | if (samples < 500 && |
246 | time_is_after_jiffies(kqd->latency_timeout[sched_domain])) { |
247 | return -1; |
248 | } |
249 | kqd->latency_timeout[sched_domain] = 0; |
250 | |
251 | percentile_samples = DIV_ROUND_UP(samples * percentile, 100); |
252 | for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) { |
253 | if (buckets[bucket] >= percentile_samples) |
254 | break; |
255 | percentile_samples -= buckets[bucket]; |
256 | } |
257 | memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type])); |
258 | |
259 | trace_kyber_latency(dev: kqd->dev, domain: kyber_domain_names[sched_domain], |
260 | type: kyber_latency_type_names[type], percentile, |
261 | numerator: bucket + 1, denominator: 1 << KYBER_LATENCY_SHIFT, samples); |
262 | |
263 | return bucket; |
264 | } |
265 | |
266 | static void kyber_resize_domain(struct kyber_queue_data *kqd, |
267 | unsigned int sched_domain, unsigned int depth) |
268 | { |
269 | depth = clamp(depth, 1U, kyber_depth[sched_domain]); |
270 | if (depth != kqd->domain_tokens[sched_domain].sb.depth) { |
271 | sbitmap_queue_resize(sbq: &kqd->domain_tokens[sched_domain], depth); |
272 | trace_kyber_adjust(dev: kqd->dev, domain: kyber_domain_names[sched_domain], |
273 | depth); |
274 | } |
275 | } |
276 | |
277 | static void kyber_timer_fn(struct timer_list *t) |
278 | { |
279 | struct kyber_queue_data *kqd = from_timer(kqd, t, timer); |
280 | unsigned int sched_domain; |
281 | int cpu; |
282 | bool bad = false; |
283 | |
284 | /* Sum all of the per-cpu latency histograms. */ |
285 | for_each_online_cpu(cpu) { |
286 | struct kyber_cpu_latency *cpu_latency; |
287 | |
288 | cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu); |
289 | for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { |
290 | flush_latency_buckets(kqd, cpu_latency, sched_domain, |
291 | type: KYBER_TOTAL_LATENCY); |
292 | flush_latency_buckets(kqd, cpu_latency, sched_domain, |
293 | type: KYBER_IO_LATENCY); |
294 | } |
295 | } |
296 | |
297 | /* |
298 | * Check if any domains have a high I/O latency, which might indicate |
299 | * congestion in the device. Note that we use the p90; we don't want to |
300 | * be too sensitive to outliers here. |
301 | */ |
302 | for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { |
303 | int p90; |
304 | |
305 | p90 = calculate_percentile(kqd, sched_domain, type: KYBER_IO_LATENCY, |
306 | percentile: 90); |
307 | if (p90 >= KYBER_GOOD_BUCKETS) |
308 | bad = true; |
309 | } |
310 | |
311 | /* |
312 | * Adjust the scheduling domain depths. If we determined that there was |
313 | * congestion, we throttle all domains with good latencies. Either way, |
314 | * we ease up on throttling domains with bad latencies. |
315 | */ |
316 | for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { |
317 | unsigned int orig_depth, depth; |
318 | int p99; |
319 | |
320 | p99 = calculate_percentile(kqd, sched_domain, |
321 | type: KYBER_TOTAL_LATENCY, percentile: 99); |
322 | /* |
323 | * This is kind of subtle: different domains will not |
324 | * necessarily have enough samples to calculate the latency |
325 | * percentiles during the same window, so we have to remember |
326 | * the p99 for the next time we observe congestion; once we do, |
327 | * we don't want to throttle again until we get more data, so we |
328 | * reset it to -1. |
329 | */ |
330 | if (bad) { |
331 | if (p99 < 0) |
332 | p99 = kqd->domain_p99[sched_domain]; |
333 | kqd->domain_p99[sched_domain] = -1; |
334 | } else if (p99 >= 0) { |
335 | kqd->domain_p99[sched_domain] = p99; |
336 | } |
337 | if (p99 < 0) |
338 | continue; |
339 | |
340 | /* |
341 | * If this domain has bad latency, throttle less. Otherwise, |
342 | * throttle more iff we determined that there is congestion. |
343 | * |
344 | * The new depth is scaled linearly with the p99 latency vs the |
345 | * latency target. E.g., if the p99 is 3/4 of the target, then |
346 | * we throttle down to 3/4 of the current depth, and if the p99 |
347 | * is 2x the target, then we double the depth. |
348 | */ |
349 | if (bad || p99 >= KYBER_GOOD_BUCKETS) { |
350 | orig_depth = kqd->domain_tokens[sched_domain].sb.depth; |
351 | depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT; |
352 | kyber_resize_domain(kqd, sched_domain, depth); |
353 | } |
354 | } |
355 | } |
356 | |
357 | static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) |
358 | { |
359 | struct kyber_queue_data *kqd; |
360 | int ret = -ENOMEM; |
361 | int i; |
362 | |
363 | kqd = kzalloc_node(size: sizeof(*kqd), GFP_KERNEL, node: q->node); |
364 | if (!kqd) |
365 | goto err; |
366 | |
367 | kqd->q = q; |
368 | kqd->dev = disk_devt(disk: q->disk); |
369 | |
370 | kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency, |
371 | GFP_KERNEL | __GFP_ZERO); |
372 | if (!kqd->cpu_latency) |
373 | goto err_kqd; |
374 | |
375 | timer_setup(&kqd->timer, kyber_timer_fn, 0); |
376 | |
377 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
378 | WARN_ON(!kyber_depth[i]); |
379 | WARN_ON(!kyber_batch_size[i]); |
380 | ret = sbitmap_queue_init_node(sbq: &kqd->domain_tokens[i], |
381 | depth: kyber_depth[i], shift: -1, round_robin: false, |
382 | GFP_KERNEL, node: q->node); |
383 | if (ret) { |
384 | while (--i >= 0) |
385 | sbitmap_queue_free(sbq: &kqd->domain_tokens[i]); |
386 | goto err_buckets; |
387 | } |
388 | } |
389 | |
390 | for (i = 0; i < KYBER_OTHER; i++) { |
391 | kqd->domain_p99[i] = -1; |
392 | kqd->latency_targets[i] = kyber_latency_targets[i]; |
393 | } |
394 | |
395 | return kqd; |
396 | |
397 | err_buckets: |
398 | free_percpu(pdata: kqd->cpu_latency); |
399 | err_kqd: |
400 | kfree(objp: kqd); |
401 | err: |
402 | return ERR_PTR(error: ret); |
403 | } |
404 | |
405 | static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) |
406 | { |
407 | struct kyber_queue_data *kqd; |
408 | struct elevator_queue *eq; |
409 | |
410 | eq = elevator_alloc(q, e); |
411 | if (!eq) |
412 | return -ENOMEM; |
413 | |
414 | kqd = kyber_queue_data_alloc(q); |
415 | if (IS_ERR(ptr: kqd)) { |
416 | kobject_put(kobj: &eq->kobj); |
417 | return PTR_ERR(ptr: kqd); |
418 | } |
419 | |
420 | blk_stat_enable_accounting(q); |
421 | |
422 | blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); |
423 | |
424 | eq->elevator_data = kqd; |
425 | q->elevator = eq; |
426 | |
427 | return 0; |
428 | } |
429 | |
430 | static void kyber_exit_sched(struct elevator_queue *e) |
431 | { |
432 | struct kyber_queue_data *kqd = e->elevator_data; |
433 | int i; |
434 | |
435 | timer_shutdown_sync(timer: &kqd->timer); |
436 | blk_stat_disable_accounting(q: kqd->q); |
437 | |
438 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) |
439 | sbitmap_queue_free(sbq: &kqd->domain_tokens[i]); |
440 | free_percpu(pdata: kqd->cpu_latency); |
441 | kfree(objp: kqd); |
442 | } |
443 | |
444 | static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq) |
445 | { |
446 | unsigned int i; |
447 | |
448 | spin_lock_init(&kcq->lock); |
449 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) |
450 | INIT_LIST_HEAD(list: &kcq->rq_list[i]); |
451 | } |
452 | |
453 | static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) |
454 | { |
455 | struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; |
456 | struct blk_mq_tags *tags = hctx->sched_tags; |
457 | unsigned int shift = tags->bitmap_tags.sb.shift; |
458 | |
459 | kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; |
460 | |
461 | sbitmap_queue_min_shallow_depth(sbq: &tags->bitmap_tags, min_shallow_depth: kqd->async_depth); |
462 | } |
463 | |
464 | static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) |
465 | { |
466 | struct kyber_hctx_data *khd; |
467 | int i; |
468 | |
469 | khd = kmalloc_node(size: sizeof(*khd), GFP_KERNEL, node: hctx->numa_node); |
470 | if (!khd) |
471 | return -ENOMEM; |
472 | |
473 | khd->kcqs = kmalloc_array_node(n: hctx->nr_ctx, |
474 | size: sizeof(struct kyber_ctx_queue), |
475 | GFP_KERNEL, node: hctx->numa_node); |
476 | if (!khd->kcqs) |
477 | goto err_khd; |
478 | |
479 | for (i = 0; i < hctx->nr_ctx; i++) |
480 | kyber_ctx_queue_init(kcq: &khd->kcqs[i]); |
481 | |
482 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
483 | if (sbitmap_init_node(sb: &khd->kcq_map[i], depth: hctx->nr_ctx, |
484 | ilog2(8), GFP_KERNEL, node: hctx->numa_node, |
485 | round_robin: false, alloc_hint: false)) { |
486 | while (--i >= 0) |
487 | sbitmap_free(sb: &khd->kcq_map[i]); |
488 | goto err_kcqs; |
489 | } |
490 | } |
491 | |
492 | spin_lock_init(&khd->lock); |
493 | |
494 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
495 | INIT_LIST_HEAD(list: &khd->rqs[i]); |
496 | khd->domain_wait[i].sbq = NULL; |
497 | init_waitqueue_func_entry(wq_entry: &khd->domain_wait[i].wait, |
498 | func: kyber_domain_wake); |
499 | khd->domain_wait[i].wait.private = hctx; |
500 | INIT_LIST_HEAD(list: &khd->domain_wait[i].wait.entry); |
501 | atomic_set(v: &khd->wait_index[i], i: 0); |
502 | } |
503 | |
504 | khd->cur_domain = 0; |
505 | khd->batching = 0; |
506 | |
507 | hctx->sched_data = khd; |
508 | kyber_depth_updated(hctx); |
509 | |
510 | return 0; |
511 | |
512 | err_kcqs: |
513 | kfree(objp: khd->kcqs); |
514 | err_khd: |
515 | kfree(objp: khd); |
516 | return -ENOMEM; |
517 | } |
518 | |
519 | static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) |
520 | { |
521 | struct kyber_hctx_data *khd = hctx->sched_data; |
522 | int i; |
523 | |
524 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) |
525 | sbitmap_free(sb: &khd->kcq_map[i]); |
526 | kfree(objp: khd->kcqs); |
527 | kfree(objp: hctx->sched_data); |
528 | } |
529 | |
530 | static int rq_get_domain_token(struct request *rq) |
531 | { |
532 | return (long)rq->elv.priv[0]; |
533 | } |
534 | |
535 | static void rq_set_domain_token(struct request *rq, int token) |
536 | { |
537 | rq->elv.priv[0] = (void *)(long)token; |
538 | } |
539 | |
540 | static void rq_clear_domain_token(struct kyber_queue_data *kqd, |
541 | struct request *rq) |
542 | { |
543 | unsigned int sched_domain; |
544 | int nr; |
545 | |
546 | nr = rq_get_domain_token(rq); |
547 | if (nr != -1) { |
548 | sched_domain = kyber_sched_domain(opf: rq->cmd_flags); |
549 | sbitmap_queue_clear(sbq: &kqd->domain_tokens[sched_domain], nr, |
550 | cpu: rq->mq_ctx->cpu); |
551 | } |
552 | } |
553 | |
554 | static void kyber_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) |
555 | { |
556 | /* |
557 | * We use the scheduler tags as per-hardware queue queueing tokens. |
558 | * Async requests can be limited at this stage. |
559 | */ |
560 | if (!op_is_sync(op: opf)) { |
561 | struct kyber_queue_data *kqd = data->q->elevator->elevator_data; |
562 | |
563 | data->shallow_depth = kqd->async_depth; |
564 | } |
565 | } |
566 | |
567 | static bool kyber_bio_merge(struct request_queue *q, struct bio *bio, |
568 | unsigned int nr_segs) |
569 | { |
570 | struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); |
571 | struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, opf: bio->bi_opf, ctx); |
572 | struct kyber_hctx_data *khd = hctx->sched_data; |
573 | struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; |
574 | unsigned int sched_domain = kyber_sched_domain(opf: bio->bi_opf); |
575 | struct list_head *rq_list = &kcq->rq_list[sched_domain]; |
576 | bool merged; |
577 | |
578 | spin_lock(lock: &kcq->lock); |
579 | merged = blk_bio_list_merge(q: hctx->queue, list: rq_list, bio, nr_segs); |
580 | spin_unlock(lock: &kcq->lock); |
581 | |
582 | return merged; |
583 | } |
584 | |
585 | static void kyber_prepare_request(struct request *rq) |
586 | { |
587 | rq_set_domain_token(rq, token: -1); |
588 | } |
589 | |
590 | static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, |
591 | struct list_head *rq_list, |
592 | blk_insert_t flags) |
593 | { |
594 | struct kyber_hctx_data *khd = hctx->sched_data; |
595 | struct request *rq, *next; |
596 | |
597 | list_for_each_entry_safe(rq, next, rq_list, queuelist) { |
598 | unsigned int sched_domain = kyber_sched_domain(opf: rq->cmd_flags); |
599 | struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]]; |
600 | struct list_head *head = &kcq->rq_list[sched_domain]; |
601 | |
602 | spin_lock(lock: &kcq->lock); |
603 | trace_block_rq_insert(rq); |
604 | if (flags & BLK_MQ_INSERT_AT_HEAD) |
605 | list_move(list: &rq->queuelist, head); |
606 | else |
607 | list_move_tail(list: &rq->queuelist, head); |
608 | sbitmap_set_bit(sb: &khd->kcq_map[sched_domain], |
609 | bitnr: rq->mq_ctx->index_hw[hctx->type]); |
610 | spin_unlock(lock: &kcq->lock); |
611 | } |
612 | } |
613 | |
614 | static void kyber_finish_request(struct request *rq) |
615 | { |
616 | struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; |
617 | |
618 | rq_clear_domain_token(kqd, rq); |
619 | } |
620 | |
621 | static void add_latency_sample(struct kyber_cpu_latency *cpu_latency, |
622 | unsigned int sched_domain, unsigned int type, |
623 | u64 target, u64 latency) |
624 | { |
625 | unsigned int bucket; |
626 | u64 divisor; |
627 | |
628 | if (latency > 0) { |
629 | divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1); |
630 | bucket = min_t(unsigned int, div64_u64(latency - 1, divisor), |
631 | KYBER_LATENCY_BUCKETS - 1); |
632 | } else { |
633 | bucket = 0; |
634 | } |
635 | |
636 | atomic_inc(v: &cpu_latency->buckets[sched_domain][type][bucket]); |
637 | } |
638 | |
639 | static void kyber_completed_request(struct request *rq, u64 now) |
640 | { |
641 | struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; |
642 | struct kyber_cpu_latency *cpu_latency; |
643 | unsigned int sched_domain; |
644 | u64 target; |
645 | |
646 | sched_domain = kyber_sched_domain(opf: rq->cmd_flags); |
647 | if (sched_domain == KYBER_OTHER) |
648 | return; |
649 | |
650 | cpu_latency = get_cpu_ptr(kqd->cpu_latency); |
651 | target = kqd->latency_targets[sched_domain]; |
652 | add_latency_sample(cpu_latency, sched_domain, type: KYBER_TOTAL_LATENCY, |
653 | target, latency: now - rq->start_time_ns); |
654 | add_latency_sample(cpu_latency, sched_domain, type: KYBER_IO_LATENCY, target, |
655 | latency: now - rq->io_start_time_ns); |
656 | put_cpu_ptr(kqd->cpu_latency); |
657 | |
658 | timer_reduce(timer: &kqd->timer, expires: jiffies + HZ / 10); |
659 | } |
660 | |
661 | struct flush_kcq_data { |
662 | struct kyber_hctx_data *khd; |
663 | unsigned int sched_domain; |
664 | struct list_head *list; |
665 | }; |
666 | |
667 | static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data) |
668 | { |
669 | struct flush_kcq_data *flush_data = data; |
670 | struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr]; |
671 | |
672 | spin_lock(lock: &kcq->lock); |
673 | list_splice_tail_init(list: &kcq->rq_list[flush_data->sched_domain], |
674 | head: flush_data->list); |
675 | sbitmap_clear_bit(sb, bitnr); |
676 | spin_unlock(lock: &kcq->lock); |
677 | |
678 | return true; |
679 | } |
680 | |
681 | static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd, |
682 | unsigned int sched_domain, |
683 | struct list_head *list) |
684 | { |
685 | struct flush_kcq_data data = { |
686 | .khd = khd, |
687 | .sched_domain = sched_domain, |
688 | .list = list, |
689 | }; |
690 | |
691 | sbitmap_for_each_set(sb: &khd->kcq_map[sched_domain], |
692 | fn: flush_busy_kcq, data: &data); |
693 | } |
694 | |
695 | static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags, |
696 | void *key) |
697 | { |
698 | struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private); |
699 | struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait); |
700 | |
701 | sbitmap_del_wait_queue(sbq_wait: wait); |
702 | blk_mq_run_hw_queue(hctx, async: true); |
703 | return 1; |
704 | } |
705 | |
706 | static int kyber_get_domain_token(struct kyber_queue_data *kqd, |
707 | struct kyber_hctx_data *khd, |
708 | struct blk_mq_hw_ctx *hctx) |
709 | { |
710 | unsigned int sched_domain = khd->cur_domain; |
711 | struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; |
712 | struct sbq_wait *wait = &khd->domain_wait[sched_domain]; |
713 | struct sbq_wait_state *ws; |
714 | int nr; |
715 | |
716 | nr = __sbitmap_queue_get(sbq: domain_tokens); |
717 | |
718 | /* |
719 | * If we failed to get a domain token, make sure the hardware queue is |
720 | * run when one becomes available. Note that this is serialized on |
721 | * khd->lock, but we still need to be careful about the waker. |
722 | */ |
723 | if (nr < 0 && list_empty_careful(head: &wait->wait.entry)) { |
724 | ws = sbq_wait_ptr(sbq: domain_tokens, |
725 | wait_index: &khd->wait_index[sched_domain]); |
726 | khd->domain_ws[sched_domain] = ws; |
727 | sbitmap_add_wait_queue(sbq: domain_tokens, ws, sbq_wait: wait); |
728 | |
729 | /* |
730 | * Try again in case a token was freed before we got on the wait |
731 | * queue. |
732 | */ |
733 | nr = __sbitmap_queue_get(sbq: domain_tokens); |
734 | } |
735 | |
736 | /* |
737 | * If we got a token while we were on the wait queue, remove ourselves |
738 | * from the wait queue to ensure that all wake ups make forward |
739 | * progress. It's possible that the waker already deleted the entry |
740 | * between the !list_empty_careful() check and us grabbing the lock, but |
741 | * list_del_init() is okay with that. |
742 | */ |
743 | if (nr >= 0 && !list_empty_careful(head: &wait->wait.entry)) { |
744 | ws = khd->domain_ws[sched_domain]; |
745 | spin_lock_irq(lock: &ws->wait.lock); |
746 | sbitmap_del_wait_queue(sbq_wait: wait); |
747 | spin_unlock_irq(lock: &ws->wait.lock); |
748 | } |
749 | |
750 | return nr; |
751 | } |
752 | |
753 | static struct request * |
754 | kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, |
755 | struct kyber_hctx_data *khd, |
756 | struct blk_mq_hw_ctx *hctx) |
757 | { |
758 | struct list_head *rqs; |
759 | struct request *rq; |
760 | int nr; |
761 | |
762 | rqs = &khd->rqs[khd->cur_domain]; |
763 | |
764 | /* |
765 | * If we already have a flushed request, then we just need to get a |
766 | * token for it. Otherwise, if there are pending requests in the kcqs, |
767 | * flush the kcqs, but only if we can get a token. If not, we should |
768 | * leave the requests in the kcqs so that they can be merged. Note that |
769 | * khd->lock serializes the flushes, so if we observed any bit set in |
770 | * the kcq_map, we will always get a request. |
771 | */ |
772 | rq = list_first_entry_or_null(rqs, struct request, queuelist); |
773 | if (rq) { |
774 | nr = kyber_get_domain_token(kqd, khd, hctx); |
775 | if (nr >= 0) { |
776 | khd->batching++; |
777 | rq_set_domain_token(rq, token: nr); |
778 | list_del_init(entry: &rq->queuelist); |
779 | return rq; |
780 | } else { |
781 | trace_kyber_throttled(dev: kqd->dev, |
782 | domain: kyber_domain_names[khd->cur_domain]); |
783 | } |
784 | } else if (sbitmap_any_bit_set(sb: &khd->kcq_map[khd->cur_domain])) { |
785 | nr = kyber_get_domain_token(kqd, khd, hctx); |
786 | if (nr >= 0) { |
787 | kyber_flush_busy_kcqs(khd, sched_domain: khd->cur_domain, list: rqs); |
788 | rq = list_first_entry(rqs, struct request, queuelist); |
789 | khd->batching++; |
790 | rq_set_domain_token(rq, token: nr); |
791 | list_del_init(entry: &rq->queuelist); |
792 | return rq; |
793 | } else { |
794 | trace_kyber_throttled(dev: kqd->dev, |
795 | domain: kyber_domain_names[khd->cur_domain]); |
796 | } |
797 | } |
798 | |
799 | /* There were either no pending requests or no tokens. */ |
800 | return NULL; |
801 | } |
802 | |
803 | static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) |
804 | { |
805 | struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; |
806 | struct kyber_hctx_data *khd = hctx->sched_data; |
807 | struct request *rq; |
808 | int i; |
809 | |
810 | spin_lock(lock: &khd->lock); |
811 | |
812 | /* |
813 | * First, if we are still entitled to batch, try to dispatch a request |
814 | * from the batch. |
815 | */ |
816 | if (khd->batching < kyber_batch_size[khd->cur_domain]) { |
817 | rq = kyber_dispatch_cur_domain(kqd, khd, hctx); |
818 | if (rq) |
819 | goto out; |
820 | } |
821 | |
822 | /* |
823 | * Either, |
824 | * 1. We were no longer entitled to a batch. |
825 | * 2. The domain we were batching didn't have any requests. |
826 | * 3. The domain we were batching was out of tokens. |
827 | * |
828 | * Start another batch. Note that this wraps back around to the original |
829 | * domain if no other domains have requests or tokens. |
830 | */ |
831 | khd->batching = 0; |
832 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
833 | if (khd->cur_domain == KYBER_NUM_DOMAINS - 1) |
834 | khd->cur_domain = 0; |
835 | else |
836 | khd->cur_domain++; |
837 | |
838 | rq = kyber_dispatch_cur_domain(kqd, khd, hctx); |
839 | if (rq) |
840 | goto out; |
841 | } |
842 | |
843 | rq = NULL; |
844 | out: |
845 | spin_unlock(lock: &khd->lock); |
846 | return rq; |
847 | } |
848 | |
849 | static bool kyber_has_work(struct blk_mq_hw_ctx *hctx) |
850 | { |
851 | struct kyber_hctx_data *khd = hctx->sched_data; |
852 | int i; |
853 | |
854 | for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
855 | if (!list_empty_careful(head: &khd->rqs[i]) || |
856 | sbitmap_any_bit_set(sb: &khd->kcq_map[i])) |
857 | return true; |
858 | } |
859 | |
860 | return false; |
861 | } |
862 | |
863 | #define KYBER_LAT_SHOW_STORE(domain, name) \ |
864 | static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \ |
865 | char *page) \ |
866 | { \ |
867 | struct kyber_queue_data *kqd = e->elevator_data; \ |
868 | \ |
869 | return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \ |
870 | } \ |
871 | \ |
872 | static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \ |
873 | const char *page, size_t count) \ |
874 | { \ |
875 | struct kyber_queue_data *kqd = e->elevator_data; \ |
876 | unsigned long long nsec; \ |
877 | int ret; \ |
878 | \ |
879 | ret = kstrtoull(page, 10, &nsec); \ |
880 | if (ret) \ |
881 | return ret; \ |
882 | \ |
883 | kqd->latency_targets[domain] = nsec; \ |
884 | \ |
885 | return count; \ |
886 | } |
887 | KYBER_LAT_SHOW_STORE(KYBER_READ, read); |
888 | KYBER_LAT_SHOW_STORE(KYBER_WRITE, write); |
889 | #undef KYBER_LAT_SHOW_STORE |
890 | |
891 | #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) |
892 | static struct elv_fs_entry kyber_sched_attrs[] = { |
893 | KYBER_LAT_ATTR(read), |
894 | KYBER_LAT_ATTR(write), |
895 | __ATTR_NULL |
896 | }; |
897 | #undef KYBER_LAT_ATTR |
898 | |
899 | #ifdef CONFIG_BLK_DEBUG_FS |
900 | #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \ |
901 | static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \ |
902 | { \ |
903 | struct request_queue *q = data; \ |
904 | struct kyber_queue_data *kqd = q->elevator->elevator_data; \ |
905 | \ |
906 | sbitmap_queue_show(&kqd->domain_tokens[domain], m); \ |
907 | return 0; \ |
908 | } \ |
909 | \ |
910 | static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \ |
911 | __acquires(&khd->lock) \ |
912 | { \ |
913 | struct blk_mq_hw_ctx *hctx = m->private; \ |
914 | struct kyber_hctx_data *khd = hctx->sched_data; \ |
915 | \ |
916 | spin_lock(&khd->lock); \ |
917 | return seq_list_start(&khd->rqs[domain], *pos); \ |
918 | } \ |
919 | \ |
920 | static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \ |
921 | loff_t *pos) \ |
922 | { \ |
923 | struct blk_mq_hw_ctx *hctx = m->private; \ |
924 | struct kyber_hctx_data *khd = hctx->sched_data; \ |
925 | \ |
926 | return seq_list_next(v, &khd->rqs[domain], pos); \ |
927 | } \ |
928 | \ |
929 | static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \ |
930 | __releases(&khd->lock) \ |
931 | { \ |
932 | struct blk_mq_hw_ctx *hctx = m->private; \ |
933 | struct kyber_hctx_data *khd = hctx->sched_data; \ |
934 | \ |
935 | spin_unlock(&khd->lock); \ |
936 | } \ |
937 | \ |
938 | static const struct seq_operations kyber_##name##_rqs_seq_ops = { \ |
939 | .start = kyber_##name##_rqs_start, \ |
940 | .next = kyber_##name##_rqs_next, \ |
941 | .stop = kyber_##name##_rqs_stop, \ |
942 | .show = blk_mq_debugfs_rq_show, \ |
943 | }; \ |
944 | \ |
945 | static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \ |
946 | { \ |
947 | struct blk_mq_hw_ctx *hctx = data; \ |
948 | struct kyber_hctx_data *khd = hctx->sched_data; \ |
949 | wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \ |
950 | \ |
951 | seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ |
952 | return 0; \ |
953 | } |
954 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read) |
955 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write) |
956 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard) |
957 | KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other) |
958 | #undef KYBER_DEBUGFS_DOMAIN_ATTRS |
959 | |
960 | static int kyber_async_depth_show(void *data, struct seq_file *m) |
961 | { |
962 | struct request_queue *q = data; |
963 | struct kyber_queue_data *kqd = q->elevator->elevator_data; |
964 | |
965 | seq_printf(m, fmt: "%u\n" , kqd->async_depth); |
966 | return 0; |
967 | } |
968 | |
969 | static int kyber_cur_domain_show(void *data, struct seq_file *m) |
970 | { |
971 | struct blk_mq_hw_ctx *hctx = data; |
972 | struct kyber_hctx_data *khd = hctx->sched_data; |
973 | |
974 | seq_printf(m, fmt: "%s\n" , kyber_domain_names[khd->cur_domain]); |
975 | return 0; |
976 | } |
977 | |
978 | static int kyber_batching_show(void *data, struct seq_file *m) |
979 | { |
980 | struct blk_mq_hw_ctx *hctx = data; |
981 | struct kyber_hctx_data *khd = hctx->sched_data; |
982 | |
983 | seq_printf(m, fmt: "%u\n" , khd->batching); |
984 | return 0; |
985 | } |
986 | |
987 | #define KYBER_QUEUE_DOMAIN_ATTRS(name) \ |
988 | {#name "_tokens", 0400, kyber_##name##_tokens_show} |
989 | static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { |
990 | KYBER_QUEUE_DOMAIN_ATTRS(read), |
991 | KYBER_QUEUE_DOMAIN_ATTRS(write), |
992 | KYBER_QUEUE_DOMAIN_ATTRS(discard), |
993 | KYBER_QUEUE_DOMAIN_ATTRS(other), |
994 | {"async_depth" , 0400, kyber_async_depth_show}, |
995 | {}, |
996 | }; |
997 | #undef KYBER_QUEUE_DOMAIN_ATTRS |
998 | |
999 | #define KYBER_HCTX_DOMAIN_ATTRS(name) \ |
1000 | {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops}, \ |
1001 | {#name "_waiting", 0400, kyber_##name##_waiting_show} |
1002 | static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { |
1003 | KYBER_HCTX_DOMAIN_ATTRS(read), |
1004 | KYBER_HCTX_DOMAIN_ATTRS(write), |
1005 | KYBER_HCTX_DOMAIN_ATTRS(discard), |
1006 | KYBER_HCTX_DOMAIN_ATTRS(other), |
1007 | {"cur_domain" , 0400, kyber_cur_domain_show}, |
1008 | {"batching" , 0400, kyber_batching_show}, |
1009 | {}, |
1010 | }; |
1011 | #undef KYBER_HCTX_DOMAIN_ATTRS |
1012 | #endif |
1013 | |
1014 | static struct elevator_type kyber_sched = { |
1015 | .ops = { |
1016 | .init_sched = kyber_init_sched, |
1017 | .exit_sched = kyber_exit_sched, |
1018 | .init_hctx = kyber_init_hctx, |
1019 | .exit_hctx = kyber_exit_hctx, |
1020 | .limit_depth = kyber_limit_depth, |
1021 | .bio_merge = kyber_bio_merge, |
1022 | .prepare_request = kyber_prepare_request, |
1023 | .insert_requests = kyber_insert_requests, |
1024 | .finish_request = kyber_finish_request, |
1025 | .requeue_request = kyber_finish_request, |
1026 | .completed_request = kyber_completed_request, |
1027 | .dispatch_request = kyber_dispatch_request, |
1028 | .has_work = kyber_has_work, |
1029 | .depth_updated = kyber_depth_updated, |
1030 | }, |
1031 | #ifdef CONFIG_BLK_DEBUG_FS |
1032 | .queue_debugfs_attrs = kyber_queue_debugfs_attrs, |
1033 | .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs, |
1034 | #endif |
1035 | .elevator_attrs = kyber_sched_attrs, |
1036 | .elevator_name = "kyber" , |
1037 | .elevator_owner = THIS_MODULE, |
1038 | }; |
1039 | |
1040 | static int __init kyber_init(void) |
1041 | { |
1042 | return elv_register(&kyber_sched); |
1043 | } |
1044 | |
1045 | static void __exit kyber_exit(void) |
1046 | { |
1047 | elv_unregister(&kyber_sched); |
1048 | } |
1049 | |
1050 | module_init(kyber_init); |
1051 | module_exit(kyber_exit); |
1052 | |
1053 | MODULE_AUTHOR("Omar Sandoval" ); |
1054 | MODULE_LICENSE("GPL" ); |
1055 | MODULE_DESCRIPTION("Kyber I/O scheduler" ); |
1056 | |