1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Historical Service Time |
4 | * |
5 | * Keeps a time-weighted exponential moving average of the historical |
6 | * service time. Estimates future service time based on the historical |
7 | * service time and the number of outstanding requests. |
8 | * |
9 | * Marks paths stale if they have not finished within hst * |
10 | * num_paths. If a path is stale and unused, we will send a single |
11 | * request to probe in case the path has improved. This situation |
12 | * generally arises if the path is so much worse than others that it |
13 | * will never have the best estimated service time, or if the entire |
14 | * multipath device is unused. If a path is stale and in use, limit the |
15 | * number of requests it can receive with the assumption that the path |
16 | * has become degraded. |
17 | * |
18 | * To avoid repeatedly calculating exponents for time weighting, times |
19 | * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT) |
20 | * ns, and the weighting is pre-calculated. |
21 | * |
22 | */ |
23 | |
24 | #include "dm.h" |
25 | #include "dm-path-selector.h" |
26 | |
27 | #include <linux/blkdev.h> |
28 | #include <linux/slab.h> |
29 | #include <linux/module.h> |
30 | |
31 | |
32 | #define DM_MSG_PREFIX "multipath historical-service-time" |
33 | #define HST_MIN_IO 1 |
34 | #define HST_VERSION "0.1.1" |
35 | |
36 | #define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */ |
37 | #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT) |
38 | #define HST_FIXED_1 (1 << HST_FIXED_SHIFT) |
39 | #define HST_FIXED_95 972 |
40 | #define HST_MAX_INFLIGHT HST_FIXED_1 |
41 | #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */ |
42 | #define HST_WEIGHT_COUNT 64ULL |
43 | |
44 | struct selector { |
45 | struct list_head valid_paths; |
46 | struct list_head failed_paths; |
47 | int valid_count; |
48 | spinlock_t lock; |
49 | |
50 | unsigned int weights[HST_WEIGHT_COUNT]; |
51 | unsigned int threshold_multiplier; |
52 | }; |
53 | |
54 | struct path_info { |
55 | struct list_head list; |
56 | struct dm_path *path; |
57 | unsigned int repeat_count; |
58 | |
59 | spinlock_t lock; |
60 | |
61 | u64 historical_service_time; /* Fixed point */ |
62 | |
63 | u64 stale_after; |
64 | u64 last_finish; |
65 | |
66 | u64 outstanding; |
67 | }; |
68 | |
69 | /** |
70 | * fixed_power - compute: x^n, in O(log n) time |
71 | * |
72 | * @x: base of the power |
73 | * @frac_bits: fractional bits of @x |
74 | * @n: power to raise @x to. |
75 | * |
76 | * By exploiting the relation between the definition of the natural power |
77 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and |
78 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, |
79 | * (where: n_i \elem {0, 1}, the binary vector representing n), |
80 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is |
81 | * of course trivially computable in O(log_2 n), the length of our binary |
82 | * vector. |
83 | * |
84 | * (see: kernel/sched/loadavg.c) |
85 | */ |
86 | static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n) |
87 | { |
88 | unsigned long result = 1UL << frac_bits; |
89 | |
90 | if (n) { |
91 | for (;;) { |
92 | if (n & 1) { |
93 | result *= x; |
94 | result += 1UL << (frac_bits - 1); |
95 | result >>= frac_bits; |
96 | } |
97 | n >>= 1; |
98 | if (!n) |
99 | break; |
100 | x *= x; |
101 | x += 1UL << (frac_bits - 1); |
102 | x >>= frac_bits; |
103 | } |
104 | } |
105 | |
106 | return result; |
107 | } |
108 | |
109 | /* |
110 | * Calculate the next value of an exponential moving average |
111 | * a_1 = a_0 * e + a * (1 - e) |
112 | * |
113 | * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT] |
114 | * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT] |
115 | * @weight: [0, HST_FIXED_1] |
116 | * |
117 | * Note: |
118 | * To account for multiple periods in the same calculation, |
119 | * a_n = a_0 * e^n + a * (1 - e^n), |
120 | * so call fixed_ema(last, next, pow(weight, N)) |
121 | */ |
122 | static u64 fixed_ema(u64 last, u64 next, u64 weight) |
123 | { |
124 | last *= weight; |
125 | last += next * (HST_FIXED_1 - weight); |
126 | last += 1ULL << (HST_FIXED_SHIFT - 1); |
127 | return last >> HST_FIXED_SHIFT; |
128 | } |
129 | |
130 | static struct selector *alloc_selector(void) |
131 | { |
132 | struct selector *s = kmalloc(size: sizeof(*s), GFP_KERNEL); |
133 | |
134 | if (s) { |
135 | INIT_LIST_HEAD(list: &s->valid_paths); |
136 | INIT_LIST_HEAD(list: &s->failed_paths); |
137 | spin_lock_init(&s->lock); |
138 | s->valid_count = 0; |
139 | } |
140 | |
141 | return s; |
142 | } |
143 | |
144 | /* |
145 | * Get the weight for a given time span. |
146 | */ |
147 | static u64 hst_weight(struct path_selector *ps, u64 delta) |
148 | { |
149 | struct selector *s = ps->context; |
150 | int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL, |
151 | HST_WEIGHT_COUNT - 1); |
152 | |
153 | return s->weights[bucket]; |
154 | } |
155 | |
156 | /* |
157 | * Set up the weights array. |
158 | * |
159 | * weights[len-1] = 0 |
160 | * weights[n] = base ^ (n + 1) |
161 | */ |
162 | static void hst_set_weights(struct path_selector *ps, unsigned int base) |
163 | { |
164 | struct selector *s = ps->context; |
165 | int i; |
166 | |
167 | if (base >= HST_FIXED_1) |
168 | return; |
169 | |
170 | for (i = 0; i < HST_WEIGHT_COUNT - 1; i++) |
171 | s->weights[i] = fixed_power(x: base, HST_FIXED_SHIFT, n: i + 1); |
172 | s->weights[HST_WEIGHT_COUNT - 1] = 0; |
173 | } |
174 | |
175 | static int hst_create(struct path_selector *ps, unsigned int argc, char **argv) |
176 | { |
177 | struct selector *s; |
178 | unsigned int base_weight = HST_FIXED_95; |
179 | unsigned int threshold_multiplier = 0; |
180 | char dummy; |
181 | |
182 | /* |
183 | * Arguments: [<base_weight> [<threshold_multiplier>]] |
184 | * <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A |
185 | * value of 0 will completely ignore any history. |
186 | * If not given, default (HST_FIXED_95) is used. |
187 | * <threshold_multiplier>: Minimum threshold multiplier for paths to |
188 | * be considered different. That is, a path is |
189 | * considered different iff (p1 > N * p2) where p1 |
190 | * is the path with higher service time. A threshold |
191 | * of 1 or 0 has no effect. Defaults to 0. |
192 | */ |
193 | if (argc > 2) |
194 | return -EINVAL; |
195 | |
196 | if (argc && (sscanf(argv[0], "%u%c" , &base_weight, &dummy) != 1 || |
197 | base_weight >= HST_FIXED_1)) { |
198 | return -EINVAL; |
199 | } |
200 | |
201 | if (argc > 1 && (sscanf(argv[1], "%u%c" , |
202 | &threshold_multiplier, &dummy) != 1)) { |
203 | return -EINVAL; |
204 | } |
205 | |
206 | s = alloc_selector(); |
207 | if (!s) |
208 | return -ENOMEM; |
209 | |
210 | ps->context = s; |
211 | |
212 | hst_set_weights(ps, base: base_weight); |
213 | s->threshold_multiplier = threshold_multiplier; |
214 | return 0; |
215 | } |
216 | |
217 | static void free_paths(struct list_head *paths) |
218 | { |
219 | struct path_info *pi, *next; |
220 | |
221 | list_for_each_entry_safe(pi, next, paths, list) { |
222 | list_del(entry: &pi->list); |
223 | kfree(objp: pi); |
224 | } |
225 | } |
226 | |
227 | static void hst_destroy(struct path_selector *ps) |
228 | { |
229 | struct selector *s = ps->context; |
230 | |
231 | free_paths(paths: &s->valid_paths); |
232 | free_paths(paths: &s->failed_paths); |
233 | kfree(objp: s); |
234 | ps->context = NULL; |
235 | } |
236 | |
237 | static int hst_status(struct path_selector *ps, struct dm_path *path, |
238 | status_type_t type, char *result, unsigned int maxlen) |
239 | { |
240 | unsigned int sz = 0; |
241 | struct path_info *pi; |
242 | |
243 | if (!path) { |
244 | struct selector *s = ps->context; |
245 | |
246 | DMEMIT("2 %u %u " , s->weights[0], s->threshold_multiplier); |
247 | } else { |
248 | pi = path->pscontext; |
249 | |
250 | switch (type) { |
251 | case STATUSTYPE_INFO: |
252 | DMEMIT("%llu %llu %llu " , pi->historical_service_time, |
253 | pi->outstanding, pi->stale_after); |
254 | break; |
255 | case STATUSTYPE_TABLE: |
256 | DMEMIT("0 " ); |
257 | break; |
258 | case STATUSTYPE_IMA: |
259 | *result = '\0'; |
260 | break; |
261 | } |
262 | } |
263 | |
264 | return sz; |
265 | } |
266 | |
267 | static int hst_add_path(struct path_selector *ps, struct dm_path *path, |
268 | int argc, char **argv, char **error) |
269 | { |
270 | struct selector *s = ps->context; |
271 | struct path_info *pi; |
272 | unsigned int repeat_count = HST_MIN_IO; |
273 | char dummy; |
274 | unsigned long flags; |
275 | |
276 | /* |
277 | * Arguments: [<repeat_count>] |
278 | * <repeat_count>: The number of I/Os before switching path. |
279 | * If not given, default (HST_MIN_IO) is used. |
280 | */ |
281 | if (argc > 1) { |
282 | *error = "historical-service-time ps: incorrect number of arguments" ; |
283 | return -EINVAL; |
284 | } |
285 | |
286 | if (argc && (sscanf(argv[0], "%u%c" , &repeat_count, &dummy) != 1)) { |
287 | *error = "historical-service-time ps: invalid repeat count" ; |
288 | return -EINVAL; |
289 | } |
290 | |
291 | /* allocate the path */ |
292 | pi = kmalloc(size: sizeof(*pi), GFP_KERNEL); |
293 | if (!pi) { |
294 | *error = "historical-service-time ps: Error allocating path context" ; |
295 | return -ENOMEM; |
296 | } |
297 | |
298 | pi->path = path; |
299 | pi->repeat_count = repeat_count; |
300 | |
301 | pi->historical_service_time = HST_FIXED_1; |
302 | |
303 | spin_lock_init(&pi->lock); |
304 | pi->outstanding = 0; |
305 | |
306 | pi->stale_after = 0; |
307 | pi->last_finish = 0; |
308 | |
309 | path->pscontext = pi; |
310 | |
311 | spin_lock_irqsave(&s->lock, flags); |
312 | list_add_tail(new: &pi->list, head: &s->valid_paths); |
313 | s->valid_count++; |
314 | spin_unlock_irqrestore(lock: &s->lock, flags); |
315 | |
316 | return 0; |
317 | } |
318 | |
319 | static void hst_fail_path(struct path_selector *ps, struct dm_path *path) |
320 | { |
321 | struct selector *s = ps->context; |
322 | struct path_info *pi = path->pscontext; |
323 | unsigned long flags; |
324 | |
325 | spin_lock_irqsave(&s->lock, flags); |
326 | list_move(list: &pi->list, head: &s->failed_paths); |
327 | s->valid_count--; |
328 | spin_unlock_irqrestore(lock: &s->lock, flags); |
329 | } |
330 | |
331 | static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path) |
332 | { |
333 | struct selector *s = ps->context; |
334 | struct path_info *pi = path->pscontext; |
335 | unsigned long flags; |
336 | |
337 | spin_lock_irqsave(&s->lock, flags); |
338 | list_move_tail(list: &pi->list, head: &s->valid_paths); |
339 | s->valid_count++; |
340 | spin_unlock_irqrestore(lock: &s->lock, flags); |
341 | |
342 | return 0; |
343 | } |
344 | |
345 | static void hst_fill_compare(struct path_info *pi, u64 *hst, |
346 | u64 *out, u64 *stale) |
347 | { |
348 | unsigned long flags; |
349 | |
350 | spin_lock_irqsave(&pi->lock, flags); |
351 | *hst = pi->historical_service_time; |
352 | *out = pi->outstanding; |
353 | *stale = pi->stale_after; |
354 | spin_unlock_irqrestore(lock: &pi->lock, flags); |
355 | } |
356 | |
357 | /* |
358 | * Compare the estimated service time of 2 paths, pi1 and pi2, |
359 | * for the incoming I/O. |
360 | * |
361 | * Returns: |
362 | * < 0 : pi1 is better |
363 | * 0 : no difference between pi1 and pi2 |
364 | * > 0 : pi2 is better |
365 | * |
366 | */ |
367 | static long long hst_compare(struct path_info *pi1, struct path_info *pi2, |
368 | u64 time_now, struct path_selector *ps) |
369 | { |
370 | struct selector *s = ps->context; |
371 | u64 hst1, hst2; |
372 | long long out1, out2, stale1, stale2; |
373 | int pi2_better, over_threshold; |
374 | |
375 | hst_fill_compare(pi: pi1, hst: &hst1, out: &out1, stale: &stale1); |
376 | hst_fill_compare(pi: pi2, hst: &hst2, out: &out2, stale: &stale2); |
377 | |
378 | /* Check here if estimated latency for two paths are too similar. |
379 | * If this is the case, we skip extra calculation and just compare |
380 | * outstanding requests. In this case, any unloaded paths will |
381 | * be preferred. |
382 | */ |
383 | if (hst1 > hst2) |
384 | over_threshold = hst1 > (s->threshold_multiplier * hst2); |
385 | else |
386 | over_threshold = hst2 > (s->threshold_multiplier * hst1); |
387 | |
388 | if (!over_threshold) |
389 | return out1 - out2; |
390 | |
391 | /* |
392 | * If an unloaded path is stale, choose it. If both paths are unloaded, |
393 | * choose path that is the most stale. |
394 | * (If one path is loaded, choose the other) |
395 | */ |
396 | if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) || |
397 | (!out1 && !out2)) |
398 | return (!out2 * stale1) - (!out1 * stale2); |
399 | |
400 | /* Compare estimated service time. If outstanding is the same, we |
401 | * don't need to multiply |
402 | */ |
403 | if (out1 == out2) { |
404 | pi2_better = hst1 > hst2; |
405 | } else { |
406 | /* Potential overflow with out >= 1024 */ |
407 | if (unlikely(out1 >= HST_MAX_INFLIGHT || |
408 | out2 >= HST_MAX_INFLIGHT)) { |
409 | /* If over 1023 in-flights, we may overflow if hst |
410 | * is at max. (With this shift we still overflow at |
411 | * 1048576 in-flights, which is high enough). |
412 | */ |
413 | hst1 >>= HST_FIXED_SHIFT; |
414 | hst2 >>= HST_FIXED_SHIFT; |
415 | } |
416 | pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2; |
417 | } |
418 | |
419 | /* In the case that the 'winner' is stale, limit to equal usage. */ |
420 | if (pi2_better) { |
421 | if (stale2 < time_now) |
422 | return out1 - out2; |
423 | return 1; |
424 | } |
425 | if (stale1 < time_now) |
426 | return out1 - out2; |
427 | return -1; |
428 | } |
429 | |
430 | static struct dm_path *hst_select_path(struct path_selector *ps, |
431 | size_t nr_bytes) |
432 | { |
433 | struct selector *s = ps->context; |
434 | struct path_info *pi = NULL, *best = NULL; |
435 | u64 time_now = ktime_get_ns(); |
436 | struct dm_path *ret = NULL; |
437 | unsigned long flags; |
438 | |
439 | spin_lock_irqsave(&s->lock, flags); |
440 | if (list_empty(head: &s->valid_paths)) |
441 | goto out; |
442 | |
443 | list_for_each_entry(pi, &s->valid_paths, list) { |
444 | if (!best || (hst_compare(pi1: pi, pi2: best, time_now, ps) < 0)) |
445 | best = pi; |
446 | } |
447 | |
448 | if (!best) |
449 | goto out; |
450 | |
451 | /* Move last used path to end (least preferred in case of ties) */ |
452 | list_move_tail(list: &best->list, head: &s->valid_paths); |
453 | |
454 | ret = best->path; |
455 | |
456 | out: |
457 | spin_unlock_irqrestore(lock: &s->lock, flags); |
458 | return ret; |
459 | } |
460 | |
461 | static int hst_start_io(struct path_selector *ps, struct dm_path *path, |
462 | size_t nr_bytes) |
463 | { |
464 | struct path_info *pi = path->pscontext; |
465 | unsigned long flags; |
466 | |
467 | spin_lock_irqsave(&pi->lock, flags); |
468 | pi->outstanding++; |
469 | spin_unlock_irqrestore(lock: &pi->lock, flags); |
470 | |
471 | return 0; |
472 | } |
473 | |
474 | static u64 path_service_time(struct path_info *pi, u64 start_time) |
475 | { |
476 | u64 now = ktime_get_ns(); |
477 | |
478 | /* if a previous disk request has finished after this IO was |
479 | * sent to the hardware, pretend the submission happened |
480 | * serially. |
481 | */ |
482 | if (time_after64(pi->last_finish, start_time)) |
483 | start_time = pi->last_finish; |
484 | |
485 | pi->last_finish = now; |
486 | if (time_before64(now, start_time)) |
487 | return 0; |
488 | |
489 | return now - start_time; |
490 | } |
491 | |
492 | static int hst_end_io(struct path_selector *ps, struct dm_path *path, |
493 | size_t nr_bytes, u64 start_time) |
494 | { |
495 | struct path_info *pi = path->pscontext; |
496 | struct selector *s = ps->context; |
497 | unsigned long flags; |
498 | u64 st; |
499 | |
500 | spin_lock_irqsave(&pi->lock, flags); |
501 | |
502 | st = path_service_time(pi, start_time); |
503 | pi->outstanding--; |
504 | pi->historical_service_time = |
505 | fixed_ema(last: pi->historical_service_time, |
506 | min(st * HST_FIXED_1, HST_FIXED_MAX), |
507 | weight: hst_weight(ps, delta: st)); |
508 | |
509 | /* |
510 | * On request end, mark path as fresh. If a path hasn't |
511 | * finished any requests within the fresh period, the estimated |
512 | * service time is considered too optimistic and we limit the |
513 | * maximum requests on that path. |
514 | */ |
515 | pi->stale_after = pi->last_finish + |
516 | (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT)); |
517 | |
518 | spin_unlock_irqrestore(lock: &pi->lock, flags); |
519 | |
520 | return 0; |
521 | } |
522 | |
523 | static struct path_selector_type hst_ps = { |
524 | .name = "historical-service-time" , |
525 | .module = THIS_MODULE, |
526 | .features = DM_PS_USE_HR_TIMER, |
527 | .table_args = 1, |
528 | .info_args = 3, |
529 | .create = hst_create, |
530 | .destroy = hst_destroy, |
531 | .status = hst_status, |
532 | .add_path = hst_add_path, |
533 | .fail_path = hst_fail_path, |
534 | .reinstate_path = hst_reinstate_path, |
535 | .select_path = hst_select_path, |
536 | .start_io = hst_start_io, |
537 | .end_io = hst_end_io, |
538 | }; |
539 | |
540 | static int __init dm_hst_init(void) |
541 | { |
542 | int r = dm_register_path_selector(type: &hst_ps); |
543 | |
544 | if (r < 0) |
545 | DMERR("register failed %d" , r); |
546 | |
547 | DMINFO("version " HST_VERSION " loaded" ); |
548 | |
549 | return r; |
550 | } |
551 | |
552 | static void __exit dm_hst_exit(void) |
553 | { |
554 | int r = dm_unregister_path_selector(type: &hst_ps); |
555 | |
556 | if (r < 0) |
557 | DMERR("unregister failed %d" , r); |
558 | } |
559 | |
560 | module_init(dm_hst_init); |
561 | module_exit(dm_hst_exit); |
562 | |
563 | MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector" ); |
564 | MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>" ); |
565 | MODULE_LICENSE("GPL" ); |
566 | |