1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved. |
4 | * |
5 | * Module Author: Kiyoshi Ueda |
6 | * |
7 | * This file is released under the GPL. |
8 | * |
9 | * Throughput oriented path selector. |
10 | */ |
11 | |
12 | #include "dm.h" |
13 | #include "dm-path-selector.h" |
14 | |
15 | #include <linux/slab.h> |
16 | #include <linux/module.h> |
17 | |
18 | #define DM_MSG_PREFIX "multipath service-time" |
19 | #define ST_MIN_IO 1 |
20 | #define ST_MAX_RELATIVE_THROUGHPUT 100 |
21 | #define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 |
22 | #define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) |
23 | #define ST_VERSION "0.3.0" |
24 | |
25 | struct selector { |
26 | struct list_head valid_paths; |
27 | struct list_head failed_paths; |
28 | spinlock_t lock; |
29 | }; |
30 | |
31 | struct path_info { |
32 | struct list_head list; |
33 | struct dm_path *path; |
34 | unsigned int repeat_count; |
35 | unsigned int relative_throughput; |
36 | atomic_t in_flight_size; /* Total size of in-flight I/Os */ |
37 | }; |
38 | |
39 | static struct selector *alloc_selector(void) |
40 | { |
41 | struct selector *s = kmalloc(size: sizeof(*s), GFP_KERNEL); |
42 | |
43 | if (s) { |
44 | INIT_LIST_HEAD(list: &s->valid_paths); |
45 | INIT_LIST_HEAD(list: &s->failed_paths); |
46 | spin_lock_init(&s->lock); |
47 | } |
48 | |
49 | return s; |
50 | } |
51 | |
52 | static int st_create(struct path_selector *ps, unsigned int argc, char **argv) |
53 | { |
54 | struct selector *s = alloc_selector(); |
55 | |
56 | if (!s) |
57 | return -ENOMEM; |
58 | |
59 | ps->context = s; |
60 | return 0; |
61 | } |
62 | |
63 | static void free_paths(struct list_head *paths) |
64 | { |
65 | struct path_info *pi, *next; |
66 | |
67 | list_for_each_entry_safe(pi, next, paths, list) { |
68 | list_del(entry: &pi->list); |
69 | kfree(objp: pi); |
70 | } |
71 | } |
72 | |
73 | static void st_destroy(struct path_selector *ps) |
74 | { |
75 | struct selector *s = ps->context; |
76 | |
77 | free_paths(paths: &s->valid_paths); |
78 | free_paths(paths: &s->failed_paths); |
79 | kfree(objp: s); |
80 | ps->context = NULL; |
81 | } |
82 | |
83 | static int st_status(struct path_selector *ps, struct dm_path *path, |
84 | status_type_t type, char *result, unsigned int maxlen) |
85 | { |
86 | unsigned int sz = 0; |
87 | struct path_info *pi; |
88 | |
89 | if (!path) |
90 | DMEMIT("0 " ); |
91 | else { |
92 | pi = path->pscontext; |
93 | |
94 | switch (type) { |
95 | case STATUSTYPE_INFO: |
96 | DMEMIT("%d %u " , atomic_read(&pi->in_flight_size), |
97 | pi->relative_throughput); |
98 | break; |
99 | case STATUSTYPE_TABLE: |
100 | DMEMIT("%u %u " , pi->repeat_count, |
101 | pi->relative_throughput); |
102 | break; |
103 | case STATUSTYPE_IMA: |
104 | result[0] = '\0'; |
105 | break; |
106 | } |
107 | } |
108 | |
109 | return sz; |
110 | } |
111 | |
112 | static int st_add_path(struct path_selector *ps, struct dm_path *path, |
113 | int argc, char **argv, char **error) |
114 | { |
115 | struct selector *s = ps->context; |
116 | struct path_info *pi; |
117 | unsigned int repeat_count = ST_MIN_IO; |
118 | unsigned int relative_throughput = 1; |
119 | char dummy; |
120 | unsigned long flags; |
121 | |
122 | /* |
123 | * Arguments: [<repeat_count> [<relative_throughput>]] |
124 | * <repeat_count>: The number of I/Os before switching path. |
125 | * If not given, default (ST_MIN_IO) is used. |
126 | * <relative_throughput>: The relative throughput value of |
127 | * the path among all paths in the path-group. |
128 | * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT> |
129 | * If not given, minimum value '1' is used. |
130 | * If '0' is given, the path isn't selected while |
131 | * other paths having a positive value are available. |
132 | */ |
133 | if (argc > 2) { |
134 | *error = "service-time ps: incorrect number of arguments" ; |
135 | return -EINVAL; |
136 | } |
137 | |
138 | if (argc && (sscanf(argv[0], "%u%c" , &repeat_count, &dummy) != 1)) { |
139 | *error = "service-time ps: invalid repeat count" ; |
140 | return -EINVAL; |
141 | } |
142 | |
143 | if (repeat_count > 1) { |
144 | DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead" ); |
145 | repeat_count = 1; |
146 | } |
147 | |
148 | if ((argc == 2) && |
149 | (sscanf(argv[1], "%u%c" , &relative_throughput, &dummy) != 1 || |
150 | relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { |
151 | *error = "service-time ps: invalid relative_throughput value" ; |
152 | return -EINVAL; |
153 | } |
154 | |
155 | /* allocate the path */ |
156 | pi = kmalloc(size: sizeof(*pi), GFP_KERNEL); |
157 | if (!pi) { |
158 | *error = "service-time ps: Error allocating path context" ; |
159 | return -ENOMEM; |
160 | } |
161 | |
162 | pi->path = path; |
163 | pi->repeat_count = repeat_count; |
164 | pi->relative_throughput = relative_throughput; |
165 | atomic_set(v: &pi->in_flight_size, i: 0); |
166 | |
167 | path->pscontext = pi; |
168 | |
169 | spin_lock_irqsave(&s->lock, flags); |
170 | list_add_tail(new: &pi->list, head: &s->valid_paths); |
171 | spin_unlock_irqrestore(lock: &s->lock, flags); |
172 | |
173 | return 0; |
174 | } |
175 | |
176 | static void st_fail_path(struct path_selector *ps, struct dm_path *path) |
177 | { |
178 | struct selector *s = ps->context; |
179 | struct path_info *pi = path->pscontext; |
180 | unsigned long flags; |
181 | |
182 | spin_lock_irqsave(&s->lock, flags); |
183 | list_move(list: &pi->list, head: &s->failed_paths); |
184 | spin_unlock_irqrestore(lock: &s->lock, flags); |
185 | } |
186 | |
187 | static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) |
188 | { |
189 | struct selector *s = ps->context; |
190 | struct path_info *pi = path->pscontext; |
191 | unsigned long flags; |
192 | |
193 | spin_lock_irqsave(&s->lock, flags); |
194 | list_move_tail(list: &pi->list, head: &s->valid_paths); |
195 | spin_unlock_irqrestore(lock: &s->lock, flags); |
196 | |
197 | return 0; |
198 | } |
199 | |
200 | /* |
201 | * Compare the estimated service time of 2 paths, pi1 and pi2, |
202 | * for the incoming I/O. |
203 | * |
204 | * Returns: |
205 | * < 0 : pi1 is better |
206 | * 0 : no difference between pi1 and pi2 |
207 | * > 0 : pi2 is better |
208 | * |
209 | * Description: |
210 | * Basically, the service time is estimated by: |
211 | * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput' |
212 | * To reduce the calculation, some optimizations are made. |
213 | * (See comments inline) |
214 | */ |
215 | static int st_compare_load(struct path_info *pi1, struct path_info *pi2, |
216 | size_t incoming) |
217 | { |
218 | size_t sz1, sz2, st1, st2; |
219 | |
220 | sz1 = atomic_read(v: &pi1->in_flight_size); |
221 | sz2 = atomic_read(v: &pi2->in_flight_size); |
222 | |
223 | /* |
224 | * Case 1: Both have same throughput value. Choose less loaded path. |
225 | */ |
226 | if (pi1->relative_throughput == pi2->relative_throughput) |
227 | return sz1 - sz2; |
228 | |
229 | /* |
230 | * Case 2a: Both have same load. Choose higher throughput path. |
231 | * Case 2b: One path has no throughput value. Choose the other one. |
232 | */ |
233 | if (sz1 == sz2 || |
234 | !pi1->relative_throughput || !pi2->relative_throughput) |
235 | return pi2->relative_throughput - pi1->relative_throughput; |
236 | |
237 | /* |
238 | * Case 3: Calculate service time. Choose faster path. |
239 | * Service time using pi1: |
240 | * st1 = (sz1 + incoming) / pi1->relative_throughput |
241 | * Service time using pi2: |
242 | * st2 = (sz2 + incoming) / pi2->relative_throughput |
243 | * |
244 | * To avoid the division, transform the expression to use |
245 | * multiplication. |
246 | * Because ->relative_throughput > 0 here, if st1 < st2, |
247 | * the expressions below are the same meaning: |
248 | * (sz1 + incoming) / pi1->relative_throughput < |
249 | * (sz2 + incoming) / pi2->relative_throughput |
250 | * (sz1 + incoming) * pi2->relative_throughput < |
251 | * (sz2 + incoming) * pi1->relative_throughput |
252 | * So use the later one. |
253 | */ |
254 | sz1 += incoming; |
255 | sz2 += incoming; |
256 | if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE || |
257 | sz2 >= ST_MAX_INFLIGHT_SIZE)) { |
258 | /* |
259 | * Size may be too big for multiplying pi->relative_throughput |
260 | * and overflow. |
261 | * To avoid the overflow and mis-selection, shift down both. |
262 | */ |
263 | sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; |
264 | sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT; |
265 | } |
266 | st1 = sz1 * pi2->relative_throughput; |
267 | st2 = sz2 * pi1->relative_throughput; |
268 | if (st1 != st2) |
269 | return st1 - st2; |
270 | |
271 | /* |
272 | * Case 4: Service time is equal. Choose higher throughput path. |
273 | */ |
274 | return pi2->relative_throughput - pi1->relative_throughput; |
275 | } |
276 | |
277 | static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) |
278 | { |
279 | struct selector *s = ps->context; |
280 | struct path_info *pi = NULL, *best = NULL; |
281 | struct dm_path *ret = NULL; |
282 | unsigned long flags; |
283 | |
284 | spin_lock_irqsave(&s->lock, flags); |
285 | if (list_empty(head: &s->valid_paths)) |
286 | goto out; |
287 | |
288 | list_for_each_entry(pi, &s->valid_paths, list) |
289 | if (!best || (st_compare_load(pi1: pi, pi2: best, incoming: nr_bytes) < 0)) |
290 | best = pi; |
291 | |
292 | if (!best) |
293 | goto out; |
294 | |
295 | /* Move most recently used to least preferred to evenly balance. */ |
296 | list_move_tail(list: &best->list, head: &s->valid_paths); |
297 | |
298 | ret = best->path; |
299 | out: |
300 | spin_unlock_irqrestore(lock: &s->lock, flags); |
301 | return ret; |
302 | } |
303 | |
304 | static int st_start_io(struct path_selector *ps, struct dm_path *path, |
305 | size_t nr_bytes) |
306 | { |
307 | struct path_info *pi = path->pscontext; |
308 | |
309 | atomic_add(i: nr_bytes, v: &pi->in_flight_size); |
310 | |
311 | return 0; |
312 | } |
313 | |
314 | static int st_end_io(struct path_selector *ps, struct dm_path *path, |
315 | size_t nr_bytes, u64 start_time) |
316 | { |
317 | struct path_info *pi = path->pscontext; |
318 | |
319 | atomic_sub(i: nr_bytes, v: &pi->in_flight_size); |
320 | |
321 | return 0; |
322 | } |
323 | |
324 | static struct path_selector_type st_ps = { |
325 | .name = "service-time" , |
326 | .module = THIS_MODULE, |
327 | .table_args = 2, |
328 | .info_args = 2, |
329 | .create = st_create, |
330 | .destroy = st_destroy, |
331 | .status = st_status, |
332 | .add_path = st_add_path, |
333 | .fail_path = st_fail_path, |
334 | .reinstate_path = st_reinstate_path, |
335 | .select_path = st_select_path, |
336 | .start_io = st_start_io, |
337 | .end_io = st_end_io, |
338 | }; |
339 | |
340 | static int __init dm_st_init(void) |
341 | { |
342 | int r = dm_register_path_selector(type: &st_ps); |
343 | |
344 | if (r < 0) |
345 | DMERR("register failed %d" , r); |
346 | |
347 | DMINFO("version " ST_VERSION " loaded" ); |
348 | |
349 | return r; |
350 | } |
351 | |
352 | static void __exit dm_st_exit(void) |
353 | { |
354 | int r = dm_unregister_path_selector(type: &st_ps); |
355 | |
356 | if (r < 0) |
357 | DMERR("unregister failed %d" , r); |
358 | } |
359 | |
360 | module_init(dm_st_init); |
361 | module_exit(dm_st_exit); |
362 | |
363 | MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector" ); |
364 | MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>" ); |
365 | MODULE_LICENSE("GPL" ); |
366 | |