1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * RDMA resource limiting controller for cgroups. |
4 | * |
5 | * Used to allow a cgroup hierarchy to stop processes from consuming |
6 | * additional RDMA resources after a certain limit is reached. |
7 | * |
8 | * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> |
9 | */ |
10 | |
11 | #include <linux/bitops.h> |
12 | #include <linux/slab.h> |
13 | #include <linux/seq_file.h> |
14 | #include <linux/cgroup.h> |
15 | #include <linux/parser.h> |
16 | #include <linux/cgroup_rdma.h> |
17 | |
18 | #define RDMACG_MAX_STR "max" |
19 | |
20 | /* |
21 | * Protects list of resource pools maintained on per cgroup basis |
22 | * and rdma device list. |
23 | */ |
24 | static DEFINE_MUTEX(rdmacg_mutex); |
25 | static LIST_HEAD(rdmacg_devices); |
26 | |
27 | enum rdmacg_file_type { |
28 | RDMACG_RESOURCE_TYPE_MAX, |
29 | RDMACG_RESOURCE_TYPE_STAT, |
30 | }; |
31 | |
32 | /* |
33 | * resource table definition as to be seen by the user. |
34 | * Need to add entries to it when more resources are |
35 | * added/defined at IB verb/core layer. |
36 | */ |
37 | static char const *rdmacg_resource_names[] = { |
38 | [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle" , |
39 | [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object" , |
40 | }; |
41 | |
42 | /* resource tracker for each resource of rdma cgroup */ |
43 | struct rdmacg_resource { |
44 | int max; |
45 | int usage; |
46 | }; |
47 | |
48 | /* |
49 | * resource pool object which represents per cgroup, per device |
50 | * resources. There are multiple instances of this object per cgroup, |
51 | * therefore it cannot be embedded within rdma_cgroup structure. It |
52 | * is maintained as list. |
53 | */ |
54 | struct rdmacg_resource_pool { |
55 | struct rdmacg_device *device; |
56 | struct rdmacg_resource resources[RDMACG_RESOURCE_MAX]; |
57 | |
58 | struct list_head cg_node; |
59 | struct list_head dev_node; |
60 | |
61 | /* count active user tasks of this pool */ |
62 | u64 usage_sum; |
63 | /* total number counts which are set to max */ |
64 | int num_max_cnt; |
65 | }; |
66 | |
67 | static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css) |
68 | { |
69 | return container_of(css, struct rdma_cgroup, css); |
70 | } |
71 | |
72 | static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg) |
73 | { |
74 | return css_rdmacg(css: cg->css.parent); |
75 | } |
76 | |
77 | static inline struct rdma_cgroup *get_current_rdmacg(void) |
78 | { |
79 | return css_rdmacg(css: task_get_css(current, subsys_id: rdma_cgrp_id)); |
80 | } |
81 | |
82 | static void set_resource_limit(struct rdmacg_resource_pool *rpool, |
83 | int index, int new_max) |
84 | { |
85 | if (new_max == S32_MAX) { |
86 | if (rpool->resources[index].max != S32_MAX) |
87 | rpool->num_max_cnt++; |
88 | } else { |
89 | if (rpool->resources[index].max == S32_MAX) |
90 | rpool->num_max_cnt--; |
91 | } |
92 | rpool->resources[index].max = new_max; |
93 | } |
94 | |
95 | static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool) |
96 | { |
97 | int i; |
98 | |
99 | for (i = 0; i < RDMACG_RESOURCE_MAX; i++) |
100 | set_resource_limit(rpool, index: i, S32_MAX); |
101 | } |
102 | |
103 | static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool) |
104 | { |
105 | lockdep_assert_held(&rdmacg_mutex); |
106 | |
107 | list_del(entry: &rpool->cg_node); |
108 | list_del(entry: &rpool->dev_node); |
109 | kfree(objp: rpool); |
110 | } |
111 | |
112 | static struct rdmacg_resource_pool * |
113 | find_cg_rpool_locked(struct rdma_cgroup *cg, |
114 | struct rdmacg_device *device) |
115 | |
116 | { |
117 | struct rdmacg_resource_pool *pool; |
118 | |
119 | lockdep_assert_held(&rdmacg_mutex); |
120 | |
121 | list_for_each_entry(pool, &cg->rpools, cg_node) |
122 | if (pool->device == device) |
123 | return pool; |
124 | |
125 | return NULL; |
126 | } |
127 | |
128 | static struct rdmacg_resource_pool * |
129 | get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device) |
130 | { |
131 | struct rdmacg_resource_pool *rpool; |
132 | |
133 | rpool = find_cg_rpool_locked(cg, device); |
134 | if (rpool) |
135 | return rpool; |
136 | |
137 | rpool = kzalloc(size: sizeof(*rpool), GFP_KERNEL); |
138 | if (!rpool) |
139 | return ERR_PTR(error: -ENOMEM); |
140 | |
141 | rpool->device = device; |
142 | set_all_resource_max_limit(rpool); |
143 | |
144 | INIT_LIST_HEAD(list: &rpool->cg_node); |
145 | INIT_LIST_HEAD(list: &rpool->dev_node); |
146 | list_add_tail(new: &rpool->cg_node, head: &cg->rpools); |
147 | list_add_tail(new: &rpool->dev_node, head: &device->rpools); |
148 | return rpool; |
149 | } |
150 | |
151 | /** |
152 | * uncharge_cg_locked - uncharge resource for rdma cgroup |
153 | * @cg: pointer to cg to uncharge and all parents in hierarchy |
154 | * @device: pointer to rdmacg device |
155 | * @index: index of the resource to uncharge in cg (resource pool) |
156 | * |
157 | * It also frees the resource pool which was created as part of |
158 | * charging operation when there are no resources attached to |
159 | * resource pool. |
160 | */ |
161 | static void |
162 | uncharge_cg_locked(struct rdma_cgroup *cg, |
163 | struct rdmacg_device *device, |
164 | enum rdmacg_resource_type index) |
165 | { |
166 | struct rdmacg_resource_pool *rpool; |
167 | |
168 | rpool = find_cg_rpool_locked(cg, device); |
169 | |
170 | /* |
171 | * rpool cannot be null at this stage. Let kernel operate in case |
172 | * if there a bug in IB stack or rdma controller, instead of crashing |
173 | * the system. |
174 | */ |
175 | if (unlikely(!rpool)) { |
176 | pr_warn("Invalid device %p or rdma cgroup %p\n" , cg, device); |
177 | return; |
178 | } |
179 | |
180 | rpool->resources[index].usage--; |
181 | |
182 | /* |
183 | * A negative count (or overflow) is invalid, |
184 | * it indicates a bug in the rdma controller. |
185 | */ |
186 | WARN_ON_ONCE(rpool->resources[index].usage < 0); |
187 | rpool->usage_sum--; |
188 | if (rpool->usage_sum == 0 && |
189 | rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { |
190 | /* |
191 | * No user of the rpool and all entries are set to max, so |
192 | * safe to delete this rpool. |
193 | */ |
194 | free_cg_rpool_locked(rpool); |
195 | } |
196 | } |
197 | |
198 | /** |
199 | * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count |
200 | * @cg: pointer to cg to uncharge and all parents in hierarchy |
201 | * @device: pointer to rdmacg device |
202 | * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup |
203 | * stop uncharging |
204 | * @index: index of the resource to uncharge in cg in given resource pool |
205 | */ |
206 | static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg, |
207 | struct rdmacg_device *device, |
208 | struct rdma_cgroup *stop_cg, |
209 | enum rdmacg_resource_type index) |
210 | { |
211 | struct rdma_cgroup *p; |
212 | |
213 | mutex_lock(&rdmacg_mutex); |
214 | |
215 | for (p = cg; p != stop_cg; p = parent_rdmacg(cg: p)) |
216 | uncharge_cg_locked(cg: p, device, index); |
217 | |
218 | mutex_unlock(lock: &rdmacg_mutex); |
219 | |
220 | css_put(css: &cg->css); |
221 | } |
222 | |
223 | /** |
224 | * rdmacg_uncharge - hierarchically uncharge rdma resource count |
225 | * @cg: pointer to cg to uncharge and all parents in hierarchy |
226 | * @device: pointer to rdmacg device |
227 | * @index: index of the resource to uncharge in cgroup in given resource pool |
228 | */ |
229 | void rdmacg_uncharge(struct rdma_cgroup *cg, |
230 | struct rdmacg_device *device, |
231 | enum rdmacg_resource_type index) |
232 | { |
233 | if (index >= RDMACG_RESOURCE_MAX) |
234 | return; |
235 | |
236 | rdmacg_uncharge_hierarchy(cg, device, NULL, index); |
237 | } |
238 | EXPORT_SYMBOL(rdmacg_uncharge); |
239 | |
240 | /** |
241 | * rdmacg_try_charge - hierarchically try to charge the rdma resource |
242 | * @rdmacg: pointer to rdma cgroup which will own this resource |
243 | * @device: pointer to rdmacg device |
244 | * @index: index of the resource to charge in cgroup (resource pool) |
245 | * |
246 | * This function follows charging resource in hierarchical way. |
247 | * It will fail if the charge would cause the new value to exceed the |
248 | * hierarchical limit. |
249 | * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. |
250 | * Returns pointer to rdmacg for this resource when charging is successful. |
251 | * |
252 | * Charger needs to account resources on two criteria. |
253 | * (a) per cgroup & (b) per device resource usage. |
254 | * Per cgroup resource usage ensures that tasks of cgroup doesn't cross |
255 | * the configured limits. Per device provides granular configuration |
256 | * in multi device usage. It allocates resource pool in the hierarchy |
257 | * for each parent it come across for first resource. Later on resource |
258 | * pool will be available. Therefore it will be much faster thereon |
259 | * to charge/uncharge. |
260 | */ |
261 | int rdmacg_try_charge(struct rdma_cgroup **rdmacg, |
262 | struct rdmacg_device *device, |
263 | enum rdmacg_resource_type index) |
264 | { |
265 | struct rdma_cgroup *cg, *p; |
266 | struct rdmacg_resource_pool *rpool; |
267 | s64 new; |
268 | int ret = 0; |
269 | |
270 | if (index >= RDMACG_RESOURCE_MAX) |
271 | return -EINVAL; |
272 | |
273 | /* |
274 | * hold on to css, as cgroup can be removed but resource |
275 | * accounting happens on css. |
276 | */ |
277 | cg = get_current_rdmacg(); |
278 | |
279 | mutex_lock(&rdmacg_mutex); |
280 | for (p = cg; p; p = parent_rdmacg(cg: p)) { |
281 | rpool = get_cg_rpool_locked(cg: p, device); |
282 | if (IS_ERR(ptr: rpool)) { |
283 | ret = PTR_ERR(ptr: rpool); |
284 | goto err; |
285 | } else { |
286 | new = rpool->resources[index].usage + 1; |
287 | if (new > rpool->resources[index].max) { |
288 | ret = -EAGAIN; |
289 | goto err; |
290 | } else { |
291 | rpool->resources[index].usage = new; |
292 | rpool->usage_sum++; |
293 | } |
294 | } |
295 | } |
296 | mutex_unlock(lock: &rdmacg_mutex); |
297 | |
298 | *rdmacg = cg; |
299 | return 0; |
300 | |
301 | err: |
302 | mutex_unlock(lock: &rdmacg_mutex); |
303 | rdmacg_uncharge_hierarchy(cg, device, stop_cg: p, index); |
304 | return ret; |
305 | } |
306 | EXPORT_SYMBOL(rdmacg_try_charge); |
307 | |
308 | /** |
309 | * rdmacg_register_device - register rdmacg device to rdma controller. |
310 | * @device: pointer to rdmacg device whose resources need to be accounted. |
311 | * |
312 | * If IB stack wish a device to participate in rdma cgroup resource |
313 | * tracking, it must invoke this API to register with rdma cgroup before |
314 | * any user space application can start using the RDMA resources. |
315 | */ |
316 | void rdmacg_register_device(struct rdmacg_device *device) |
317 | { |
318 | INIT_LIST_HEAD(list: &device->dev_node); |
319 | INIT_LIST_HEAD(list: &device->rpools); |
320 | |
321 | mutex_lock(&rdmacg_mutex); |
322 | list_add_tail(new: &device->dev_node, head: &rdmacg_devices); |
323 | mutex_unlock(lock: &rdmacg_mutex); |
324 | } |
325 | EXPORT_SYMBOL(rdmacg_register_device); |
326 | |
327 | /** |
328 | * rdmacg_unregister_device - unregister rdmacg device from rdma controller. |
329 | * @device: pointer to rdmacg device which was previously registered with rdma |
330 | * controller using rdmacg_register_device(). |
331 | * |
332 | * IB stack must invoke this after all the resources of the IB device |
333 | * are destroyed and after ensuring that no more resources will be created |
334 | * when this API is invoked. |
335 | */ |
336 | void rdmacg_unregister_device(struct rdmacg_device *device) |
337 | { |
338 | struct rdmacg_resource_pool *rpool, *tmp; |
339 | |
340 | /* |
341 | * Synchronize with any active resource settings, |
342 | * usage query happening via configfs. |
343 | */ |
344 | mutex_lock(&rdmacg_mutex); |
345 | list_del_init(entry: &device->dev_node); |
346 | |
347 | /* |
348 | * Now that this device is off the cgroup list, its safe to free |
349 | * all the rpool resources. |
350 | */ |
351 | list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node) |
352 | free_cg_rpool_locked(rpool); |
353 | |
354 | mutex_unlock(lock: &rdmacg_mutex); |
355 | } |
356 | EXPORT_SYMBOL(rdmacg_unregister_device); |
357 | |
358 | static int parse_resource(char *c, int *intval) |
359 | { |
360 | substring_t argstr; |
361 | char *name, *value = c; |
362 | size_t len; |
363 | int ret, i; |
364 | |
365 | name = strsep(&value, "=" ); |
366 | if (!name || !value) |
367 | return -EINVAL; |
368 | |
369 | i = match_string(array: rdmacg_resource_names, n: RDMACG_RESOURCE_MAX, string: name); |
370 | if (i < 0) |
371 | return i; |
372 | |
373 | len = strlen(value); |
374 | |
375 | argstr.from = value; |
376 | argstr.to = value + len; |
377 | |
378 | ret = match_int(&argstr, result: intval); |
379 | if (ret >= 0) { |
380 | if (*intval < 0) |
381 | return -EINVAL; |
382 | return i; |
383 | } |
384 | if (strncmp(value, RDMACG_MAX_STR, len) == 0) { |
385 | *intval = S32_MAX; |
386 | return i; |
387 | } |
388 | return -EINVAL; |
389 | } |
390 | |
391 | static int rdmacg_parse_limits(char *options, |
392 | int *new_limits, unsigned long *enables) |
393 | { |
394 | char *c; |
395 | int err = -EINVAL; |
396 | |
397 | /* parse resource options */ |
398 | while ((c = strsep(&options, " " )) != NULL) { |
399 | int index, intval; |
400 | |
401 | index = parse_resource(c, intval: &intval); |
402 | if (index < 0) |
403 | goto err; |
404 | |
405 | new_limits[index] = intval; |
406 | *enables |= BIT(index); |
407 | } |
408 | return 0; |
409 | |
410 | err: |
411 | return err; |
412 | } |
413 | |
414 | static struct rdmacg_device *rdmacg_get_device_locked(const char *name) |
415 | { |
416 | struct rdmacg_device *device; |
417 | |
418 | lockdep_assert_held(&rdmacg_mutex); |
419 | |
420 | list_for_each_entry(device, &rdmacg_devices, dev_node) |
421 | if (!strcmp(name, device->name)) |
422 | return device; |
423 | |
424 | return NULL; |
425 | } |
426 | |
427 | static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of, |
428 | char *buf, size_t nbytes, loff_t off) |
429 | { |
430 | struct rdma_cgroup *cg = css_rdmacg(css: of_css(of)); |
431 | const char *dev_name; |
432 | struct rdmacg_resource_pool *rpool; |
433 | struct rdmacg_device *device; |
434 | char *options = strstrip(str: buf); |
435 | int *new_limits; |
436 | unsigned long enables = 0; |
437 | int i = 0, ret = 0; |
438 | |
439 | /* extract the device name first */ |
440 | dev_name = strsep(&options, " " ); |
441 | if (!dev_name) { |
442 | ret = -EINVAL; |
443 | goto err; |
444 | } |
445 | |
446 | new_limits = kcalloc(n: RDMACG_RESOURCE_MAX, size: sizeof(int), GFP_KERNEL); |
447 | if (!new_limits) { |
448 | ret = -ENOMEM; |
449 | goto err; |
450 | } |
451 | |
452 | ret = rdmacg_parse_limits(options, new_limits, enables: &enables); |
453 | if (ret) |
454 | goto parse_err; |
455 | |
456 | /* acquire lock to synchronize with hot plug devices */ |
457 | mutex_lock(&rdmacg_mutex); |
458 | |
459 | device = rdmacg_get_device_locked(name: dev_name); |
460 | if (!device) { |
461 | ret = -ENODEV; |
462 | goto dev_err; |
463 | } |
464 | |
465 | rpool = get_cg_rpool_locked(cg, device); |
466 | if (IS_ERR(ptr: rpool)) { |
467 | ret = PTR_ERR(ptr: rpool); |
468 | goto dev_err; |
469 | } |
470 | |
471 | /* now set the new limits of the rpool */ |
472 | for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX) |
473 | set_resource_limit(rpool, index: i, new_max: new_limits[i]); |
474 | |
475 | if (rpool->usage_sum == 0 && |
476 | rpool->num_max_cnt == RDMACG_RESOURCE_MAX) { |
477 | /* |
478 | * No user of the rpool and all entries are set to max, so |
479 | * safe to delete this rpool. |
480 | */ |
481 | free_cg_rpool_locked(rpool); |
482 | } |
483 | |
484 | dev_err: |
485 | mutex_unlock(lock: &rdmacg_mutex); |
486 | |
487 | parse_err: |
488 | kfree(objp: new_limits); |
489 | |
490 | err: |
491 | return ret ?: nbytes; |
492 | } |
493 | |
494 | static void print_rpool_values(struct seq_file *sf, |
495 | struct rdmacg_resource_pool *rpool) |
496 | { |
497 | enum rdmacg_file_type sf_type; |
498 | int i; |
499 | u32 value; |
500 | |
501 | sf_type = seq_cft(seq: sf)->private; |
502 | |
503 | for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { |
504 | seq_puts(m: sf, s: rdmacg_resource_names[i]); |
505 | seq_putc(m: sf, c: '='); |
506 | if (sf_type == RDMACG_RESOURCE_TYPE_MAX) { |
507 | if (rpool) |
508 | value = rpool->resources[i].max; |
509 | else |
510 | value = S32_MAX; |
511 | } else { |
512 | if (rpool) |
513 | value = rpool->resources[i].usage; |
514 | else |
515 | value = 0; |
516 | } |
517 | |
518 | if (value == S32_MAX) |
519 | seq_puts(m: sf, RDMACG_MAX_STR); |
520 | else |
521 | seq_printf(m: sf, fmt: "%d" , value); |
522 | seq_putc(m: sf, c: ' '); |
523 | } |
524 | } |
525 | |
526 | static int rdmacg_resource_read(struct seq_file *sf, void *v) |
527 | { |
528 | struct rdmacg_device *device; |
529 | struct rdmacg_resource_pool *rpool; |
530 | struct rdma_cgroup *cg = css_rdmacg(css: seq_css(seq: sf)); |
531 | |
532 | mutex_lock(&rdmacg_mutex); |
533 | |
534 | list_for_each_entry(device, &rdmacg_devices, dev_node) { |
535 | seq_printf(m: sf, fmt: "%s " , device->name); |
536 | |
537 | rpool = find_cg_rpool_locked(cg, device); |
538 | print_rpool_values(sf, rpool); |
539 | |
540 | seq_putc(m: sf, c: '\n'); |
541 | } |
542 | |
543 | mutex_unlock(lock: &rdmacg_mutex); |
544 | return 0; |
545 | } |
546 | |
547 | static struct cftype rdmacg_files[] = { |
548 | { |
549 | .name = "max" , |
550 | .write = rdmacg_resource_set_max, |
551 | .seq_show = rdmacg_resource_read, |
552 | .private = RDMACG_RESOURCE_TYPE_MAX, |
553 | .flags = CFTYPE_NOT_ON_ROOT, |
554 | }, |
555 | { |
556 | .name = "current" , |
557 | .seq_show = rdmacg_resource_read, |
558 | .private = RDMACG_RESOURCE_TYPE_STAT, |
559 | .flags = CFTYPE_NOT_ON_ROOT, |
560 | }, |
561 | { } /* terminate */ |
562 | }; |
563 | |
564 | static struct cgroup_subsys_state * |
565 | rdmacg_css_alloc(struct cgroup_subsys_state *parent) |
566 | { |
567 | struct rdma_cgroup *cg; |
568 | |
569 | cg = kzalloc(size: sizeof(*cg), GFP_KERNEL); |
570 | if (!cg) |
571 | return ERR_PTR(error: -ENOMEM); |
572 | |
573 | INIT_LIST_HEAD(list: &cg->rpools); |
574 | return &cg->css; |
575 | } |
576 | |
577 | static void rdmacg_css_free(struct cgroup_subsys_state *css) |
578 | { |
579 | struct rdma_cgroup *cg = css_rdmacg(css); |
580 | |
581 | kfree(objp: cg); |
582 | } |
583 | |
584 | /** |
585 | * rdmacg_css_offline - cgroup css_offline callback |
586 | * @css: css of interest |
587 | * |
588 | * This function is called when @css is about to go away and responsible |
589 | * for shooting down all rdmacg associated with @css. As part of that it |
590 | * marks all the resource pool entries to max value, so that when resources are |
591 | * uncharged, associated resource pool can be freed as well. |
592 | */ |
593 | static void rdmacg_css_offline(struct cgroup_subsys_state *css) |
594 | { |
595 | struct rdma_cgroup *cg = css_rdmacg(css); |
596 | struct rdmacg_resource_pool *rpool; |
597 | |
598 | mutex_lock(&rdmacg_mutex); |
599 | |
600 | list_for_each_entry(rpool, &cg->rpools, cg_node) |
601 | set_all_resource_max_limit(rpool); |
602 | |
603 | mutex_unlock(lock: &rdmacg_mutex); |
604 | } |
605 | |
606 | struct cgroup_subsys rdma_cgrp_subsys = { |
607 | .css_alloc = rdmacg_css_alloc, |
608 | .css_free = rdmacg_css_free, |
609 | .css_offline = rdmacg_css_offline, |
610 | .legacy_cftypes = rdmacg_files, |
611 | .dfl_cftypes = rdmacg_files, |
612 | }; |
613 | |