1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * RDMA resource limiting controller for cgroups.
4 *
5 * Used to allow a cgroup hierarchy to stop processes from consuming
6 * additional RDMA resources after a certain limit is reached.
7 *
8 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
9 */
10
11#include <linux/bitops.h>
12#include <linux/slab.h>
13#include <linux/seq_file.h>
14#include <linux/cgroup.h>
15#include <linux/parser.h>
16#include <linux/cgroup_rdma.h>
17
18#define RDMACG_MAX_STR "max"
19
20/*
21 * Protects list of resource pools maintained on per cgroup basis
22 * and rdma device list.
23 */
24static DEFINE_MUTEX(rdmacg_mutex);
25static LIST_HEAD(rdmacg_devices);
26
27enum rdmacg_file_type {
28 RDMACG_RESOURCE_TYPE_MAX,
29 RDMACG_RESOURCE_TYPE_STAT,
30};
31
32/*
33 * resource table definition as to be seen by the user.
34 * Need to add entries to it when more resources are
35 * added/defined at IB verb/core layer.
36 */
37static char const *rdmacg_resource_names[] = {
38 [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle",
39 [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object",
40};
41
42/* resource tracker for each resource of rdma cgroup */
43struct rdmacg_resource {
44 int max;
45 int usage;
46};
47
48/*
49 * resource pool object which represents per cgroup, per device
50 * resources. There are multiple instances of this object per cgroup,
51 * therefore it cannot be embedded within rdma_cgroup structure. It
52 * is maintained as list.
53 */
54struct rdmacg_resource_pool {
55 struct rdmacg_device *device;
56 struct rdmacg_resource resources[RDMACG_RESOURCE_MAX];
57
58 struct list_head cg_node;
59 struct list_head dev_node;
60
61 /* count active user tasks of this pool */
62 u64 usage_sum;
63 /* total number counts which are set to max */
64 int num_max_cnt;
65};
66
67static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
68{
69 return container_of(css, struct rdma_cgroup, css);
70}
71
72static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
73{
74 return css_rdmacg(css: cg->css.parent);
75}
76
77static inline struct rdma_cgroup *get_current_rdmacg(void)
78{
79 return css_rdmacg(css: task_get_css(current, subsys_id: rdma_cgrp_id));
80}
81
82static void set_resource_limit(struct rdmacg_resource_pool *rpool,
83 int index, int new_max)
84{
85 if (new_max == S32_MAX) {
86 if (rpool->resources[index].max != S32_MAX)
87 rpool->num_max_cnt++;
88 } else {
89 if (rpool->resources[index].max == S32_MAX)
90 rpool->num_max_cnt--;
91 }
92 rpool->resources[index].max = new_max;
93}
94
95static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
96{
97 int i;
98
99 for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
100 set_resource_limit(rpool, index: i, S32_MAX);
101}
102
103static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
104{
105 lockdep_assert_held(&rdmacg_mutex);
106
107 list_del(entry: &rpool->cg_node);
108 list_del(entry: &rpool->dev_node);
109 kfree(objp: rpool);
110}
111
112static struct rdmacg_resource_pool *
113find_cg_rpool_locked(struct rdma_cgroup *cg,
114 struct rdmacg_device *device)
115
116{
117 struct rdmacg_resource_pool *pool;
118
119 lockdep_assert_held(&rdmacg_mutex);
120
121 list_for_each_entry(pool, &cg->rpools, cg_node)
122 if (pool->device == device)
123 return pool;
124
125 return NULL;
126}
127
128static struct rdmacg_resource_pool *
129get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
130{
131 struct rdmacg_resource_pool *rpool;
132
133 rpool = find_cg_rpool_locked(cg, device);
134 if (rpool)
135 return rpool;
136
137 rpool = kzalloc(size: sizeof(*rpool), GFP_KERNEL);
138 if (!rpool)
139 return ERR_PTR(error: -ENOMEM);
140
141 rpool->device = device;
142 set_all_resource_max_limit(rpool);
143
144 INIT_LIST_HEAD(list: &rpool->cg_node);
145 INIT_LIST_HEAD(list: &rpool->dev_node);
146 list_add_tail(new: &rpool->cg_node, head: &cg->rpools);
147 list_add_tail(new: &rpool->dev_node, head: &device->rpools);
148 return rpool;
149}
150
151/**
152 * uncharge_cg_locked - uncharge resource for rdma cgroup
153 * @cg: pointer to cg to uncharge and all parents in hierarchy
154 * @device: pointer to rdmacg device
155 * @index: index of the resource to uncharge in cg (resource pool)
156 *
157 * It also frees the resource pool which was created as part of
158 * charging operation when there are no resources attached to
159 * resource pool.
160 */
161static void
162uncharge_cg_locked(struct rdma_cgroup *cg,
163 struct rdmacg_device *device,
164 enum rdmacg_resource_type index)
165{
166 struct rdmacg_resource_pool *rpool;
167
168 rpool = find_cg_rpool_locked(cg, device);
169
170 /*
171 * rpool cannot be null at this stage. Let kernel operate in case
172 * if there a bug in IB stack or rdma controller, instead of crashing
173 * the system.
174 */
175 if (unlikely(!rpool)) {
176 pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
177 return;
178 }
179
180 rpool->resources[index].usage--;
181
182 /*
183 * A negative count (or overflow) is invalid,
184 * it indicates a bug in the rdma controller.
185 */
186 WARN_ON_ONCE(rpool->resources[index].usage < 0);
187 rpool->usage_sum--;
188 if (rpool->usage_sum == 0 &&
189 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
190 /*
191 * No user of the rpool and all entries are set to max, so
192 * safe to delete this rpool.
193 */
194 free_cg_rpool_locked(rpool);
195 }
196}
197
198/**
199 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
200 * @cg: pointer to cg to uncharge and all parents in hierarchy
201 * @device: pointer to rdmacg device
202 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
203 * stop uncharging
204 * @index: index of the resource to uncharge in cg in given resource pool
205 */
206static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
207 struct rdmacg_device *device,
208 struct rdma_cgroup *stop_cg,
209 enum rdmacg_resource_type index)
210{
211 struct rdma_cgroup *p;
212
213 mutex_lock(&rdmacg_mutex);
214
215 for (p = cg; p != stop_cg; p = parent_rdmacg(cg: p))
216 uncharge_cg_locked(cg: p, device, index);
217
218 mutex_unlock(lock: &rdmacg_mutex);
219
220 css_put(css: &cg->css);
221}
222
223/**
224 * rdmacg_uncharge - hierarchically uncharge rdma resource count
225 * @cg: pointer to cg to uncharge and all parents in hierarchy
226 * @device: pointer to rdmacg device
227 * @index: index of the resource to uncharge in cgroup in given resource pool
228 */
229void rdmacg_uncharge(struct rdma_cgroup *cg,
230 struct rdmacg_device *device,
231 enum rdmacg_resource_type index)
232{
233 if (index >= RDMACG_RESOURCE_MAX)
234 return;
235
236 rdmacg_uncharge_hierarchy(cg, device, NULL, index);
237}
238EXPORT_SYMBOL(rdmacg_uncharge);
239
240/**
241 * rdmacg_try_charge - hierarchically try to charge the rdma resource
242 * @rdmacg: pointer to rdma cgroup which will own this resource
243 * @device: pointer to rdmacg device
244 * @index: index of the resource to charge in cgroup (resource pool)
245 *
246 * This function follows charging resource in hierarchical way.
247 * It will fail if the charge would cause the new value to exceed the
248 * hierarchical limit.
249 * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
250 * Returns pointer to rdmacg for this resource when charging is successful.
251 *
252 * Charger needs to account resources on two criteria.
253 * (a) per cgroup & (b) per device resource usage.
254 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
255 * the configured limits. Per device provides granular configuration
256 * in multi device usage. It allocates resource pool in the hierarchy
257 * for each parent it come across for first resource. Later on resource
258 * pool will be available. Therefore it will be much faster thereon
259 * to charge/uncharge.
260 */
261int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
262 struct rdmacg_device *device,
263 enum rdmacg_resource_type index)
264{
265 struct rdma_cgroup *cg, *p;
266 struct rdmacg_resource_pool *rpool;
267 s64 new;
268 int ret = 0;
269
270 if (index >= RDMACG_RESOURCE_MAX)
271 return -EINVAL;
272
273 /*
274 * hold on to css, as cgroup can be removed but resource
275 * accounting happens on css.
276 */
277 cg = get_current_rdmacg();
278
279 mutex_lock(&rdmacg_mutex);
280 for (p = cg; p; p = parent_rdmacg(cg: p)) {
281 rpool = get_cg_rpool_locked(cg: p, device);
282 if (IS_ERR(ptr: rpool)) {
283 ret = PTR_ERR(ptr: rpool);
284 goto err;
285 } else {
286 new = rpool->resources[index].usage + 1;
287 if (new > rpool->resources[index].max) {
288 ret = -EAGAIN;
289 goto err;
290 } else {
291 rpool->resources[index].usage = new;
292 rpool->usage_sum++;
293 }
294 }
295 }
296 mutex_unlock(lock: &rdmacg_mutex);
297
298 *rdmacg = cg;
299 return 0;
300
301err:
302 mutex_unlock(lock: &rdmacg_mutex);
303 rdmacg_uncharge_hierarchy(cg, device, stop_cg: p, index);
304 return ret;
305}
306EXPORT_SYMBOL(rdmacg_try_charge);
307
308/**
309 * rdmacg_register_device - register rdmacg device to rdma controller.
310 * @device: pointer to rdmacg device whose resources need to be accounted.
311 *
312 * If IB stack wish a device to participate in rdma cgroup resource
313 * tracking, it must invoke this API to register with rdma cgroup before
314 * any user space application can start using the RDMA resources.
315 */
316void rdmacg_register_device(struct rdmacg_device *device)
317{
318 INIT_LIST_HEAD(list: &device->dev_node);
319 INIT_LIST_HEAD(list: &device->rpools);
320
321 mutex_lock(&rdmacg_mutex);
322 list_add_tail(new: &device->dev_node, head: &rdmacg_devices);
323 mutex_unlock(lock: &rdmacg_mutex);
324}
325EXPORT_SYMBOL(rdmacg_register_device);
326
327/**
328 * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
329 * @device: pointer to rdmacg device which was previously registered with rdma
330 * controller using rdmacg_register_device().
331 *
332 * IB stack must invoke this after all the resources of the IB device
333 * are destroyed and after ensuring that no more resources will be created
334 * when this API is invoked.
335 */
336void rdmacg_unregister_device(struct rdmacg_device *device)
337{
338 struct rdmacg_resource_pool *rpool, *tmp;
339
340 /*
341 * Synchronize with any active resource settings,
342 * usage query happening via configfs.
343 */
344 mutex_lock(&rdmacg_mutex);
345 list_del_init(entry: &device->dev_node);
346
347 /*
348 * Now that this device is off the cgroup list, its safe to free
349 * all the rpool resources.
350 */
351 list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
352 free_cg_rpool_locked(rpool);
353
354 mutex_unlock(lock: &rdmacg_mutex);
355}
356EXPORT_SYMBOL(rdmacg_unregister_device);
357
358static int parse_resource(char *c, int *intval)
359{
360 substring_t argstr;
361 char *name, *value = c;
362 size_t len;
363 int ret, i;
364
365 name = strsep(&value, "=");
366 if (!name || !value)
367 return -EINVAL;
368
369 i = match_string(array: rdmacg_resource_names, n: RDMACG_RESOURCE_MAX, string: name);
370 if (i < 0)
371 return i;
372
373 len = strlen(value);
374
375 argstr.from = value;
376 argstr.to = value + len;
377
378 ret = match_int(&argstr, result: intval);
379 if (ret >= 0) {
380 if (*intval < 0)
381 return -EINVAL;
382 return i;
383 }
384 if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
385 *intval = S32_MAX;
386 return i;
387 }
388 return -EINVAL;
389}
390
391static int rdmacg_parse_limits(char *options,
392 int *new_limits, unsigned long *enables)
393{
394 char *c;
395 int err = -EINVAL;
396
397 /* parse resource options */
398 while ((c = strsep(&options, " ")) != NULL) {
399 int index, intval;
400
401 index = parse_resource(c, intval: &intval);
402 if (index < 0)
403 goto err;
404
405 new_limits[index] = intval;
406 *enables |= BIT(index);
407 }
408 return 0;
409
410err:
411 return err;
412}
413
414static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
415{
416 struct rdmacg_device *device;
417
418 lockdep_assert_held(&rdmacg_mutex);
419
420 list_for_each_entry(device, &rdmacg_devices, dev_node)
421 if (!strcmp(name, device->name))
422 return device;
423
424 return NULL;
425}
426
427static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
428 char *buf, size_t nbytes, loff_t off)
429{
430 struct rdma_cgroup *cg = css_rdmacg(css: of_css(of));
431 const char *dev_name;
432 struct rdmacg_resource_pool *rpool;
433 struct rdmacg_device *device;
434 char *options = strstrip(str: buf);
435 int *new_limits;
436 unsigned long enables = 0;
437 int i = 0, ret = 0;
438
439 /* extract the device name first */
440 dev_name = strsep(&options, " ");
441 if (!dev_name) {
442 ret = -EINVAL;
443 goto err;
444 }
445
446 new_limits = kcalloc(n: RDMACG_RESOURCE_MAX, size: sizeof(int), GFP_KERNEL);
447 if (!new_limits) {
448 ret = -ENOMEM;
449 goto err;
450 }
451
452 ret = rdmacg_parse_limits(options, new_limits, enables: &enables);
453 if (ret)
454 goto parse_err;
455
456 /* acquire lock to synchronize with hot plug devices */
457 mutex_lock(&rdmacg_mutex);
458
459 device = rdmacg_get_device_locked(name: dev_name);
460 if (!device) {
461 ret = -ENODEV;
462 goto dev_err;
463 }
464
465 rpool = get_cg_rpool_locked(cg, device);
466 if (IS_ERR(ptr: rpool)) {
467 ret = PTR_ERR(ptr: rpool);
468 goto dev_err;
469 }
470
471 /* now set the new limits of the rpool */
472 for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
473 set_resource_limit(rpool, index: i, new_max: new_limits[i]);
474
475 if (rpool->usage_sum == 0 &&
476 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
477 /*
478 * No user of the rpool and all entries are set to max, so
479 * safe to delete this rpool.
480 */
481 free_cg_rpool_locked(rpool);
482 }
483
484dev_err:
485 mutex_unlock(lock: &rdmacg_mutex);
486
487parse_err:
488 kfree(objp: new_limits);
489
490err:
491 return ret ?: nbytes;
492}
493
494static void print_rpool_values(struct seq_file *sf,
495 struct rdmacg_resource_pool *rpool)
496{
497 enum rdmacg_file_type sf_type;
498 int i;
499 u32 value;
500
501 sf_type = seq_cft(seq: sf)->private;
502
503 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
504 seq_puts(m: sf, s: rdmacg_resource_names[i]);
505 seq_putc(m: sf, c: '=');
506 if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
507 if (rpool)
508 value = rpool->resources[i].max;
509 else
510 value = S32_MAX;
511 } else {
512 if (rpool)
513 value = rpool->resources[i].usage;
514 else
515 value = 0;
516 }
517
518 if (value == S32_MAX)
519 seq_puts(m: sf, RDMACG_MAX_STR);
520 else
521 seq_printf(m: sf, fmt: "%d", value);
522 seq_putc(m: sf, c: ' ');
523 }
524}
525
526static int rdmacg_resource_read(struct seq_file *sf, void *v)
527{
528 struct rdmacg_device *device;
529 struct rdmacg_resource_pool *rpool;
530 struct rdma_cgroup *cg = css_rdmacg(css: seq_css(seq: sf));
531
532 mutex_lock(&rdmacg_mutex);
533
534 list_for_each_entry(device, &rdmacg_devices, dev_node) {
535 seq_printf(m: sf, fmt: "%s ", device->name);
536
537 rpool = find_cg_rpool_locked(cg, device);
538 print_rpool_values(sf, rpool);
539
540 seq_putc(m: sf, c: '\n');
541 }
542
543 mutex_unlock(lock: &rdmacg_mutex);
544 return 0;
545}
546
547static struct cftype rdmacg_files[] = {
548 {
549 .name = "max",
550 .write = rdmacg_resource_set_max,
551 .seq_show = rdmacg_resource_read,
552 .private = RDMACG_RESOURCE_TYPE_MAX,
553 .flags = CFTYPE_NOT_ON_ROOT,
554 },
555 {
556 .name = "current",
557 .seq_show = rdmacg_resource_read,
558 .private = RDMACG_RESOURCE_TYPE_STAT,
559 .flags = CFTYPE_NOT_ON_ROOT,
560 },
561 { } /* terminate */
562};
563
564static struct cgroup_subsys_state *
565rdmacg_css_alloc(struct cgroup_subsys_state *parent)
566{
567 struct rdma_cgroup *cg;
568
569 cg = kzalloc(size: sizeof(*cg), GFP_KERNEL);
570 if (!cg)
571 return ERR_PTR(error: -ENOMEM);
572
573 INIT_LIST_HEAD(list: &cg->rpools);
574 return &cg->css;
575}
576
577static void rdmacg_css_free(struct cgroup_subsys_state *css)
578{
579 struct rdma_cgroup *cg = css_rdmacg(css);
580
581 kfree(objp: cg);
582}
583
584/**
585 * rdmacg_css_offline - cgroup css_offline callback
586 * @css: css of interest
587 *
588 * This function is called when @css is about to go away and responsible
589 * for shooting down all rdmacg associated with @css. As part of that it
590 * marks all the resource pool entries to max value, so that when resources are
591 * uncharged, associated resource pool can be freed as well.
592 */
593static void rdmacg_css_offline(struct cgroup_subsys_state *css)
594{
595 struct rdma_cgroup *cg = css_rdmacg(css);
596 struct rdmacg_resource_pool *rpool;
597
598 mutex_lock(&rdmacg_mutex);
599
600 list_for_each_entry(rpool, &cg->rpools, cg_node)
601 set_all_resource_max_limit(rpool);
602
603 mutex_unlock(lock: &rdmacg_mutex);
604}
605
606struct cgroup_subsys rdma_cgrp_subsys = {
607 .css_alloc = rdmacg_css_alloc,
608 .css_free = rdmacg_css_free,
609 .css_offline = rdmacg_css_offline,
610 .legacy_cftypes = rdmacg_files,
611 .dfl_cftypes = rdmacg_files,
612};
613

source code of linux/kernel/cgroup/rdma.c