1 | // SPDX-License-Identifier: GPL-2.0-only |
---|---|
2 | #include "cgroup-internal.h" |
3 | |
4 | #include <linux/ctype.h> |
5 | #include <linux/kmod.h> |
6 | #include <linux/sort.h> |
7 | #include <linux/delay.h> |
8 | #include <linux/mm.h> |
9 | #include <linux/sched/signal.h> |
10 | #include <linux/sched/task.h> |
11 | #include <linux/magic.h> |
12 | #include <linux/slab.h> |
13 | #include <linux/vmalloc.h> |
14 | #include <linux/delayacct.h> |
15 | #include <linux/pid_namespace.h> |
16 | #include <linux/cgroupstats.h> |
17 | #include <linux/fs_parser.h> |
18 | |
19 | #include <trace/events/cgroup.h> |
20 | |
21 | /* |
22 | * pidlists linger the following amount before being destroyed. The goal |
23 | * is avoiding frequent destruction in the middle of consecutive read calls |
24 | * Expiring in the middle is a performance problem not a correctness one. |
25 | * 1 sec should be enough. |
26 | */ |
27 | #define CGROUP_PIDLIST_DESTROY_DELAY HZ |
28 | |
29 | /* Controllers blocked by the commandline in v1 */ |
30 | static u16 cgroup_no_v1_mask; |
31 | |
32 | /* disable named v1 mounts */ |
33 | static bool cgroup_no_v1_named; |
34 | |
35 | /* |
36 | * pidlist destructions need to be flushed on cgroup destruction. Use a |
37 | * separate workqueue as flush domain. |
38 | */ |
39 | static struct workqueue_struct *cgroup_pidlist_destroy_wq; |
40 | |
41 | /* protects cgroup_subsys->release_agent_path */ |
42 | static DEFINE_SPINLOCK(release_agent_path_lock); |
43 | |
44 | bool cgroup1_ssid_disabled(int ssid) |
45 | { |
46 | return cgroup_no_v1_mask & (1 << ssid); |
47 | } |
48 | |
49 | /** |
50 | * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' |
51 | * @from: attach to all cgroups of a given task |
52 | * @tsk: the task to be attached |
53 | * |
54 | * Return: %0 on success or a negative errno code on failure |
55 | */ |
56 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) |
57 | { |
58 | struct cgroup_root *root; |
59 | int retval = 0; |
60 | |
61 | cgroup_lock(); |
62 | cgroup_attach_lock(lock_threadgroup: true); |
63 | for_each_root(root) { |
64 | struct cgroup *from_cgrp; |
65 | |
66 | spin_lock_irq(lock: &css_set_lock); |
67 | from_cgrp = task_cgroup_from_root(task: from, root); |
68 | spin_unlock_irq(lock: &css_set_lock); |
69 | |
70 | retval = cgroup_attach_task(dst_cgrp: from_cgrp, leader: tsk, threadgroup: false); |
71 | if (retval) |
72 | break; |
73 | } |
74 | cgroup_attach_unlock(lock_threadgroup: true); |
75 | cgroup_unlock(); |
76 | |
77 | return retval; |
78 | } |
79 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
80 | |
81 | /** |
82 | * cgroup_transfer_tasks - move tasks from one cgroup to another |
83 | * @to: cgroup to which the tasks will be moved |
84 | * @from: cgroup in which the tasks currently reside |
85 | * |
86 | * Locking rules between cgroup_post_fork() and the migration path |
87 | * guarantee that, if a task is forking while being migrated, the new child |
88 | * is guaranteed to be either visible in the source cgroup after the |
89 | * parent's migration is complete or put into the target cgroup. No task |
90 | * can slip out of migration through forking. |
91 | * |
92 | * Return: %0 on success or a negative errno code on failure |
93 | */ |
94 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) |
95 | { |
96 | DEFINE_CGROUP_MGCTX(mgctx); |
97 | struct cgrp_cset_link *link; |
98 | struct css_task_iter it; |
99 | struct task_struct *task; |
100 | int ret; |
101 | |
102 | if (cgroup_on_dfl(cgrp: to)) |
103 | return -EINVAL; |
104 | |
105 | ret = cgroup_migrate_vet_dst(dst_cgrp: to); |
106 | if (ret) |
107 | return ret; |
108 | |
109 | cgroup_lock(); |
110 | |
111 | cgroup_attach_lock(lock_threadgroup: true); |
112 | |
113 | /* all tasks in @from are being moved, all csets are source */ |
114 | spin_lock_irq(lock: &css_set_lock); |
115 | list_for_each_entry(link, &from->cset_links, cset_link) |
116 | cgroup_migrate_add_src(src_cset: link->cset, dst_cgrp: to, mgctx: &mgctx); |
117 | spin_unlock_irq(lock: &css_set_lock); |
118 | |
119 | ret = cgroup_migrate_prepare_dst(mgctx: &mgctx); |
120 | if (ret) |
121 | goto out_err; |
122 | |
123 | /* |
124 | * Migrate tasks one-by-one until @from is empty. This fails iff |
125 | * ->can_attach() fails. |
126 | */ |
127 | do { |
128 | css_task_iter_start(css: &from->self, flags: 0, it: &it); |
129 | |
130 | do { |
131 | task = css_task_iter_next(it: &it); |
132 | } while (task && (task->flags & PF_EXITING)); |
133 | |
134 | if (task) |
135 | get_task_struct(t: task); |
136 | css_task_iter_end(it: &it); |
137 | |
138 | if (task) { |
139 | ret = cgroup_migrate(leader: task, threadgroup: false, mgctx: &mgctx); |
140 | if (!ret) |
141 | TRACE_CGROUP_PATH(transfer_tasks, to, task, false); |
142 | put_task_struct(t: task); |
143 | } |
144 | } while (task && !ret); |
145 | out_err: |
146 | cgroup_migrate_finish(mgctx: &mgctx); |
147 | cgroup_attach_unlock(lock_threadgroup: true); |
148 | cgroup_unlock(); |
149 | return ret; |
150 | } |
151 | |
152 | /* |
153 | * Stuff for reading the 'tasks'/'procs' files. |
154 | * |
155 | * Reading this file can return large amounts of data if a cgroup has |
156 | * *lots* of attached tasks. So it may need several calls to read(), |
157 | * but we cannot guarantee that the information we produce is correct |
158 | * unless we produce it entirely atomically. |
159 | * |
160 | */ |
161 | |
162 | /* which pidlist file are we talking about? */ |
163 | enum cgroup_filetype { |
164 | CGROUP_FILE_PROCS, |
165 | CGROUP_FILE_TASKS, |
166 | }; |
167 | |
168 | /* |
169 | * A pidlist is a list of pids that virtually represents the contents of one |
170 | * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, |
171 | * a pair (one each for procs, tasks) for each pid namespace that's relevant |
172 | * to the cgroup. |
173 | */ |
174 | struct cgroup_pidlist { |
175 | /* |
176 | * used to find which pidlist is wanted. doesn't change as long as |
177 | * this particular list stays in the list. |
178 | */ |
179 | struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; |
180 | /* array of xids */ |
181 | pid_t *list; |
182 | /* how many elements the above list has */ |
183 | int length; |
184 | /* each of these stored in a list by its cgroup */ |
185 | struct list_head links; |
186 | /* pointer to the cgroup we belong to, for list removal purposes */ |
187 | struct cgroup *owner; |
188 | /* for delayed destruction */ |
189 | struct delayed_work destroy_dwork; |
190 | }; |
191 | |
192 | /* |
193 | * Used to destroy all pidlists lingering waiting for destroy timer. None |
194 | * should be left afterwards. |
195 | */ |
196 | void cgroup1_pidlist_destroy_all(struct cgroup *cgrp) |
197 | { |
198 | struct cgroup_pidlist *l, *tmp_l; |
199 | |
200 | mutex_lock(&cgrp->pidlist_mutex); |
201 | list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) |
202 | mod_delayed_work(wq: cgroup_pidlist_destroy_wq, dwork: &l->destroy_dwork, delay: 0); |
203 | mutex_unlock(lock: &cgrp->pidlist_mutex); |
204 | |
205 | flush_workqueue(cgroup_pidlist_destroy_wq); |
206 | BUG_ON(!list_empty(&cgrp->pidlists)); |
207 | } |
208 | |
209 | static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) |
210 | { |
211 | struct delayed_work *dwork = to_delayed_work(work); |
212 | struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, |
213 | destroy_dwork); |
214 | struct cgroup_pidlist *tofree = NULL; |
215 | |
216 | mutex_lock(&l->owner->pidlist_mutex); |
217 | |
218 | /* |
219 | * Destroy iff we didn't get queued again. The state won't change |
220 | * as destroy_dwork can only be queued while locked. |
221 | */ |
222 | if (!delayed_work_pending(dwork)) { |
223 | list_del(entry: &l->links); |
224 | kvfree(addr: l->list); |
225 | put_pid_ns(ns: l->key.ns); |
226 | tofree = l; |
227 | } |
228 | |
229 | mutex_unlock(lock: &l->owner->pidlist_mutex); |
230 | kfree(objp: tofree); |
231 | } |
232 | |
233 | /* |
234 | * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries |
235 | * Returns the number of unique elements. |
236 | */ |
237 | static int pidlist_uniq(pid_t *list, int length) |
238 | { |
239 | int src, dest = 1; |
240 | |
241 | /* |
242 | * we presume the 0th element is unique, so i starts at 1. trivial |
243 | * edge cases first; no work needs to be done for either |
244 | */ |
245 | if (length == 0 || length == 1) |
246 | return length; |
247 | /* src and dest walk down the list; dest counts unique elements */ |
248 | for (src = 1; src < length; src++) { |
249 | /* find next unique element */ |
250 | while (list[src] == list[src-1]) { |
251 | src++; |
252 | if (src == length) |
253 | goto after; |
254 | } |
255 | /* dest always points to where the next unique element goes */ |
256 | list[dest] = list[src]; |
257 | dest++; |
258 | } |
259 | after: |
260 | return dest; |
261 | } |
262 | |
263 | /* |
264 | * The two pid files - task and cgroup.procs - guaranteed that the result |
265 | * is sorted, which forced this whole pidlist fiasco. As pid order is |
266 | * different per namespace, each namespace needs differently sorted list, |
267 | * making it impossible to use, for example, single rbtree of member tasks |
268 | * sorted by task pointer. As pidlists can be fairly large, allocating one |
269 | * per open file is dangerous, so cgroup had to implement shared pool of |
270 | * pidlists keyed by cgroup and namespace. |
271 | */ |
272 | static int cmppid(const void *a, const void *b) |
273 | { |
274 | return *(pid_t *)a - *(pid_t *)b; |
275 | } |
276 | |
277 | static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, |
278 | enum cgroup_filetype type) |
279 | { |
280 | struct cgroup_pidlist *l; |
281 | /* don't need task_nsproxy() if we're looking at ourself */ |
282 | struct pid_namespace *ns = task_active_pid_ns(current); |
283 | |
284 | lockdep_assert_held(&cgrp->pidlist_mutex); |
285 | |
286 | list_for_each_entry(l, &cgrp->pidlists, links) |
287 | if (l->key.type == type && l->key.ns == ns) |
288 | return l; |
289 | return NULL; |
290 | } |
291 | |
292 | /* |
293 | * find the appropriate pidlist for our purpose (given procs vs tasks) |
294 | * returns with the lock on that pidlist already held, and takes care |
295 | * of the use count, or returns NULL with no locks held if we're out of |
296 | * memory. |
297 | */ |
298 | static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, |
299 | enum cgroup_filetype type) |
300 | { |
301 | struct cgroup_pidlist *l; |
302 | |
303 | lockdep_assert_held(&cgrp->pidlist_mutex); |
304 | |
305 | l = cgroup_pidlist_find(cgrp, type); |
306 | if (l) |
307 | return l; |
308 | |
309 | /* entry not found; create a new one */ |
310 | l = kzalloc(size: sizeof(struct cgroup_pidlist), GFP_KERNEL); |
311 | if (!l) |
312 | return l; |
313 | |
314 | INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); |
315 | l->key.type = type; |
316 | /* don't need task_nsproxy() if we're looking at ourself */ |
317 | l->key.ns = get_pid_ns(ns: task_active_pid_ns(current)); |
318 | l->owner = cgrp; |
319 | list_add(new: &l->links, head: &cgrp->pidlists); |
320 | return l; |
321 | } |
322 | |
323 | /* |
324 | * Load a cgroup's pidarray with either procs' tgids or tasks' pids |
325 | */ |
326 | static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, |
327 | struct cgroup_pidlist **lp) |
328 | { |
329 | pid_t *array; |
330 | int length; |
331 | int pid, n = 0; /* used for populating the array */ |
332 | struct css_task_iter it; |
333 | struct task_struct *tsk; |
334 | struct cgroup_pidlist *l; |
335 | |
336 | lockdep_assert_held(&cgrp->pidlist_mutex); |
337 | |
338 | /* |
339 | * If cgroup gets more users after we read count, we won't have |
340 | * enough space - tough. This race is indistinguishable to the |
341 | * caller from the case that the additional cgroup users didn't |
342 | * show up until sometime later on. |
343 | */ |
344 | length = cgroup_task_count(cgrp); |
345 | array = kvmalloc_array(n: length, size: sizeof(pid_t), GFP_KERNEL); |
346 | if (!array) |
347 | return -ENOMEM; |
348 | /* now, populate the array */ |
349 | css_task_iter_start(css: &cgrp->self, flags: 0, it: &it); |
350 | while ((tsk = css_task_iter_next(it: &it))) { |
351 | if (unlikely(n == length)) |
352 | break; |
353 | /* get tgid or pid for procs or tasks file respectively */ |
354 | if (type == CGROUP_FILE_PROCS) |
355 | pid = task_tgid_vnr(tsk); |
356 | else |
357 | pid = task_pid_vnr(tsk); |
358 | if (pid > 0) /* make sure to only use valid results */ |
359 | array[n++] = pid; |
360 | } |
361 | css_task_iter_end(it: &it); |
362 | length = n; |
363 | /* now sort & strip out duplicates (tgids or recycled thread PIDs) */ |
364 | sort(base: array, num: length, size: sizeof(pid_t), cmp_func: cmppid, NULL); |
365 | length = pidlist_uniq(list: array, length); |
366 | |
367 | l = cgroup_pidlist_find_create(cgrp, type); |
368 | if (!l) { |
369 | kvfree(addr: array); |
370 | return -ENOMEM; |
371 | } |
372 | |
373 | /* store array, freeing old if necessary */ |
374 | kvfree(addr: l->list); |
375 | l->list = array; |
376 | l->length = length; |
377 | *lp = l; |
378 | return 0; |
379 | } |
380 | |
381 | /* |
382 | * seq_file methods for the tasks/procs files. The seq_file position is the |
383 | * next pid to display; the seq_file iterator is a pointer to the pid |
384 | * in the cgroup->l->list array. |
385 | */ |
386 | |
387 | static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) |
388 | { |
389 | /* |
390 | * Initially we receive a position value that corresponds to |
391 | * one more than the last pid shown (or 0 on the first call or |
392 | * after a seek to the start). Use a binary-search to find the |
393 | * next pid to display, if any |
394 | */ |
395 | struct kernfs_open_file *of = s->private; |
396 | struct cgroup_file_ctx *ctx = of->priv; |
397 | struct cgroup *cgrp = seq_css(seq: s)->cgroup; |
398 | struct cgroup_pidlist *l; |
399 | enum cgroup_filetype type = seq_cft(seq: s)->private; |
400 | int index = 0, pid = *pos; |
401 | int *iter, ret; |
402 | |
403 | mutex_lock(&cgrp->pidlist_mutex); |
404 | |
405 | /* |
406 | * !NULL @ctx->procs1.pidlist indicates that this isn't the first |
407 | * start() after open. If the matching pidlist is around, we can use |
408 | * that. Look for it. Note that @ctx->procs1.pidlist can't be used |
409 | * directly. It could already have been destroyed. |
410 | */ |
411 | if (ctx->procs1.pidlist) |
412 | ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); |
413 | |
414 | /* |
415 | * Either this is the first start() after open or the matching |
416 | * pidlist has been destroyed inbetween. Create a new one. |
417 | */ |
418 | if (!ctx->procs1.pidlist) { |
419 | ret = pidlist_array_load(cgrp, type, lp: &ctx->procs1.pidlist); |
420 | if (ret) |
421 | return ERR_PTR(error: ret); |
422 | } |
423 | l = ctx->procs1.pidlist; |
424 | |
425 | if (pid) { |
426 | int end = l->length; |
427 | |
428 | while (index < end) { |
429 | int mid = (index + end) / 2; |
430 | if (l->list[mid] == pid) { |
431 | index = mid; |
432 | break; |
433 | } else if (l->list[mid] < pid) |
434 | index = mid + 1; |
435 | else |
436 | end = mid; |
437 | } |
438 | } |
439 | /* If we're off the end of the array, we're done */ |
440 | if (index >= l->length) |
441 | return NULL; |
442 | /* Update the abstract position to be the actual pid that we found */ |
443 | iter = l->list + index; |
444 | *pos = *iter; |
445 | return iter; |
446 | } |
447 | |
448 | static void cgroup_pidlist_stop(struct seq_file *s, void *v) |
449 | { |
450 | struct kernfs_open_file *of = s->private; |
451 | struct cgroup_file_ctx *ctx = of->priv; |
452 | struct cgroup_pidlist *l = ctx->procs1.pidlist; |
453 | |
454 | if (l) |
455 | mod_delayed_work(wq: cgroup_pidlist_destroy_wq, dwork: &l->destroy_dwork, |
456 | CGROUP_PIDLIST_DESTROY_DELAY); |
457 | mutex_unlock(lock: &seq_css(seq: s)->cgroup->pidlist_mutex); |
458 | } |
459 | |
460 | static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) |
461 | { |
462 | struct kernfs_open_file *of = s->private; |
463 | struct cgroup_file_ctx *ctx = of->priv; |
464 | struct cgroup_pidlist *l = ctx->procs1.pidlist; |
465 | pid_t *p = v; |
466 | pid_t *end = l->list + l->length; |
467 | /* |
468 | * Advance to the next pid in the array. If this goes off the |
469 | * end, we're done |
470 | */ |
471 | p++; |
472 | if (p >= end) { |
473 | (*pos)++; |
474 | return NULL; |
475 | } else { |
476 | *pos = *p; |
477 | return p; |
478 | } |
479 | } |
480 | |
481 | static int cgroup_pidlist_show(struct seq_file *s, void *v) |
482 | { |
483 | seq_printf(m: s, fmt: "%d\n", *(int *)v); |
484 | |
485 | return 0; |
486 | } |
487 | |
488 | static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, |
489 | char *buf, size_t nbytes, loff_t off, |
490 | bool threadgroup) |
491 | { |
492 | struct cgroup *cgrp; |
493 | struct task_struct *task; |
494 | const struct cred *cred, *tcred; |
495 | ssize_t ret; |
496 | bool locked; |
497 | |
498 | cgrp = cgroup_kn_lock_live(kn: of->kn, drain_offline: false); |
499 | if (!cgrp) |
500 | return -ENODEV; |
501 | |
502 | task = cgroup_procs_write_start(buf, threadgroup, locked: &locked); |
503 | ret = PTR_ERR_OR_ZERO(ptr: task); |
504 | if (ret) |
505 | goto out_unlock; |
506 | |
507 | /* |
508 | * Even if we're attaching all tasks in the thread group, we only need |
509 | * to check permissions on one of them. Check permissions using the |
510 | * credentials from file open to protect against inherited fd attacks. |
511 | */ |
512 | cred = of->file->f_cred; |
513 | tcred = get_task_cred(task); |
514 | if (!uid_eq(left: cred->euid, GLOBAL_ROOT_UID) && |
515 | !uid_eq(cred->euid, tcred->uid) && |
516 | !uid_eq(cred->euid, tcred->suid)) |
517 | ret = -EACCES; |
518 | put_cred(tcred); |
519 | if (ret) |
520 | goto out_finish; |
521 | |
522 | ret = cgroup_attach_task(cgrp, task, threadgroup); |
523 | |
524 | out_finish: |
525 | cgroup_procs_write_finish(task, locked); |
526 | out_unlock: |
527 | cgroup_kn_unlock(of->kn); |
528 | |
529 | return ret ?: nbytes; |
530 | } |
531 | |
532 | static ssize_t cgroup1_procs_write(struct kernfs_open_file *of, |
533 | char *buf, size_t nbytes, loff_t off) |
534 | { |
535 | return __cgroup1_procs_write(of, buf, nbytes, off, threadgroup: true); |
536 | } |
537 | |
538 | static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of, |
539 | char *buf, size_t nbytes, loff_t off) |
540 | { |
541 | return __cgroup1_procs_write(of, buf, nbytes, off, threadgroup: false); |
542 | } |
543 | |
544 | static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, |
545 | char *buf, size_t nbytes, loff_t off) |
546 | { |
547 | struct cgroup *cgrp; |
548 | struct cgroup_file_ctx *ctx; |
549 | |
550 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); |
551 | |
552 | /* |
553 | * Release agent gets called with all capabilities, |
554 | * require capabilities to set release agent. |
555 | */ |
556 | ctx = of->priv; |
557 | if ((ctx->ns->user_ns != &init_user_ns) || |
558 | !file_ns_capable(file: of->file, ns: &init_user_ns, CAP_SYS_ADMIN)) |
559 | return -EPERM; |
560 | |
561 | cgrp = cgroup_kn_lock_live(kn: of->kn, drain_offline: false); |
562 | if (!cgrp) |
563 | return -ENODEV; |
564 | spin_lock(lock: &release_agent_path_lock); |
565 | strscpy(cgrp->root->release_agent_path, strstrip(buf), |
566 | sizeof(cgrp->root->release_agent_path)); |
567 | spin_unlock(lock: &release_agent_path_lock); |
568 | cgroup_kn_unlock(kn: of->kn); |
569 | return nbytes; |
570 | } |
571 | |
572 | static int cgroup_release_agent_show(struct seq_file *seq, void *v) |
573 | { |
574 | struct cgroup *cgrp = seq_css(seq)->cgroup; |
575 | |
576 | spin_lock(lock: &release_agent_path_lock); |
577 | seq_puts(m: seq, s: cgrp->root->release_agent_path); |
578 | spin_unlock(lock: &release_agent_path_lock); |
579 | seq_putc(m: seq, c: '\n'); |
580 | return 0; |
581 | } |
582 | |
583 | static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) |
584 | { |
585 | seq_puts(m: seq, s: "0\n"); |
586 | return 0; |
587 | } |
588 | |
589 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, |
590 | struct cftype *cft) |
591 | { |
592 | return notify_on_release(cgrp: css->cgroup); |
593 | } |
594 | |
595 | static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, |
596 | struct cftype *cft, u64 val) |
597 | { |
598 | if (val) |
599 | set_bit(nr: CGRP_NOTIFY_ON_RELEASE, addr: &css->cgroup->flags); |
600 | else |
601 | clear_bit(nr: CGRP_NOTIFY_ON_RELEASE, addr: &css->cgroup->flags); |
602 | return 0; |
603 | } |
604 | |
605 | static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, |
606 | struct cftype *cft) |
607 | { |
608 | return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); |
609 | } |
610 | |
611 | static int cgroup_clone_children_write(struct cgroup_subsys_state *css, |
612 | struct cftype *cft, u64 val) |
613 | { |
614 | if (val) |
615 | set_bit(nr: CGRP_CPUSET_CLONE_CHILDREN, addr: &css->cgroup->flags); |
616 | else |
617 | clear_bit(nr: CGRP_CPUSET_CLONE_CHILDREN, addr: &css->cgroup->flags); |
618 | return 0; |
619 | } |
620 | |
621 | /* cgroup core interface files for the legacy hierarchies */ |
622 | struct cftype cgroup1_base_files[] = { |
623 | { |
624 | .name = "cgroup.procs", |
625 | .seq_start = cgroup_pidlist_start, |
626 | .seq_next = cgroup_pidlist_next, |
627 | .seq_stop = cgroup_pidlist_stop, |
628 | .seq_show = cgroup_pidlist_show, |
629 | .private = CGROUP_FILE_PROCS, |
630 | .write = cgroup1_procs_write, |
631 | }, |
632 | { |
633 | .name = "cgroup.clone_children", |
634 | .read_u64 = cgroup_clone_children_read, |
635 | .write_u64 = cgroup_clone_children_write, |
636 | }, |
637 | { |
638 | .name = "cgroup.sane_behavior", |
639 | .flags = CFTYPE_ONLY_ON_ROOT, |
640 | .seq_show = cgroup_sane_behavior_show, |
641 | }, |
642 | { |
643 | .name = "tasks", |
644 | .seq_start = cgroup_pidlist_start, |
645 | .seq_next = cgroup_pidlist_next, |
646 | .seq_stop = cgroup_pidlist_stop, |
647 | .seq_show = cgroup_pidlist_show, |
648 | .private = CGROUP_FILE_TASKS, |
649 | .write = cgroup1_tasks_write, |
650 | }, |
651 | { |
652 | .name = "notify_on_release", |
653 | .read_u64 = cgroup_read_notify_on_release, |
654 | .write_u64 = cgroup_write_notify_on_release, |
655 | }, |
656 | { |
657 | .name = "release_agent", |
658 | .flags = CFTYPE_ONLY_ON_ROOT, |
659 | .seq_show = cgroup_release_agent_show, |
660 | .write = cgroup_release_agent_write, |
661 | .max_write_len = PATH_MAX - 1, |
662 | }, |
663 | { } /* terminate */ |
664 | }; |
665 | |
666 | /* Display information about each subsystem and each hierarchy */ |
667 | int proc_cgroupstats_show(struct seq_file *m, void *v) |
668 | { |
669 | struct cgroup_subsys *ss; |
670 | int i; |
671 | |
672 | seq_puts(m, s: "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); |
673 | /* |
674 | * Grab the subsystems state racily. No need to add avenue to |
675 | * cgroup_mutex contention. |
676 | */ |
677 | |
678 | for_each_subsys(ss, i) |
679 | seq_printf(m, fmt: "%s\t%d\t%d\t%d\n", |
680 | ss->legacy_name, ss->root->hierarchy_id, |
681 | atomic_read(v: &ss->root->nr_cgrps), |
682 | cgroup_ssid_enabled(ssid: i)); |
683 | |
684 | return 0; |
685 | } |
686 | |
687 | /** |
688 | * cgroupstats_build - build and fill cgroupstats |
689 | * @stats: cgroupstats to fill information into |
690 | * @dentry: A dentry entry belonging to the cgroup for which stats have |
691 | * been requested. |
692 | * |
693 | * Build and fill cgroupstats so that taskstats can export it to user |
694 | * space. |
695 | * |
696 | * Return: %0 on success or a negative errno code on failure |
697 | */ |
698 | int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) |
699 | { |
700 | struct kernfs_node *kn = kernfs_node_from_dentry(dentry); |
701 | struct cgroup *cgrp; |
702 | struct css_task_iter it; |
703 | struct task_struct *tsk; |
704 | |
705 | /* it should be kernfs_node belonging to cgroupfs and is a directory */ |
706 | if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || |
707 | kernfs_type(kn) != KERNFS_DIR) |
708 | return -EINVAL; |
709 | |
710 | /* |
711 | * We aren't being called from kernfs and there's no guarantee on |
712 | * @kn->priv's validity. For this and css_tryget_online_from_dir(), |
713 | * @kn->priv is RCU safe. Let's do the RCU dancing. |
714 | */ |
715 | rcu_read_lock(); |
716 | cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); |
717 | if (!cgrp || !cgroup_tryget(cgrp)) { |
718 | rcu_read_unlock(); |
719 | return -ENOENT; |
720 | } |
721 | rcu_read_unlock(); |
722 | |
723 | css_task_iter_start(css: &cgrp->self, flags: 0, it: &it); |
724 | while ((tsk = css_task_iter_next(it: &it))) { |
725 | switch (READ_ONCE(tsk->__state)) { |
726 | case TASK_RUNNING: |
727 | stats->nr_running++; |
728 | break; |
729 | case TASK_INTERRUPTIBLE: |
730 | stats->nr_sleeping++; |
731 | break; |
732 | case TASK_UNINTERRUPTIBLE: |
733 | stats->nr_uninterruptible++; |
734 | break; |
735 | case TASK_STOPPED: |
736 | stats->nr_stopped++; |
737 | break; |
738 | default: |
739 | if (tsk->in_iowait) |
740 | stats->nr_io_wait++; |
741 | break; |
742 | } |
743 | } |
744 | css_task_iter_end(it: &it); |
745 | |
746 | cgroup_put(cgrp); |
747 | return 0; |
748 | } |
749 | |
750 | void cgroup1_check_for_release(struct cgroup *cgrp) |
751 | { |
752 | if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && |
753 | !css_has_online_children(css: &cgrp->self) && !cgroup_is_dead(cgrp)) |
754 | schedule_work(work: &cgrp->release_agent_work); |
755 | } |
756 | |
757 | /* |
758 | * Notify userspace when a cgroup is released, by running the |
759 | * configured release agent with the name of the cgroup (path |
760 | * relative to the root of cgroup file system) as the argument. |
761 | * |
762 | * Most likely, this user command will try to rmdir this cgroup. |
763 | * |
764 | * This races with the possibility that some other task will be |
765 | * attached to this cgroup before it is removed, or that some other |
766 | * user task will 'mkdir' a child cgroup of this cgroup. That's ok. |
767 | * The presumed 'rmdir' will fail quietly if this cgroup is no longer |
768 | * unused, and this cgroup will be reprieved from its death sentence, |
769 | * to continue to serve a useful existence. Next time it's released, |
770 | * we will get notified again, if it still has 'notify_on_release' set. |
771 | * |
772 | * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which |
773 | * means only wait until the task is successfully execve()'d. The |
774 | * separate release agent task is forked by call_usermodehelper(), |
775 | * then control in this thread returns here, without waiting for the |
776 | * release agent task. We don't bother to wait because the caller of |
777 | * this routine has no use for the exit status of the release agent |
778 | * task, so no sense holding our caller up for that. |
779 | */ |
780 | void cgroup1_release_agent(struct work_struct *work) |
781 | { |
782 | struct cgroup *cgrp = |
783 | container_of(work, struct cgroup, release_agent_work); |
784 | char *pathbuf, *agentbuf; |
785 | char *argv[3], *envp[3]; |
786 | int ret; |
787 | |
788 | /* snoop agent path and exit early if empty */ |
789 | if (!cgrp->root->release_agent_path[0]) |
790 | return; |
791 | |
792 | /* prepare argument buffers */ |
793 | pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); |
794 | agentbuf = kmalloc(PATH_MAX, GFP_KERNEL); |
795 | if (!pathbuf || !agentbuf) |
796 | goto out_free; |
797 | |
798 | spin_lock(lock: &release_agent_path_lock); |
799 | strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); |
800 | spin_unlock(lock: &release_agent_path_lock); |
801 | if (!agentbuf[0]) |
802 | goto out_free; |
803 | |
804 | ret = cgroup_path_ns(cgrp, buf: pathbuf, PATH_MAX, ns: &init_cgroup_ns); |
805 | if (ret < 0) |
806 | goto out_free; |
807 | |
808 | argv[0] = agentbuf; |
809 | argv[1] = pathbuf; |
810 | argv[2] = NULL; |
811 | |
812 | /* minimal command environment */ |
813 | envp[0] = "HOME=/"; |
814 | envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; |
815 | envp[2] = NULL; |
816 | |
817 | call_usermodehelper(path: argv[0], argv, envp, UMH_WAIT_EXEC); |
818 | out_free: |
819 | kfree(objp: agentbuf); |
820 | kfree(objp: pathbuf); |
821 | } |
822 | |
823 | /* |
824 | * cgroup_rename - Only allow simple rename of directories in place. |
825 | */ |
826 | static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, |
827 | const char *new_name_str) |
828 | { |
829 | struct cgroup *cgrp = kn->priv; |
830 | int ret; |
831 | |
832 | /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ |
833 | if (strchr(new_name_str, '\n')) |
834 | return -EINVAL; |
835 | |
836 | if (kernfs_type(kn) != KERNFS_DIR) |
837 | return -ENOTDIR; |
838 | if (kn->parent != new_parent) |
839 | return -EIO; |
840 | |
841 | /* |
842 | * We're gonna grab cgroup_mutex which nests outside kernfs |
843 | * active_ref. kernfs_rename() doesn't require active_ref |
844 | * protection. Break them before grabbing cgroup_mutex. |
845 | */ |
846 | kernfs_break_active_protection(kn: new_parent); |
847 | kernfs_break_active_protection(kn); |
848 | |
849 | cgroup_lock(); |
850 | |
851 | ret = kernfs_rename(kn, new_parent, new_name: new_name_str); |
852 | if (!ret) |
853 | TRACE_CGROUP_PATH(rename, cgrp); |
854 | |
855 | cgroup_unlock(); |
856 | |
857 | kernfs_unbreak_active_protection(kn); |
858 | kernfs_unbreak_active_protection(kn: new_parent); |
859 | return ret; |
860 | } |
861 | |
862 | static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) |
863 | { |
864 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); |
865 | struct cgroup_subsys *ss; |
866 | int ssid; |
867 | |
868 | for_each_subsys(ss, ssid) |
869 | if (root->subsys_mask & (1 << ssid)) |
870 | seq_show_option(m: seq, name: ss->legacy_name, NULL); |
871 | if (root->flags & CGRP_ROOT_NOPREFIX) |
872 | seq_puts(m: seq, s: ",noprefix"); |
873 | if (root->flags & CGRP_ROOT_XATTR) |
874 | seq_puts(m: seq, s: ",xattr"); |
875 | if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) |
876 | seq_puts(m: seq, s: ",cpuset_v2_mode"); |
877 | if (root->flags & CGRP_ROOT_FAVOR_DYNMODS) |
878 | seq_puts(m: seq, s: ",favordynmods"); |
879 | |
880 | spin_lock(lock: &release_agent_path_lock); |
881 | if (strlen(root->release_agent_path)) |
882 | seq_show_option(m: seq, name: "release_agent", |
883 | value: root->release_agent_path); |
884 | spin_unlock(lock: &release_agent_path_lock); |
885 | |
886 | if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) |
887 | seq_puts(m: seq, s: ",clone_children"); |
888 | if (strlen(root->name)) |
889 | seq_show_option(m: seq, name: "name", value: root->name); |
890 | return 0; |
891 | } |
892 | |
893 | enum cgroup1_param { |
894 | Opt_all, |
895 | Opt_clone_children, |
896 | Opt_cpuset_v2_mode, |
897 | Opt_name, |
898 | Opt_none, |
899 | Opt_noprefix, |
900 | Opt_release_agent, |
901 | Opt_xattr, |
902 | Opt_favordynmods, |
903 | Opt_nofavordynmods, |
904 | }; |
905 | |
906 | const struct fs_parameter_spec cgroup1_fs_parameters[] = { |
907 | fsparam_flag ("all", Opt_all), |
908 | fsparam_flag ("clone_children", Opt_clone_children), |
909 | fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), |
910 | fsparam_string("name", Opt_name), |
911 | fsparam_flag ("none", Opt_none), |
912 | fsparam_flag ("noprefix", Opt_noprefix), |
913 | fsparam_string("release_agent", Opt_release_agent), |
914 | fsparam_flag ("xattr", Opt_xattr), |
915 | fsparam_flag ("favordynmods", Opt_favordynmods), |
916 | fsparam_flag ("nofavordynmods", Opt_nofavordynmods), |
917 | {} |
918 | }; |
919 | |
920 | int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) |
921 | { |
922 | struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
923 | struct cgroup_subsys *ss; |
924 | struct fs_parse_result result; |
925 | int opt, i; |
926 | |
927 | opt = fs_parse(fc, desc: cgroup1_fs_parameters, param, result: &result); |
928 | if (opt == -ENOPARAM) { |
929 | int ret; |
930 | |
931 | ret = vfs_parse_fs_param_source(fc, param); |
932 | if (ret != -ENOPARAM) |
933 | return ret; |
934 | for_each_subsys(ss, i) { |
935 | if (strcmp(param->key, ss->legacy_name)) |
936 | continue; |
937 | if (!cgroup_ssid_enabled(ssid: i) || cgroup1_ssid_disabled(ssid: i)) |
938 | return invalfc(fc, "Disabled controller '%s'", |
939 | param->key); |
940 | ctx->subsys_mask |= (1 << i); |
941 | return 0; |
942 | } |
943 | return invalfc(fc, "Unknown subsys name '%s'", param->key); |
944 | } |
945 | if (opt < 0) |
946 | return opt; |
947 | |
948 | switch (opt) { |
949 | case Opt_none: |
950 | /* Explicitly have no subsystems */ |
951 | ctx->none = true; |
952 | break; |
953 | case Opt_all: |
954 | ctx->all_ss = true; |
955 | break; |
956 | case Opt_noprefix: |
957 | ctx->flags |= CGRP_ROOT_NOPREFIX; |
958 | break; |
959 | case Opt_clone_children: |
960 | ctx->cpuset_clone_children = true; |
961 | break; |
962 | case Opt_cpuset_v2_mode: |
963 | ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; |
964 | break; |
965 | case Opt_xattr: |
966 | ctx->flags |= CGRP_ROOT_XATTR; |
967 | break; |
968 | case Opt_favordynmods: |
969 | ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; |
970 | break; |
971 | case Opt_nofavordynmods: |
972 | ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; |
973 | break; |
974 | case Opt_release_agent: |
975 | /* Specifying two release agents is forbidden */ |
976 | if (ctx->release_agent) |
977 | return invalfc(fc, "release_agent respecified"); |
978 | /* |
979 | * Release agent gets called with all capabilities, |
980 | * require capabilities to set release agent. |
981 | */ |
982 | if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) |
983 | return invalfc(fc, "Setting release_agent not allowed"); |
984 | ctx->release_agent = param->string; |
985 | param->string = NULL; |
986 | break; |
987 | case Opt_name: |
988 | /* blocked by boot param? */ |
989 | if (cgroup_no_v1_named) |
990 | return -ENOENT; |
991 | /* Can't specify an empty name */ |
992 | if (!param->size) |
993 | return invalfc(fc, "Empty name"); |
994 | if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) |
995 | return invalfc(fc, "Name too long"); |
996 | /* Must match [\w.-]+ */ |
997 | for (i = 0; i < param->size; i++) { |
998 | char c = param->string[i]; |
999 | if (isalnum(c)) |
1000 | continue; |
1001 | if ((c == '.') || (c == '-') || (c == '_')) |
1002 | continue; |
1003 | return invalfc(fc, "Invalid name"); |
1004 | } |
1005 | /* Specifying two names is forbidden */ |
1006 | if (ctx->name) |
1007 | return invalfc(fc, "name respecified"); |
1008 | ctx->name = param->string; |
1009 | param->string = NULL; |
1010 | break; |
1011 | } |
1012 | return 0; |
1013 | } |
1014 | |
1015 | static int check_cgroupfs_options(struct fs_context *fc) |
1016 | { |
1017 | struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
1018 | u16 mask = U16_MAX; |
1019 | u16 enabled = 0; |
1020 | struct cgroup_subsys *ss; |
1021 | int i; |
1022 | |
1023 | #ifdef CONFIG_CPUSETS |
1024 | mask = ~((u16)1 << cpuset_cgrp_id); |
1025 | #endif |
1026 | for_each_subsys(ss, i) |
1027 | if (cgroup_ssid_enabled(ssid: i) && !cgroup1_ssid_disabled(ssid: i)) |
1028 | enabled |= 1 << i; |
1029 | |
1030 | ctx->subsys_mask &= enabled; |
1031 | |
1032 | /* |
1033 | * In absence of 'none', 'name=' and subsystem name options, |
1034 | * let's default to 'all'. |
1035 | */ |
1036 | if (!ctx->subsys_mask && !ctx->none && !ctx->name) |
1037 | ctx->all_ss = true; |
1038 | |
1039 | if (ctx->all_ss) { |
1040 | /* Mutually exclusive option 'all' + subsystem name */ |
1041 | if (ctx->subsys_mask) |
1042 | return invalfc(fc, "subsys name conflicts with all"); |
1043 | /* 'all' => select all the subsystems */ |
1044 | ctx->subsys_mask = enabled; |
1045 | } |
1046 | |
1047 | /* |
1048 | * We either have to specify by name or by subsystems. (So all |
1049 | * empty hierarchies must have a name). |
1050 | */ |
1051 | if (!ctx->subsys_mask && !ctx->name) |
1052 | return invalfc(fc, "Need name or subsystem set"); |
1053 | |
1054 | /* |
1055 | * Option noprefix was introduced just for backward compatibility |
1056 | * with the old cpuset, so we allow noprefix only if mounting just |
1057 | * the cpuset subsystem. |
1058 | */ |
1059 | if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) |
1060 | return invalfc(fc, "noprefix used incorrectly"); |
1061 | |
1062 | /* Can't specify "none" and some subsystems */ |
1063 | if (ctx->subsys_mask && ctx->none) |
1064 | return invalfc(fc, "none used incorrectly"); |
1065 | |
1066 | return 0; |
1067 | } |
1068 | |
1069 | int cgroup1_reconfigure(struct fs_context *fc) |
1070 | { |
1071 | struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
1072 | struct kernfs_root *kf_root = kernfs_root_from_sb(sb: fc->root->d_sb); |
1073 | struct cgroup_root *root = cgroup_root_from_kf(kf_root); |
1074 | int ret = 0; |
1075 | u16 added_mask, removed_mask; |
1076 | |
1077 | cgroup_lock_and_drain_offline(cgrp: &cgrp_dfl_root.cgrp); |
1078 | |
1079 | /* See what subsystems are wanted */ |
1080 | ret = check_cgroupfs_options(fc); |
1081 | if (ret) |
1082 | goto out_unlock; |
1083 | |
1084 | if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent) |
1085 | pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", |
1086 | task_tgid_nr(current), current->comm); |
1087 | |
1088 | added_mask = ctx->subsys_mask & ~root->subsys_mask; |
1089 | removed_mask = root->subsys_mask & ~ctx->subsys_mask; |
1090 | |
1091 | /* Don't allow flags or name to change at remount */ |
1092 | if ((ctx->flags ^ root->flags) || |
1093 | (ctx->name && strcmp(ctx->name, root->name))) { |
1094 | errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", |
1095 | ctx->flags, ctx->name ?: "", root->flags, root->name); |
1096 | ret = -EINVAL; |
1097 | goto out_unlock; |
1098 | } |
1099 | |
1100 | /* remounting is not allowed for populated hierarchies */ |
1101 | if (!list_empty(head: &root->cgrp.self.children)) { |
1102 | ret = -EBUSY; |
1103 | goto out_unlock; |
1104 | } |
1105 | |
1106 | ret = rebind_subsystems(dst_root: root, ss_mask: added_mask); |
1107 | if (ret) |
1108 | goto out_unlock; |
1109 | |
1110 | WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); |
1111 | |
1112 | if (ctx->release_agent) { |
1113 | spin_lock(lock: &release_agent_path_lock); |
1114 | strcpy(p: root->release_agent_path, q: ctx->release_agent); |
1115 | spin_unlock(lock: &release_agent_path_lock); |
1116 | } |
1117 | |
1118 | trace_cgroup_remount(root); |
1119 | |
1120 | out_unlock: |
1121 | cgroup_unlock(); |
1122 | return ret; |
1123 | } |
1124 | |
1125 | struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { |
1126 | .rename = cgroup1_rename, |
1127 | .show_options = cgroup1_show_options, |
1128 | .mkdir = cgroup_mkdir, |
1129 | .rmdir = cgroup_rmdir, |
1130 | .show_path = cgroup_show_path, |
1131 | }; |
1132 | |
1133 | /* |
1134 | * The guts of cgroup1 mount - find or create cgroup_root to use. |
1135 | * Called with cgroup_mutex held; returns 0 on success, -E... on |
1136 | * error and positive - in case when the candidate is busy dying. |
1137 | * On success it stashes a reference to cgroup_root into given |
1138 | * cgroup_fs_context; that reference is *NOT* counting towards the |
1139 | * cgroup_root refcount. |
1140 | */ |
1141 | static int cgroup1_root_to_use(struct fs_context *fc) |
1142 | { |
1143 | struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
1144 | struct cgroup_root *root; |
1145 | struct cgroup_subsys *ss; |
1146 | int i, ret; |
1147 | |
1148 | /* First find the desired set of subsystems */ |
1149 | ret = check_cgroupfs_options(fc); |
1150 | if (ret) |
1151 | return ret; |
1152 | |
1153 | /* |
1154 | * Destruction of cgroup root is asynchronous, so subsystems may |
1155 | * still be dying after the previous unmount. Let's drain the |
1156 | * dying subsystems. We just need to ensure that the ones |
1157 | * unmounted previously finish dying and don't care about new ones |
1158 | * starting. Testing ref liveliness is good enough. |
1159 | */ |
1160 | for_each_subsys(ss, i) { |
1161 | if (!(ctx->subsys_mask & (1 << i)) || |
1162 | ss->root == &cgrp_dfl_root) |
1163 | continue; |
1164 | |
1165 | if (!percpu_ref_tryget_live(ref: &ss->root->cgrp.self.refcnt)) |
1166 | return 1; /* restart */ |
1167 | cgroup_put(cgrp: &ss->root->cgrp); |
1168 | } |
1169 | |
1170 | for_each_root(root) { |
1171 | bool name_match = false; |
1172 | |
1173 | if (root == &cgrp_dfl_root) |
1174 | continue; |
1175 | |
1176 | /* |
1177 | * If we asked for a name then it must match. Also, if |
1178 | * name matches but sybsys_mask doesn't, we should fail. |
1179 | * Remember whether name matched. |
1180 | */ |
1181 | if (ctx->name) { |
1182 | if (strcmp(ctx->name, root->name)) |
1183 | continue; |
1184 | name_match = true; |
1185 | } |
1186 | |
1187 | /* |
1188 | * If we asked for subsystems (or explicitly for no |
1189 | * subsystems) then they must match. |
1190 | */ |
1191 | if ((ctx->subsys_mask || ctx->none) && |
1192 | (ctx->subsys_mask != root->subsys_mask)) { |
1193 | if (!name_match) |
1194 | continue; |
1195 | return -EBUSY; |
1196 | } |
1197 | |
1198 | if (root->flags ^ ctx->flags) |
1199 | pr_warn("new mount options do not match the existing superblock, will be ignored\n"); |
1200 | |
1201 | ctx->root = root; |
1202 | return 0; |
1203 | } |
1204 | |
1205 | /* |
1206 | * No such thing, create a new one. name= matching without subsys |
1207 | * specification is allowed for already existing hierarchies but we |
1208 | * can't create new one without subsys specification. |
1209 | */ |
1210 | if (!ctx->subsys_mask && !ctx->none) |
1211 | return invalfc(fc, "No subsys list or none specified"); |
1212 | |
1213 | /* Hierarchies may only be created in the initial cgroup namespace. */ |
1214 | if (ctx->ns != &init_cgroup_ns) |
1215 | return -EPERM; |
1216 | |
1217 | root = kzalloc(size: sizeof(*root), GFP_KERNEL); |
1218 | if (!root) |
1219 | return -ENOMEM; |
1220 | |
1221 | ctx->root = root; |
1222 | init_cgroup_root(ctx); |
1223 | |
1224 | ret = cgroup_setup_root(root, ss_mask: ctx->subsys_mask); |
1225 | if (!ret) |
1226 | cgroup_favor_dynmods(root, favor: ctx->flags & CGRP_ROOT_FAVOR_DYNMODS); |
1227 | else |
1228 | cgroup_free_root(root); |
1229 | |
1230 | return ret; |
1231 | } |
1232 | |
1233 | int cgroup1_get_tree(struct fs_context *fc) |
1234 | { |
1235 | struct cgroup_fs_context *ctx = cgroup_fc2context(fc); |
1236 | int ret; |
1237 | |
1238 | /* Check if the caller has permission to mount. */ |
1239 | if (!ns_capable(ns: ctx->ns->user_ns, CAP_SYS_ADMIN)) |
1240 | return -EPERM; |
1241 | |
1242 | cgroup_lock_and_drain_offline(cgrp: &cgrp_dfl_root.cgrp); |
1243 | |
1244 | ret = cgroup1_root_to_use(fc); |
1245 | if (!ret && !percpu_ref_tryget_live(ref: &ctx->root->cgrp.self.refcnt)) |
1246 | ret = 1; /* restart */ |
1247 | |
1248 | cgroup_unlock(); |
1249 | |
1250 | if (!ret) |
1251 | ret = cgroup_do_get_tree(fc); |
1252 | |
1253 | if (!ret && percpu_ref_is_dying(ref: &ctx->root->cgrp.self.refcnt)) { |
1254 | fc_drop_locked(fc); |
1255 | ret = 1; |
1256 | } |
1257 | |
1258 | if (unlikely(ret > 0)) { |
1259 | msleep(msecs: 10); |
1260 | return restart_syscall(); |
1261 | } |
1262 | return ret; |
1263 | } |
1264 | |
1265 | /** |
1266 | * task_get_cgroup1 - Acquires the associated cgroup of a task within a |
1267 | * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its |
1268 | * hierarchy ID. |
1269 | * @tsk: The target task |
1270 | * @hierarchy_id: The ID of a cgroup1 hierarchy |
1271 | * |
1272 | * On success, the cgroup is returned. On failure, ERR_PTR is returned. |
1273 | * We limit it to cgroup1 only. |
1274 | */ |
1275 | struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id) |
1276 | { |
1277 | struct cgroup *cgrp = ERR_PTR(error: -ENOENT); |
1278 | struct cgroup_root *root; |
1279 | unsigned long flags; |
1280 | |
1281 | rcu_read_lock(); |
1282 | for_each_root(root) { |
1283 | /* cgroup1 only*/ |
1284 | if (root == &cgrp_dfl_root) |
1285 | continue; |
1286 | if (root->hierarchy_id != hierarchy_id) |
1287 | continue; |
1288 | spin_lock_irqsave(&css_set_lock, flags); |
1289 | cgrp = task_cgroup_from_root(task: tsk, root); |
1290 | if (!cgrp || !cgroup_tryget(cgrp)) |
1291 | cgrp = ERR_PTR(error: -ENOENT); |
1292 | spin_unlock_irqrestore(lock: &css_set_lock, flags); |
1293 | break; |
1294 | } |
1295 | rcu_read_unlock(); |
1296 | return cgrp; |
1297 | } |
1298 | |
1299 | static int __init cgroup1_wq_init(void) |
1300 | { |
1301 | /* |
1302 | * Used to destroy pidlists and separate to serve as flush domain. |
1303 | * Cap @max_active to 1 too. |
1304 | */ |
1305 | cgroup_pidlist_destroy_wq = alloc_workqueue(fmt: "cgroup_pidlist_destroy", |
1306 | flags: 0, max_active: 1); |
1307 | BUG_ON(!cgroup_pidlist_destroy_wq); |
1308 | return 0; |
1309 | } |
1310 | core_initcall(cgroup1_wq_init); |
1311 | |
1312 | static int __init cgroup_no_v1(char *str) |
1313 | { |
1314 | struct cgroup_subsys *ss; |
1315 | char *token; |
1316 | int i; |
1317 | |
1318 | while ((token = strsep(&str, ",")) != NULL) { |
1319 | if (!*token) |
1320 | continue; |
1321 | |
1322 | if (!strcmp(token, "all")) { |
1323 | cgroup_no_v1_mask = U16_MAX; |
1324 | continue; |
1325 | } |
1326 | |
1327 | if (!strcmp(token, "named")) { |
1328 | cgroup_no_v1_named = true; |
1329 | continue; |
1330 | } |
1331 | |
1332 | for_each_subsys(ss, i) { |
1333 | if (strcmp(token, ss->name) && |
1334 | strcmp(token, ss->legacy_name)) |
1335 | continue; |
1336 | |
1337 | cgroup_no_v1_mask |= 1 << i; |
1338 | } |
1339 | } |
1340 | return 1; |
1341 | } |
1342 | __setup("cgroup_no_v1=", cgroup_no_v1); |
1343 |
Definitions
- cgroup_no_v1_mask
- cgroup_no_v1_named
- cgroup_pidlist_destroy_wq
- release_agent_path_lock
- cgroup1_ssid_disabled
- cgroup_attach_task_all
- cgroup_transfer_tasks
- cgroup_filetype
- cgroup_pidlist
- cgroup1_pidlist_destroy_all
- cgroup_pidlist_destroy_work_fn
- pidlist_uniq
- cmppid
- cgroup_pidlist_find
- cgroup_pidlist_find_create
- pidlist_array_load
- cgroup_pidlist_start
- cgroup_pidlist_stop
- cgroup_pidlist_next
- cgroup_pidlist_show
- __cgroup1_procs_write
- cgroup1_procs_write
- cgroup1_tasks_write
- cgroup_release_agent_write
- cgroup_release_agent_show
- cgroup_sane_behavior_show
- cgroup_read_notify_on_release
- cgroup_write_notify_on_release
- cgroup_clone_children_read
- cgroup_clone_children_write
- cgroup1_base_files
- proc_cgroupstats_show
- cgroupstats_build
- cgroup1_check_for_release
- cgroup1_release_agent
- cgroup1_rename
- cgroup1_show_options
- cgroup1_param
- cgroup1_fs_parameters
- cgroup1_parse_param
- check_cgroupfs_options
- cgroup1_reconfigure
- cgroup1_kf_syscall_ops
- cgroup1_root_to_use
- cgroup1_get_tree
- task_get_cgroup1
- cgroup1_wq_init
Improve your Profiling and Debugging skills
Find out more