1 | /* |
2 | * kernel/cpuset.c |
3 | * |
4 | * Processor and Memory placement constraints for sets of tasks. |
5 | * |
6 | * Copyright (C) 2003 BULL SA. |
7 | * Copyright (C) 2004-2007 Silicon Graphics, Inc. |
8 | * Copyright (C) 2006 Google, Inc |
9 | * |
10 | * Portions derived from Patrick Mochel's sysfs code. |
11 | * sysfs is Copyright (c) 2001-3 Patrick Mochel |
12 | * |
13 | * 2003-10-10 Written by Simon Derr. |
14 | * 2003-10-22 Updates by Stephen Hemminger. |
15 | * 2004 May-July Rework by Paul Jackson. |
16 | * 2006 Rework by Paul Menage to use generic cgroups |
17 | * 2008 Rework of the scheduler domains and CPU hotplug handling |
18 | * by Max Krasnyansky |
19 | * |
20 | * This file is subject to the terms and conditions of the GNU General Public |
21 | * License. See the file COPYING in the main directory of the Linux |
22 | * distribution for more details. |
23 | */ |
24 | |
25 | #include <linux/cpu.h> |
26 | #include <linux/cpumask.h> |
27 | #include <linux/cpuset.h> |
28 | #include <linux/delay.h> |
29 | #include <linux/init.h> |
30 | #include <linux/interrupt.h> |
31 | #include <linux/kernel.h> |
32 | #include <linux/mempolicy.h> |
33 | #include <linux/mm.h> |
34 | #include <linux/memory.h> |
35 | #include <linux/export.h> |
36 | #include <linux/rcupdate.h> |
37 | #include <linux/sched.h> |
38 | #include <linux/sched/deadline.h> |
39 | #include <linux/sched/mm.h> |
40 | #include <linux/sched/task.h> |
41 | #include <linux/security.h> |
42 | #include <linux/spinlock.h> |
43 | #include <linux/oom.h> |
44 | #include <linux/sched/isolation.h> |
45 | #include <linux/cgroup.h> |
46 | #include <linux/wait.h> |
47 | #include <linux/workqueue.h> |
48 | |
49 | DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); |
50 | DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); |
51 | |
52 | /* |
53 | * There could be abnormal cpuset configurations for cpu or memory |
54 | * node binding, add this key to provide a quick low-cost judgment |
55 | * of the situation. |
56 | */ |
57 | DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); |
58 | |
59 | /* See "Frequency meter" comments, below. */ |
60 | |
61 | struct fmeter { |
62 | int cnt; /* unprocessed events count */ |
63 | int val; /* most recent output value */ |
64 | time64_t time; /* clock (secs) when val computed */ |
65 | spinlock_t lock; /* guards read or write of above */ |
66 | }; |
67 | |
68 | /* |
69 | * Invalid partition error code |
70 | */ |
71 | enum prs_errcode { |
72 | PERR_NONE = 0, |
73 | PERR_INVCPUS, |
74 | PERR_INVPARENT, |
75 | PERR_NOTPART, |
76 | PERR_NOTEXCL, |
77 | PERR_NOCPUS, |
78 | PERR_HOTPLUG, |
79 | PERR_CPUSEMPTY, |
80 | PERR_HKEEPING, |
81 | }; |
82 | |
83 | static const char * const perr_strings[] = { |
84 | [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive" , |
85 | [PERR_INVPARENT] = "Parent is an invalid partition root" , |
86 | [PERR_NOTPART] = "Parent is not a partition root" , |
87 | [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive" , |
88 | [PERR_NOCPUS] = "Parent unable to distribute cpu downstream" , |
89 | [PERR_HOTPLUG] = "No cpu available due to hotplug" , |
90 | [PERR_CPUSEMPTY] = "cpuset.cpus is empty" , |
91 | [PERR_HKEEPING] = "partition config conflicts with housekeeping setup" , |
92 | }; |
93 | |
94 | struct cpuset { |
95 | struct cgroup_subsys_state css; |
96 | |
97 | unsigned long flags; /* "unsigned long" so bitops work */ |
98 | |
99 | /* |
100 | * On default hierarchy: |
101 | * |
102 | * The user-configured masks can only be changed by writing to |
103 | * cpuset.cpus and cpuset.mems, and won't be limited by the |
104 | * parent masks. |
105 | * |
106 | * The effective masks is the real masks that apply to the tasks |
107 | * in the cpuset. They may be changed if the configured masks are |
108 | * changed or hotplug happens. |
109 | * |
110 | * effective_mask == configured_mask & parent's effective_mask, |
111 | * and if it ends up empty, it will inherit the parent's mask. |
112 | * |
113 | * |
114 | * On legacy hierarchy: |
115 | * |
116 | * The user-configured masks are always the same with effective masks. |
117 | */ |
118 | |
119 | /* user-configured CPUs and Memory Nodes allow to tasks */ |
120 | cpumask_var_t cpus_allowed; |
121 | nodemask_t mems_allowed; |
122 | |
123 | /* effective CPUs and Memory Nodes allow to tasks */ |
124 | cpumask_var_t effective_cpus; |
125 | nodemask_t effective_mems; |
126 | |
127 | /* |
128 | * Exclusive CPUs dedicated to current cgroup (default hierarchy only) |
129 | * |
130 | * This exclusive CPUs must be a subset of cpus_allowed. A parent |
131 | * cgroup can only grant exclusive CPUs to one of its children. |
132 | * |
133 | * When the cgroup becomes a valid partition root, effective_xcpus |
134 | * defaults to cpus_allowed if not set. The effective_cpus of a valid |
135 | * partition root comes solely from its effective_xcpus and some of the |
136 | * effective_xcpus may be distributed to sub-partitions below & hence |
137 | * excluded from its effective_cpus. |
138 | */ |
139 | cpumask_var_t effective_xcpus; |
140 | |
141 | /* |
142 | * Exclusive CPUs as requested by the user (default hierarchy only) |
143 | */ |
144 | cpumask_var_t exclusive_cpus; |
145 | |
146 | /* |
147 | * This is old Memory Nodes tasks took on. |
148 | * |
149 | * - top_cpuset.old_mems_allowed is initialized to mems_allowed. |
150 | * - A new cpuset's old_mems_allowed is initialized when some |
151 | * task is moved into it. |
152 | * - old_mems_allowed is used in cpuset_migrate_mm() when we change |
153 | * cpuset.mems_allowed and have tasks' nodemask updated, and |
154 | * then old_mems_allowed is updated to mems_allowed. |
155 | */ |
156 | nodemask_t old_mems_allowed; |
157 | |
158 | struct fmeter fmeter; /* memory_pressure filter */ |
159 | |
160 | /* |
161 | * Tasks are being attached to this cpuset. Used to prevent |
162 | * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). |
163 | */ |
164 | int attach_in_progress; |
165 | |
166 | /* partition number for rebuild_sched_domains() */ |
167 | int pn; |
168 | |
169 | /* for custom sched domain */ |
170 | int relax_domain_level; |
171 | |
172 | /* number of valid sub-partitions */ |
173 | int nr_subparts; |
174 | |
175 | /* partition root state */ |
176 | int partition_root_state; |
177 | |
178 | /* |
179 | * Default hierarchy only: |
180 | * use_parent_ecpus - set if using parent's effective_cpus |
181 | * child_ecpus_count - # of children with use_parent_ecpus set |
182 | */ |
183 | int use_parent_ecpus; |
184 | int child_ecpus_count; |
185 | |
186 | /* |
187 | * number of SCHED_DEADLINE tasks attached to this cpuset, so that we |
188 | * know when to rebuild associated root domain bandwidth information. |
189 | */ |
190 | int nr_deadline_tasks; |
191 | int nr_migrate_dl_tasks; |
192 | u64 sum_migrate_dl_bw; |
193 | |
194 | /* Invalid partition error code, not lock protected */ |
195 | enum prs_errcode prs_err; |
196 | |
197 | /* Handle for cpuset.cpus.partition */ |
198 | struct cgroup_file partition_file; |
199 | |
200 | /* Remote partition silbling list anchored at remote_children */ |
201 | struct list_head remote_sibling; |
202 | }; |
203 | |
204 | /* |
205 | * Exclusive CPUs distributed out to sub-partitions of top_cpuset |
206 | */ |
207 | static cpumask_var_t subpartitions_cpus; |
208 | |
209 | /* |
210 | * Exclusive CPUs in isolated partitions |
211 | */ |
212 | static cpumask_var_t isolated_cpus; |
213 | |
214 | /* List of remote partition root children */ |
215 | static struct list_head remote_children; |
216 | |
217 | /* |
218 | * Partition root states: |
219 | * |
220 | * 0 - member (not a partition root) |
221 | * 1 - partition root |
222 | * 2 - partition root without load balancing (isolated) |
223 | * -1 - invalid partition root |
224 | * -2 - invalid isolated partition root |
225 | */ |
226 | #define PRS_MEMBER 0 |
227 | #define PRS_ROOT 1 |
228 | #define PRS_ISOLATED 2 |
229 | #define PRS_INVALID_ROOT -1 |
230 | #define PRS_INVALID_ISOLATED -2 |
231 | |
232 | static inline bool is_prs_invalid(int prs_state) |
233 | { |
234 | return prs_state < 0; |
235 | } |
236 | |
237 | /* |
238 | * Temporary cpumasks for working with partitions that are passed among |
239 | * functions to avoid memory allocation in inner functions. |
240 | */ |
241 | struct tmpmasks { |
242 | cpumask_var_t addmask, delmask; /* For partition root */ |
243 | cpumask_var_t new_cpus; /* For update_cpumasks_hier() */ |
244 | }; |
245 | |
246 | static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) |
247 | { |
248 | return css ? container_of(css, struct cpuset, css) : NULL; |
249 | } |
250 | |
251 | /* Retrieve the cpuset for a task */ |
252 | static inline struct cpuset *task_cs(struct task_struct *task) |
253 | { |
254 | return css_cs(css: task_css(task, subsys_id: cpuset_cgrp_id)); |
255 | } |
256 | |
257 | static inline struct cpuset *parent_cs(struct cpuset *cs) |
258 | { |
259 | return css_cs(css: cs->css.parent); |
260 | } |
261 | |
262 | void inc_dl_tasks_cs(struct task_struct *p) |
263 | { |
264 | struct cpuset *cs = task_cs(task: p); |
265 | |
266 | cs->nr_deadline_tasks++; |
267 | } |
268 | |
269 | void dec_dl_tasks_cs(struct task_struct *p) |
270 | { |
271 | struct cpuset *cs = task_cs(task: p); |
272 | |
273 | cs->nr_deadline_tasks--; |
274 | } |
275 | |
276 | /* bits in struct cpuset flags field */ |
277 | typedef enum { |
278 | CS_ONLINE, |
279 | CS_CPU_EXCLUSIVE, |
280 | CS_MEM_EXCLUSIVE, |
281 | CS_MEM_HARDWALL, |
282 | CS_MEMORY_MIGRATE, |
283 | CS_SCHED_LOAD_BALANCE, |
284 | CS_SPREAD_PAGE, |
285 | CS_SPREAD_SLAB, |
286 | } cpuset_flagbits_t; |
287 | |
288 | /* convenient tests for these bits */ |
289 | static inline bool is_cpuset_online(struct cpuset *cs) |
290 | { |
291 | return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(css: &cs->css); |
292 | } |
293 | |
294 | static inline int is_cpu_exclusive(const struct cpuset *cs) |
295 | { |
296 | return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); |
297 | } |
298 | |
299 | static inline int is_mem_exclusive(const struct cpuset *cs) |
300 | { |
301 | return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); |
302 | } |
303 | |
304 | static inline int is_mem_hardwall(const struct cpuset *cs) |
305 | { |
306 | return test_bit(CS_MEM_HARDWALL, &cs->flags); |
307 | } |
308 | |
309 | static inline int is_sched_load_balance(const struct cpuset *cs) |
310 | { |
311 | return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
312 | } |
313 | |
314 | static inline int is_memory_migrate(const struct cpuset *cs) |
315 | { |
316 | return test_bit(CS_MEMORY_MIGRATE, &cs->flags); |
317 | } |
318 | |
319 | static inline int is_spread_page(const struct cpuset *cs) |
320 | { |
321 | return test_bit(CS_SPREAD_PAGE, &cs->flags); |
322 | } |
323 | |
324 | static inline int is_spread_slab(const struct cpuset *cs) |
325 | { |
326 | return test_bit(CS_SPREAD_SLAB, &cs->flags); |
327 | } |
328 | |
329 | static inline int is_partition_valid(const struct cpuset *cs) |
330 | { |
331 | return cs->partition_root_state > 0; |
332 | } |
333 | |
334 | static inline int is_partition_invalid(const struct cpuset *cs) |
335 | { |
336 | return cs->partition_root_state < 0; |
337 | } |
338 | |
339 | /* |
340 | * Callers should hold callback_lock to modify partition_root_state. |
341 | */ |
342 | static inline void make_partition_invalid(struct cpuset *cs) |
343 | { |
344 | if (cs->partition_root_state > 0) |
345 | cs->partition_root_state = -cs->partition_root_state; |
346 | } |
347 | |
348 | /* |
349 | * Send notification event of whenever partition_root_state changes. |
350 | */ |
351 | static inline void notify_partition_change(struct cpuset *cs, int old_prs) |
352 | { |
353 | if (old_prs == cs->partition_root_state) |
354 | return; |
355 | cgroup_file_notify(cfile: &cs->partition_file); |
356 | |
357 | /* Reset prs_err if not invalid */ |
358 | if (is_partition_valid(cs)) |
359 | WRITE_ONCE(cs->prs_err, PERR_NONE); |
360 | } |
361 | |
362 | static struct cpuset top_cpuset = { |
363 | .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | |
364 | (1 << CS_MEM_EXCLUSIVE)), |
365 | .partition_root_state = PRS_ROOT, |
366 | .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), |
367 | }; |
368 | |
369 | /** |
370 | * cpuset_for_each_child - traverse online children of a cpuset |
371 | * @child_cs: loop cursor pointing to the current child |
372 | * @pos_css: used for iteration |
373 | * @parent_cs: target cpuset to walk children of |
374 | * |
375 | * Walk @child_cs through the online children of @parent_cs. Must be used |
376 | * with RCU read locked. |
377 | */ |
378 | #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ |
379 | css_for_each_child((pos_css), &(parent_cs)->css) \ |
380 | if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) |
381 | |
382 | /** |
383 | * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants |
384 | * @des_cs: loop cursor pointing to the current descendant |
385 | * @pos_css: used for iteration |
386 | * @root_cs: target cpuset to walk ancestor of |
387 | * |
388 | * Walk @des_cs through the online descendants of @root_cs. Must be used |
389 | * with RCU read locked. The caller may modify @pos_css by calling |
390 | * css_rightmost_descendant() to skip subtree. @root_cs is included in the |
391 | * iteration and the first node to be visited. |
392 | */ |
393 | #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ |
394 | css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ |
395 | if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) |
396 | |
397 | /* |
398 | * There are two global locks guarding cpuset structures - cpuset_mutex and |
399 | * callback_lock. We also require taking task_lock() when dereferencing a |
400 | * task's cpuset pointer. See "The task_lock() exception", at the end of this |
401 | * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems |
402 | * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset |
403 | * structures. Note that cpuset_mutex needs to be a mutex as it is used in |
404 | * paths that rely on priority inheritance (e.g. scheduler - on RT) for |
405 | * correctness. |
406 | * |
407 | * A task must hold both locks to modify cpusets. If a task holds |
408 | * cpuset_mutex, it blocks others, ensuring that it is the only task able to |
409 | * also acquire callback_lock and be able to modify cpusets. It can perform |
410 | * various checks on the cpuset structure first, knowing nothing will change. |
411 | * It can also allocate memory while just holding cpuset_mutex. While it is |
412 | * performing these checks, various callback routines can briefly acquire |
413 | * callback_lock to query cpusets. Once it is ready to make the changes, it |
414 | * takes callback_lock, blocking everyone else. |
415 | * |
416 | * Calls to the kernel memory allocator can not be made while holding |
417 | * callback_lock, as that would risk double tripping on callback_lock |
418 | * from one of the callbacks into the cpuset code from within |
419 | * __alloc_pages(). |
420 | * |
421 | * If a task is only holding callback_lock, then it has read-only |
422 | * access to cpusets. |
423 | * |
424 | * Now, the task_struct fields mems_allowed and mempolicy may be changed |
425 | * by other task, we use alloc_lock in the task_struct fields to protect |
426 | * them. |
427 | * |
428 | * The cpuset_common_file_read() handlers only hold callback_lock across |
429 | * small pieces of code, such as when reading out possibly multi-word |
430 | * cpumasks and nodemasks. |
431 | * |
432 | * Accessing a task's cpuset should be done in accordance with the |
433 | * guidelines for accessing subsystem state in kernel/cgroup.c |
434 | */ |
435 | |
436 | static DEFINE_MUTEX(cpuset_mutex); |
437 | |
438 | void cpuset_lock(void) |
439 | { |
440 | mutex_lock(&cpuset_mutex); |
441 | } |
442 | |
443 | void cpuset_unlock(void) |
444 | { |
445 | mutex_unlock(lock: &cpuset_mutex); |
446 | } |
447 | |
448 | static DEFINE_SPINLOCK(callback_lock); |
449 | |
450 | static struct workqueue_struct *cpuset_migrate_mm_wq; |
451 | |
452 | /* |
453 | * CPU / memory hotplug is handled asynchronously. |
454 | */ |
455 | static void cpuset_hotplug_workfn(struct work_struct *work); |
456 | static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); |
457 | |
458 | static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); |
459 | |
460 | static inline void check_insane_mems_config(nodemask_t *nodes) |
461 | { |
462 | if (!cpusets_insane_config() && |
463 | movable_only_nodes(nodes)) { |
464 | static_branch_enable(&cpusets_insane_config_key); |
465 | pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n" |
466 | "Cpuset allocations might fail even with a lot of memory available.\n" , |
467 | nodemask_pr_args(nodes)); |
468 | } |
469 | } |
470 | |
471 | /* |
472 | * Cgroup v2 behavior is used on the "cpus" and "mems" control files when |
473 | * on default hierarchy or when the cpuset_v2_mode flag is set by mounting |
474 | * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. |
475 | * With v2 behavior, "cpus" and "mems" are always what the users have |
476 | * requested and won't be changed by hotplug events. Only the effective |
477 | * cpus or mems will be affected. |
478 | */ |
479 | static inline bool is_in_v2_mode(void) |
480 | { |
481 | return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || |
482 | (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); |
483 | } |
484 | |
485 | /** |
486 | * partition_is_populated - check if partition has tasks |
487 | * @cs: partition root to be checked |
488 | * @excluded_child: a child cpuset to be excluded in task checking |
489 | * Return: true if there are tasks, false otherwise |
490 | * |
491 | * It is assumed that @cs is a valid partition root. @excluded_child should |
492 | * be non-NULL when this cpuset is going to become a partition itself. |
493 | */ |
494 | static inline bool partition_is_populated(struct cpuset *cs, |
495 | struct cpuset *excluded_child) |
496 | { |
497 | struct cgroup_subsys_state *css; |
498 | struct cpuset *child; |
499 | |
500 | if (cs->css.cgroup->nr_populated_csets) |
501 | return true; |
502 | if (!excluded_child && !cs->nr_subparts) |
503 | return cgroup_is_populated(cgrp: cs->css.cgroup); |
504 | |
505 | rcu_read_lock(); |
506 | cpuset_for_each_child(child, css, cs) { |
507 | if (child == excluded_child) |
508 | continue; |
509 | if (is_partition_valid(cs: child)) |
510 | continue; |
511 | if (cgroup_is_populated(cgrp: child->css.cgroup)) { |
512 | rcu_read_unlock(); |
513 | return true; |
514 | } |
515 | } |
516 | rcu_read_unlock(); |
517 | return false; |
518 | } |
519 | |
520 | /* |
521 | * Return in pmask the portion of a task's cpusets's cpus_allowed that |
522 | * are online and are capable of running the task. If none are found, |
523 | * walk up the cpuset hierarchy until we find one that does have some |
524 | * appropriate cpus. |
525 | * |
526 | * One way or another, we guarantee to return some non-empty subset |
527 | * of cpu_online_mask. |
528 | * |
529 | * Call with callback_lock or cpuset_mutex held. |
530 | */ |
531 | static void guarantee_online_cpus(struct task_struct *tsk, |
532 | struct cpumask *pmask) |
533 | { |
534 | const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); |
535 | struct cpuset *cs; |
536 | |
537 | if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask))) |
538 | cpumask_copy(dstp: pmask, cpu_online_mask); |
539 | |
540 | rcu_read_lock(); |
541 | cs = task_cs(task: tsk); |
542 | |
543 | while (!cpumask_intersects(src1p: cs->effective_cpus, src2p: pmask)) { |
544 | cs = parent_cs(cs); |
545 | if (unlikely(!cs)) { |
546 | /* |
547 | * The top cpuset doesn't have any online cpu as a |
548 | * consequence of a race between cpuset_hotplug_work |
549 | * and cpu hotplug notifier. But we know the top |
550 | * cpuset's effective_cpus is on its way to be |
551 | * identical to cpu_online_mask. |
552 | */ |
553 | goto out_unlock; |
554 | } |
555 | } |
556 | cpumask_and(dstp: pmask, src1p: pmask, src2p: cs->effective_cpus); |
557 | |
558 | out_unlock: |
559 | rcu_read_unlock(); |
560 | } |
561 | |
562 | /* |
563 | * Return in *pmask the portion of a cpusets's mems_allowed that |
564 | * are online, with memory. If none are online with memory, walk |
565 | * up the cpuset hierarchy until we find one that does have some |
566 | * online mems. The top cpuset always has some mems online. |
567 | * |
568 | * One way or another, we guarantee to return some non-empty subset |
569 | * of node_states[N_MEMORY]. |
570 | * |
571 | * Call with callback_lock or cpuset_mutex held. |
572 | */ |
573 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) |
574 | { |
575 | while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) |
576 | cs = parent_cs(cs); |
577 | nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); |
578 | } |
579 | |
580 | /* |
581 | * update task's spread flag if cpuset's page/slab spread flag is set |
582 | * |
583 | * Call with callback_lock or cpuset_mutex held. The check can be skipped |
584 | * if on default hierarchy. |
585 | */ |
586 | static void cpuset_update_task_spread_flags(struct cpuset *cs, |
587 | struct task_struct *tsk) |
588 | { |
589 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) |
590 | return; |
591 | |
592 | if (is_spread_page(cs)) |
593 | task_set_spread_page(p: tsk); |
594 | else |
595 | task_clear_spread_page(p: tsk); |
596 | |
597 | if (is_spread_slab(cs)) |
598 | task_set_spread_slab(p: tsk); |
599 | else |
600 | task_clear_spread_slab(p: tsk); |
601 | } |
602 | |
603 | /* |
604 | * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? |
605 | * |
606 | * One cpuset is a subset of another if all its allowed CPUs and |
607 | * Memory Nodes are a subset of the other, and its exclusive flags |
608 | * are only set if the other's are set. Call holding cpuset_mutex. |
609 | */ |
610 | |
611 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
612 | { |
613 | return cpumask_subset(src1p: p->cpus_allowed, src2p: q->cpus_allowed) && |
614 | nodes_subset(p->mems_allowed, q->mems_allowed) && |
615 | is_cpu_exclusive(cs: p) <= is_cpu_exclusive(cs: q) && |
616 | is_mem_exclusive(cs: p) <= is_mem_exclusive(cs: q); |
617 | } |
618 | |
619 | /** |
620 | * alloc_cpumasks - allocate three cpumasks for cpuset |
621 | * @cs: the cpuset that have cpumasks to be allocated. |
622 | * @tmp: the tmpmasks structure pointer |
623 | * Return: 0 if successful, -ENOMEM otherwise. |
624 | * |
625 | * Only one of the two input arguments should be non-NULL. |
626 | */ |
627 | static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) |
628 | { |
629 | cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; |
630 | |
631 | if (cs) { |
632 | pmask1 = &cs->cpus_allowed; |
633 | pmask2 = &cs->effective_cpus; |
634 | pmask3 = &cs->effective_xcpus; |
635 | pmask4 = &cs->exclusive_cpus; |
636 | } else { |
637 | pmask1 = &tmp->new_cpus; |
638 | pmask2 = &tmp->addmask; |
639 | pmask3 = &tmp->delmask; |
640 | pmask4 = NULL; |
641 | } |
642 | |
643 | if (!zalloc_cpumask_var(mask: pmask1, GFP_KERNEL)) |
644 | return -ENOMEM; |
645 | |
646 | if (!zalloc_cpumask_var(mask: pmask2, GFP_KERNEL)) |
647 | goto free_one; |
648 | |
649 | if (!zalloc_cpumask_var(mask: pmask3, GFP_KERNEL)) |
650 | goto free_two; |
651 | |
652 | if (pmask4 && !zalloc_cpumask_var(mask: pmask4, GFP_KERNEL)) |
653 | goto free_three; |
654 | |
655 | |
656 | return 0; |
657 | |
658 | free_three: |
659 | free_cpumask_var(mask: *pmask3); |
660 | free_two: |
661 | free_cpumask_var(mask: *pmask2); |
662 | free_one: |
663 | free_cpumask_var(mask: *pmask1); |
664 | return -ENOMEM; |
665 | } |
666 | |
667 | /** |
668 | * free_cpumasks - free cpumasks in a tmpmasks structure |
669 | * @cs: the cpuset that have cpumasks to be free. |
670 | * @tmp: the tmpmasks structure pointer |
671 | */ |
672 | static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) |
673 | { |
674 | if (cs) { |
675 | free_cpumask_var(mask: cs->cpus_allowed); |
676 | free_cpumask_var(mask: cs->effective_cpus); |
677 | free_cpumask_var(mask: cs->effective_xcpus); |
678 | free_cpumask_var(mask: cs->exclusive_cpus); |
679 | } |
680 | if (tmp) { |
681 | free_cpumask_var(mask: tmp->new_cpus); |
682 | free_cpumask_var(mask: tmp->addmask); |
683 | free_cpumask_var(mask: tmp->delmask); |
684 | } |
685 | } |
686 | |
687 | /** |
688 | * alloc_trial_cpuset - allocate a trial cpuset |
689 | * @cs: the cpuset that the trial cpuset duplicates |
690 | */ |
691 | static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) |
692 | { |
693 | struct cpuset *trial; |
694 | |
695 | trial = kmemdup(p: cs, size: sizeof(*cs), GFP_KERNEL); |
696 | if (!trial) |
697 | return NULL; |
698 | |
699 | if (alloc_cpumasks(cs: trial, NULL)) { |
700 | kfree(objp: trial); |
701 | return NULL; |
702 | } |
703 | |
704 | cpumask_copy(dstp: trial->cpus_allowed, srcp: cs->cpus_allowed); |
705 | cpumask_copy(dstp: trial->effective_cpus, srcp: cs->effective_cpus); |
706 | cpumask_copy(dstp: trial->effective_xcpus, srcp: cs->effective_xcpus); |
707 | cpumask_copy(dstp: trial->exclusive_cpus, srcp: cs->exclusive_cpus); |
708 | return trial; |
709 | } |
710 | |
711 | /** |
712 | * free_cpuset - free the cpuset |
713 | * @cs: the cpuset to be freed |
714 | */ |
715 | static inline void free_cpuset(struct cpuset *cs) |
716 | { |
717 | free_cpumasks(cs, NULL); |
718 | kfree(objp: cs); |
719 | } |
720 | |
721 | static inline struct cpumask *fetch_xcpus(struct cpuset *cs) |
722 | { |
723 | return !cpumask_empty(srcp: cs->exclusive_cpus) ? cs->exclusive_cpus : |
724 | cpumask_empty(srcp: cs->effective_xcpus) ? cs->cpus_allowed |
725 | : cs->effective_xcpus; |
726 | } |
727 | |
728 | /* |
729 | * cpusets_are_exclusive() - check if two cpusets are exclusive |
730 | * |
731 | * Return true if exclusive, false if not |
732 | */ |
733 | static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) |
734 | { |
735 | struct cpumask *xcpus1 = fetch_xcpus(cs: cs1); |
736 | struct cpumask *xcpus2 = fetch_xcpus(cs: cs2); |
737 | |
738 | if (cpumask_intersects(src1p: xcpus1, src2p: xcpus2)) |
739 | return false; |
740 | return true; |
741 | } |
742 | |
743 | /* |
744 | * validate_change_legacy() - Validate conditions specific to legacy (v1) |
745 | * behavior. |
746 | */ |
747 | static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) |
748 | { |
749 | struct cgroup_subsys_state *css; |
750 | struct cpuset *c, *par; |
751 | int ret; |
752 | |
753 | WARN_ON_ONCE(!rcu_read_lock_held()); |
754 | |
755 | /* Each of our child cpusets must be a subset of us */ |
756 | ret = -EBUSY; |
757 | cpuset_for_each_child(c, css, cur) |
758 | if (!is_cpuset_subset(p: c, q: trial)) |
759 | goto out; |
760 | |
761 | /* On legacy hierarchy, we must be a subset of our parent cpuset. */ |
762 | ret = -EACCES; |
763 | par = parent_cs(cs: cur); |
764 | if (par && !is_cpuset_subset(p: trial, q: par)) |
765 | goto out; |
766 | |
767 | ret = 0; |
768 | out: |
769 | return ret; |
770 | } |
771 | |
772 | /* |
773 | * validate_change() - Used to validate that any proposed cpuset change |
774 | * follows the structural rules for cpusets. |
775 | * |
776 | * If we replaced the flag and mask values of the current cpuset |
777 | * (cur) with those values in the trial cpuset (trial), would |
778 | * our various subset and exclusive rules still be valid? Presumes |
779 | * cpuset_mutex held. |
780 | * |
781 | * 'cur' is the address of an actual, in-use cpuset. Operations |
782 | * such as list traversal that depend on the actual address of the |
783 | * cpuset in the list must use cur below, not trial. |
784 | * |
785 | * 'trial' is the address of bulk structure copy of cur, with |
786 | * perhaps one or more of the fields cpus_allowed, mems_allowed, |
787 | * or flags changed to new, trial values. |
788 | * |
789 | * Return 0 if valid, -errno if not. |
790 | */ |
791 | |
792 | static int validate_change(struct cpuset *cur, struct cpuset *trial) |
793 | { |
794 | struct cgroup_subsys_state *css; |
795 | struct cpuset *c, *par; |
796 | int ret = 0; |
797 | |
798 | rcu_read_lock(); |
799 | |
800 | if (!is_in_v2_mode()) |
801 | ret = validate_change_legacy(cur, trial); |
802 | if (ret) |
803 | goto out; |
804 | |
805 | /* Remaining checks don't apply to root cpuset */ |
806 | if (cur == &top_cpuset) |
807 | goto out; |
808 | |
809 | par = parent_cs(cs: cur); |
810 | |
811 | /* |
812 | * Cpusets with tasks - existing or newly being attached - can't |
813 | * be changed to have empty cpus_allowed or mems_allowed. |
814 | */ |
815 | ret = -ENOSPC; |
816 | if ((cgroup_is_populated(cgrp: cur->css.cgroup) || cur->attach_in_progress)) { |
817 | if (!cpumask_empty(srcp: cur->cpus_allowed) && |
818 | cpumask_empty(srcp: trial->cpus_allowed)) |
819 | goto out; |
820 | if (!nodes_empty(cur->mems_allowed) && |
821 | nodes_empty(trial->mems_allowed)) |
822 | goto out; |
823 | } |
824 | |
825 | /* |
826 | * We can't shrink if we won't have enough room for SCHED_DEADLINE |
827 | * tasks. |
828 | */ |
829 | ret = -EBUSY; |
830 | if (is_cpu_exclusive(cs: cur) && |
831 | !cpuset_cpumask_can_shrink(cur: cur->cpus_allowed, |
832 | trial: trial->cpus_allowed)) |
833 | goto out; |
834 | |
835 | /* |
836 | * If either I or some sibling (!= me) is exclusive, we can't |
837 | * overlap |
838 | */ |
839 | ret = -EINVAL; |
840 | cpuset_for_each_child(c, css, par) { |
841 | if ((is_cpu_exclusive(cs: trial) || is_cpu_exclusive(cs: c)) && |
842 | c != cur) { |
843 | if (!cpusets_are_exclusive(cs1: trial, cs2: c)) |
844 | goto out; |
845 | } |
846 | if ((is_mem_exclusive(cs: trial) || is_mem_exclusive(cs: c)) && |
847 | c != cur && |
848 | nodes_intersects(trial->mems_allowed, c->mems_allowed)) |
849 | goto out; |
850 | } |
851 | |
852 | ret = 0; |
853 | out: |
854 | rcu_read_unlock(); |
855 | return ret; |
856 | } |
857 | |
858 | #ifdef CONFIG_SMP |
859 | /* |
860 | * Helper routine for generate_sched_domains(). |
861 | * Do cpusets a, b have overlapping effective cpus_allowed masks? |
862 | */ |
863 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) |
864 | { |
865 | return cpumask_intersects(src1p: a->effective_cpus, src2p: b->effective_cpus); |
866 | } |
867 | |
868 | static void |
869 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) |
870 | { |
871 | if (dattr->relax_domain_level < c->relax_domain_level) |
872 | dattr->relax_domain_level = c->relax_domain_level; |
873 | return; |
874 | } |
875 | |
876 | static void update_domain_attr_tree(struct sched_domain_attr *dattr, |
877 | struct cpuset *root_cs) |
878 | { |
879 | struct cpuset *cp; |
880 | struct cgroup_subsys_state *pos_css; |
881 | |
882 | rcu_read_lock(); |
883 | cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { |
884 | /* skip the whole subtree if @cp doesn't have any CPU */ |
885 | if (cpumask_empty(srcp: cp->cpus_allowed)) { |
886 | pos_css = css_rightmost_descendant(pos: pos_css); |
887 | continue; |
888 | } |
889 | |
890 | if (is_sched_load_balance(cs: cp)) |
891 | update_domain_attr(dattr, c: cp); |
892 | } |
893 | rcu_read_unlock(); |
894 | } |
895 | |
896 | /* Must be called with cpuset_mutex held. */ |
897 | static inline int nr_cpusets(void) |
898 | { |
899 | /* jump label reference count + the top-level cpuset */ |
900 | return static_key_count(key: &cpusets_enabled_key.key) + 1; |
901 | } |
902 | |
903 | /* |
904 | * generate_sched_domains() |
905 | * |
906 | * This function builds a partial partition of the systems CPUs |
907 | * A 'partial partition' is a set of non-overlapping subsets whose |
908 | * union is a subset of that set. |
909 | * The output of this function needs to be passed to kernel/sched/core.c |
910 | * partition_sched_domains() routine, which will rebuild the scheduler's |
911 | * load balancing domains (sched domains) as specified by that partial |
912 | * partition. |
913 | * |
914 | * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst |
915 | * for a background explanation of this. |
916 | * |
917 | * Does not return errors, on the theory that the callers of this |
918 | * routine would rather not worry about failures to rebuild sched |
919 | * domains when operating in the severe memory shortage situations |
920 | * that could cause allocation failures below. |
921 | * |
922 | * Must be called with cpuset_mutex held. |
923 | * |
924 | * The three key local variables below are: |
925 | * cp - cpuset pointer, used (together with pos_css) to perform a |
926 | * top-down scan of all cpusets. For our purposes, rebuilding |
927 | * the schedulers sched domains, we can ignore !is_sched_load_ |
928 | * balance cpusets. |
929 | * csa - (for CpuSet Array) Array of pointers to all the cpusets |
930 | * that need to be load balanced, for convenient iterative |
931 | * access by the subsequent code that finds the best partition, |
932 | * i.e the set of domains (subsets) of CPUs such that the |
933 | * cpus_allowed of every cpuset marked is_sched_load_balance |
934 | * is a subset of one of these domains, while there are as |
935 | * many such domains as possible, each as small as possible. |
936 | * doms - Conversion of 'csa' to an array of cpumasks, for passing to |
937 | * the kernel/sched/core.c routine partition_sched_domains() in a |
938 | * convenient format, that can be easily compared to the prior |
939 | * value to determine what partition elements (sched domains) |
940 | * were changed (added or removed.) |
941 | * |
942 | * Finding the best partition (set of domains): |
943 | * The triple nested loops below over i, j, k scan over the |
944 | * load balanced cpusets (using the array of cpuset pointers in |
945 | * csa[]) looking for pairs of cpusets that have overlapping |
946 | * cpus_allowed, but which don't have the same 'pn' partition |
947 | * number and gives them in the same partition number. It keeps |
948 | * looping on the 'restart' label until it can no longer find |
949 | * any such pairs. |
950 | * |
951 | * The union of the cpus_allowed masks from the set of |
952 | * all cpusets having the same 'pn' value then form the one |
953 | * element of the partition (one sched domain) to be passed to |
954 | * partition_sched_domains(). |
955 | */ |
956 | static int generate_sched_domains(cpumask_var_t **domains, |
957 | struct sched_domain_attr **attributes) |
958 | { |
959 | struct cpuset *cp; /* top-down scan of cpusets */ |
960 | struct cpuset **csa; /* array of all cpuset ptrs */ |
961 | int csn; /* how many cpuset ptrs in csa so far */ |
962 | int i, j, k; /* indices for partition finding loops */ |
963 | cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ |
964 | struct sched_domain_attr *dattr; /* attributes for custom domains */ |
965 | int ndoms = 0; /* number of sched domains in result */ |
966 | int nslot; /* next empty doms[] struct cpumask slot */ |
967 | struct cgroup_subsys_state *pos_css; |
968 | bool root_load_balance = is_sched_load_balance(cs: &top_cpuset); |
969 | |
970 | doms = NULL; |
971 | dattr = NULL; |
972 | csa = NULL; |
973 | |
974 | /* Special case for the 99% of systems with one, full, sched domain */ |
975 | if (root_load_balance && !top_cpuset.nr_subparts) { |
976 | ndoms = 1; |
977 | doms = alloc_sched_domains(ndoms); |
978 | if (!doms) |
979 | goto done; |
980 | |
981 | dattr = kmalloc(size: sizeof(struct sched_domain_attr), GFP_KERNEL); |
982 | if (dattr) { |
983 | *dattr = SD_ATTR_INIT; |
984 | update_domain_attr_tree(dattr, root_cs: &top_cpuset); |
985 | } |
986 | cpumask_and(dstp: doms[0], src1p: top_cpuset.effective_cpus, |
987 | src2p: housekeeping_cpumask(type: HK_TYPE_DOMAIN)); |
988 | |
989 | goto done; |
990 | } |
991 | |
992 | csa = kmalloc_array(n: nr_cpusets(), size: sizeof(cp), GFP_KERNEL); |
993 | if (!csa) |
994 | goto done; |
995 | csn = 0; |
996 | |
997 | rcu_read_lock(); |
998 | if (root_load_balance) |
999 | csa[csn++] = &top_cpuset; |
1000 | cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { |
1001 | if (cp == &top_cpuset) |
1002 | continue; |
1003 | /* |
1004 | * Continue traversing beyond @cp iff @cp has some CPUs and |
1005 | * isn't load balancing. The former is obvious. The |
1006 | * latter: All child cpusets contain a subset of the |
1007 | * parent's cpus, so just skip them, and then we call |
1008 | * update_domain_attr_tree() to calc relax_domain_level of |
1009 | * the corresponding sched domain. |
1010 | * |
1011 | * If root is load-balancing, we can skip @cp if it |
1012 | * is a subset of the root's effective_cpus. |
1013 | */ |
1014 | if (!cpumask_empty(srcp: cp->cpus_allowed) && |
1015 | !(is_sched_load_balance(cs: cp) && |
1016 | cpumask_intersects(src1p: cp->cpus_allowed, |
1017 | src2p: housekeeping_cpumask(type: HK_TYPE_DOMAIN)))) |
1018 | continue; |
1019 | |
1020 | if (root_load_balance && |
1021 | cpumask_subset(src1p: cp->cpus_allowed, src2p: top_cpuset.effective_cpus)) |
1022 | continue; |
1023 | |
1024 | if (is_sched_load_balance(cs: cp) && |
1025 | !cpumask_empty(srcp: cp->effective_cpus)) |
1026 | csa[csn++] = cp; |
1027 | |
1028 | /* skip @cp's subtree if not a partition root */ |
1029 | if (!is_partition_valid(cs: cp)) |
1030 | pos_css = css_rightmost_descendant(pos: pos_css); |
1031 | } |
1032 | rcu_read_unlock(); |
1033 | |
1034 | for (i = 0; i < csn; i++) |
1035 | csa[i]->pn = i; |
1036 | ndoms = csn; |
1037 | |
1038 | restart: |
1039 | /* Find the best partition (set of sched domains) */ |
1040 | for (i = 0; i < csn; i++) { |
1041 | struct cpuset *a = csa[i]; |
1042 | int apn = a->pn; |
1043 | |
1044 | for (j = 0; j < csn; j++) { |
1045 | struct cpuset *b = csa[j]; |
1046 | int bpn = b->pn; |
1047 | |
1048 | if (apn != bpn && cpusets_overlap(a, b)) { |
1049 | for (k = 0; k < csn; k++) { |
1050 | struct cpuset *c = csa[k]; |
1051 | |
1052 | if (c->pn == bpn) |
1053 | c->pn = apn; |
1054 | } |
1055 | ndoms--; /* one less element */ |
1056 | goto restart; |
1057 | } |
1058 | } |
1059 | } |
1060 | |
1061 | /* |
1062 | * Now we know how many domains to create. |
1063 | * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. |
1064 | */ |
1065 | doms = alloc_sched_domains(ndoms); |
1066 | if (!doms) |
1067 | goto done; |
1068 | |
1069 | /* |
1070 | * The rest of the code, including the scheduler, can deal with |
1071 | * dattr==NULL case. No need to abort if alloc fails. |
1072 | */ |
1073 | dattr = kmalloc_array(n: ndoms, size: sizeof(struct sched_domain_attr), |
1074 | GFP_KERNEL); |
1075 | |
1076 | for (nslot = 0, i = 0; i < csn; i++) { |
1077 | struct cpuset *a = csa[i]; |
1078 | struct cpumask *dp; |
1079 | int apn = a->pn; |
1080 | |
1081 | if (apn < 0) { |
1082 | /* Skip completed partitions */ |
1083 | continue; |
1084 | } |
1085 | |
1086 | dp = doms[nslot]; |
1087 | |
1088 | if (nslot == ndoms) { |
1089 | static int warnings = 10; |
1090 | if (warnings) { |
1091 | pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n" , |
1092 | nslot, ndoms, csn, i, apn); |
1093 | warnings--; |
1094 | } |
1095 | continue; |
1096 | } |
1097 | |
1098 | cpumask_clear(dstp: dp); |
1099 | if (dattr) |
1100 | *(dattr + nslot) = SD_ATTR_INIT; |
1101 | for (j = i; j < csn; j++) { |
1102 | struct cpuset *b = csa[j]; |
1103 | |
1104 | if (apn == b->pn) { |
1105 | cpumask_or(dstp: dp, src1p: dp, src2p: b->effective_cpus); |
1106 | cpumask_and(dstp: dp, src1p: dp, src2p: housekeeping_cpumask(type: HK_TYPE_DOMAIN)); |
1107 | if (dattr) |
1108 | update_domain_attr_tree(dattr: dattr + nslot, root_cs: b); |
1109 | |
1110 | /* Done with this partition */ |
1111 | b->pn = -1; |
1112 | } |
1113 | } |
1114 | nslot++; |
1115 | } |
1116 | BUG_ON(nslot != ndoms); |
1117 | |
1118 | done: |
1119 | kfree(objp: csa); |
1120 | |
1121 | /* |
1122 | * Fallback to the default domain if kmalloc() failed. |
1123 | * See comments in partition_sched_domains(). |
1124 | */ |
1125 | if (doms == NULL) |
1126 | ndoms = 1; |
1127 | |
1128 | *domains = doms; |
1129 | *attributes = dattr; |
1130 | return ndoms; |
1131 | } |
1132 | |
1133 | static void dl_update_tasks_root_domain(struct cpuset *cs) |
1134 | { |
1135 | struct css_task_iter it; |
1136 | struct task_struct *task; |
1137 | |
1138 | if (cs->nr_deadline_tasks == 0) |
1139 | return; |
1140 | |
1141 | css_task_iter_start(css: &cs->css, flags: 0, it: &it); |
1142 | |
1143 | while ((task = css_task_iter_next(it: &it))) |
1144 | dl_add_task_root_domain(p: task); |
1145 | |
1146 | css_task_iter_end(it: &it); |
1147 | } |
1148 | |
1149 | static void dl_rebuild_rd_accounting(void) |
1150 | { |
1151 | struct cpuset *cs = NULL; |
1152 | struct cgroup_subsys_state *pos_css; |
1153 | |
1154 | lockdep_assert_held(&cpuset_mutex); |
1155 | lockdep_assert_cpus_held(); |
1156 | lockdep_assert_held(&sched_domains_mutex); |
1157 | |
1158 | rcu_read_lock(); |
1159 | |
1160 | /* |
1161 | * Clear default root domain DL accounting, it will be computed again |
1162 | * if a task belongs to it. |
1163 | */ |
1164 | dl_clear_root_domain(rd: &def_root_domain); |
1165 | |
1166 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { |
1167 | |
1168 | if (cpumask_empty(srcp: cs->effective_cpus)) { |
1169 | pos_css = css_rightmost_descendant(pos: pos_css); |
1170 | continue; |
1171 | } |
1172 | |
1173 | css_get(css: &cs->css); |
1174 | |
1175 | rcu_read_unlock(); |
1176 | |
1177 | dl_update_tasks_root_domain(cs); |
1178 | |
1179 | rcu_read_lock(); |
1180 | css_put(css: &cs->css); |
1181 | } |
1182 | rcu_read_unlock(); |
1183 | } |
1184 | |
1185 | static void |
1186 | partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], |
1187 | struct sched_domain_attr *dattr_new) |
1188 | { |
1189 | mutex_lock(&sched_domains_mutex); |
1190 | partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); |
1191 | dl_rebuild_rd_accounting(); |
1192 | mutex_unlock(lock: &sched_domains_mutex); |
1193 | } |
1194 | |
1195 | /* |
1196 | * Rebuild scheduler domains. |
1197 | * |
1198 | * If the flag 'sched_load_balance' of any cpuset with non-empty |
1199 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset |
1200 | * which has that flag enabled, or if any cpuset with a non-empty |
1201 | * 'cpus' is removed, then call this routine to rebuild the |
1202 | * scheduler's dynamic sched domains. |
1203 | * |
1204 | * Call with cpuset_mutex held. Takes cpus_read_lock(). |
1205 | */ |
1206 | static void rebuild_sched_domains_locked(void) |
1207 | { |
1208 | struct cgroup_subsys_state *pos_css; |
1209 | struct sched_domain_attr *attr; |
1210 | cpumask_var_t *doms; |
1211 | struct cpuset *cs; |
1212 | int ndoms; |
1213 | |
1214 | lockdep_assert_cpus_held(); |
1215 | lockdep_assert_held(&cpuset_mutex); |
1216 | |
1217 | /* |
1218 | * If we have raced with CPU hotplug, return early to avoid |
1219 | * passing doms with offlined cpu to partition_sched_domains(). |
1220 | * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. |
1221 | * |
1222 | * With no CPUs in any subpartitions, top_cpuset's effective CPUs |
1223 | * should be the same as the active CPUs, so checking only top_cpuset |
1224 | * is enough to detect racing CPU offlines. |
1225 | */ |
1226 | if (cpumask_empty(srcp: subpartitions_cpus) && |
1227 | !cpumask_equal(src1p: top_cpuset.effective_cpus, cpu_active_mask)) |
1228 | return; |
1229 | |
1230 | /* |
1231 | * With subpartition CPUs, however, the effective CPUs of a partition |
1232 | * root should be only a subset of the active CPUs. Since a CPU in any |
1233 | * partition root could be offlined, all must be checked. |
1234 | */ |
1235 | if (top_cpuset.nr_subparts) { |
1236 | rcu_read_lock(); |
1237 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { |
1238 | if (!is_partition_valid(cs)) { |
1239 | pos_css = css_rightmost_descendant(pos: pos_css); |
1240 | continue; |
1241 | } |
1242 | if (!cpumask_subset(src1p: cs->effective_cpus, |
1243 | cpu_active_mask)) { |
1244 | rcu_read_unlock(); |
1245 | return; |
1246 | } |
1247 | } |
1248 | rcu_read_unlock(); |
1249 | } |
1250 | |
1251 | /* Generate domain masks and attrs */ |
1252 | ndoms = generate_sched_domains(domains: &doms, attributes: &attr); |
1253 | |
1254 | /* Have scheduler rebuild the domains */ |
1255 | partition_and_rebuild_sched_domains(ndoms_new: ndoms, doms_new: doms, dattr_new: attr); |
1256 | } |
1257 | #else /* !CONFIG_SMP */ |
1258 | static void rebuild_sched_domains_locked(void) |
1259 | { |
1260 | } |
1261 | #endif /* CONFIG_SMP */ |
1262 | |
1263 | void rebuild_sched_domains(void) |
1264 | { |
1265 | cpus_read_lock(); |
1266 | mutex_lock(&cpuset_mutex); |
1267 | rebuild_sched_domains_locked(); |
1268 | mutex_unlock(lock: &cpuset_mutex); |
1269 | cpus_read_unlock(); |
1270 | } |
1271 | |
1272 | /** |
1273 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. |
1274 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed |
1275 | * @new_cpus: the temp variable for the new effective_cpus mask |
1276 | * |
1277 | * Iterate through each task of @cs updating its cpus_allowed to the |
1278 | * effective cpuset's. As this function is called with cpuset_mutex held, |
1279 | * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() |
1280 | * is used instead of effective_cpus to make sure all offline CPUs are also |
1281 | * included as hotplug code won't update cpumasks for tasks in top_cpuset. |
1282 | */ |
1283 | static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) |
1284 | { |
1285 | struct css_task_iter it; |
1286 | struct task_struct *task; |
1287 | bool top_cs = cs == &top_cpuset; |
1288 | |
1289 | css_task_iter_start(css: &cs->css, flags: 0, it: &it); |
1290 | while ((task = css_task_iter_next(it: &it))) { |
1291 | const struct cpumask *possible_mask = task_cpu_possible_mask(task); |
1292 | |
1293 | if (top_cs) { |
1294 | /* |
1295 | * Percpu kthreads in top_cpuset are ignored |
1296 | */ |
1297 | if (kthread_is_per_cpu(k: task)) |
1298 | continue; |
1299 | cpumask_andnot(dstp: new_cpus, src1p: possible_mask, src2p: subpartitions_cpus); |
1300 | } else { |
1301 | cpumask_and(dstp: new_cpus, src1p: possible_mask, src2p: cs->effective_cpus); |
1302 | } |
1303 | set_cpus_allowed_ptr(p: task, new_mask: new_cpus); |
1304 | } |
1305 | css_task_iter_end(it: &it); |
1306 | } |
1307 | |
1308 | /** |
1309 | * compute_effective_cpumask - Compute the effective cpumask of the cpuset |
1310 | * @new_cpus: the temp variable for the new effective_cpus mask |
1311 | * @cs: the cpuset the need to recompute the new effective_cpus mask |
1312 | * @parent: the parent cpuset |
1313 | * |
1314 | * The result is valid only if the given cpuset isn't a partition root. |
1315 | */ |
1316 | static void compute_effective_cpumask(struct cpumask *new_cpus, |
1317 | struct cpuset *cs, struct cpuset *parent) |
1318 | { |
1319 | cpumask_and(dstp: new_cpus, src1p: cs->cpus_allowed, src2p: parent->effective_cpus); |
1320 | } |
1321 | |
1322 | /* |
1323 | * Commands for update_parent_effective_cpumask |
1324 | */ |
1325 | enum partition_cmd { |
1326 | partcmd_enable, /* Enable partition root */ |
1327 | partcmd_enablei, /* Enable isolated partition root */ |
1328 | partcmd_disable, /* Disable partition root */ |
1329 | partcmd_update, /* Update parent's effective_cpus */ |
1330 | partcmd_invalidate, /* Make partition invalid */ |
1331 | }; |
1332 | |
1333 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, |
1334 | int turning_on); |
1335 | static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, |
1336 | struct tmpmasks *tmp); |
1337 | |
1338 | /* |
1339 | * Update partition exclusive flag |
1340 | * |
1341 | * Return: 0 if successful, an error code otherwise |
1342 | */ |
1343 | static int update_partition_exclusive(struct cpuset *cs, int new_prs) |
1344 | { |
1345 | bool exclusive = (new_prs > 0); |
1346 | |
1347 | if (exclusive && !is_cpu_exclusive(cs)) { |
1348 | if (update_flag(bit: CS_CPU_EXCLUSIVE, cs, turning_on: 1)) |
1349 | return PERR_NOTEXCL; |
1350 | } else if (!exclusive && is_cpu_exclusive(cs)) { |
1351 | /* Turning off CS_CPU_EXCLUSIVE will not return error */ |
1352 | update_flag(bit: CS_CPU_EXCLUSIVE, cs, turning_on: 0); |
1353 | } |
1354 | return 0; |
1355 | } |
1356 | |
1357 | /* |
1358 | * Update partition load balance flag and/or rebuild sched domain |
1359 | * |
1360 | * Changing load balance flag will automatically call |
1361 | * rebuild_sched_domains_locked(). |
1362 | * This function is for cgroup v2 only. |
1363 | */ |
1364 | static void update_partition_sd_lb(struct cpuset *cs, int old_prs) |
1365 | { |
1366 | int new_prs = cs->partition_root_state; |
1367 | bool rebuild_domains = (new_prs > 0) || (old_prs > 0); |
1368 | bool new_lb; |
1369 | |
1370 | /* |
1371 | * If cs is not a valid partition root, the load balance state |
1372 | * will follow its parent. |
1373 | */ |
1374 | if (new_prs > 0) { |
1375 | new_lb = (new_prs != PRS_ISOLATED); |
1376 | } else { |
1377 | new_lb = is_sched_load_balance(cs: parent_cs(cs)); |
1378 | } |
1379 | if (new_lb != !!is_sched_load_balance(cs)) { |
1380 | rebuild_domains = true; |
1381 | if (new_lb) |
1382 | set_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cs->flags); |
1383 | else |
1384 | clear_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cs->flags); |
1385 | } |
1386 | |
1387 | if (rebuild_domains) |
1388 | rebuild_sched_domains_locked(); |
1389 | } |
1390 | |
1391 | /* |
1392 | * tasks_nocpu_error - Return true if tasks will have no effective_cpus |
1393 | */ |
1394 | static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, |
1395 | struct cpumask *xcpus) |
1396 | { |
1397 | /* |
1398 | * A populated partition (cs or parent) can't have empty effective_cpus |
1399 | */ |
1400 | return (cpumask_subset(src1p: parent->effective_cpus, src2p: xcpus) && |
1401 | partition_is_populated(cs: parent, excluded_child: cs)) || |
1402 | (!cpumask_intersects(src1p: xcpus, cpu_active_mask) && |
1403 | partition_is_populated(cs, NULL)); |
1404 | } |
1405 | |
1406 | static void reset_partition_data(struct cpuset *cs) |
1407 | { |
1408 | struct cpuset *parent = parent_cs(cs); |
1409 | |
1410 | if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) |
1411 | return; |
1412 | |
1413 | lockdep_assert_held(&callback_lock); |
1414 | |
1415 | cs->nr_subparts = 0; |
1416 | if (cpumask_empty(srcp: cs->exclusive_cpus)) { |
1417 | cpumask_clear(dstp: cs->effective_xcpus); |
1418 | if (is_cpu_exclusive(cs)) |
1419 | clear_bit(nr: CS_CPU_EXCLUSIVE, addr: &cs->flags); |
1420 | } |
1421 | if (!cpumask_and(dstp: cs->effective_cpus, |
1422 | src1p: parent->effective_cpus, src2p: cs->cpus_allowed)) { |
1423 | cs->use_parent_ecpus = true; |
1424 | parent->child_ecpus_count++; |
1425 | cpumask_copy(dstp: cs->effective_cpus, srcp: parent->effective_cpus); |
1426 | } |
1427 | } |
1428 | |
1429 | /* |
1430 | * partition_xcpus_newstate - Exclusive CPUs state change |
1431 | * @old_prs: old partition_root_state |
1432 | * @new_prs: new partition_root_state |
1433 | * @xcpus: exclusive CPUs with state change |
1434 | */ |
1435 | static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus) |
1436 | { |
1437 | WARN_ON_ONCE(old_prs == new_prs); |
1438 | if (new_prs == PRS_ISOLATED) |
1439 | cpumask_or(dstp: isolated_cpus, src1p: isolated_cpus, src2p: xcpus); |
1440 | else |
1441 | cpumask_andnot(dstp: isolated_cpus, src1p: isolated_cpus, src2p: xcpus); |
1442 | } |
1443 | |
1444 | /* |
1445 | * partition_xcpus_add - Add new exclusive CPUs to partition |
1446 | * @new_prs: new partition_root_state |
1447 | * @parent: parent cpuset |
1448 | * @xcpus: exclusive CPUs to be added |
1449 | * Return: true if isolated_cpus modified, false otherwise |
1450 | * |
1451 | * Remote partition if parent == NULL |
1452 | */ |
1453 | static bool partition_xcpus_add(int new_prs, struct cpuset *parent, |
1454 | struct cpumask *xcpus) |
1455 | { |
1456 | bool isolcpus_updated; |
1457 | |
1458 | WARN_ON_ONCE(new_prs < 0); |
1459 | lockdep_assert_held(&callback_lock); |
1460 | if (!parent) |
1461 | parent = &top_cpuset; |
1462 | |
1463 | |
1464 | if (parent == &top_cpuset) |
1465 | cpumask_or(dstp: subpartitions_cpus, src1p: subpartitions_cpus, src2p: xcpus); |
1466 | |
1467 | isolcpus_updated = (new_prs != parent->partition_root_state); |
1468 | if (isolcpus_updated) |
1469 | partition_xcpus_newstate(old_prs: parent->partition_root_state, new_prs, |
1470 | xcpus); |
1471 | |
1472 | cpumask_andnot(dstp: parent->effective_cpus, src1p: parent->effective_cpus, src2p: xcpus); |
1473 | return isolcpus_updated; |
1474 | } |
1475 | |
1476 | /* |
1477 | * partition_xcpus_del - Remove exclusive CPUs from partition |
1478 | * @old_prs: old partition_root_state |
1479 | * @parent: parent cpuset |
1480 | * @xcpus: exclusive CPUs to be removed |
1481 | * Return: true if isolated_cpus modified, false otherwise |
1482 | * |
1483 | * Remote partition if parent == NULL |
1484 | */ |
1485 | static bool partition_xcpus_del(int old_prs, struct cpuset *parent, |
1486 | struct cpumask *xcpus) |
1487 | { |
1488 | bool isolcpus_updated; |
1489 | |
1490 | WARN_ON_ONCE(old_prs < 0); |
1491 | lockdep_assert_held(&callback_lock); |
1492 | if (!parent) |
1493 | parent = &top_cpuset; |
1494 | |
1495 | if (parent == &top_cpuset) |
1496 | cpumask_andnot(dstp: subpartitions_cpus, src1p: subpartitions_cpus, src2p: xcpus); |
1497 | |
1498 | isolcpus_updated = (old_prs != parent->partition_root_state); |
1499 | if (isolcpus_updated) |
1500 | partition_xcpus_newstate(old_prs, new_prs: parent->partition_root_state, |
1501 | xcpus); |
1502 | |
1503 | cpumask_and(dstp: xcpus, src1p: xcpus, cpu_active_mask); |
1504 | cpumask_or(dstp: parent->effective_cpus, src1p: parent->effective_cpus, src2p: xcpus); |
1505 | return isolcpus_updated; |
1506 | } |
1507 | |
1508 | static void update_unbound_workqueue_cpumask(bool isolcpus_updated) |
1509 | { |
1510 | int ret; |
1511 | |
1512 | lockdep_assert_cpus_held(); |
1513 | |
1514 | if (!isolcpus_updated) |
1515 | return; |
1516 | |
1517 | ret = workqueue_unbound_exclude_cpumask(cpumask: isolated_cpus); |
1518 | WARN_ON_ONCE(ret < 0); |
1519 | } |
1520 | |
1521 | /** |
1522 | * cpuset_cpu_is_isolated - Check if the given CPU is isolated |
1523 | * @cpu: the CPU number to be checked |
1524 | * Return: true if CPU is used in an isolated partition, false otherwise |
1525 | */ |
1526 | bool cpuset_cpu_is_isolated(int cpu) |
1527 | { |
1528 | return cpumask_test_cpu(cpu, cpumask: isolated_cpus); |
1529 | } |
1530 | EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); |
1531 | |
1532 | /* |
1533 | * compute_effective_exclusive_cpumask - compute effective exclusive CPUs |
1534 | * @cs: cpuset |
1535 | * @xcpus: effective exclusive CPUs value to be set |
1536 | * Return: true if xcpus is not empty, false otherwise. |
1537 | * |
1538 | * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), |
1539 | * it must be a subset of cpus_allowed and parent's effective_xcpus. |
1540 | */ |
1541 | static bool compute_effective_exclusive_cpumask(struct cpuset *cs, |
1542 | struct cpumask *xcpus) |
1543 | { |
1544 | struct cpuset *parent = parent_cs(cs); |
1545 | |
1546 | if (!xcpus) |
1547 | xcpus = cs->effective_xcpus; |
1548 | |
1549 | if (!cpumask_empty(srcp: cs->exclusive_cpus)) |
1550 | cpumask_and(dstp: xcpus, src1p: cs->exclusive_cpus, src2p: cs->cpus_allowed); |
1551 | else |
1552 | cpumask_copy(dstp: xcpus, srcp: cs->cpus_allowed); |
1553 | |
1554 | return cpumask_and(dstp: xcpus, src1p: xcpus, src2p: parent->effective_xcpus); |
1555 | } |
1556 | |
1557 | static inline bool is_remote_partition(struct cpuset *cs) |
1558 | { |
1559 | return !list_empty(head: &cs->remote_sibling); |
1560 | } |
1561 | |
1562 | static inline bool is_local_partition(struct cpuset *cs) |
1563 | { |
1564 | return is_partition_valid(cs) && !is_remote_partition(cs); |
1565 | } |
1566 | |
1567 | /* |
1568 | * remote_partition_enable - Enable current cpuset as a remote partition root |
1569 | * @cs: the cpuset to update |
1570 | * @new_prs: new partition_root_state |
1571 | * @tmp: temparary masks |
1572 | * Return: 1 if successful, 0 if error |
1573 | * |
1574 | * Enable the current cpuset to become a remote partition root taking CPUs |
1575 | * directly from the top cpuset. cpuset_mutex must be held by the caller. |
1576 | */ |
1577 | static int remote_partition_enable(struct cpuset *cs, int new_prs, |
1578 | struct tmpmasks *tmp) |
1579 | { |
1580 | bool isolcpus_updated; |
1581 | |
1582 | /* |
1583 | * The user must have sysadmin privilege. |
1584 | */ |
1585 | if (!capable(CAP_SYS_ADMIN)) |
1586 | return 0; |
1587 | |
1588 | /* |
1589 | * The requested exclusive_cpus must not be allocated to other |
1590 | * partitions and it can't use up all the root's effective_cpus. |
1591 | * |
1592 | * Note that if there is any local partition root above it or |
1593 | * remote partition root underneath it, its exclusive_cpus must |
1594 | * have overlapped with subpartitions_cpus. |
1595 | */ |
1596 | compute_effective_exclusive_cpumask(cs, xcpus: tmp->new_cpus); |
1597 | if (cpumask_empty(srcp: tmp->new_cpus) || |
1598 | cpumask_intersects(src1p: tmp->new_cpus, src2p: subpartitions_cpus) || |
1599 | cpumask_subset(src1p: top_cpuset.effective_cpus, src2p: tmp->new_cpus)) |
1600 | return 0; |
1601 | |
1602 | spin_lock_irq(lock: &callback_lock); |
1603 | isolcpus_updated = partition_xcpus_add(new_prs, NULL, xcpus: tmp->new_cpus); |
1604 | list_add(new: &cs->remote_sibling, head: &remote_children); |
1605 | if (cs->use_parent_ecpus) { |
1606 | struct cpuset *parent = parent_cs(cs); |
1607 | |
1608 | cs->use_parent_ecpus = false; |
1609 | parent->child_ecpus_count--; |
1610 | } |
1611 | spin_unlock_irq(lock: &callback_lock); |
1612 | update_unbound_workqueue_cpumask(isolcpus_updated); |
1613 | |
1614 | /* |
1615 | * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. |
1616 | */ |
1617 | update_tasks_cpumask(cs: &top_cpuset, new_cpus: tmp->new_cpus); |
1618 | update_sibling_cpumasks(parent: &top_cpuset, NULL, tmp); |
1619 | return 1; |
1620 | } |
1621 | |
1622 | /* |
1623 | * remote_partition_disable - Remove current cpuset from remote partition list |
1624 | * @cs: the cpuset to update |
1625 | * @tmp: temparary masks |
1626 | * |
1627 | * The effective_cpus is also updated. |
1628 | * |
1629 | * cpuset_mutex must be held by the caller. |
1630 | */ |
1631 | static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) |
1632 | { |
1633 | bool isolcpus_updated; |
1634 | |
1635 | compute_effective_exclusive_cpumask(cs, xcpus: tmp->new_cpus); |
1636 | WARN_ON_ONCE(!is_remote_partition(cs)); |
1637 | WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); |
1638 | |
1639 | spin_lock_irq(lock: &callback_lock); |
1640 | list_del_init(entry: &cs->remote_sibling); |
1641 | isolcpus_updated = partition_xcpus_del(old_prs: cs->partition_root_state, |
1642 | NULL, xcpus: tmp->new_cpus); |
1643 | cs->partition_root_state = -cs->partition_root_state; |
1644 | if (!cs->prs_err) |
1645 | cs->prs_err = PERR_INVCPUS; |
1646 | reset_partition_data(cs); |
1647 | spin_unlock_irq(lock: &callback_lock); |
1648 | update_unbound_workqueue_cpumask(isolcpus_updated); |
1649 | |
1650 | /* |
1651 | * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. |
1652 | */ |
1653 | update_tasks_cpumask(cs: &top_cpuset, new_cpus: tmp->new_cpus); |
1654 | update_sibling_cpumasks(parent: &top_cpuset, NULL, tmp); |
1655 | } |
1656 | |
1657 | /* |
1658 | * remote_cpus_update - cpus_exclusive change of remote partition |
1659 | * @cs: the cpuset to be updated |
1660 | * @newmask: the new effective_xcpus mask |
1661 | * @tmp: temparary masks |
1662 | * |
1663 | * top_cpuset and subpartitions_cpus will be updated or partition can be |
1664 | * invalidated. |
1665 | */ |
1666 | static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, |
1667 | struct tmpmasks *tmp) |
1668 | { |
1669 | bool adding, deleting; |
1670 | int prs = cs->partition_root_state; |
1671 | int isolcpus_updated = 0; |
1672 | |
1673 | if (WARN_ON_ONCE(!is_remote_partition(cs))) |
1674 | return; |
1675 | |
1676 | WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); |
1677 | |
1678 | if (cpumask_empty(srcp: newmask)) |
1679 | goto invalidate; |
1680 | |
1681 | adding = cpumask_andnot(dstp: tmp->addmask, src1p: newmask, src2p: cs->effective_xcpus); |
1682 | deleting = cpumask_andnot(dstp: tmp->delmask, src1p: cs->effective_xcpus, src2p: newmask); |
1683 | |
1684 | /* |
1685 | * Additions of remote CPUs is only allowed if those CPUs are |
1686 | * not allocated to other partitions and there are effective_cpus |
1687 | * left in the top cpuset. |
1688 | */ |
1689 | if (adding && (!capable(CAP_SYS_ADMIN) || |
1690 | cpumask_intersects(src1p: tmp->addmask, src2p: subpartitions_cpus) || |
1691 | cpumask_subset(src1p: top_cpuset.effective_cpus, src2p: tmp->addmask))) |
1692 | goto invalidate; |
1693 | |
1694 | spin_lock_irq(lock: &callback_lock); |
1695 | if (adding) |
1696 | isolcpus_updated += partition_xcpus_add(new_prs: prs, NULL, xcpus: tmp->addmask); |
1697 | if (deleting) |
1698 | isolcpus_updated += partition_xcpus_del(old_prs: prs, NULL, xcpus: tmp->delmask); |
1699 | spin_unlock_irq(lock: &callback_lock); |
1700 | update_unbound_workqueue_cpumask(isolcpus_updated); |
1701 | |
1702 | /* |
1703 | * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. |
1704 | */ |
1705 | update_tasks_cpumask(cs: &top_cpuset, new_cpus: tmp->new_cpus); |
1706 | update_sibling_cpumasks(parent: &top_cpuset, NULL, tmp); |
1707 | return; |
1708 | |
1709 | invalidate: |
1710 | remote_partition_disable(cs, tmp); |
1711 | } |
1712 | |
1713 | /* |
1714 | * remote_partition_check - check if a child remote partition needs update |
1715 | * @cs: the cpuset to be updated |
1716 | * @newmask: the new effective_xcpus mask |
1717 | * @delmask: temporary mask for deletion (not in tmp) |
1718 | * @tmp: temparary masks |
1719 | * |
1720 | * This should be called before the given cs has updated its cpus_allowed |
1721 | * and/or effective_xcpus. |
1722 | */ |
1723 | static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, |
1724 | struct cpumask *delmask, struct tmpmasks *tmp) |
1725 | { |
1726 | struct cpuset *child, *next; |
1727 | int disable_cnt = 0; |
1728 | |
1729 | /* |
1730 | * Compute the effective exclusive CPUs that will be deleted. |
1731 | */ |
1732 | if (!cpumask_andnot(dstp: delmask, src1p: cs->effective_xcpus, src2p: newmask) || |
1733 | !cpumask_intersects(src1p: delmask, src2p: subpartitions_cpus)) |
1734 | return; /* No deletion of exclusive CPUs in partitions */ |
1735 | |
1736 | /* |
1737 | * Searching the remote children list to look for those that will |
1738 | * be impacted by the deletion of exclusive CPUs. |
1739 | * |
1740 | * Since a cpuset must be removed from the remote children list |
1741 | * before it can go offline and holding cpuset_mutex will prevent |
1742 | * any change in cpuset status. RCU read lock isn't needed. |
1743 | */ |
1744 | lockdep_assert_held(&cpuset_mutex); |
1745 | list_for_each_entry_safe(child, next, &remote_children, remote_sibling) |
1746 | if (cpumask_intersects(src1p: child->effective_cpus, src2p: delmask)) { |
1747 | remote_partition_disable(cs: child, tmp); |
1748 | disable_cnt++; |
1749 | } |
1750 | if (disable_cnt) |
1751 | rebuild_sched_domains_locked(); |
1752 | } |
1753 | |
1754 | /* |
1755 | * prstate_housekeeping_conflict - check for partition & housekeeping conflicts |
1756 | * @prstate: partition root state to be checked |
1757 | * @new_cpus: cpu mask |
1758 | * Return: true if there is conflict, false otherwise |
1759 | * |
1760 | * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in |
1761 | * an isolated partition. |
1762 | */ |
1763 | static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) |
1764 | { |
1765 | const struct cpumask *hk_domain = housekeeping_cpumask(type: HK_TYPE_DOMAIN); |
1766 | bool all_in_hk = cpumask_subset(src1p: new_cpus, src2p: hk_domain); |
1767 | |
1768 | if (!all_in_hk && (prstate != PRS_ISOLATED)) |
1769 | return true; |
1770 | |
1771 | return false; |
1772 | } |
1773 | |
1774 | /** |
1775 | * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset |
1776 | * @cs: The cpuset that requests change in partition root state |
1777 | * @cmd: Partition root state change command |
1778 | * @newmask: Optional new cpumask for partcmd_update |
1779 | * @tmp: Temporary addmask and delmask |
1780 | * Return: 0 or a partition root state error code |
1781 | * |
1782 | * For partcmd_enable*, the cpuset is being transformed from a non-partition |
1783 | * root to a partition root. The effective_xcpus (cpus_allowed if |
1784 | * effective_xcpus not set) mask of the given cpuset will be taken away from |
1785 | * parent's effective_cpus. The function will return 0 if all the CPUs listed |
1786 | * in effective_xcpus can be granted or an error code will be returned. |
1787 | * |
1788 | * For partcmd_disable, the cpuset is being transformed from a partition |
1789 | * root back to a non-partition root. Any CPUs in effective_xcpus will be |
1790 | * given back to parent's effective_cpus. 0 will always be returned. |
1791 | * |
1792 | * For partcmd_update, if the optional newmask is specified, the cpu list is |
1793 | * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is |
1794 | * assumed to remain the same. The cpuset should either be a valid or invalid |
1795 | * partition root. The partition root state may change from valid to invalid |
1796 | * or vice versa. An error code will be returned if transitioning from |
1797 | * invalid to valid violates the exclusivity rule. |
1798 | * |
1799 | * For partcmd_invalidate, the current partition will be made invalid. |
1800 | * |
1801 | * The partcmd_enable* and partcmd_disable commands are used by |
1802 | * update_prstate(). An error code may be returned and the caller will check |
1803 | * for error. |
1804 | * |
1805 | * The partcmd_update command is used by update_cpumasks_hier() with newmask |
1806 | * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used |
1807 | * by update_cpumask() with NULL newmask. In both cases, the callers won't |
1808 | * check for error and so partition_root_state and prs_error will be updated |
1809 | * directly. |
1810 | */ |
1811 | static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, |
1812 | struct cpumask *newmask, |
1813 | struct tmpmasks *tmp) |
1814 | { |
1815 | struct cpuset *parent = parent_cs(cs); |
1816 | int adding; /* Adding cpus to parent's effective_cpus */ |
1817 | int deleting; /* Deleting cpus from parent's effective_cpus */ |
1818 | int old_prs, new_prs; |
1819 | int part_error = PERR_NONE; /* Partition error? */ |
1820 | int subparts_delta = 0; |
1821 | struct cpumask *xcpus; /* cs effective_xcpus */ |
1822 | int isolcpus_updated = 0; |
1823 | bool nocpu; |
1824 | |
1825 | lockdep_assert_held(&cpuset_mutex); |
1826 | |
1827 | /* |
1828 | * new_prs will only be changed for the partcmd_update and |
1829 | * partcmd_invalidate commands. |
1830 | */ |
1831 | adding = deleting = false; |
1832 | old_prs = new_prs = cs->partition_root_state; |
1833 | xcpus = !cpumask_empty(srcp: cs->exclusive_cpus) |
1834 | ? cs->effective_xcpus : cs->cpus_allowed; |
1835 | |
1836 | if (cmd == partcmd_invalidate) { |
1837 | if (is_prs_invalid(prs_state: old_prs)) |
1838 | return 0; |
1839 | |
1840 | /* |
1841 | * Make the current partition invalid. |
1842 | */ |
1843 | if (is_partition_valid(cs: parent)) |
1844 | adding = cpumask_and(dstp: tmp->addmask, |
1845 | src1p: xcpus, src2p: parent->effective_xcpus); |
1846 | if (old_prs > 0) { |
1847 | new_prs = -old_prs; |
1848 | subparts_delta--; |
1849 | } |
1850 | goto write_error; |
1851 | } |
1852 | |
1853 | /* |
1854 | * The parent must be a partition root. |
1855 | * The new cpumask, if present, or the current cpus_allowed must |
1856 | * not be empty. |
1857 | */ |
1858 | if (!is_partition_valid(cs: parent)) { |
1859 | return is_partition_invalid(cs: parent) |
1860 | ? PERR_INVPARENT : PERR_NOTPART; |
1861 | } |
1862 | if (!newmask && cpumask_empty(srcp: cs->cpus_allowed)) |
1863 | return PERR_CPUSEMPTY; |
1864 | |
1865 | nocpu = tasks_nocpu_error(parent, cs, xcpus); |
1866 | |
1867 | if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { |
1868 | /* |
1869 | * Enabling partition root is not allowed if its |
1870 | * effective_xcpus is empty or doesn't overlap with |
1871 | * parent's effective_xcpus. |
1872 | */ |
1873 | if (cpumask_empty(srcp: xcpus) || |
1874 | !cpumask_intersects(src1p: xcpus, src2p: parent->effective_xcpus)) |
1875 | return PERR_INVCPUS; |
1876 | |
1877 | if (prstate_housekeeping_conflict(prstate: new_prs, new_cpus: xcpus)) |
1878 | return PERR_HKEEPING; |
1879 | |
1880 | /* |
1881 | * A parent can be left with no CPU as long as there is no |
1882 | * task directly associated with the parent partition. |
1883 | */ |
1884 | if (nocpu) |
1885 | return PERR_NOCPUS; |
1886 | |
1887 | cpumask_copy(dstp: tmp->delmask, srcp: xcpus); |
1888 | deleting = true; |
1889 | subparts_delta++; |
1890 | new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; |
1891 | } else if (cmd == partcmd_disable) { |
1892 | /* |
1893 | * May need to add cpus to parent's effective_cpus for |
1894 | * valid partition root. |
1895 | */ |
1896 | adding = !is_prs_invalid(prs_state: old_prs) && |
1897 | cpumask_and(dstp: tmp->addmask, src1p: xcpus, src2p: parent->effective_xcpus); |
1898 | if (adding) |
1899 | subparts_delta--; |
1900 | new_prs = PRS_MEMBER; |
1901 | } else if (newmask) { |
1902 | /* |
1903 | * Empty cpumask is not allowed |
1904 | */ |
1905 | if (cpumask_empty(srcp: newmask)) { |
1906 | part_error = PERR_CPUSEMPTY; |
1907 | goto write_error; |
1908 | } |
1909 | |
1910 | /* |
1911 | * partcmd_update with newmask: |
1912 | * |
1913 | * Compute add/delete mask to/from effective_cpus |
1914 | * |
1915 | * For valid partition: |
1916 | * addmask = exclusive_cpus & ~newmask |
1917 | * & parent->effective_xcpus |
1918 | * delmask = newmask & ~exclusive_cpus |
1919 | * & parent->effective_xcpus |
1920 | * |
1921 | * For invalid partition: |
1922 | * delmask = newmask & parent->effective_xcpus |
1923 | */ |
1924 | if (is_prs_invalid(prs_state: old_prs)) { |
1925 | adding = false; |
1926 | deleting = cpumask_and(dstp: tmp->delmask, |
1927 | src1p: newmask, src2p: parent->effective_xcpus); |
1928 | } else { |
1929 | cpumask_andnot(dstp: tmp->addmask, src1p: xcpus, src2p: newmask); |
1930 | adding = cpumask_and(dstp: tmp->addmask, src1p: tmp->addmask, |
1931 | src2p: parent->effective_xcpus); |
1932 | |
1933 | cpumask_andnot(dstp: tmp->delmask, src1p: newmask, src2p: xcpus); |
1934 | deleting = cpumask_and(dstp: tmp->delmask, src1p: tmp->delmask, |
1935 | src2p: parent->effective_xcpus); |
1936 | } |
1937 | /* |
1938 | * Make partition invalid if parent's effective_cpus could |
1939 | * become empty and there are tasks in the parent. |
1940 | */ |
1941 | if (nocpu && (!adding || |
1942 | !cpumask_intersects(src1p: tmp->addmask, cpu_active_mask))) { |
1943 | part_error = PERR_NOCPUS; |
1944 | deleting = false; |
1945 | adding = cpumask_and(dstp: tmp->addmask, |
1946 | src1p: xcpus, src2p: parent->effective_xcpus); |
1947 | } |
1948 | } else { |
1949 | /* |
1950 | * partcmd_update w/o newmask |
1951 | * |
1952 | * delmask = effective_xcpus & parent->effective_cpus |
1953 | * |
1954 | * This can be called from: |
1955 | * 1) update_cpumasks_hier() |
1956 | * 2) cpuset_hotplug_update_tasks() |
1957 | * |
1958 | * Check to see if it can be transitioned from valid to |
1959 | * invalid partition or vice versa. |
1960 | * |
1961 | * A partition error happens when parent has tasks and all |
1962 | * its effective CPUs will have to be distributed out. |
1963 | */ |
1964 | WARN_ON_ONCE(!is_partition_valid(parent)); |
1965 | if (nocpu) { |
1966 | part_error = PERR_NOCPUS; |
1967 | if (is_partition_valid(cs)) |
1968 | adding = cpumask_and(dstp: tmp->addmask, |
1969 | src1p: xcpus, src2p: parent->effective_xcpus); |
1970 | } else if (is_partition_invalid(cs) && |
1971 | cpumask_subset(src1p: xcpus, src2p: parent->effective_xcpus)) { |
1972 | struct cgroup_subsys_state *css; |
1973 | struct cpuset *child; |
1974 | bool exclusive = true; |
1975 | |
1976 | /* |
1977 | * Convert invalid partition to valid has to |
1978 | * pass the cpu exclusivity test. |
1979 | */ |
1980 | rcu_read_lock(); |
1981 | cpuset_for_each_child(child, css, parent) { |
1982 | if (child == cs) |
1983 | continue; |
1984 | if (!cpusets_are_exclusive(cs1: cs, cs2: child)) { |
1985 | exclusive = false; |
1986 | break; |
1987 | } |
1988 | } |
1989 | rcu_read_unlock(); |
1990 | if (exclusive) |
1991 | deleting = cpumask_and(dstp: tmp->delmask, |
1992 | src1p: xcpus, src2p: parent->effective_cpus); |
1993 | else |
1994 | part_error = PERR_NOTEXCL; |
1995 | } |
1996 | } |
1997 | |
1998 | write_error: |
1999 | if (part_error) |
2000 | WRITE_ONCE(cs->prs_err, part_error); |
2001 | |
2002 | if (cmd == partcmd_update) { |
2003 | /* |
2004 | * Check for possible transition between valid and invalid |
2005 | * partition root. |
2006 | */ |
2007 | switch (cs->partition_root_state) { |
2008 | case PRS_ROOT: |
2009 | case PRS_ISOLATED: |
2010 | if (part_error) { |
2011 | new_prs = -old_prs; |
2012 | subparts_delta--; |
2013 | } |
2014 | break; |
2015 | case PRS_INVALID_ROOT: |
2016 | case PRS_INVALID_ISOLATED: |
2017 | if (!part_error) { |
2018 | new_prs = -old_prs; |
2019 | subparts_delta++; |
2020 | } |
2021 | break; |
2022 | } |
2023 | } |
2024 | |
2025 | if (!adding && !deleting && (new_prs == old_prs)) |
2026 | return 0; |
2027 | |
2028 | /* |
2029 | * Transitioning between invalid to valid or vice versa may require |
2030 | * changing CS_CPU_EXCLUSIVE. In the case of partcmd_update, |
2031 | * validate_change() has already been successfully called and |
2032 | * CPU lists in cs haven't been updated yet. So defer it to later. |
2033 | */ |
2034 | if ((old_prs != new_prs) && (cmd != partcmd_update)) { |
2035 | int err = update_partition_exclusive(cs, new_prs); |
2036 | |
2037 | if (err) |
2038 | return err; |
2039 | } |
2040 | |
2041 | /* |
2042 | * Change the parent's effective_cpus & effective_xcpus (top cpuset |
2043 | * only). |
2044 | * |
2045 | * Newly added CPUs will be removed from effective_cpus and |
2046 | * newly deleted ones will be added back to effective_cpus. |
2047 | */ |
2048 | spin_lock_irq(lock: &callback_lock); |
2049 | if (old_prs != new_prs) { |
2050 | cs->partition_root_state = new_prs; |
2051 | if (new_prs <= 0) |
2052 | cs->nr_subparts = 0; |
2053 | } |
2054 | /* |
2055 | * Adding to parent's effective_cpus means deletion CPUs from cs |
2056 | * and vice versa. |
2057 | */ |
2058 | if (adding) |
2059 | isolcpus_updated += partition_xcpus_del(old_prs, parent, |
2060 | xcpus: tmp->addmask); |
2061 | if (deleting) |
2062 | isolcpus_updated += partition_xcpus_add(new_prs, parent, |
2063 | xcpus: tmp->delmask); |
2064 | |
2065 | if (is_partition_valid(cs: parent)) { |
2066 | parent->nr_subparts += subparts_delta; |
2067 | WARN_ON_ONCE(parent->nr_subparts < 0); |
2068 | } |
2069 | spin_unlock_irq(lock: &callback_lock); |
2070 | update_unbound_workqueue_cpumask(isolcpus_updated); |
2071 | |
2072 | if ((old_prs != new_prs) && (cmd == partcmd_update)) |
2073 | update_partition_exclusive(cs, new_prs); |
2074 | |
2075 | if (adding || deleting) { |
2076 | update_tasks_cpumask(cs: parent, new_cpus: tmp->addmask); |
2077 | update_sibling_cpumasks(parent, cs, tmp); |
2078 | } |
2079 | |
2080 | /* |
2081 | * For partcmd_update without newmask, it is being called from |
2082 | * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. |
2083 | * Update the load balance flag and scheduling domain if |
2084 | * cpus_read_trylock() is successful. |
2085 | */ |
2086 | if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { |
2087 | update_partition_sd_lb(cs, old_prs); |
2088 | cpus_read_unlock(); |
2089 | } |
2090 | |
2091 | notify_partition_change(cs, old_prs); |
2092 | return 0; |
2093 | } |
2094 | |
2095 | /** |
2096 | * compute_partition_effective_cpumask - compute effective_cpus for partition |
2097 | * @cs: partition root cpuset |
2098 | * @new_ecpus: previously computed effective_cpus to be updated |
2099 | * |
2100 | * Compute the effective_cpus of a partition root by scanning effective_xcpus |
2101 | * of child partition roots and excluding their effective_xcpus. |
2102 | * |
2103 | * This has the side effect of invalidating valid child partition roots, |
2104 | * if necessary. Since it is called from either cpuset_hotplug_update_tasks() |
2105 | * or update_cpumasks_hier() where parent and children are modified |
2106 | * successively, we don't need to call update_parent_effective_cpumask() |
2107 | * and the child's effective_cpus will be updated in later iterations. |
2108 | * |
2109 | * Note that rcu_read_lock() is assumed to be held. |
2110 | */ |
2111 | static void compute_partition_effective_cpumask(struct cpuset *cs, |
2112 | struct cpumask *new_ecpus) |
2113 | { |
2114 | struct cgroup_subsys_state *css; |
2115 | struct cpuset *child; |
2116 | bool populated = partition_is_populated(cs, NULL); |
2117 | |
2118 | /* |
2119 | * Check child partition roots to see if they should be |
2120 | * invalidated when |
2121 | * 1) child effective_xcpus not a subset of new |
2122 | * excluisve_cpus |
2123 | * 2) All the effective_cpus will be used up and cp |
2124 | * has tasks |
2125 | */ |
2126 | compute_effective_exclusive_cpumask(cs, xcpus: new_ecpus); |
2127 | cpumask_and(dstp: new_ecpus, src1p: new_ecpus, cpu_active_mask); |
2128 | |
2129 | rcu_read_lock(); |
2130 | cpuset_for_each_child(child, css, cs) { |
2131 | if (!is_partition_valid(cs: child)) |
2132 | continue; |
2133 | |
2134 | child->prs_err = 0; |
2135 | if (!cpumask_subset(src1p: child->effective_xcpus, |
2136 | src2p: cs->effective_xcpus)) |
2137 | child->prs_err = PERR_INVCPUS; |
2138 | else if (populated && |
2139 | cpumask_subset(src1p: new_ecpus, src2p: child->effective_xcpus)) |
2140 | child->prs_err = PERR_NOCPUS; |
2141 | |
2142 | if (child->prs_err) { |
2143 | int old_prs = child->partition_root_state; |
2144 | |
2145 | /* |
2146 | * Invalidate child partition |
2147 | */ |
2148 | spin_lock_irq(lock: &callback_lock); |
2149 | make_partition_invalid(cs: child); |
2150 | cs->nr_subparts--; |
2151 | child->nr_subparts = 0; |
2152 | spin_unlock_irq(lock: &callback_lock); |
2153 | notify_partition_change(cs: child, old_prs); |
2154 | continue; |
2155 | } |
2156 | cpumask_andnot(dstp: new_ecpus, src1p: new_ecpus, |
2157 | src2p: child->effective_xcpus); |
2158 | } |
2159 | rcu_read_unlock(); |
2160 | } |
2161 | |
2162 | /* |
2163 | * update_cpumasks_hier() flags |
2164 | */ |
2165 | #define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */ |
2166 | #define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */ |
2167 | |
2168 | /* |
2169 | * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree |
2170 | * @cs: the cpuset to consider |
2171 | * @tmp: temp variables for calculating effective_cpus & partition setup |
2172 | * @force: don't skip any descendant cpusets if set |
2173 | * |
2174 | * When configured cpumask is changed, the effective cpumasks of this cpuset |
2175 | * and all its descendants need to be updated. |
2176 | * |
2177 | * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. |
2178 | * |
2179 | * Called with cpuset_mutex held |
2180 | */ |
2181 | static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, |
2182 | int flags) |
2183 | { |
2184 | struct cpuset *cp; |
2185 | struct cgroup_subsys_state *pos_css; |
2186 | bool need_rebuild_sched_domains = false; |
2187 | int old_prs, new_prs; |
2188 | |
2189 | rcu_read_lock(); |
2190 | cpuset_for_each_descendant_pre(cp, pos_css, cs) { |
2191 | struct cpuset *parent = parent_cs(cs: cp); |
2192 | bool remote = is_remote_partition(cs: cp); |
2193 | bool update_parent = false; |
2194 | |
2195 | /* |
2196 | * Skip descendent remote partition that acquires CPUs |
2197 | * directly from top cpuset unless it is cs. |
2198 | */ |
2199 | if (remote && (cp != cs)) { |
2200 | pos_css = css_rightmost_descendant(pos: pos_css); |
2201 | continue; |
2202 | } |
2203 | |
2204 | /* |
2205 | * Update effective_xcpus if exclusive_cpus set. |
2206 | * The case when exclusive_cpus isn't set is handled later. |
2207 | */ |
2208 | if (!cpumask_empty(srcp: cp->exclusive_cpus) && (cp != cs)) { |
2209 | spin_lock_irq(lock: &callback_lock); |
2210 | compute_effective_exclusive_cpumask(cs: cp, NULL); |
2211 | spin_unlock_irq(lock: &callback_lock); |
2212 | } |
2213 | |
2214 | old_prs = new_prs = cp->partition_root_state; |
2215 | if (remote || (is_partition_valid(cs: parent) && |
2216 | is_partition_valid(cs: cp))) |
2217 | compute_partition_effective_cpumask(cs: cp, new_ecpus: tmp->new_cpus); |
2218 | else |
2219 | compute_effective_cpumask(new_cpus: tmp->new_cpus, cs: cp, parent); |
2220 | |
2221 | /* |
2222 | * A partition with no effective_cpus is allowed as long as |
2223 | * there is no task associated with it. Call |
2224 | * update_parent_effective_cpumask() to check it. |
2225 | */ |
2226 | if (is_partition_valid(cs: cp) && cpumask_empty(srcp: tmp->new_cpus)) { |
2227 | update_parent = true; |
2228 | goto update_parent_effective; |
2229 | } |
2230 | |
2231 | /* |
2232 | * If it becomes empty, inherit the effective mask of the |
2233 | * parent, which is guaranteed to have some CPUs unless |
2234 | * it is a partition root that has explicitly distributed |
2235 | * out all its CPUs. |
2236 | */ |
2237 | if (is_in_v2_mode() && !remote && cpumask_empty(srcp: tmp->new_cpus)) { |
2238 | cpumask_copy(dstp: tmp->new_cpus, srcp: parent->effective_cpus); |
2239 | if (!cp->use_parent_ecpus) { |
2240 | cp->use_parent_ecpus = true; |
2241 | parent->child_ecpus_count++; |
2242 | } |
2243 | } else if (cp->use_parent_ecpus) { |
2244 | cp->use_parent_ecpus = false; |
2245 | WARN_ON_ONCE(!parent->child_ecpus_count); |
2246 | parent->child_ecpus_count--; |
2247 | } |
2248 | |
2249 | if (remote) |
2250 | goto get_css; |
2251 | |
2252 | /* |
2253 | * Skip the whole subtree if |
2254 | * 1) the cpumask remains the same, |
2255 | * 2) has no partition root state, |
2256 | * 3) HIER_CHECKALL flag not set, and |
2257 | * 4) for v2 load balance state same as its parent. |
2258 | */ |
2259 | if (!cp->partition_root_state && !(flags & HIER_CHECKALL) && |
2260 | cpumask_equal(src1p: tmp->new_cpus, src2p: cp->effective_cpus) && |
2261 | (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || |
2262 | (is_sched_load_balance(cs: parent) == is_sched_load_balance(cs: cp)))) { |
2263 | pos_css = css_rightmost_descendant(pos: pos_css); |
2264 | continue; |
2265 | } |
2266 | |
2267 | update_parent_effective: |
2268 | /* |
2269 | * update_parent_effective_cpumask() should have been called |
2270 | * for cs already in update_cpumask(). We should also call |
2271 | * update_tasks_cpumask() again for tasks in the parent |
2272 | * cpuset if the parent's effective_cpus changes. |
2273 | */ |
2274 | if ((cp != cs) && old_prs) { |
2275 | switch (parent->partition_root_state) { |
2276 | case PRS_ROOT: |
2277 | case PRS_ISOLATED: |
2278 | update_parent = true; |
2279 | break; |
2280 | |
2281 | default: |
2282 | /* |
2283 | * When parent is not a partition root or is |
2284 | * invalid, child partition roots become |
2285 | * invalid too. |
2286 | */ |
2287 | if (is_partition_valid(cs: cp)) |
2288 | new_prs = -cp->partition_root_state; |
2289 | WRITE_ONCE(cp->prs_err, |
2290 | is_partition_invalid(parent) |
2291 | ? PERR_INVPARENT : PERR_NOTPART); |
2292 | break; |
2293 | } |
2294 | } |
2295 | get_css: |
2296 | if (!css_tryget_online(css: &cp->css)) |
2297 | continue; |
2298 | rcu_read_unlock(); |
2299 | |
2300 | if (update_parent) { |
2301 | update_parent_effective_cpumask(cs: cp, cmd: partcmd_update, NULL, tmp); |
2302 | /* |
2303 | * The cpuset partition_root_state may become |
2304 | * invalid. Capture it. |
2305 | */ |
2306 | new_prs = cp->partition_root_state; |
2307 | } |
2308 | |
2309 | spin_lock_irq(lock: &callback_lock); |
2310 | cpumask_copy(dstp: cp->effective_cpus, srcp: tmp->new_cpus); |
2311 | cp->partition_root_state = new_prs; |
2312 | /* |
2313 | * Make sure effective_xcpus is properly set for a valid |
2314 | * partition root. |
2315 | */ |
2316 | if ((new_prs > 0) && cpumask_empty(srcp: cp->exclusive_cpus)) |
2317 | cpumask_and(dstp: cp->effective_xcpus, |
2318 | src1p: cp->cpus_allowed, src2p: parent->effective_xcpus); |
2319 | else if (new_prs < 0) |
2320 | reset_partition_data(cs: cp); |
2321 | spin_unlock_irq(lock: &callback_lock); |
2322 | |
2323 | notify_partition_change(cs: cp, old_prs); |
2324 | |
2325 | WARN_ON(!is_in_v2_mode() && |
2326 | !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); |
2327 | |
2328 | update_tasks_cpumask(cs: cp, new_cpus: cp->effective_cpus); |
2329 | |
2330 | /* |
2331 | * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE |
2332 | * from parent if current cpuset isn't a valid partition root |
2333 | * and their load balance states differ. |
2334 | */ |
2335 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && |
2336 | !is_partition_valid(cs: cp) && |
2337 | (is_sched_load_balance(cs: parent) != is_sched_load_balance(cs: cp))) { |
2338 | if (is_sched_load_balance(cs: parent)) |
2339 | set_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cp->flags); |
2340 | else |
2341 | clear_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cp->flags); |
2342 | } |
2343 | |
2344 | /* |
2345 | * On legacy hierarchy, if the effective cpumask of any non- |
2346 | * empty cpuset is changed, we need to rebuild sched domains. |
2347 | * On default hierarchy, the cpuset needs to be a partition |
2348 | * root as well. |
2349 | */ |
2350 | if (!cpumask_empty(srcp: cp->cpus_allowed) && |
2351 | is_sched_load_balance(cs: cp) && |
2352 | (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || |
2353 | is_partition_valid(cs: cp))) |
2354 | need_rebuild_sched_domains = true; |
2355 | |
2356 | rcu_read_lock(); |
2357 | css_put(css: &cp->css); |
2358 | } |
2359 | rcu_read_unlock(); |
2360 | |
2361 | if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD)) |
2362 | rebuild_sched_domains_locked(); |
2363 | } |
2364 | |
2365 | /** |
2366 | * update_sibling_cpumasks - Update siblings cpumasks |
2367 | * @parent: Parent cpuset |
2368 | * @cs: Current cpuset |
2369 | * @tmp: Temp variables |
2370 | */ |
2371 | static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, |
2372 | struct tmpmasks *tmp) |
2373 | { |
2374 | struct cpuset *sibling; |
2375 | struct cgroup_subsys_state *pos_css; |
2376 | |
2377 | lockdep_assert_held(&cpuset_mutex); |
2378 | |
2379 | /* |
2380 | * Check all its siblings and call update_cpumasks_hier() |
2381 | * if their effective_cpus will need to be changed. |
2382 | * |
2383 | * With the addition of effective_xcpus which is a subset of |
2384 | * cpus_allowed. It is possible a change in parent's effective_cpus |
2385 | * due to a change in a child partition's effective_xcpus will impact |
2386 | * its siblings even if they do not inherit parent's effective_cpus |
2387 | * directly. |
2388 | * |
2389 | * The update_cpumasks_hier() function may sleep. So we have to |
2390 | * release the RCU read lock before calling it. HIER_NO_SD_REBUILD |
2391 | * flag is used to suppress rebuild of sched domains as the callers |
2392 | * will take care of that. |
2393 | */ |
2394 | rcu_read_lock(); |
2395 | cpuset_for_each_child(sibling, pos_css, parent) { |
2396 | if (sibling == cs) |
2397 | continue; |
2398 | if (!sibling->use_parent_ecpus && |
2399 | !is_partition_valid(cs: sibling)) { |
2400 | compute_effective_cpumask(new_cpus: tmp->new_cpus, cs: sibling, |
2401 | parent); |
2402 | if (cpumask_equal(src1p: tmp->new_cpus, src2p: sibling->effective_cpus)) |
2403 | continue; |
2404 | } |
2405 | if (!css_tryget_online(css: &sibling->css)) |
2406 | continue; |
2407 | |
2408 | rcu_read_unlock(); |
2409 | update_cpumasks_hier(cs: sibling, tmp, HIER_NO_SD_REBUILD); |
2410 | rcu_read_lock(); |
2411 | css_put(css: &sibling->css); |
2412 | } |
2413 | rcu_read_unlock(); |
2414 | } |
2415 | |
2416 | /** |
2417 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it |
2418 | * @cs: the cpuset to consider |
2419 | * @trialcs: trial cpuset |
2420 | * @buf: buffer of cpu numbers written to this cpuset |
2421 | */ |
2422 | static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, |
2423 | const char *buf) |
2424 | { |
2425 | int retval; |
2426 | struct tmpmasks tmp; |
2427 | struct cpuset *parent = parent_cs(cs); |
2428 | bool invalidate = false; |
2429 | int hier_flags = 0; |
2430 | int old_prs = cs->partition_root_state; |
2431 | |
2432 | /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ |
2433 | if (cs == &top_cpuset) |
2434 | return -EACCES; |
2435 | |
2436 | /* |
2437 | * An empty cpus_allowed is ok only if the cpuset has no tasks. |
2438 | * Since cpulist_parse() fails on an empty mask, we special case |
2439 | * that parsing. The validate_change() call ensures that cpusets |
2440 | * with tasks have cpus. |
2441 | */ |
2442 | if (!*buf) { |
2443 | cpumask_clear(dstp: trialcs->cpus_allowed); |
2444 | cpumask_clear(dstp: trialcs->effective_xcpus); |
2445 | } else { |
2446 | retval = cpulist_parse(buf, dstp: trialcs->cpus_allowed); |
2447 | if (retval < 0) |
2448 | return retval; |
2449 | |
2450 | if (!cpumask_subset(src1p: trialcs->cpus_allowed, |
2451 | src2p: top_cpuset.cpus_allowed)) |
2452 | return -EINVAL; |
2453 | |
2454 | /* |
2455 | * When exclusive_cpus isn't explicitly set, it is constrainted |
2456 | * by cpus_allowed and parent's effective_xcpus. Otherwise, |
2457 | * trialcs->effective_xcpus is used as a temporary cpumask |
2458 | * for checking validity of the partition root. |
2459 | */ |
2460 | if (!cpumask_empty(srcp: trialcs->exclusive_cpus) || is_partition_valid(cs)) |
2461 | compute_effective_exclusive_cpumask(cs: trialcs, NULL); |
2462 | } |
2463 | |
2464 | /* Nothing to do if the cpus didn't change */ |
2465 | if (cpumask_equal(src1p: cs->cpus_allowed, src2p: trialcs->cpus_allowed)) |
2466 | return 0; |
2467 | |
2468 | if (alloc_cpumasks(NULL, tmp: &tmp)) |
2469 | return -ENOMEM; |
2470 | |
2471 | if (old_prs) { |
2472 | if (is_partition_valid(cs) && |
2473 | cpumask_empty(srcp: trialcs->effective_xcpus)) { |
2474 | invalidate = true; |
2475 | cs->prs_err = PERR_INVCPUS; |
2476 | } else if (prstate_housekeeping_conflict(prstate: old_prs, new_cpus: trialcs->effective_xcpus)) { |
2477 | invalidate = true; |
2478 | cs->prs_err = PERR_HKEEPING; |
2479 | } else if (tasks_nocpu_error(parent, cs, xcpus: trialcs->effective_xcpus)) { |
2480 | invalidate = true; |
2481 | cs->prs_err = PERR_NOCPUS; |
2482 | } |
2483 | } |
2484 | |
2485 | /* |
2486 | * Check all the descendants in update_cpumasks_hier() if |
2487 | * effective_xcpus is to be changed. |
2488 | */ |
2489 | if (!cpumask_equal(src1p: cs->effective_xcpus, src2p: trialcs->effective_xcpus)) |
2490 | hier_flags = HIER_CHECKALL; |
2491 | |
2492 | retval = validate_change(cur: cs, trial: trialcs); |
2493 | |
2494 | if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { |
2495 | struct cgroup_subsys_state *css; |
2496 | struct cpuset *cp; |
2497 | |
2498 | /* |
2499 | * The -EINVAL error code indicates that partition sibling |
2500 | * CPU exclusivity rule has been violated. We still allow |
2501 | * the cpumask change to proceed while invalidating the |
2502 | * partition. However, any conflicting sibling partitions |
2503 | * have to be marked as invalid too. |
2504 | */ |
2505 | invalidate = true; |
2506 | rcu_read_lock(); |
2507 | cpuset_for_each_child(cp, css, parent) { |
2508 | struct cpumask *xcpus = fetch_xcpus(cs: trialcs); |
2509 | |
2510 | if (is_partition_valid(cs: cp) && |
2511 | cpumask_intersects(src1p: xcpus, src2p: cp->effective_xcpus)) { |
2512 | rcu_read_unlock(); |
2513 | update_parent_effective_cpumask(cs: cp, cmd: partcmd_invalidate, NULL, tmp: &tmp); |
2514 | rcu_read_lock(); |
2515 | } |
2516 | } |
2517 | rcu_read_unlock(); |
2518 | retval = 0; |
2519 | } |
2520 | |
2521 | if (retval < 0) |
2522 | goto out_free; |
2523 | |
2524 | if (is_partition_valid(cs) || |
2525 | (is_partition_invalid(cs) && !invalidate)) { |
2526 | struct cpumask *xcpus = trialcs->effective_xcpus; |
2527 | |
2528 | if (cpumask_empty(srcp: xcpus) && is_partition_invalid(cs)) |
2529 | xcpus = trialcs->cpus_allowed; |
2530 | |
2531 | /* |
2532 | * Call remote_cpus_update() to handle valid remote partition |
2533 | */ |
2534 | if (is_remote_partition(cs)) |
2535 | remote_cpus_update(cs, newmask: xcpus, tmp: &tmp); |
2536 | else if (invalidate) |
2537 | update_parent_effective_cpumask(cs, cmd: partcmd_invalidate, |
2538 | NULL, tmp: &tmp); |
2539 | else |
2540 | update_parent_effective_cpumask(cs, cmd: partcmd_update, |
2541 | newmask: xcpus, tmp: &tmp); |
2542 | } else if (!cpumask_empty(srcp: cs->exclusive_cpus)) { |
2543 | /* |
2544 | * Use trialcs->effective_cpus as a temp cpumask |
2545 | */ |
2546 | remote_partition_check(cs, newmask: trialcs->effective_xcpus, |
2547 | delmask: trialcs->effective_cpus, tmp: &tmp); |
2548 | } |
2549 | |
2550 | spin_lock_irq(lock: &callback_lock); |
2551 | cpumask_copy(dstp: cs->cpus_allowed, srcp: trialcs->cpus_allowed); |
2552 | cpumask_copy(dstp: cs->effective_xcpus, srcp: trialcs->effective_xcpus); |
2553 | if ((old_prs > 0) && !is_partition_valid(cs)) |
2554 | reset_partition_data(cs); |
2555 | spin_unlock_irq(lock: &callback_lock); |
2556 | |
2557 | /* effective_cpus/effective_xcpus will be updated here */ |
2558 | update_cpumasks_hier(cs, tmp: &tmp, flags: hier_flags); |
2559 | |
2560 | /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ |
2561 | if (cs->partition_root_state) |
2562 | update_partition_sd_lb(cs, old_prs); |
2563 | out_free: |
2564 | free_cpumasks(NULL, tmp: &tmp); |
2565 | return retval; |
2566 | } |
2567 | |
2568 | /** |
2569 | * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset |
2570 | * @cs: the cpuset to consider |
2571 | * @trialcs: trial cpuset |
2572 | * @buf: buffer of cpu numbers written to this cpuset |
2573 | * |
2574 | * The tasks' cpumask will be updated if cs is a valid partition root. |
2575 | */ |
2576 | static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, |
2577 | const char *buf) |
2578 | { |
2579 | int retval; |
2580 | struct tmpmasks tmp; |
2581 | struct cpuset *parent = parent_cs(cs); |
2582 | bool invalidate = false; |
2583 | int hier_flags = 0; |
2584 | int old_prs = cs->partition_root_state; |
2585 | |
2586 | if (!*buf) { |
2587 | cpumask_clear(dstp: trialcs->exclusive_cpus); |
2588 | cpumask_clear(dstp: trialcs->effective_xcpus); |
2589 | } else { |
2590 | retval = cpulist_parse(buf, dstp: trialcs->exclusive_cpus); |
2591 | if (retval < 0) |
2592 | return retval; |
2593 | if (!is_cpu_exclusive(cs)) |
2594 | set_bit(nr: CS_CPU_EXCLUSIVE, addr: &trialcs->flags); |
2595 | } |
2596 | |
2597 | /* Nothing to do if the CPUs didn't change */ |
2598 | if (cpumask_equal(src1p: cs->exclusive_cpus, src2p: trialcs->exclusive_cpus)) |
2599 | return 0; |
2600 | |
2601 | if (*buf) |
2602 | compute_effective_exclusive_cpumask(cs: trialcs, NULL); |
2603 | |
2604 | /* |
2605 | * Check all the descendants in update_cpumasks_hier() if |
2606 | * effective_xcpus is to be changed. |
2607 | */ |
2608 | if (!cpumask_equal(src1p: cs->effective_xcpus, src2p: trialcs->effective_xcpus)) |
2609 | hier_flags = HIER_CHECKALL; |
2610 | |
2611 | retval = validate_change(cur: cs, trial: trialcs); |
2612 | if (retval) |
2613 | return retval; |
2614 | |
2615 | if (alloc_cpumasks(NULL, tmp: &tmp)) |
2616 | return -ENOMEM; |
2617 | |
2618 | if (old_prs) { |
2619 | if (cpumask_empty(srcp: trialcs->effective_xcpus)) { |
2620 | invalidate = true; |
2621 | cs->prs_err = PERR_INVCPUS; |
2622 | } else if (prstate_housekeeping_conflict(prstate: old_prs, new_cpus: trialcs->effective_xcpus)) { |
2623 | invalidate = true; |
2624 | cs->prs_err = PERR_HKEEPING; |
2625 | } else if (tasks_nocpu_error(parent, cs, xcpus: trialcs->effective_xcpus)) { |
2626 | invalidate = true; |
2627 | cs->prs_err = PERR_NOCPUS; |
2628 | } |
2629 | |
2630 | if (is_remote_partition(cs)) { |
2631 | if (invalidate) |
2632 | remote_partition_disable(cs, tmp: &tmp); |
2633 | else |
2634 | remote_cpus_update(cs, newmask: trialcs->effective_xcpus, |
2635 | tmp: &tmp); |
2636 | } else if (invalidate) { |
2637 | update_parent_effective_cpumask(cs, cmd: partcmd_invalidate, |
2638 | NULL, tmp: &tmp); |
2639 | } else { |
2640 | update_parent_effective_cpumask(cs, cmd: partcmd_update, |
2641 | newmask: trialcs->effective_xcpus, tmp: &tmp); |
2642 | } |
2643 | } else if (!cpumask_empty(srcp: trialcs->exclusive_cpus)) { |
2644 | /* |
2645 | * Use trialcs->effective_cpus as a temp cpumask |
2646 | */ |
2647 | remote_partition_check(cs, newmask: trialcs->effective_xcpus, |
2648 | delmask: trialcs->effective_cpus, tmp: &tmp); |
2649 | } |
2650 | spin_lock_irq(lock: &callback_lock); |
2651 | cpumask_copy(dstp: cs->exclusive_cpus, srcp: trialcs->exclusive_cpus); |
2652 | cpumask_copy(dstp: cs->effective_xcpus, srcp: trialcs->effective_xcpus); |
2653 | if ((old_prs > 0) && !is_partition_valid(cs)) |
2654 | reset_partition_data(cs); |
2655 | spin_unlock_irq(lock: &callback_lock); |
2656 | |
2657 | /* |
2658 | * Call update_cpumasks_hier() to update effective_cpus/effective_xcpus |
2659 | * of the subtree when it is a valid partition root or effective_xcpus |
2660 | * is updated. |
2661 | */ |
2662 | if (is_partition_valid(cs) || hier_flags) |
2663 | update_cpumasks_hier(cs, tmp: &tmp, flags: hier_flags); |
2664 | |
2665 | /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ |
2666 | if (cs->partition_root_state) |
2667 | update_partition_sd_lb(cs, old_prs); |
2668 | |
2669 | free_cpumasks(NULL, tmp: &tmp); |
2670 | return 0; |
2671 | } |
2672 | |
2673 | /* |
2674 | * Migrate memory region from one set of nodes to another. This is |
2675 | * performed asynchronously as it can be called from process migration path |
2676 | * holding locks involved in process management. All mm migrations are |
2677 | * performed in the queued order and can be waited for by flushing |
2678 | * cpuset_migrate_mm_wq. |
2679 | */ |
2680 | |
2681 | struct cpuset_migrate_mm_work { |
2682 | struct work_struct work; |
2683 | struct mm_struct *mm; |
2684 | nodemask_t from; |
2685 | nodemask_t to; |
2686 | }; |
2687 | |
2688 | static void cpuset_migrate_mm_workfn(struct work_struct *work) |
2689 | { |
2690 | struct cpuset_migrate_mm_work *mwork = |
2691 | container_of(work, struct cpuset_migrate_mm_work, work); |
2692 | |
2693 | /* on a wq worker, no need to worry about %current's mems_allowed */ |
2694 | do_migrate_pages(mm: mwork->mm, from: &mwork->from, to: &mwork->to, MPOL_MF_MOVE_ALL); |
2695 | mmput(mwork->mm); |
2696 | kfree(objp: mwork); |
2697 | } |
2698 | |
2699 | static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, |
2700 | const nodemask_t *to) |
2701 | { |
2702 | struct cpuset_migrate_mm_work *mwork; |
2703 | |
2704 | if (nodes_equal(*from, *to)) { |
2705 | mmput(mm); |
2706 | return; |
2707 | } |
2708 | |
2709 | mwork = kzalloc(size: sizeof(*mwork), GFP_KERNEL); |
2710 | if (mwork) { |
2711 | mwork->mm = mm; |
2712 | mwork->from = *from; |
2713 | mwork->to = *to; |
2714 | INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); |
2715 | queue_work(wq: cpuset_migrate_mm_wq, work: &mwork->work); |
2716 | } else { |
2717 | mmput(mm); |
2718 | } |
2719 | } |
2720 | |
2721 | static void cpuset_post_attach(void) |
2722 | { |
2723 | flush_workqueue(cpuset_migrate_mm_wq); |
2724 | } |
2725 | |
2726 | /* |
2727 | * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy |
2728 | * @tsk: the task to change |
2729 | * @newmems: new nodes that the task will be set |
2730 | * |
2731 | * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed |
2732 | * and rebind an eventual tasks' mempolicy. If the task is allocating in |
2733 | * parallel, it might temporarily see an empty intersection, which results in |
2734 | * a seqlock check and retry before OOM or allocation failure. |
2735 | */ |
2736 | static void cpuset_change_task_nodemask(struct task_struct *tsk, |
2737 | nodemask_t *newmems) |
2738 | { |
2739 | task_lock(p: tsk); |
2740 | |
2741 | local_irq_disable(); |
2742 | write_seqcount_begin(&tsk->mems_allowed_seq); |
2743 | |
2744 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); |
2745 | mpol_rebind_task(tsk, new: newmems); |
2746 | tsk->mems_allowed = *newmems; |
2747 | |
2748 | write_seqcount_end(&tsk->mems_allowed_seq); |
2749 | local_irq_enable(); |
2750 | |
2751 | task_unlock(p: tsk); |
2752 | } |
2753 | |
2754 | static void *cpuset_being_rebound; |
2755 | |
2756 | /** |
2757 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. |
2758 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed |
2759 | * |
2760 | * Iterate through each task of @cs updating its mems_allowed to the |
2761 | * effective cpuset's. As this function is called with cpuset_mutex held, |
2762 | * cpuset membership stays stable. |
2763 | */ |
2764 | static void update_tasks_nodemask(struct cpuset *cs) |
2765 | { |
2766 | static nodemask_t newmems; /* protected by cpuset_mutex */ |
2767 | struct css_task_iter it; |
2768 | struct task_struct *task; |
2769 | |
2770 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
2771 | |
2772 | guarantee_online_mems(cs, pmask: &newmems); |
2773 | |
2774 | /* |
2775 | * The mpol_rebind_mm() call takes mmap_lock, which we couldn't |
2776 | * take while holding tasklist_lock. Forks can happen - the |
2777 | * mpol_dup() cpuset_being_rebound check will catch such forks, |
2778 | * and rebind their vma mempolicies too. Because we still hold |
2779 | * the global cpuset_mutex, we know that no other rebind effort |
2780 | * will be contending for the global variable cpuset_being_rebound. |
2781 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
2782 | * is idempotent. Also migrate pages in each mm to new nodes. |
2783 | */ |
2784 | css_task_iter_start(css: &cs->css, flags: 0, it: &it); |
2785 | while ((task = css_task_iter_next(it: &it))) { |
2786 | struct mm_struct *mm; |
2787 | bool migrate; |
2788 | |
2789 | cpuset_change_task_nodemask(tsk: task, newmems: &newmems); |
2790 | |
2791 | mm = get_task_mm(task); |
2792 | if (!mm) |
2793 | continue; |
2794 | |
2795 | migrate = is_memory_migrate(cs); |
2796 | |
2797 | mpol_rebind_mm(mm, new: &cs->mems_allowed); |
2798 | if (migrate) |
2799 | cpuset_migrate_mm(mm, from: &cs->old_mems_allowed, to: &newmems); |
2800 | else |
2801 | mmput(mm); |
2802 | } |
2803 | css_task_iter_end(it: &it); |
2804 | |
2805 | /* |
2806 | * All the tasks' nodemasks have been updated, update |
2807 | * cs->old_mems_allowed. |
2808 | */ |
2809 | cs->old_mems_allowed = newmems; |
2810 | |
2811 | /* We're done rebinding vmas to this cpuset's new mems_allowed. */ |
2812 | cpuset_being_rebound = NULL; |
2813 | } |
2814 | |
2815 | /* |
2816 | * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree |
2817 | * @cs: the cpuset to consider |
2818 | * @new_mems: a temp variable for calculating new effective_mems |
2819 | * |
2820 | * When configured nodemask is changed, the effective nodemasks of this cpuset |
2821 | * and all its descendants need to be updated. |
2822 | * |
2823 | * On legacy hierarchy, effective_mems will be the same with mems_allowed. |
2824 | * |
2825 | * Called with cpuset_mutex held |
2826 | */ |
2827 | static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) |
2828 | { |
2829 | struct cpuset *cp; |
2830 | struct cgroup_subsys_state *pos_css; |
2831 | |
2832 | rcu_read_lock(); |
2833 | cpuset_for_each_descendant_pre(cp, pos_css, cs) { |
2834 | struct cpuset *parent = parent_cs(cs: cp); |
2835 | |
2836 | nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); |
2837 | |
2838 | /* |
2839 | * If it becomes empty, inherit the effective mask of the |
2840 | * parent, which is guaranteed to have some MEMs. |
2841 | */ |
2842 | if (is_in_v2_mode() && nodes_empty(*new_mems)) |
2843 | *new_mems = parent->effective_mems; |
2844 | |
2845 | /* Skip the whole subtree if the nodemask remains the same. */ |
2846 | if (nodes_equal(*new_mems, cp->effective_mems)) { |
2847 | pos_css = css_rightmost_descendant(pos: pos_css); |
2848 | continue; |
2849 | } |
2850 | |
2851 | if (!css_tryget_online(css: &cp->css)) |
2852 | continue; |
2853 | rcu_read_unlock(); |
2854 | |
2855 | spin_lock_irq(lock: &callback_lock); |
2856 | cp->effective_mems = *new_mems; |
2857 | spin_unlock_irq(lock: &callback_lock); |
2858 | |
2859 | WARN_ON(!is_in_v2_mode() && |
2860 | !nodes_equal(cp->mems_allowed, cp->effective_mems)); |
2861 | |
2862 | update_tasks_nodemask(cs: cp); |
2863 | |
2864 | rcu_read_lock(); |
2865 | css_put(css: &cp->css); |
2866 | } |
2867 | rcu_read_unlock(); |
2868 | } |
2869 | |
2870 | /* |
2871 | * Handle user request to change the 'mems' memory placement |
2872 | * of a cpuset. Needs to validate the request, update the |
2873 | * cpusets mems_allowed, and for each task in the cpuset, |
2874 | * update mems_allowed and rebind task's mempolicy and any vma |
2875 | * mempolicies and if the cpuset is marked 'memory_migrate', |
2876 | * migrate the tasks pages to the new memory. |
2877 | * |
2878 | * Call with cpuset_mutex held. May take callback_lock during call. |
2879 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
2880 | * lock each such tasks mm->mmap_lock, scan its vma's and rebind |
2881 | * their mempolicies to the cpusets new mems_allowed. |
2882 | */ |
2883 | static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, |
2884 | const char *buf) |
2885 | { |
2886 | int retval; |
2887 | |
2888 | /* |
2889 | * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; |
2890 | * it's read-only |
2891 | */ |
2892 | if (cs == &top_cpuset) { |
2893 | retval = -EACCES; |
2894 | goto done; |
2895 | } |
2896 | |
2897 | /* |
2898 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. |
2899 | * Since nodelist_parse() fails on an empty mask, we special case |
2900 | * that parsing. The validate_change() call ensures that cpusets |
2901 | * with tasks have memory. |
2902 | */ |
2903 | if (!*buf) { |
2904 | nodes_clear(trialcs->mems_allowed); |
2905 | } else { |
2906 | retval = nodelist_parse(buf, trialcs->mems_allowed); |
2907 | if (retval < 0) |
2908 | goto done; |
2909 | |
2910 | if (!nodes_subset(trialcs->mems_allowed, |
2911 | top_cpuset.mems_allowed)) { |
2912 | retval = -EINVAL; |
2913 | goto done; |
2914 | } |
2915 | } |
2916 | |
2917 | if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { |
2918 | retval = 0; /* Too easy - nothing to do */ |
2919 | goto done; |
2920 | } |
2921 | retval = validate_change(cur: cs, trial: trialcs); |
2922 | if (retval < 0) |
2923 | goto done; |
2924 | |
2925 | check_insane_mems_config(nodes: &trialcs->mems_allowed); |
2926 | |
2927 | spin_lock_irq(lock: &callback_lock); |
2928 | cs->mems_allowed = trialcs->mems_allowed; |
2929 | spin_unlock_irq(lock: &callback_lock); |
2930 | |
2931 | /* use trialcs->mems_allowed as a temp variable */ |
2932 | update_nodemasks_hier(cs, new_mems: &trialcs->mems_allowed); |
2933 | done: |
2934 | return retval; |
2935 | } |
2936 | |
2937 | bool current_cpuset_is_being_rebound(void) |
2938 | { |
2939 | bool ret; |
2940 | |
2941 | rcu_read_lock(); |
2942 | ret = task_cs(current) == cpuset_being_rebound; |
2943 | rcu_read_unlock(); |
2944 | |
2945 | return ret; |
2946 | } |
2947 | |
2948 | static int update_relax_domain_level(struct cpuset *cs, s64 val) |
2949 | { |
2950 | #ifdef CONFIG_SMP |
2951 | if (val < -1 || val >= sched_domain_level_max) |
2952 | return -EINVAL; |
2953 | #endif |
2954 | |
2955 | if (val != cs->relax_domain_level) { |
2956 | cs->relax_domain_level = val; |
2957 | if (!cpumask_empty(srcp: cs->cpus_allowed) && |
2958 | is_sched_load_balance(cs)) |
2959 | rebuild_sched_domains_locked(); |
2960 | } |
2961 | |
2962 | return 0; |
2963 | } |
2964 | |
2965 | /** |
2966 | * update_tasks_flags - update the spread flags of tasks in the cpuset. |
2967 | * @cs: the cpuset in which each task's spread flags needs to be changed |
2968 | * |
2969 | * Iterate through each task of @cs updating its spread flags. As this |
2970 | * function is called with cpuset_mutex held, cpuset membership stays |
2971 | * stable. |
2972 | */ |
2973 | static void update_tasks_flags(struct cpuset *cs) |
2974 | { |
2975 | struct css_task_iter it; |
2976 | struct task_struct *task; |
2977 | |
2978 | css_task_iter_start(css: &cs->css, flags: 0, it: &it); |
2979 | while ((task = css_task_iter_next(it: &it))) |
2980 | cpuset_update_task_spread_flags(cs, tsk: task); |
2981 | css_task_iter_end(it: &it); |
2982 | } |
2983 | |
2984 | /* |
2985 | * update_flag - read a 0 or a 1 in a file and update associated flag |
2986 | * bit: the bit to update (see cpuset_flagbits_t) |
2987 | * cs: the cpuset to update |
2988 | * turning_on: whether the flag is being set or cleared |
2989 | * |
2990 | * Call with cpuset_mutex held. |
2991 | */ |
2992 | |
2993 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, |
2994 | int turning_on) |
2995 | { |
2996 | struct cpuset *trialcs; |
2997 | int balance_flag_changed; |
2998 | int spread_flag_changed; |
2999 | int err; |
3000 | |
3001 | trialcs = alloc_trial_cpuset(cs); |
3002 | if (!trialcs) |
3003 | return -ENOMEM; |
3004 | |
3005 | if (turning_on) |
3006 | set_bit(nr: bit, addr: &trialcs->flags); |
3007 | else |
3008 | clear_bit(nr: bit, addr: &trialcs->flags); |
3009 | |
3010 | err = validate_change(cur: cs, trial: trialcs); |
3011 | if (err < 0) |
3012 | goto out; |
3013 | |
3014 | balance_flag_changed = (is_sched_load_balance(cs) != |
3015 | is_sched_load_balance(cs: trialcs)); |
3016 | |
3017 | spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(cs: trialcs)) |
3018 | || (is_spread_page(cs) != is_spread_page(cs: trialcs))); |
3019 | |
3020 | spin_lock_irq(lock: &callback_lock); |
3021 | cs->flags = trialcs->flags; |
3022 | spin_unlock_irq(lock: &callback_lock); |
3023 | |
3024 | if (!cpumask_empty(srcp: trialcs->cpus_allowed) && balance_flag_changed) |
3025 | rebuild_sched_domains_locked(); |
3026 | |
3027 | if (spread_flag_changed) |
3028 | update_tasks_flags(cs); |
3029 | out: |
3030 | free_cpuset(cs: trialcs); |
3031 | return err; |
3032 | } |
3033 | |
3034 | /** |
3035 | * update_prstate - update partition_root_state |
3036 | * @cs: the cpuset to update |
3037 | * @new_prs: new partition root state |
3038 | * Return: 0 if successful, != 0 if error |
3039 | * |
3040 | * Call with cpuset_mutex held. |
3041 | */ |
3042 | static int update_prstate(struct cpuset *cs, int new_prs) |
3043 | { |
3044 | int err = PERR_NONE, old_prs = cs->partition_root_state; |
3045 | struct cpuset *parent = parent_cs(cs); |
3046 | struct tmpmasks tmpmask; |
3047 | bool new_xcpus_state = false; |
3048 | |
3049 | if (old_prs == new_prs) |
3050 | return 0; |
3051 | |
3052 | /* |
3053 | * Treat a previously invalid partition root as if it is a "member". |
3054 | */ |
3055 | if (new_prs && is_prs_invalid(prs_state: old_prs)) |
3056 | old_prs = PRS_MEMBER; |
3057 | |
3058 | if (alloc_cpumasks(NULL, tmp: &tmpmask)) |
3059 | return -ENOMEM; |
3060 | |
3061 | /* |
3062 | * Setup effective_xcpus if not properly set yet, it will be cleared |
3063 | * later if partition becomes invalid. |
3064 | */ |
3065 | if ((new_prs > 0) && cpumask_empty(srcp: cs->exclusive_cpus)) { |
3066 | spin_lock_irq(lock: &callback_lock); |
3067 | cpumask_and(dstp: cs->effective_xcpus, |
3068 | src1p: cs->cpus_allowed, src2p: parent->effective_xcpus); |
3069 | spin_unlock_irq(lock: &callback_lock); |
3070 | } |
3071 | |
3072 | err = update_partition_exclusive(cs, new_prs); |
3073 | if (err) |
3074 | goto out; |
3075 | |
3076 | if (!old_prs) { |
3077 | enum partition_cmd cmd = (new_prs == PRS_ROOT) |
3078 | ? partcmd_enable : partcmd_enablei; |
3079 | |
3080 | /* |
3081 | * cpus_allowed cannot be empty. |
3082 | */ |
3083 | if (cpumask_empty(srcp: cs->cpus_allowed)) { |
3084 | err = PERR_CPUSEMPTY; |
3085 | goto out; |
3086 | } |
3087 | |
3088 | err = update_parent_effective_cpumask(cs, cmd, NULL, tmp: &tmpmask); |
3089 | /* |
3090 | * If an attempt to become local partition root fails, |
3091 | * try to become a remote partition root instead. |
3092 | */ |
3093 | if (err && remote_partition_enable(cs, new_prs, tmp: &tmpmask)) |
3094 | err = 0; |
3095 | } else if (old_prs && new_prs) { |
3096 | /* |
3097 | * A change in load balance state only, no change in cpumasks. |
3098 | */ |
3099 | new_xcpus_state = true; |
3100 | } else { |
3101 | /* |
3102 | * Switching back to member is always allowed even if it |
3103 | * disables child partitions. |
3104 | */ |
3105 | if (is_remote_partition(cs)) |
3106 | remote_partition_disable(cs, tmp: &tmpmask); |
3107 | else |
3108 | update_parent_effective_cpumask(cs, cmd: partcmd_disable, |
3109 | NULL, tmp: &tmpmask); |
3110 | |
3111 | /* |
3112 | * Invalidation of child partitions will be done in |
3113 | * update_cpumasks_hier(). |
3114 | */ |
3115 | } |
3116 | out: |
3117 | /* |
3118 | * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error |
3119 | * happens. |
3120 | */ |
3121 | if (err) { |
3122 | new_prs = -new_prs; |
3123 | update_partition_exclusive(cs, new_prs); |
3124 | } |
3125 | |
3126 | spin_lock_irq(lock: &callback_lock); |
3127 | cs->partition_root_state = new_prs; |
3128 | WRITE_ONCE(cs->prs_err, err); |
3129 | if (!is_partition_valid(cs)) |
3130 | reset_partition_data(cs); |
3131 | else if (new_xcpus_state) |
3132 | partition_xcpus_newstate(old_prs, new_prs, xcpus: cs->effective_xcpus); |
3133 | spin_unlock_irq(lock: &callback_lock); |
3134 | update_unbound_workqueue_cpumask(isolcpus_updated: new_xcpus_state); |
3135 | |
3136 | /* Force update if switching back to member */ |
3137 | update_cpumasks_hier(cs, tmp: &tmpmask, flags: !new_prs ? HIER_CHECKALL : 0); |
3138 | |
3139 | /* Update sched domains and load balance flag */ |
3140 | update_partition_sd_lb(cs, old_prs); |
3141 | |
3142 | notify_partition_change(cs, old_prs); |
3143 | free_cpumasks(NULL, tmp: &tmpmask); |
3144 | return 0; |
3145 | } |
3146 | |
3147 | /* |
3148 | * Frequency meter - How fast is some event occurring? |
3149 | * |
3150 | * These routines manage a digitally filtered, constant time based, |
3151 | * event frequency meter. There are four routines: |
3152 | * fmeter_init() - initialize a frequency meter. |
3153 | * fmeter_markevent() - called each time the event happens. |
3154 | * fmeter_getrate() - returns the recent rate of such events. |
3155 | * fmeter_update() - internal routine used to update fmeter. |
3156 | * |
3157 | * A common data structure is passed to each of these routines, |
3158 | * which is used to keep track of the state required to manage the |
3159 | * frequency meter and its digital filter. |
3160 | * |
3161 | * The filter works on the number of events marked per unit time. |
3162 | * The filter is single-pole low-pass recursive (IIR). The time unit |
3163 | * is 1 second. Arithmetic is done using 32-bit integers scaled to |
3164 | * simulate 3 decimal digits of precision (multiplied by 1000). |
3165 | * |
3166 | * With an FM_COEF of 933, and a time base of 1 second, the filter |
3167 | * has a half-life of 10 seconds, meaning that if the events quit |
3168 | * happening, then the rate returned from the fmeter_getrate() |
3169 | * will be cut in half each 10 seconds, until it converges to zero. |
3170 | * |
3171 | * It is not worth doing a real infinitely recursive filter. If more |
3172 | * than FM_MAXTICKS ticks have elapsed since the last filter event, |
3173 | * just compute FM_MAXTICKS ticks worth, by which point the level |
3174 | * will be stable. |
3175 | * |
3176 | * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid |
3177 | * arithmetic overflow in the fmeter_update() routine. |
3178 | * |
3179 | * Given the simple 32 bit integer arithmetic used, this meter works |
3180 | * best for reporting rates between one per millisecond (msec) and |
3181 | * one per 32 (approx) seconds. At constant rates faster than one |
3182 | * per msec it maxes out at values just under 1,000,000. At constant |
3183 | * rates between one per msec, and one per second it will stabilize |
3184 | * to a value N*1000, where N is the rate of events per second. |
3185 | * At constant rates between one per second and one per 32 seconds, |
3186 | * it will be choppy, moving up on the seconds that have an event, |
3187 | * and then decaying until the next event. At rates slower than |
3188 | * about one in 32 seconds, it decays all the way back to zero between |
3189 | * each event. |
3190 | */ |
3191 | |
3192 | #define FM_COEF 933 /* coefficient for half-life of 10 secs */ |
3193 | #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ |
3194 | #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ |
3195 | #define FM_SCALE 1000 /* faux fixed point scale */ |
3196 | |
3197 | /* Initialize a frequency meter */ |
3198 | static void fmeter_init(struct fmeter *fmp) |
3199 | { |
3200 | fmp->cnt = 0; |
3201 | fmp->val = 0; |
3202 | fmp->time = 0; |
3203 | spin_lock_init(&fmp->lock); |
3204 | } |
3205 | |
3206 | /* Internal meter update - process cnt events and update value */ |
3207 | static void fmeter_update(struct fmeter *fmp) |
3208 | { |
3209 | time64_t now; |
3210 | u32 ticks; |
3211 | |
3212 | now = ktime_get_seconds(); |
3213 | ticks = now - fmp->time; |
3214 | |
3215 | if (ticks == 0) |
3216 | return; |
3217 | |
3218 | ticks = min(FM_MAXTICKS, ticks); |
3219 | while (ticks-- > 0) |
3220 | fmp->val = (FM_COEF * fmp->val) / FM_SCALE; |
3221 | fmp->time = now; |
3222 | |
3223 | fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; |
3224 | fmp->cnt = 0; |
3225 | } |
3226 | |
3227 | /* Process any previous ticks, then bump cnt by one (times scale). */ |
3228 | static void fmeter_markevent(struct fmeter *fmp) |
3229 | { |
3230 | spin_lock(lock: &fmp->lock); |
3231 | fmeter_update(fmp); |
3232 | fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); |
3233 | spin_unlock(lock: &fmp->lock); |
3234 | } |
3235 | |
3236 | /* Process any previous ticks, then return current value. */ |
3237 | static int fmeter_getrate(struct fmeter *fmp) |
3238 | { |
3239 | int val; |
3240 | |
3241 | spin_lock(lock: &fmp->lock); |
3242 | fmeter_update(fmp); |
3243 | val = fmp->val; |
3244 | spin_unlock(lock: &fmp->lock); |
3245 | return val; |
3246 | } |
3247 | |
3248 | static struct cpuset *cpuset_attach_old_cs; |
3249 | |
3250 | /* |
3251 | * Check to see if a cpuset can accept a new task |
3252 | * For v1, cpus_allowed and mems_allowed can't be empty. |
3253 | * For v2, effective_cpus can't be empty. |
3254 | * Note that in v1, effective_cpus = cpus_allowed. |
3255 | */ |
3256 | static int cpuset_can_attach_check(struct cpuset *cs) |
3257 | { |
3258 | if (cpumask_empty(srcp: cs->effective_cpus) || |
3259 | (!is_in_v2_mode() && nodes_empty(cs->mems_allowed))) |
3260 | return -ENOSPC; |
3261 | return 0; |
3262 | } |
3263 | |
3264 | static void reset_migrate_dl_data(struct cpuset *cs) |
3265 | { |
3266 | cs->nr_migrate_dl_tasks = 0; |
3267 | cs->sum_migrate_dl_bw = 0; |
3268 | } |
3269 | |
3270 | /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ |
3271 | static int cpuset_can_attach(struct cgroup_taskset *tset) |
3272 | { |
3273 | struct cgroup_subsys_state *css; |
3274 | struct cpuset *cs, *oldcs; |
3275 | struct task_struct *task; |
3276 | bool cpus_updated, mems_updated; |
3277 | int ret; |
3278 | |
3279 | /* used later by cpuset_attach() */ |
3280 | cpuset_attach_old_cs = task_cs(task: cgroup_taskset_first(tset, dst_cssp: &css)); |
3281 | oldcs = cpuset_attach_old_cs; |
3282 | cs = css_cs(css); |
3283 | |
3284 | mutex_lock(&cpuset_mutex); |
3285 | |
3286 | /* Check to see if task is allowed in the cpuset */ |
3287 | ret = cpuset_can_attach_check(cs); |
3288 | if (ret) |
3289 | goto out_unlock; |
3290 | |
3291 | cpus_updated = !cpumask_equal(src1p: cs->effective_cpus, src2p: oldcs->effective_cpus); |
3292 | mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); |
3293 | |
3294 | cgroup_taskset_for_each(task, css, tset) { |
3295 | ret = task_can_attach(p: task); |
3296 | if (ret) |
3297 | goto out_unlock; |
3298 | |
3299 | /* |
3300 | * Skip rights over task check in v2 when nothing changes, |
3301 | * migration permission derives from hierarchy ownership in |
3302 | * cgroup_procs_write_permission()). |
3303 | */ |
3304 | if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || |
3305 | (cpus_updated || mems_updated)) { |
3306 | ret = security_task_setscheduler(p: task); |
3307 | if (ret) |
3308 | goto out_unlock; |
3309 | } |
3310 | |
3311 | if (dl_task(p: task)) { |
3312 | cs->nr_migrate_dl_tasks++; |
3313 | cs->sum_migrate_dl_bw += task->dl.dl_bw; |
3314 | } |
3315 | } |
3316 | |
3317 | if (!cs->nr_migrate_dl_tasks) |
3318 | goto out_success; |
3319 | |
3320 | if (!cpumask_intersects(src1p: oldcs->effective_cpus, src2p: cs->effective_cpus)) { |
3321 | int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus); |
3322 | |
3323 | if (unlikely(cpu >= nr_cpu_ids)) { |
3324 | reset_migrate_dl_data(cs); |
3325 | ret = -EINVAL; |
3326 | goto out_unlock; |
3327 | } |
3328 | |
3329 | ret = dl_bw_alloc(cpu, dl_bw: cs->sum_migrate_dl_bw); |
3330 | if (ret) { |
3331 | reset_migrate_dl_data(cs); |
3332 | goto out_unlock; |
3333 | } |
3334 | } |
3335 | |
3336 | out_success: |
3337 | /* |
3338 | * Mark attach is in progress. This makes validate_change() fail |
3339 | * changes which zero cpus/mems_allowed. |
3340 | */ |
3341 | cs->attach_in_progress++; |
3342 | out_unlock: |
3343 | mutex_unlock(lock: &cpuset_mutex); |
3344 | return ret; |
3345 | } |
3346 | |
3347 | static void cpuset_cancel_attach(struct cgroup_taskset *tset) |
3348 | { |
3349 | struct cgroup_subsys_state *css; |
3350 | struct cpuset *cs; |
3351 | |
3352 | cgroup_taskset_first(tset, dst_cssp: &css); |
3353 | cs = css_cs(css); |
3354 | |
3355 | mutex_lock(&cpuset_mutex); |
3356 | cs->attach_in_progress--; |
3357 | if (!cs->attach_in_progress) |
3358 | wake_up(&cpuset_attach_wq); |
3359 | |
3360 | if (cs->nr_migrate_dl_tasks) { |
3361 | int cpu = cpumask_any(cs->effective_cpus); |
3362 | |
3363 | dl_bw_free(cpu, dl_bw: cs->sum_migrate_dl_bw); |
3364 | reset_migrate_dl_data(cs); |
3365 | } |
3366 | |
3367 | mutex_unlock(lock: &cpuset_mutex); |
3368 | } |
3369 | |
3370 | /* |
3371 | * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() |
3372 | * but we can't allocate it dynamically there. Define it global and |
3373 | * allocate from cpuset_init(). |
3374 | */ |
3375 | static cpumask_var_t cpus_attach; |
3376 | static nodemask_t cpuset_attach_nodemask_to; |
3377 | |
3378 | static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) |
3379 | { |
3380 | lockdep_assert_held(&cpuset_mutex); |
3381 | |
3382 | if (cs != &top_cpuset) |
3383 | guarantee_online_cpus(tsk: task, pmask: cpus_attach); |
3384 | else |
3385 | cpumask_andnot(dstp: cpus_attach, task_cpu_possible_mask(task), |
3386 | src2p: subpartitions_cpus); |
3387 | /* |
3388 | * can_attach beforehand should guarantee that this doesn't |
3389 | * fail. TODO: have a better way to handle failure here |
3390 | */ |
3391 | WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); |
3392 | |
3393 | cpuset_change_task_nodemask(tsk: task, newmems: &cpuset_attach_nodemask_to); |
3394 | cpuset_update_task_spread_flags(cs, tsk: task); |
3395 | } |
3396 | |
3397 | static void cpuset_attach(struct cgroup_taskset *tset) |
3398 | { |
3399 | struct task_struct *task; |
3400 | struct task_struct *leader; |
3401 | struct cgroup_subsys_state *css; |
3402 | struct cpuset *cs; |
3403 | struct cpuset *oldcs = cpuset_attach_old_cs; |
3404 | bool cpus_updated, mems_updated; |
3405 | |
3406 | cgroup_taskset_first(tset, dst_cssp: &css); |
3407 | cs = css_cs(css); |
3408 | |
3409 | lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ |
3410 | mutex_lock(&cpuset_mutex); |
3411 | cpus_updated = !cpumask_equal(src1p: cs->effective_cpus, |
3412 | src2p: oldcs->effective_cpus); |
3413 | mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); |
3414 | |
3415 | /* |
3416 | * In the default hierarchy, enabling cpuset in the child cgroups |
3417 | * will trigger a number of cpuset_attach() calls with no change |
3418 | * in effective cpus and mems. In that case, we can optimize out |
3419 | * by skipping the task iteration and update. |
3420 | */ |
3421 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && |
3422 | !cpus_updated && !mems_updated) { |
3423 | cpuset_attach_nodemask_to = cs->effective_mems; |
3424 | goto out; |
3425 | } |
3426 | |
3427 | guarantee_online_mems(cs, pmask: &cpuset_attach_nodemask_to); |
3428 | |
3429 | cgroup_taskset_for_each(task, css, tset) |
3430 | cpuset_attach_task(cs, task); |
3431 | |
3432 | /* |
3433 | * Change mm for all threadgroup leaders. This is expensive and may |
3434 | * sleep and should be moved outside migration path proper. Skip it |
3435 | * if there is no change in effective_mems and CS_MEMORY_MIGRATE is |
3436 | * not set. |
3437 | */ |
3438 | cpuset_attach_nodemask_to = cs->effective_mems; |
3439 | if (!is_memory_migrate(cs) && !mems_updated) |
3440 | goto out; |
3441 | |
3442 | cgroup_taskset_for_each_leader(leader, css, tset) { |
3443 | struct mm_struct *mm = get_task_mm(task: leader); |
3444 | |
3445 | if (mm) { |
3446 | mpol_rebind_mm(mm, new: &cpuset_attach_nodemask_to); |
3447 | |
3448 | /* |
3449 | * old_mems_allowed is the same with mems_allowed |
3450 | * here, except if this task is being moved |
3451 | * automatically due to hotplug. In that case |
3452 | * @mems_allowed has been updated and is empty, so |
3453 | * @old_mems_allowed is the right nodesets that we |
3454 | * migrate mm from. |
3455 | */ |
3456 | if (is_memory_migrate(cs)) |
3457 | cpuset_migrate_mm(mm, from: &oldcs->old_mems_allowed, |
3458 | to: &cpuset_attach_nodemask_to); |
3459 | else |
3460 | mmput(mm); |
3461 | } |
3462 | } |
3463 | |
3464 | out: |
3465 | cs->old_mems_allowed = cpuset_attach_nodemask_to; |
3466 | |
3467 | if (cs->nr_migrate_dl_tasks) { |
3468 | cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks; |
3469 | oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks; |
3470 | reset_migrate_dl_data(cs); |
3471 | } |
3472 | |
3473 | cs->attach_in_progress--; |
3474 | if (!cs->attach_in_progress) |
3475 | wake_up(&cpuset_attach_wq); |
3476 | |
3477 | mutex_unlock(lock: &cpuset_mutex); |
3478 | } |
3479 | |
3480 | /* The various types of files and directories in a cpuset file system */ |
3481 | |
3482 | typedef enum { |
3483 | FILE_MEMORY_MIGRATE, |
3484 | FILE_CPULIST, |
3485 | FILE_MEMLIST, |
3486 | FILE_EFFECTIVE_CPULIST, |
3487 | FILE_EFFECTIVE_MEMLIST, |
3488 | FILE_SUBPARTS_CPULIST, |
3489 | FILE_EXCLUSIVE_CPULIST, |
3490 | FILE_EFFECTIVE_XCPULIST, |
3491 | FILE_ISOLATED_CPULIST, |
3492 | FILE_CPU_EXCLUSIVE, |
3493 | FILE_MEM_EXCLUSIVE, |
3494 | FILE_MEM_HARDWALL, |
3495 | FILE_SCHED_LOAD_BALANCE, |
3496 | FILE_PARTITION_ROOT, |
3497 | FILE_SCHED_RELAX_DOMAIN_LEVEL, |
3498 | FILE_MEMORY_PRESSURE_ENABLED, |
3499 | FILE_MEMORY_PRESSURE, |
3500 | FILE_SPREAD_PAGE, |
3501 | FILE_SPREAD_SLAB, |
3502 | } cpuset_filetype_t; |
3503 | |
3504 | static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, |
3505 | u64 val) |
3506 | { |
3507 | struct cpuset *cs = css_cs(css); |
3508 | cpuset_filetype_t type = cft->private; |
3509 | int retval = 0; |
3510 | |
3511 | cpus_read_lock(); |
3512 | mutex_lock(&cpuset_mutex); |
3513 | if (!is_cpuset_online(cs)) { |
3514 | retval = -ENODEV; |
3515 | goto out_unlock; |
3516 | } |
3517 | |
3518 | switch (type) { |
3519 | case FILE_CPU_EXCLUSIVE: |
3520 | retval = update_flag(bit: CS_CPU_EXCLUSIVE, cs, turning_on: val); |
3521 | break; |
3522 | case FILE_MEM_EXCLUSIVE: |
3523 | retval = update_flag(bit: CS_MEM_EXCLUSIVE, cs, turning_on: val); |
3524 | break; |
3525 | case FILE_MEM_HARDWALL: |
3526 | retval = update_flag(bit: CS_MEM_HARDWALL, cs, turning_on: val); |
3527 | break; |
3528 | case FILE_SCHED_LOAD_BALANCE: |
3529 | retval = update_flag(bit: CS_SCHED_LOAD_BALANCE, cs, turning_on: val); |
3530 | break; |
3531 | case FILE_MEMORY_MIGRATE: |
3532 | retval = update_flag(bit: CS_MEMORY_MIGRATE, cs, turning_on: val); |
3533 | break; |
3534 | case FILE_MEMORY_PRESSURE_ENABLED: |
3535 | cpuset_memory_pressure_enabled = !!val; |
3536 | break; |
3537 | case FILE_SPREAD_PAGE: |
3538 | retval = update_flag(bit: CS_SPREAD_PAGE, cs, turning_on: val); |
3539 | break; |
3540 | case FILE_SPREAD_SLAB: |
3541 | retval = update_flag(bit: CS_SPREAD_SLAB, cs, turning_on: val); |
3542 | break; |
3543 | default: |
3544 | retval = -EINVAL; |
3545 | break; |
3546 | } |
3547 | out_unlock: |
3548 | mutex_unlock(lock: &cpuset_mutex); |
3549 | cpus_read_unlock(); |
3550 | return retval; |
3551 | } |
3552 | |
3553 | static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, |
3554 | s64 val) |
3555 | { |
3556 | struct cpuset *cs = css_cs(css); |
3557 | cpuset_filetype_t type = cft->private; |
3558 | int retval = -ENODEV; |
3559 | |
3560 | cpus_read_lock(); |
3561 | mutex_lock(&cpuset_mutex); |
3562 | if (!is_cpuset_online(cs)) |
3563 | goto out_unlock; |
3564 | |
3565 | switch (type) { |
3566 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
3567 | retval = update_relax_domain_level(cs, val); |
3568 | break; |
3569 | default: |
3570 | retval = -EINVAL; |
3571 | break; |
3572 | } |
3573 | out_unlock: |
3574 | mutex_unlock(lock: &cpuset_mutex); |
3575 | cpus_read_unlock(); |
3576 | return retval; |
3577 | } |
3578 | |
3579 | /* |
3580 | * Common handling for a write to a "cpus" or "mems" file. |
3581 | */ |
3582 | static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, |
3583 | char *buf, size_t nbytes, loff_t off) |
3584 | { |
3585 | struct cpuset *cs = css_cs(css: of_css(of)); |
3586 | struct cpuset *trialcs; |
3587 | int retval = -ENODEV; |
3588 | |
3589 | buf = strstrip(str: buf); |
3590 | |
3591 | /* |
3592 | * CPU or memory hotunplug may leave @cs w/o any execution |
3593 | * resources, in which case the hotplug code asynchronously updates |
3594 | * configuration and transfers all tasks to the nearest ancestor |
3595 | * which can execute. |
3596 | * |
3597 | * As writes to "cpus" or "mems" may restore @cs's execution |
3598 | * resources, wait for the previously scheduled operations before |
3599 | * proceeding, so that we don't end up keep removing tasks added |
3600 | * after execution capability is restored. |
3601 | * |
3602 | * cpuset_hotplug_work calls back into cgroup core via |
3603 | * cgroup_transfer_tasks() and waiting for it from a cgroupfs |
3604 | * operation like this one can lead to a deadlock through kernfs |
3605 | * active_ref protection. Let's break the protection. Losing the |
3606 | * protection is okay as we check whether @cs is online after |
3607 | * grabbing cpuset_mutex anyway. This only happens on the legacy |
3608 | * hierarchies. |
3609 | */ |
3610 | css_get(css: &cs->css); |
3611 | kernfs_break_active_protection(kn: of->kn); |
3612 | flush_work(work: &cpuset_hotplug_work); |
3613 | |
3614 | cpus_read_lock(); |
3615 | mutex_lock(&cpuset_mutex); |
3616 | if (!is_cpuset_online(cs)) |
3617 | goto out_unlock; |
3618 | |
3619 | trialcs = alloc_trial_cpuset(cs); |
3620 | if (!trialcs) { |
3621 | retval = -ENOMEM; |
3622 | goto out_unlock; |
3623 | } |
3624 | |
3625 | switch (of_cft(of)->private) { |
3626 | case FILE_CPULIST: |
3627 | retval = update_cpumask(cs, trialcs, buf); |
3628 | break; |
3629 | case FILE_EXCLUSIVE_CPULIST: |
3630 | retval = update_exclusive_cpumask(cs, trialcs, buf); |
3631 | break; |
3632 | case FILE_MEMLIST: |
3633 | retval = update_nodemask(cs, trialcs, buf); |
3634 | break; |
3635 | default: |
3636 | retval = -EINVAL; |
3637 | break; |
3638 | } |
3639 | |
3640 | free_cpuset(cs: trialcs); |
3641 | out_unlock: |
3642 | mutex_unlock(lock: &cpuset_mutex); |
3643 | cpus_read_unlock(); |
3644 | kernfs_unbreak_active_protection(kn: of->kn); |
3645 | css_put(css: &cs->css); |
3646 | flush_workqueue(cpuset_migrate_mm_wq); |
3647 | return retval ?: nbytes; |
3648 | } |
3649 | |
3650 | /* |
3651 | * These ascii lists should be read in a single call, by using a user |
3652 | * buffer large enough to hold the entire map. If read in smaller |
3653 | * chunks, there is no guarantee of atomicity. Since the display format |
3654 | * used, list of ranges of sequential numbers, is variable length, |
3655 | * and since these maps can change value dynamically, one could read |
3656 | * gibberish by doing partial reads while a list was changing. |
3657 | */ |
3658 | static int cpuset_common_seq_show(struct seq_file *sf, void *v) |
3659 | { |
3660 | struct cpuset *cs = css_cs(css: seq_css(seq: sf)); |
3661 | cpuset_filetype_t type = seq_cft(seq: sf)->private; |
3662 | int ret = 0; |
3663 | |
3664 | spin_lock_irq(lock: &callback_lock); |
3665 | |
3666 | switch (type) { |
3667 | case FILE_CPULIST: |
3668 | seq_printf(m: sf, fmt: "%*pbl\n" , cpumask_pr_args(cs->cpus_allowed)); |
3669 | break; |
3670 | case FILE_MEMLIST: |
3671 | seq_printf(m: sf, fmt: "%*pbl\n" , nodemask_pr_args(&cs->mems_allowed)); |
3672 | break; |
3673 | case FILE_EFFECTIVE_CPULIST: |
3674 | seq_printf(m: sf, fmt: "%*pbl\n" , cpumask_pr_args(cs->effective_cpus)); |
3675 | break; |
3676 | case FILE_EFFECTIVE_MEMLIST: |
3677 | seq_printf(m: sf, fmt: "%*pbl\n" , nodemask_pr_args(&cs->effective_mems)); |
3678 | break; |
3679 | case FILE_EXCLUSIVE_CPULIST: |
3680 | seq_printf(m: sf, fmt: "%*pbl\n" , cpumask_pr_args(cs->exclusive_cpus)); |
3681 | break; |
3682 | case FILE_EFFECTIVE_XCPULIST: |
3683 | seq_printf(m: sf, fmt: "%*pbl\n" , cpumask_pr_args(cs->effective_xcpus)); |
3684 | break; |
3685 | case FILE_SUBPARTS_CPULIST: |
3686 | seq_printf(m: sf, fmt: "%*pbl\n" , cpumask_pr_args(subpartitions_cpus)); |
3687 | break; |
3688 | case FILE_ISOLATED_CPULIST: |
3689 | seq_printf(m: sf, fmt: "%*pbl\n" , cpumask_pr_args(isolated_cpus)); |
3690 | break; |
3691 | default: |
3692 | ret = -EINVAL; |
3693 | } |
3694 | |
3695 | spin_unlock_irq(lock: &callback_lock); |
3696 | return ret; |
3697 | } |
3698 | |
3699 | static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) |
3700 | { |
3701 | struct cpuset *cs = css_cs(css); |
3702 | cpuset_filetype_t type = cft->private; |
3703 | switch (type) { |
3704 | case FILE_CPU_EXCLUSIVE: |
3705 | return is_cpu_exclusive(cs); |
3706 | case FILE_MEM_EXCLUSIVE: |
3707 | return is_mem_exclusive(cs); |
3708 | case FILE_MEM_HARDWALL: |
3709 | return is_mem_hardwall(cs); |
3710 | case FILE_SCHED_LOAD_BALANCE: |
3711 | return is_sched_load_balance(cs); |
3712 | case FILE_MEMORY_MIGRATE: |
3713 | return is_memory_migrate(cs); |
3714 | case FILE_MEMORY_PRESSURE_ENABLED: |
3715 | return cpuset_memory_pressure_enabled; |
3716 | case FILE_MEMORY_PRESSURE: |
3717 | return fmeter_getrate(fmp: &cs->fmeter); |
3718 | case FILE_SPREAD_PAGE: |
3719 | return is_spread_page(cs); |
3720 | case FILE_SPREAD_SLAB: |
3721 | return is_spread_slab(cs); |
3722 | default: |
3723 | BUG(); |
3724 | } |
3725 | |
3726 | /* Unreachable but makes gcc happy */ |
3727 | return 0; |
3728 | } |
3729 | |
3730 | static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) |
3731 | { |
3732 | struct cpuset *cs = css_cs(css); |
3733 | cpuset_filetype_t type = cft->private; |
3734 | switch (type) { |
3735 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
3736 | return cs->relax_domain_level; |
3737 | default: |
3738 | BUG(); |
3739 | } |
3740 | |
3741 | /* Unreachable but makes gcc happy */ |
3742 | return 0; |
3743 | } |
3744 | |
3745 | static int sched_partition_show(struct seq_file *seq, void *v) |
3746 | { |
3747 | struct cpuset *cs = css_cs(css: seq_css(seq)); |
3748 | const char *err, *type = NULL; |
3749 | |
3750 | switch (cs->partition_root_state) { |
3751 | case PRS_ROOT: |
3752 | seq_puts(m: seq, s: "root\n" ); |
3753 | break; |
3754 | case PRS_ISOLATED: |
3755 | seq_puts(m: seq, s: "isolated\n" ); |
3756 | break; |
3757 | case PRS_MEMBER: |
3758 | seq_puts(m: seq, s: "member\n" ); |
3759 | break; |
3760 | case PRS_INVALID_ROOT: |
3761 | type = "root" ; |
3762 | fallthrough; |
3763 | case PRS_INVALID_ISOLATED: |
3764 | if (!type) |
3765 | type = "isolated" ; |
3766 | err = perr_strings[READ_ONCE(cs->prs_err)]; |
3767 | if (err) |
3768 | seq_printf(m: seq, fmt: "%s invalid (%s)\n" , type, err); |
3769 | else |
3770 | seq_printf(m: seq, fmt: "%s invalid\n" , type); |
3771 | break; |
3772 | } |
3773 | return 0; |
3774 | } |
3775 | |
3776 | static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, |
3777 | size_t nbytes, loff_t off) |
3778 | { |
3779 | struct cpuset *cs = css_cs(css: of_css(of)); |
3780 | int val; |
3781 | int retval = -ENODEV; |
3782 | |
3783 | buf = strstrip(str: buf); |
3784 | |
3785 | /* |
3786 | * Convert "root" to ENABLED, and convert "member" to DISABLED. |
3787 | */ |
3788 | if (!strcmp(buf, "root" )) |
3789 | val = PRS_ROOT; |
3790 | else if (!strcmp(buf, "member" )) |
3791 | val = PRS_MEMBER; |
3792 | else if (!strcmp(buf, "isolated" )) |
3793 | val = PRS_ISOLATED; |
3794 | else |
3795 | return -EINVAL; |
3796 | |
3797 | css_get(css: &cs->css); |
3798 | cpus_read_lock(); |
3799 | mutex_lock(&cpuset_mutex); |
3800 | if (!is_cpuset_online(cs)) |
3801 | goto out_unlock; |
3802 | |
3803 | retval = update_prstate(cs, new_prs: val); |
3804 | out_unlock: |
3805 | mutex_unlock(lock: &cpuset_mutex); |
3806 | cpus_read_unlock(); |
3807 | css_put(css: &cs->css); |
3808 | return retval ?: nbytes; |
3809 | } |
3810 | |
3811 | /* |
3812 | * for the common functions, 'private' gives the type of file |
3813 | */ |
3814 | |
3815 | static struct cftype legacy_files[] = { |
3816 | { |
3817 | .name = "cpus" , |
3818 | .seq_show = cpuset_common_seq_show, |
3819 | .write = cpuset_write_resmask, |
3820 | .max_write_len = (100U + 6 * NR_CPUS), |
3821 | .private = FILE_CPULIST, |
3822 | }, |
3823 | |
3824 | { |
3825 | .name = "mems" , |
3826 | .seq_show = cpuset_common_seq_show, |
3827 | .write = cpuset_write_resmask, |
3828 | .max_write_len = (100U + 6 * MAX_NUMNODES), |
3829 | .private = FILE_MEMLIST, |
3830 | }, |
3831 | |
3832 | { |
3833 | .name = "effective_cpus" , |
3834 | .seq_show = cpuset_common_seq_show, |
3835 | .private = FILE_EFFECTIVE_CPULIST, |
3836 | }, |
3837 | |
3838 | { |
3839 | .name = "effective_mems" , |
3840 | .seq_show = cpuset_common_seq_show, |
3841 | .private = FILE_EFFECTIVE_MEMLIST, |
3842 | }, |
3843 | |
3844 | { |
3845 | .name = "cpu_exclusive" , |
3846 | .read_u64 = cpuset_read_u64, |
3847 | .write_u64 = cpuset_write_u64, |
3848 | .private = FILE_CPU_EXCLUSIVE, |
3849 | }, |
3850 | |
3851 | { |
3852 | .name = "mem_exclusive" , |
3853 | .read_u64 = cpuset_read_u64, |
3854 | .write_u64 = cpuset_write_u64, |
3855 | .private = FILE_MEM_EXCLUSIVE, |
3856 | }, |
3857 | |
3858 | { |
3859 | .name = "mem_hardwall" , |
3860 | .read_u64 = cpuset_read_u64, |
3861 | .write_u64 = cpuset_write_u64, |
3862 | .private = FILE_MEM_HARDWALL, |
3863 | }, |
3864 | |
3865 | { |
3866 | .name = "sched_load_balance" , |
3867 | .read_u64 = cpuset_read_u64, |
3868 | .write_u64 = cpuset_write_u64, |
3869 | .private = FILE_SCHED_LOAD_BALANCE, |
3870 | }, |
3871 | |
3872 | { |
3873 | .name = "sched_relax_domain_level" , |
3874 | .read_s64 = cpuset_read_s64, |
3875 | .write_s64 = cpuset_write_s64, |
3876 | .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, |
3877 | }, |
3878 | |
3879 | { |
3880 | .name = "memory_migrate" , |
3881 | .read_u64 = cpuset_read_u64, |
3882 | .write_u64 = cpuset_write_u64, |
3883 | .private = FILE_MEMORY_MIGRATE, |
3884 | }, |
3885 | |
3886 | { |
3887 | .name = "memory_pressure" , |
3888 | .read_u64 = cpuset_read_u64, |
3889 | .private = FILE_MEMORY_PRESSURE, |
3890 | }, |
3891 | |
3892 | { |
3893 | .name = "memory_spread_page" , |
3894 | .read_u64 = cpuset_read_u64, |
3895 | .write_u64 = cpuset_write_u64, |
3896 | .private = FILE_SPREAD_PAGE, |
3897 | }, |
3898 | |
3899 | { |
3900 | /* obsolete, may be removed in the future */ |
3901 | .name = "memory_spread_slab" , |
3902 | .read_u64 = cpuset_read_u64, |
3903 | .write_u64 = cpuset_write_u64, |
3904 | .private = FILE_SPREAD_SLAB, |
3905 | }, |
3906 | |
3907 | { |
3908 | .name = "memory_pressure_enabled" , |
3909 | .flags = CFTYPE_ONLY_ON_ROOT, |
3910 | .read_u64 = cpuset_read_u64, |
3911 | .write_u64 = cpuset_write_u64, |
3912 | .private = FILE_MEMORY_PRESSURE_ENABLED, |
3913 | }, |
3914 | |
3915 | { } /* terminate */ |
3916 | }; |
3917 | |
3918 | /* |
3919 | * This is currently a minimal set for the default hierarchy. It can be |
3920 | * expanded later on by migrating more features and control files from v1. |
3921 | */ |
3922 | static struct cftype dfl_files[] = { |
3923 | { |
3924 | .name = "cpus" , |
3925 | .seq_show = cpuset_common_seq_show, |
3926 | .write = cpuset_write_resmask, |
3927 | .max_write_len = (100U + 6 * NR_CPUS), |
3928 | .private = FILE_CPULIST, |
3929 | .flags = CFTYPE_NOT_ON_ROOT, |
3930 | }, |
3931 | |
3932 | { |
3933 | .name = "mems" , |
3934 | .seq_show = cpuset_common_seq_show, |
3935 | .write = cpuset_write_resmask, |
3936 | .max_write_len = (100U + 6 * MAX_NUMNODES), |
3937 | .private = FILE_MEMLIST, |
3938 | .flags = CFTYPE_NOT_ON_ROOT, |
3939 | }, |
3940 | |
3941 | { |
3942 | .name = "cpus.effective" , |
3943 | .seq_show = cpuset_common_seq_show, |
3944 | .private = FILE_EFFECTIVE_CPULIST, |
3945 | }, |
3946 | |
3947 | { |
3948 | .name = "mems.effective" , |
3949 | .seq_show = cpuset_common_seq_show, |
3950 | .private = FILE_EFFECTIVE_MEMLIST, |
3951 | }, |
3952 | |
3953 | { |
3954 | .name = "cpus.partition" , |
3955 | .seq_show = sched_partition_show, |
3956 | .write = sched_partition_write, |
3957 | .private = FILE_PARTITION_ROOT, |
3958 | .flags = CFTYPE_NOT_ON_ROOT, |
3959 | .file_offset = offsetof(struct cpuset, partition_file), |
3960 | }, |
3961 | |
3962 | { |
3963 | .name = "cpus.exclusive" , |
3964 | .seq_show = cpuset_common_seq_show, |
3965 | .write = cpuset_write_resmask, |
3966 | .max_write_len = (100U + 6 * NR_CPUS), |
3967 | .private = FILE_EXCLUSIVE_CPULIST, |
3968 | .flags = CFTYPE_NOT_ON_ROOT, |
3969 | }, |
3970 | |
3971 | { |
3972 | .name = "cpus.exclusive.effective" , |
3973 | .seq_show = cpuset_common_seq_show, |
3974 | .private = FILE_EFFECTIVE_XCPULIST, |
3975 | .flags = CFTYPE_NOT_ON_ROOT, |
3976 | }, |
3977 | |
3978 | { |
3979 | .name = "cpus.subpartitions" , |
3980 | .seq_show = cpuset_common_seq_show, |
3981 | .private = FILE_SUBPARTS_CPULIST, |
3982 | .flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG, |
3983 | }, |
3984 | |
3985 | { |
3986 | .name = "cpus.isolated" , |
3987 | .seq_show = cpuset_common_seq_show, |
3988 | .private = FILE_ISOLATED_CPULIST, |
3989 | .flags = CFTYPE_ONLY_ON_ROOT, |
3990 | }, |
3991 | |
3992 | { } /* terminate */ |
3993 | }; |
3994 | |
3995 | |
3996 | /** |
3997 | * cpuset_css_alloc - Allocate a cpuset css |
3998 | * @parent_css: Parent css of the control group that the new cpuset will be |
3999 | * part of |
4000 | * Return: cpuset css on success, -ENOMEM on failure. |
4001 | * |
4002 | * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return |
4003 | * top cpuset css otherwise. |
4004 | */ |
4005 | static struct cgroup_subsys_state * |
4006 | cpuset_css_alloc(struct cgroup_subsys_state *parent_css) |
4007 | { |
4008 | struct cpuset *cs; |
4009 | |
4010 | if (!parent_css) |
4011 | return &top_cpuset.css; |
4012 | |
4013 | cs = kzalloc(size: sizeof(*cs), GFP_KERNEL); |
4014 | if (!cs) |
4015 | return ERR_PTR(error: -ENOMEM); |
4016 | |
4017 | if (alloc_cpumasks(cs, NULL)) { |
4018 | kfree(objp: cs); |
4019 | return ERR_PTR(error: -ENOMEM); |
4020 | } |
4021 | |
4022 | __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
4023 | nodes_clear(cs->mems_allowed); |
4024 | nodes_clear(cs->effective_mems); |
4025 | fmeter_init(fmp: &cs->fmeter); |
4026 | cs->relax_domain_level = -1; |
4027 | INIT_LIST_HEAD(list: &cs->remote_sibling); |
4028 | |
4029 | /* Set CS_MEMORY_MIGRATE for default hierarchy */ |
4030 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) |
4031 | __set_bit(CS_MEMORY_MIGRATE, &cs->flags); |
4032 | |
4033 | return &cs->css; |
4034 | } |
4035 | |
4036 | static int cpuset_css_online(struct cgroup_subsys_state *css) |
4037 | { |
4038 | struct cpuset *cs = css_cs(css); |
4039 | struct cpuset *parent = parent_cs(cs); |
4040 | struct cpuset *tmp_cs; |
4041 | struct cgroup_subsys_state *pos_css; |
4042 | |
4043 | if (!parent) |
4044 | return 0; |
4045 | |
4046 | cpus_read_lock(); |
4047 | mutex_lock(&cpuset_mutex); |
4048 | |
4049 | set_bit(nr: CS_ONLINE, addr: &cs->flags); |
4050 | if (is_spread_page(cs: parent)) |
4051 | set_bit(nr: CS_SPREAD_PAGE, addr: &cs->flags); |
4052 | if (is_spread_slab(cs: parent)) |
4053 | set_bit(nr: CS_SPREAD_SLAB, addr: &cs->flags); |
4054 | |
4055 | cpuset_inc(); |
4056 | |
4057 | spin_lock_irq(lock: &callback_lock); |
4058 | if (is_in_v2_mode()) { |
4059 | cpumask_copy(dstp: cs->effective_cpus, srcp: parent->effective_cpus); |
4060 | cs->effective_mems = parent->effective_mems; |
4061 | cs->use_parent_ecpus = true; |
4062 | parent->child_ecpus_count++; |
4063 | /* |
4064 | * Clear CS_SCHED_LOAD_BALANCE if parent is isolated |
4065 | */ |
4066 | if (!is_sched_load_balance(cs: parent)) |
4067 | clear_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cs->flags); |
4068 | } |
4069 | |
4070 | /* |
4071 | * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated |
4072 | */ |
4073 | if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && |
4074 | !is_sched_load_balance(cs: parent)) |
4075 | clear_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cs->flags); |
4076 | |
4077 | spin_unlock_irq(lock: &callback_lock); |
4078 | |
4079 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
4080 | goto out_unlock; |
4081 | |
4082 | /* |
4083 | * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is |
4084 | * set. This flag handling is implemented in cgroup core for |
4085 | * historical reasons - the flag may be specified during mount. |
4086 | * |
4087 | * Currently, if any sibling cpusets have exclusive cpus or mem, we |
4088 | * refuse to clone the configuration - thereby refusing the task to |
4089 | * be entered, and as a result refusing the sys_unshare() or |
4090 | * clone() which initiated it. If this becomes a problem for some |
4091 | * users who wish to allow that scenario, then this could be |
4092 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive |
4093 | * (and likewise for mems) to the new cgroup. |
4094 | */ |
4095 | rcu_read_lock(); |
4096 | cpuset_for_each_child(tmp_cs, pos_css, parent) { |
4097 | if (is_mem_exclusive(cs: tmp_cs) || is_cpu_exclusive(cs: tmp_cs)) { |
4098 | rcu_read_unlock(); |
4099 | goto out_unlock; |
4100 | } |
4101 | } |
4102 | rcu_read_unlock(); |
4103 | |
4104 | spin_lock_irq(lock: &callback_lock); |
4105 | cs->mems_allowed = parent->mems_allowed; |
4106 | cs->effective_mems = parent->mems_allowed; |
4107 | cpumask_copy(dstp: cs->cpus_allowed, srcp: parent->cpus_allowed); |
4108 | cpumask_copy(dstp: cs->effective_cpus, srcp: parent->cpus_allowed); |
4109 | spin_unlock_irq(lock: &callback_lock); |
4110 | out_unlock: |
4111 | mutex_unlock(lock: &cpuset_mutex); |
4112 | cpus_read_unlock(); |
4113 | return 0; |
4114 | } |
4115 | |
4116 | /* |
4117 | * If the cpuset being removed has its flag 'sched_load_balance' |
4118 | * enabled, then simulate turning sched_load_balance off, which |
4119 | * will call rebuild_sched_domains_locked(). That is not needed |
4120 | * in the default hierarchy where only changes in partition |
4121 | * will cause repartitioning. |
4122 | * |
4123 | * If the cpuset has the 'sched.partition' flag enabled, simulate |
4124 | * turning 'sched.partition" off. |
4125 | */ |
4126 | |
4127 | static void cpuset_css_offline(struct cgroup_subsys_state *css) |
4128 | { |
4129 | struct cpuset *cs = css_cs(css); |
4130 | |
4131 | cpus_read_lock(); |
4132 | mutex_lock(&cpuset_mutex); |
4133 | |
4134 | if (is_partition_valid(cs)) |
4135 | update_prstate(cs, new_prs: 0); |
4136 | |
4137 | if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && |
4138 | is_sched_load_balance(cs)) |
4139 | update_flag(bit: CS_SCHED_LOAD_BALANCE, cs, turning_on: 0); |
4140 | |
4141 | if (cs->use_parent_ecpus) { |
4142 | struct cpuset *parent = parent_cs(cs); |
4143 | |
4144 | cs->use_parent_ecpus = false; |
4145 | parent->child_ecpus_count--; |
4146 | } |
4147 | |
4148 | cpuset_dec(); |
4149 | clear_bit(nr: CS_ONLINE, addr: &cs->flags); |
4150 | |
4151 | mutex_unlock(lock: &cpuset_mutex); |
4152 | cpus_read_unlock(); |
4153 | } |
4154 | |
4155 | static void cpuset_css_free(struct cgroup_subsys_state *css) |
4156 | { |
4157 | struct cpuset *cs = css_cs(css); |
4158 | |
4159 | free_cpuset(cs); |
4160 | } |
4161 | |
4162 | static void cpuset_bind(struct cgroup_subsys_state *root_css) |
4163 | { |
4164 | mutex_lock(&cpuset_mutex); |
4165 | spin_lock_irq(lock: &callback_lock); |
4166 | |
4167 | if (is_in_v2_mode()) { |
4168 | cpumask_copy(dstp: top_cpuset.cpus_allowed, cpu_possible_mask); |
4169 | cpumask_copy(dstp: top_cpuset.effective_xcpus, cpu_possible_mask); |
4170 | top_cpuset.mems_allowed = node_possible_map; |
4171 | } else { |
4172 | cpumask_copy(dstp: top_cpuset.cpus_allowed, |
4173 | srcp: top_cpuset.effective_cpus); |
4174 | top_cpuset.mems_allowed = top_cpuset.effective_mems; |
4175 | } |
4176 | |
4177 | spin_unlock_irq(lock: &callback_lock); |
4178 | mutex_unlock(lock: &cpuset_mutex); |
4179 | } |
4180 | |
4181 | /* |
4182 | * In case the child is cloned into a cpuset different from its parent, |
4183 | * additional checks are done to see if the move is allowed. |
4184 | */ |
4185 | static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) |
4186 | { |
4187 | struct cpuset *cs = css_cs(css: cset->subsys[cpuset_cgrp_id]); |
4188 | bool same_cs; |
4189 | int ret; |
4190 | |
4191 | rcu_read_lock(); |
4192 | same_cs = (cs == task_cs(current)); |
4193 | rcu_read_unlock(); |
4194 | |
4195 | if (same_cs) |
4196 | return 0; |
4197 | |
4198 | lockdep_assert_held(&cgroup_mutex); |
4199 | mutex_lock(&cpuset_mutex); |
4200 | |
4201 | /* Check to see if task is allowed in the cpuset */ |
4202 | ret = cpuset_can_attach_check(cs); |
4203 | if (ret) |
4204 | goto out_unlock; |
4205 | |
4206 | ret = task_can_attach(p: task); |
4207 | if (ret) |
4208 | goto out_unlock; |
4209 | |
4210 | ret = security_task_setscheduler(p: task); |
4211 | if (ret) |
4212 | goto out_unlock; |
4213 | |
4214 | /* |
4215 | * Mark attach is in progress. This makes validate_change() fail |
4216 | * changes which zero cpus/mems_allowed. |
4217 | */ |
4218 | cs->attach_in_progress++; |
4219 | out_unlock: |
4220 | mutex_unlock(lock: &cpuset_mutex); |
4221 | return ret; |
4222 | } |
4223 | |
4224 | static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) |
4225 | { |
4226 | struct cpuset *cs = css_cs(css: cset->subsys[cpuset_cgrp_id]); |
4227 | bool same_cs; |
4228 | |
4229 | rcu_read_lock(); |
4230 | same_cs = (cs == task_cs(current)); |
4231 | rcu_read_unlock(); |
4232 | |
4233 | if (same_cs) |
4234 | return; |
4235 | |
4236 | mutex_lock(&cpuset_mutex); |
4237 | cs->attach_in_progress--; |
4238 | if (!cs->attach_in_progress) |
4239 | wake_up(&cpuset_attach_wq); |
4240 | mutex_unlock(lock: &cpuset_mutex); |
4241 | } |
4242 | |
4243 | /* |
4244 | * Make sure the new task conform to the current state of its parent, |
4245 | * which could have been changed by cpuset just after it inherits the |
4246 | * state from the parent and before it sits on the cgroup's task list. |
4247 | */ |
4248 | static void cpuset_fork(struct task_struct *task) |
4249 | { |
4250 | struct cpuset *cs; |
4251 | bool same_cs; |
4252 | |
4253 | rcu_read_lock(); |
4254 | cs = task_cs(task); |
4255 | same_cs = (cs == task_cs(current)); |
4256 | rcu_read_unlock(); |
4257 | |
4258 | if (same_cs) { |
4259 | if (cs == &top_cpuset) |
4260 | return; |
4261 | |
4262 | set_cpus_allowed_ptr(p: task, current->cpus_ptr); |
4263 | task->mems_allowed = current->mems_allowed; |
4264 | return; |
4265 | } |
4266 | |
4267 | /* CLONE_INTO_CGROUP */ |
4268 | mutex_lock(&cpuset_mutex); |
4269 | guarantee_online_mems(cs, pmask: &cpuset_attach_nodemask_to); |
4270 | cpuset_attach_task(cs, task); |
4271 | |
4272 | cs->attach_in_progress--; |
4273 | if (!cs->attach_in_progress) |
4274 | wake_up(&cpuset_attach_wq); |
4275 | |
4276 | mutex_unlock(lock: &cpuset_mutex); |
4277 | } |
4278 | |
4279 | struct cgroup_subsys cpuset_cgrp_subsys = { |
4280 | .css_alloc = cpuset_css_alloc, |
4281 | .css_online = cpuset_css_online, |
4282 | .css_offline = cpuset_css_offline, |
4283 | .css_free = cpuset_css_free, |
4284 | .can_attach = cpuset_can_attach, |
4285 | .cancel_attach = cpuset_cancel_attach, |
4286 | .attach = cpuset_attach, |
4287 | .post_attach = cpuset_post_attach, |
4288 | .bind = cpuset_bind, |
4289 | .can_fork = cpuset_can_fork, |
4290 | .cancel_fork = cpuset_cancel_fork, |
4291 | .fork = cpuset_fork, |
4292 | .legacy_cftypes = legacy_files, |
4293 | .dfl_cftypes = dfl_files, |
4294 | .early_init = true, |
4295 | .threaded = true, |
4296 | }; |
4297 | |
4298 | /** |
4299 | * cpuset_init - initialize cpusets at system boot |
4300 | * |
4301 | * Description: Initialize top_cpuset |
4302 | **/ |
4303 | |
4304 | int __init cpuset_init(void) |
4305 | { |
4306 | BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); |
4307 | BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); |
4308 | BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL)); |
4309 | BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL)); |
4310 | BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL)); |
4311 | BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL)); |
4312 | |
4313 | cpumask_setall(dstp: top_cpuset.cpus_allowed); |
4314 | nodes_setall(top_cpuset.mems_allowed); |
4315 | cpumask_setall(dstp: top_cpuset.effective_cpus); |
4316 | cpumask_setall(dstp: top_cpuset.effective_xcpus); |
4317 | cpumask_setall(dstp: top_cpuset.exclusive_cpus); |
4318 | nodes_setall(top_cpuset.effective_mems); |
4319 | |
4320 | fmeter_init(fmp: &top_cpuset.fmeter); |
4321 | set_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &top_cpuset.flags); |
4322 | top_cpuset.relax_domain_level = -1; |
4323 | INIT_LIST_HEAD(list: &remote_children); |
4324 | |
4325 | BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); |
4326 | |
4327 | return 0; |
4328 | } |
4329 | |
4330 | /* |
4331 | * If CPU and/or memory hotplug handlers, below, unplug any CPUs |
4332 | * or memory nodes, we need to walk over the cpuset hierarchy, |
4333 | * removing that CPU or node from all cpusets. If this removes the |
4334 | * last CPU or node from a cpuset, then move the tasks in the empty |
4335 | * cpuset to its next-highest non-empty parent. |
4336 | */ |
4337 | static void remove_tasks_in_empty_cpuset(struct cpuset *cs) |
4338 | { |
4339 | struct cpuset *parent; |
4340 | |
4341 | /* |
4342 | * Find its next-highest non-empty parent, (top cpuset |
4343 | * has online cpus, so can't be empty). |
4344 | */ |
4345 | parent = parent_cs(cs); |
4346 | while (cpumask_empty(srcp: parent->cpus_allowed) || |
4347 | nodes_empty(parent->mems_allowed)) |
4348 | parent = parent_cs(cs: parent); |
4349 | |
4350 | if (cgroup_transfer_tasks(to: parent->css.cgroup, from: cs->css.cgroup)) { |
4351 | pr_err("cpuset: failed to transfer tasks out of empty cpuset " ); |
4352 | pr_cont_cgroup_name(cgrp: cs->css.cgroup); |
4353 | pr_cont("\n" ); |
4354 | } |
4355 | } |
4356 | |
4357 | static void |
4358 | hotplug_update_tasks_legacy(struct cpuset *cs, |
4359 | struct cpumask *new_cpus, nodemask_t *new_mems, |
4360 | bool cpus_updated, bool mems_updated) |
4361 | { |
4362 | bool is_empty; |
4363 | |
4364 | spin_lock_irq(lock: &callback_lock); |
4365 | cpumask_copy(dstp: cs->cpus_allowed, srcp: new_cpus); |
4366 | cpumask_copy(dstp: cs->effective_cpus, srcp: new_cpus); |
4367 | cs->mems_allowed = *new_mems; |
4368 | cs->effective_mems = *new_mems; |
4369 | spin_unlock_irq(lock: &callback_lock); |
4370 | |
4371 | /* |
4372 | * Don't call update_tasks_cpumask() if the cpuset becomes empty, |
4373 | * as the tasks will be migrated to an ancestor. |
4374 | */ |
4375 | if (cpus_updated && !cpumask_empty(srcp: cs->cpus_allowed)) |
4376 | update_tasks_cpumask(cs, new_cpus); |
4377 | if (mems_updated && !nodes_empty(cs->mems_allowed)) |
4378 | update_tasks_nodemask(cs); |
4379 | |
4380 | is_empty = cpumask_empty(srcp: cs->cpus_allowed) || |
4381 | nodes_empty(cs->mems_allowed); |
4382 | |
4383 | /* |
4384 | * Move tasks to the nearest ancestor with execution resources, |
4385 | * This is full cgroup operation which will also call back into |
4386 | * cpuset. Should be done outside any lock. |
4387 | */ |
4388 | if (is_empty) { |
4389 | mutex_unlock(lock: &cpuset_mutex); |
4390 | remove_tasks_in_empty_cpuset(cs); |
4391 | mutex_lock(&cpuset_mutex); |
4392 | } |
4393 | } |
4394 | |
4395 | static void |
4396 | hotplug_update_tasks(struct cpuset *cs, |
4397 | struct cpumask *new_cpus, nodemask_t *new_mems, |
4398 | bool cpus_updated, bool mems_updated) |
4399 | { |
4400 | /* A partition root is allowed to have empty effective cpus */ |
4401 | if (cpumask_empty(srcp: new_cpus) && !is_partition_valid(cs)) |
4402 | cpumask_copy(dstp: new_cpus, srcp: parent_cs(cs)->effective_cpus); |
4403 | if (nodes_empty(*new_mems)) |
4404 | *new_mems = parent_cs(cs)->effective_mems; |
4405 | |
4406 | spin_lock_irq(lock: &callback_lock); |
4407 | cpumask_copy(dstp: cs->effective_cpus, srcp: new_cpus); |
4408 | cs->effective_mems = *new_mems; |
4409 | spin_unlock_irq(lock: &callback_lock); |
4410 | |
4411 | if (cpus_updated) |
4412 | update_tasks_cpumask(cs, new_cpus); |
4413 | if (mems_updated) |
4414 | update_tasks_nodemask(cs); |
4415 | } |
4416 | |
4417 | static bool force_rebuild; |
4418 | |
4419 | void cpuset_force_rebuild(void) |
4420 | { |
4421 | force_rebuild = true; |
4422 | } |
4423 | |
4424 | /* |
4425 | * Attempt to acquire a cpus_read_lock while a hotplug operation may be in |
4426 | * progress. |
4427 | * Return: true if successful, false otherwise |
4428 | * |
4429 | * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, |
4430 | * cpus_read_trylock() is used here to acquire the lock. |
4431 | */ |
4432 | static bool cpuset_hotplug_cpus_read_trylock(void) |
4433 | { |
4434 | int retries = 0; |
4435 | |
4436 | while (!cpus_read_trylock()) { |
4437 | /* |
4438 | * CPU hotplug still in progress. Retry 5 times |
4439 | * with a 10ms wait before bailing out. |
4440 | */ |
4441 | if (++retries > 5) |
4442 | return false; |
4443 | msleep(msecs: 10); |
4444 | } |
4445 | return true; |
4446 | } |
4447 | |
4448 | /** |
4449 | * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug |
4450 | * @cs: cpuset in interest |
4451 | * @tmp: the tmpmasks structure pointer |
4452 | * |
4453 | * Compare @cs's cpu and mem masks against top_cpuset and if some have gone |
4454 | * offline, update @cs accordingly. If @cs ends up with no CPU or memory, |
4455 | * all its tasks are moved to the nearest ancestor with both resources. |
4456 | */ |
4457 | static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) |
4458 | { |
4459 | static cpumask_t new_cpus; |
4460 | static nodemask_t new_mems; |
4461 | bool cpus_updated; |
4462 | bool mems_updated; |
4463 | bool remote; |
4464 | int partcmd = -1; |
4465 | struct cpuset *parent; |
4466 | retry: |
4467 | wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); |
4468 | |
4469 | mutex_lock(&cpuset_mutex); |
4470 | |
4471 | /* |
4472 | * We have raced with task attaching. We wait until attaching |
4473 | * is finished, so we won't attach a task to an empty cpuset. |
4474 | */ |
4475 | if (cs->attach_in_progress) { |
4476 | mutex_unlock(lock: &cpuset_mutex); |
4477 | goto retry; |
4478 | } |
4479 | |
4480 | parent = parent_cs(cs); |
4481 | compute_effective_cpumask(new_cpus: &new_cpus, cs, parent); |
4482 | nodes_and(new_mems, cs->mems_allowed, parent->effective_mems); |
4483 | |
4484 | if (!tmp || !cs->partition_root_state) |
4485 | goto update_tasks; |
4486 | |
4487 | /* |
4488 | * Compute effective_cpus for valid partition root, may invalidate |
4489 | * child partition roots if necessary. |
4490 | */ |
4491 | remote = is_remote_partition(cs); |
4492 | if (remote || (is_partition_valid(cs) && is_partition_valid(cs: parent))) |
4493 | compute_partition_effective_cpumask(cs, new_ecpus: &new_cpus); |
4494 | |
4495 | if (remote && cpumask_empty(srcp: &new_cpus) && |
4496 | partition_is_populated(cs, NULL) && |
4497 | cpuset_hotplug_cpus_read_trylock()) { |
4498 | remote_partition_disable(cs, tmp); |
4499 | compute_effective_cpumask(new_cpus: &new_cpus, cs, parent); |
4500 | remote = false; |
4501 | cpuset_force_rebuild(); |
4502 | cpus_read_unlock(); |
4503 | } |
4504 | |
4505 | /* |
4506 | * Force the partition to become invalid if either one of |
4507 | * the following conditions hold: |
4508 | * 1) empty effective cpus but not valid empty partition. |
4509 | * 2) parent is invalid or doesn't grant any cpus to child |
4510 | * partitions. |
4511 | */ |
4512 | if (is_local_partition(cs) && (!is_partition_valid(cs: parent) || |
4513 | tasks_nocpu_error(parent, cs, xcpus: &new_cpus))) |
4514 | partcmd = partcmd_invalidate; |
4515 | /* |
4516 | * On the other hand, an invalid partition root may be transitioned |
4517 | * back to a regular one. |
4518 | */ |
4519 | else if (is_partition_valid(cs: parent) && is_partition_invalid(cs)) |
4520 | partcmd = partcmd_update; |
4521 | |
4522 | /* |
4523 | * cpus_read_lock needs to be held before calling |
4524 | * update_parent_effective_cpumask(). To avoid circular lock |
4525 | * dependency between cpuset_mutex and cpus_read_lock, |
4526 | * cpus_read_trylock() is used here to acquire the lock. |
4527 | */ |
4528 | if (partcmd >= 0) { |
4529 | if (!cpuset_hotplug_cpus_read_trylock()) |
4530 | goto update_tasks; |
4531 | |
4532 | update_parent_effective_cpumask(cs, cmd: partcmd, NULL, tmp); |
4533 | cpus_read_unlock(); |
4534 | if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { |
4535 | compute_partition_effective_cpumask(cs, new_ecpus: &new_cpus); |
4536 | cpuset_force_rebuild(); |
4537 | } |
4538 | } |
4539 | |
4540 | update_tasks: |
4541 | cpus_updated = !cpumask_equal(src1p: &new_cpus, src2p: cs->effective_cpus); |
4542 | mems_updated = !nodes_equal(new_mems, cs->effective_mems); |
4543 | if (!cpus_updated && !mems_updated) |
4544 | goto unlock; /* Hotplug doesn't affect this cpuset */ |
4545 | |
4546 | if (mems_updated) |
4547 | check_insane_mems_config(nodes: &new_mems); |
4548 | |
4549 | if (is_in_v2_mode()) |
4550 | hotplug_update_tasks(cs, new_cpus: &new_cpus, new_mems: &new_mems, |
4551 | cpus_updated, mems_updated); |
4552 | else |
4553 | hotplug_update_tasks_legacy(cs, new_cpus: &new_cpus, new_mems: &new_mems, |
4554 | cpus_updated, mems_updated); |
4555 | |
4556 | unlock: |
4557 | mutex_unlock(lock: &cpuset_mutex); |
4558 | } |
4559 | |
4560 | /** |
4561 | * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset |
4562 | * @work: unused |
4563 | * |
4564 | * This function is called after either CPU or memory configuration has |
4565 | * changed and updates cpuset accordingly. The top_cpuset is always |
4566 | * synchronized to cpu_active_mask and N_MEMORY, which is necessary in |
4567 | * order to make cpusets transparent (of no affect) on systems that are |
4568 | * actively using CPU hotplug but making no active use of cpusets. |
4569 | * |
4570 | * Non-root cpusets are only affected by offlining. If any CPUs or memory |
4571 | * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on |
4572 | * all descendants. |
4573 | * |
4574 | * Note that CPU offlining during suspend is ignored. We don't modify |
4575 | * cpusets across suspend/resume cycles at all. |
4576 | */ |
4577 | static void cpuset_hotplug_workfn(struct work_struct *work) |
4578 | { |
4579 | static cpumask_t new_cpus; |
4580 | static nodemask_t new_mems; |
4581 | bool cpus_updated, mems_updated; |
4582 | bool on_dfl = is_in_v2_mode(); |
4583 | struct tmpmasks tmp, *ptmp = NULL; |
4584 | |
4585 | if (on_dfl && !alloc_cpumasks(NULL, tmp: &tmp)) |
4586 | ptmp = &tmp; |
4587 | |
4588 | mutex_lock(&cpuset_mutex); |
4589 | |
4590 | /* fetch the available cpus/mems and find out which changed how */ |
4591 | cpumask_copy(dstp: &new_cpus, cpu_active_mask); |
4592 | new_mems = node_states[N_MEMORY]; |
4593 | |
4594 | /* |
4595 | * If subpartitions_cpus is populated, it is likely that the check |
4596 | * below will produce a false positive on cpus_updated when the cpu |
4597 | * list isn't changed. It is extra work, but it is better to be safe. |
4598 | */ |
4599 | cpus_updated = !cpumask_equal(src1p: top_cpuset.effective_cpus, src2p: &new_cpus) || |
4600 | !cpumask_empty(srcp: subpartitions_cpus); |
4601 | mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); |
4602 | |
4603 | /* |
4604 | * In the rare case that hotplug removes all the cpus in |
4605 | * subpartitions_cpus, we assumed that cpus are updated. |
4606 | */ |
4607 | if (!cpus_updated && top_cpuset.nr_subparts) |
4608 | cpus_updated = true; |
4609 | |
4610 | /* For v1, synchronize cpus_allowed to cpu_active_mask */ |
4611 | if (cpus_updated) { |
4612 | spin_lock_irq(lock: &callback_lock); |
4613 | if (!on_dfl) |
4614 | cpumask_copy(dstp: top_cpuset.cpus_allowed, srcp: &new_cpus); |
4615 | /* |
4616 | * Make sure that CPUs allocated to child partitions |
4617 | * do not show up in effective_cpus. If no CPU is left, |
4618 | * we clear the subpartitions_cpus & let the child partitions |
4619 | * fight for the CPUs again. |
4620 | */ |
4621 | if (!cpumask_empty(srcp: subpartitions_cpus)) { |
4622 | if (cpumask_subset(src1p: &new_cpus, src2p: subpartitions_cpus)) { |
4623 | top_cpuset.nr_subparts = 0; |
4624 | cpumask_clear(dstp: subpartitions_cpus); |
4625 | } else { |
4626 | cpumask_andnot(dstp: &new_cpus, src1p: &new_cpus, |
4627 | src2p: subpartitions_cpus); |
4628 | } |
4629 | } |
4630 | cpumask_copy(dstp: top_cpuset.effective_cpus, srcp: &new_cpus); |
4631 | spin_unlock_irq(lock: &callback_lock); |
4632 | /* we don't mess with cpumasks of tasks in top_cpuset */ |
4633 | } |
4634 | |
4635 | /* synchronize mems_allowed to N_MEMORY */ |
4636 | if (mems_updated) { |
4637 | spin_lock_irq(lock: &callback_lock); |
4638 | if (!on_dfl) |
4639 | top_cpuset.mems_allowed = new_mems; |
4640 | top_cpuset.effective_mems = new_mems; |
4641 | spin_unlock_irq(lock: &callback_lock); |
4642 | update_tasks_nodemask(cs: &top_cpuset); |
4643 | } |
4644 | |
4645 | mutex_unlock(lock: &cpuset_mutex); |
4646 | |
4647 | /* if cpus or mems changed, we need to propagate to descendants */ |
4648 | if (cpus_updated || mems_updated) { |
4649 | struct cpuset *cs; |
4650 | struct cgroup_subsys_state *pos_css; |
4651 | |
4652 | rcu_read_lock(); |
4653 | cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { |
4654 | if (cs == &top_cpuset || !css_tryget_online(css: &cs->css)) |
4655 | continue; |
4656 | rcu_read_unlock(); |
4657 | |
4658 | cpuset_hotplug_update_tasks(cs, tmp: ptmp); |
4659 | |
4660 | rcu_read_lock(); |
4661 | css_put(css: &cs->css); |
4662 | } |
4663 | rcu_read_unlock(); |
4664 | } |
4665 | |
4666 | /* rebuild sched domains if cpus_allowed has changed */ |
4667 | if (cpus_updated || force_rebuild) { |
4668 | force_rebuild = false; |
4669 | rebuild_sched_domains(); |
4670 | } |
4671 | |
4672 | free_cpumasks(NULL, tmp: ptmp); |
4673 | } |
4674 | |
4675 | void cpuset_update_active_cpus(void) |
4676 | { |
4677 | /* |
4678 | * We're inside cpu hotplug critical region which usually nests |
4679 | * inside cgroup synchronization. Bounce actual hotplug processing |
4680 | * to a work item to avoid reverse locking order. |
4681 | */ |
4682 | schedule_work(work: &cpuset_hotplug_work); |
4683 | } |
4684 | |
4685 | void cpuset_wait_for_hotplug(void) |
4686 | { |
4687 | flush_work(work: &cpuset_hotplug_work); |
4688 | } |
4689 | |
4690 | /* |
4691 | * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. |
4692 | * Call this routine anytime after node_states[N_MEMORY] changes. |
4693 | * See cpuset_update_active_cpus() for CPU hotplug handling. |
4694 | */ |
4695 | static int cpuset_track_online_nodes(struct notifier_block *self, |
4696 | unsigned long action, void *arg) |
4697 | { |
4698 | schedule_work(work: &cpuset_hotplug_work); |
4699 | return NOTIFY_OK; |
4700 | } |
4701 | |
4702 | /** |
4703 | * cpuset_init_smp - initialize cpus_allowed |
4704 | * |
4705 | * Description: Finish top cpuset after cpu, node maps are initialized |
4706 | */ |
4707 | void __init cpuset_init_smp(void) |
4708 | { |
4709 | /* |
4710 | * cpus_allowd/mems_allowed set to v2 values in the initial |
4711 | * cpuset_bind() call will be reset to v1 values in another |
4712 | * cpuset_bind() call when v1 cpuset is mounted. |
4713 | */ |
4714 | top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; |
4715 | |
4716 | cpumask_copy(dstp: top_cpuset.effective_cpus, cpu_active_mask); |
4717 | top_cpuset.effective_mems = node_states[N_MEMORY]; |
4718 | |
4719 | hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); |
4720 | |
4721 | cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm" , 0); |
4722 | BUG_ON(!cpuset_migrate_mm_wq); |
4723 | } |
4724 | |
4725 | /** |
4726 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. |
4727 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. |
4728 | * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. |
4729 | * |
4730 | * Description: Returns the cpumask_var_t cpus_allowed of the cpuset |
4731 | * attached to the specified @tsk. Guaranteed to return some non-empty |
4732 | * subset of cpu_online_mask, even if this means going outside the |
4733 | * tasks cpuset, except when the task is in the top cpuset. |
4734 | **/ |
4735 | |
4736 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) |
4737 | { |
4738 | unsigned long flags; |
4739 | struct cpuset *cs; |
4740 | |
4741 | spin_lock_irqsave(&callback_lock, flags); |
4742 | rcu_read_lock(); |
4743 | |
4744 | cs = task_cs(task: tsk); |
4745 | if (cs != &top_cpuset) |
4746 | guarantee_online_cpus(tsk, pmask); |
4747 | /* |
4748 | * Tasks in the top cpuset won't get update to their cpumasks |
4749 | * when a hotplug online/offline event happens. So we include all |
4750 | * offline cpus in the allowed cpu list. |
4751 | */ |
4752 | if ((cs == &top_cpuset) || cpumask_empty(srcp: pmask)) { |
4753 | const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); |
4754 | |
4755 | /* |
4756 | * We first exclude cpus allocated to partitions. If there is no |
4757 | * allowable online cpu left, we fall back to all possible cpus. |
4758 | */ |
4759 | cpumask_andnot(dstp: pmask, src1p: possible_mask, src2p: subpartitions_cpus); |
4760 | if (!cpumask_intersects(src1p: pmask, cpu_online_mask)) |
4761 | cpumask_copy(dstp: pmask, srcp: possible_mask); |
4762 | } |
4763 | |
4764 | rcu_read_unlock(); |
4765 | spin_unlock_irqrestore(lock: &callback_lock, flags); |
4766 | } |
4767 | |
4768 | /** |
4769 | * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. |
4770 | * @tsk: pointer to task_struct with which the scheduler is struggling |
4771 | * |
4772 | * Description: In the case that the scheduler cannot find an allowed cpu in |
4773 | * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy |
4774 | * mode however, this value is the same as task_cs(tsk)->effective_cpus, |
4775 | * which will not contain a sane cpumask during cases such as cpu hotplugging. |
4776 | * This is the absolute last resort for the scheduler and it is only used if |
4777 | * _every_ other avenue has been traveled. |
4778 | * |
4779 | * Returns true if the affinity of @tsk was changed, false otherwise. |
4780 | **/ |
4781 | |
4782 | bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
4783 | { |
4784 | const struct cpumask *possible_mask = task_cpu_possible_mask(tsk); |
4785 | const struct cpumask *cs_mask; |
4786 | bool changed = false; |
4787 | |
4788 | rcu_read_lock(); |
4789 | cs_mask = task_cs(task: tsk)->cpus_allowed; |
4790 | if (is_in_v2_mode() && cpumask_subset(src1p: cs_mask, src2p: possible_mask)) { |
4791 | do_set_cpus_allowed(p: tsk, new_mask: cs_mask); |
4792 | changed = true; |
4793 | } |
4794 | rcu_read_unlock(); |
4795 | |
4796 | /* |
4797 | * We own tsk->cpus_allowed, nobody can change it under us. |
4798 | * |
4799 | * But we used cs && cs->cpus_allowed lockless and thus can |
4800 | * race with cgroup_attach_task() or update_cpumask() and get |
4801 | * the wrong tsk->cpus_allowed. However, both cases imply the |
4802 | * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() |
4803 | * which takes task_rq_lock(). |
4804 | * |
4805 | * If we are called after it dropped the lock we must see all |
4806 | * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary |
4807 | * set any mask even if it is not right from task_cs() pov, |
4808 | * the pending set_cpus_allowed_ptr() will fix things. |
4809 | * |
4810 | * select_fallback_rq() will fix things ups and set cpu_possible_mask |
4811 | * if required. |
4812 | */ |
4813 | return changed; |
4814 | } |
4815 | |
4816 | void __init cpuset_init_current_mems_allowed(void) |
4817 | { |
4818 | nodes_setall(current->mems_allowed); |
4819 | } |
4820 | |
4821 | /** |
4822 | * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. |
4823 | * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. |
4824 | * |
4825 | * Description: Returns the nodemask_t mems_allowed of the cpuset |
4826 | * attached to the specified @tsk. Guaranteed to return some non-empty |
4827 | * subset of node_states[N_MEMORY], even if this means going outside the |
4828 | * tasks cpuset. |
4829 | **/ |
4830 | |
4831 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) |
4832 | { |
4833 | nodemask_t mask; |
4834 | unsigned long flags; |
4835 | |
4836 | spin_lock_irqsave(&callback_lock, flags); |
4837 | rcu_read_lock(); |
4838 | guarantee_online_mems(cs: task_cs(task: tsk), pmask: &mask); |
4839 | rcu_read_unlock(); |
4840 | spin_unlock_irqrestore(lock: &callback_lock, flags); |
4841 | |
4842 | return mask; |
4843 | } |
4844 | |
4845 | /** |
4846 | * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed |
4847 | * @nodemask: the nodemask to be checked |
4848 | * |
4849 | * Are any of the nodes in the nodemask allowed in current->mems_allowed? |
4850 | */ |
4851 | int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) |
4852 | { |
4853 | return nodes_intersects(*nodemask, current->mems_allowed); |
4854 | } |
4855 | |
4856 | /* |
4857 | * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or |
4858 | * mem_hardwall ancestor to the specified cpuset. Call holding |
4859 | * callback_lock. If no ancestor is mem_exclusive or mem_hardwall |
4860 | * (an unusual configuration), then returns the root cpuset. |
4861 | */ |
4862 | static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) |
4863 | { |
4864 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) |
4865 | cs = parent_cs(cs); |
4866 | return cs; |
4867 | } |
4868 | |
4869 | /* |
4870 | * cpuset_node_allowed - Can we allocate on a memory node? |
4871 | * @node: is this an allowed node? |
4872 | * @gfp_mask: memory allocation flags |
4873 | * |
4874 | * If we're in interrupt, yes, we can always allocate. If @node is set in |
4875 | * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this |
4876 | * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, |
4877 | * yes. If current has access to memory reserves as an oom victim, yes. |
4878 | * Otherwise, no. |
4879 | * |
4880 | * GFP_USER allocations are marked with the __GFP_HARDWALL bit, |
4881 | * and do not allow allocations outside the current tasks cpuset |
4882 | * unless the task has been OOM killed. |
4883 | * GFP_KERNEL allocations are not so marked, so can escape to the |
4884 | * nearest enclosing hardwalled ancestor cpuset. |
4885 | * |
4886 | * Scanning up parent cpusets requires callback_lock. The |
4887 | * __alloc_pages() routine only calls here with __GFP_HARDWALL bit |
4888 | * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the |
4889 | * current tasks mems_allowed came up empty on the first pass over |
4890 | * the zonelist. So only GFP_KERNEL allocations, if all nodes in the |
4891 | * cpuset are short of memory, might require taking the callback_lock. |
4892 | * |
4893 | * The first call here from mm/page_alloc:get_page_from_freelist() |
4894 | * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, |
4895 | * so no allocation on a node outside the cpuset is allowed (unless |
4896 | * in interrupt, of course). |
4897 | * |
4898 | * The second pass through get_page_from_freelist() doesn't even call |
4899 | * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() |
4900 | * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set |
4901 | * in alloc_flags. That logic and the checks below have the combined |
4902 | * affect that: |
4903 | * in_interrupt - any node ok (current task context irrelevant) |
4904 | * GFP_ATOMIC - any node ok |
4905 | * tsk_is_oom_victim - any node ok |
4906 | * GFP_KERNEL - any node in enclosing hardwalled cpuset ok |
4907 | * GFP_USER - only nodes in current tasks mems allowed ok. |
4908 | */ |
4909 | bool cpuset_node_allowed(int node, gfp_t gfp_mask) |
4910 | { |
4911 | struct cpuset *cs; /* current cpuset ancestors */ |
4912 | bool allowed; /* is allocation in zone z allowed? */ |
4913 | unsigned long flags; |
4914 | |
4915 | if (in_interrupt()) |
4916 | return true; |
4917 | if (node_isset(node, current->mems_allowed)) |
4918 | return true; |
4919 | /* |
4920 | * Allow tasks that have access to memory reserves because they have |
4921 | * been OOM killed to get memory anywhere. |
4922 | */ |
4923 | if (unlikely(tsk_is_oom_victim(current))) |
4924 | return true; |
4925 | if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ |
4926 | return false; |
4927 | |
4928 | if (current->flags & PF_EXITING) /* Let dying task have memory */ |
4929 | return true; |
4930 | |
4931 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
4932 | spin_lock_irqsave(&callback_lock, flags); |
4933 | |
4934 | rcu_read_lock(); |
4935 | cs = nearest_hardwall_ancestor(cs: task_cs(current)); |
4936 | allowed = node_isset(node, cs->mems_allowed); |
4937 | rcu_read_unlock(); |
4938 | |
4939 | spin_unlock_irqrestore(lock: &callback_lock, flags); |
4940 | return allowed; |
4941 | } |
4942 | |
4943 | /** |
4944 | * cpuset_spread_node() - On which node to begin search for a page |
4945 | * @rotor: round robin rotor |
4946 | * |
4947 | * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for |
4948 | * tasks in a cpuset with is_spread_page or is_spread_slab set), |
4949 | * and if the memory allocation used cpuset_mem_spread_node() |
4950 | * to determine on which node to start looking, as it will for |
4951 | * certain page cache or slab cache pages such as used for file |
4952 | * system buffers and inode caches, then instead of starting on the |
4953 | * local node to look for a free page, rather spread the starting |
4954 | * node around the tasks mems_allowed nodes. |
4955 | * |
4956 | * We don't have to worry about the returned node being offline |
4957 | * because "it can't happen", and even if it did, it would be ok. |
4958 | * |
4959 | * The routines calling guarantee_online_mems() are careful to |
4960 | * only set nodes in task->mems_allowed that are online. So it |
4961 | * should not be possible for the following code to return an |
4962 | * offline node. But if it did, that would be ok, as this routine |
4963 | * is not returning the node where the allocation must be, only |
4964 | * the node where the search should start. The zonelist passed to |
4965 | * __alloc_pages() will include all nodes. If the slab allocator |
4966 | * is passed an offline node, it will fall back to the local node. |
4967 | * See kmem_cache_alloc_node(). |
4968 | */ |
4969 | static int cpuset_spread_node(int *rotor) |
4970 | { |
4971 | return *rotor = next_node_in(*rotor, current->mems_allowed); |
4972 | } |
4973 | |
4974 | /** |
4975 | * cpuset_mem_spread_node() - On which node to begin search for a file page |
4976 | */ |
4977 | int cpuset_mem_spread_node(void) |
4978 | { |
4979 | if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) |
4980 | current->cpuset_mem_spread_rotor = |
4981 | node_random(maskp: ¤t->mems_allowed); |
4982 | |
4983 | return cpuset_spread_node(rotor: ¤t->cpuset_mem_spread_rotor); |
4984 | } |
4985 | |
4986 | /** |
4987 | * cpuset_slab_spread_node() - On which node to begin search for a slab page |
4988 | */ |
4989 | int cpuset_slab_spread_node(void) |
4990 | { |
4991 | if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) |
4992 | current->cpuset_slab_spread_rotor = |
4993 | node_random(maskp: ¤t->mems_allowed); |
4994 | |
4995 | return cpuset_spread_node(rotor: ¤t->cpuset_slab_spread_rotor); |
4996 | } |
4997 | EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); |
4998 | |
4999 | /** |
5000 | * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? |
5001 | * @tsk1: pointer to task_struct of some task. |
5002 | * @tsk2: pointer to task_struct of some other task. |
5003 | * |
5004 | * Description: Return true if @tsk1's mems_allowed intersects the |
5005 | * mems_allowed of @tsk2. Used by the OOM killer to determine if |
5006 | * one of the task's memory usage might impact the memory available |
5007 | * to the other. |
5008 | **/ |
5009 | |
5010 | int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, |
5011 | const struct task_struct *tsk2) |
5012 | { |
5013 | return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); |
5014 | } |
5015 | |
5016 | /** |
5017 | * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed |
5018 | * |
5019 | * Description: Prints current's name, cpuset name, and cached copy of its |
5020 | * mems_allowed to the kernel log. |
5021 | */ |
5022 | void cpuset_print_current_mems_allowed(void) |
5023 | { |
5024 | struct cgroup *cgrp; |
5025 | |
5026 | rcu_read_lock(); |
5027 | |
5028 | cgrp = task_cs(current)->css.cgroup; |
5029 | pr_cont(",cpuset=" ); |
5030 | pr_cont_cgroup_name(cgrp); |
5031 | pr_cont(",mems_allowed=%*pbl" , |
5032 | nodemask_pr_args(¤t->mems_allowed)); |
5033 | |
5034 | rcu_read_unlock(); |
5035 | } |
5036 | |
5037 | /* |
5038 | * Collection of memory_pressure is suppressed unless |
5039 | * this flag is enabled by writing "1" to the special |
5040 | * cpuset file 'memory_pressure_enabled' in the root cpuset. |
5041 | */ |
5042 | |
5043 | int cpuset_memory_pressure_enabled __read_mostly; |
5044 | |
5045 | /* |
5046 | * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. |
5047 | * |
5048 | * Keep a running average of the rate of synchronous (direct) |
5049 | * page reclaim efforts initiated by tasks in each cpuset. |
5050 | * |
5051 | * This represents the rate at which some task in the cpuset |
5052 | * ran low on memory on all nodes it was allowed to use, and |
5053 | * had to enter the kernels page reclaim code in an effort to |
5054 | * create more free memory by tossing clean pages or swapping |
5055 | * or writing dirty pages. |
5056 | * |
5057 | * Display to user space in the per-cpuset read-only file |
5058 | * "memory_pressure". Value displayed is an integer |
5059 | * representing the recent rate of entry into the synchronous |
5060 | * (direct) page reclaim by any task attached to the cpuset. |
5061 | */ |
5062 | |
5063 | void __cpuset_memory_pressure_bump(void) |
5064 | { |
5065 | rcu_read_lock(); |
5066 | fmeter_markevent(fmp: &task_cs(current)->fmeter); |
5067 | rcu_read_unlock(); |
5068 | } |
5069 | |
5070 | #ifdef CONFIG_PROC_PID_CPUSET |
5071 | /* |
5072 | * proc_cpuset_show() |
5073 | * - Print tasks cpuset path into seq_file. |
5074 | * - Used for /proc/<pid>/cpuset. |
5075 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it |
5076 | * doesn't really matter if tsk->cpuset changes after we read it, |
5077 | * and we take cpuset_mutex, keeping cpuset_attach() from changing it |
5078 | * anyway. |
5079 | */ |
5080 | int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, |
5081 | struct pid *pid, struct task_struct *tsk) |
5082 | { |
5083 | char *buf; |
5084 | struct cgroup_subsys_state *css; |
5085 | int retval; |
5086 | |
5087 | retval = -ENOMEM; |
5088 | buf = kmalloc(PATH_MAX, GFP_KERNEL); |
5089 | if (!buf) |
5090 | goto out; |
5091 | |
5092 | css = task_get_css(task: tsk, subsys_id: cpuset_cgrp_id); |
5093 | retval = cgroup_path_ns(cgrp: css->cgroup, buf, PATH_MAX, |
5094 | current->nsproxy->cgroup_ns); |
5095 | css_put(css); |
5096 | if (retval == -E2BIG) |
5097 | retval = -ENAMETOOLONG; |
5098 | if (retval < 0) |
5099 | goto out_free; |
5100 | seq_puts(m, s: buf); |
5101 | seq_putc(m, c: '\n'); |
5102 | retval = 0; |
5103 | out_free: |
5104 | kfree(objp: buf); |
5105 | out: |
5106 | return retval; |
5107 | } |
5108 | #endif /* CONFIG_PROC_PID_CPUSET */ |
5109 | |
5110 | /* Display task mems_allowed in /proc/<pid>/status file. */ |
5111 | void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) |
5112 | { |
5113 | seq_printf(m, fmt: "Mems_allowed:\t%*pb\n" , |
5114 | nodemask_pr_args(&task->mems_allowed)); |
5115 | seq_printf(m, fmt: "Mems_allowed_list:\t%*pbl\n" , |
5116 | nodemask_pr_args(&task->mems_allowed)); |
5117 | } |
5118 | |