cpuset.c source code [linux/kernel/cgroup/cpuset.c]

1	/*
2	* kernel/cpuset.c
3	*
4	* Processor and Memory placement constraints for sets of tasks.
5	*
6	* Copyright (C) 2003 BULL SA.
7	* Copyright (C) 2004-2007 Silicon Graphics, Inc.
8	* Copyright (C) 2006 Google, Inc
9	*
10	* Portions derived from Patrick Mochel's sysfs code.
11	* sysfs is Copyright (c) 2001-3 Patrick Mochel
12	*
13	* 2003-10-10 Written by Simon Derr.
14	* 2003-10-22 Updates by Stephen Hemminger.
15	* 2004 May-July Rework by Paul Jackson.
16	* 2006 Rework by Paul Menage to use generic cgroups
17	* 2008 Rework of the scheduler domains and CPU hotplug handling
18	* by Max Krasnyansky
19	*
20	* This file is subject to the terms and conditions of the GNU General Public
21	* License. See the file COPYING in the main directory of the Linux
22	* distribution for more details.
23	*/
24
25	#include <linux/cpu.h>
26	#include <linux/cpumask.h>
27	#include <linux/cpuset.h>
28	#include <linux/delay.h>
29	#include <linux/init.h>
30	#include <linux/interrupt.h>
31	#include <linux/kernel.h>
32	#include <linux/mempolicy.h>
33	#include <linux/mm.h>
34	#include <linux/memory.h>
35	#include <linux/export.h>
36	#include <linux/rcupdate.h>
37	#include <linux/sched.h>
38	#include <linux/sched/deadline.h>
39	#include <linux/sched/mm.h>
40	#include <linux/sched/task.h>
41	#include <linux/security.h>
42	#include <linux/spinlock.h>
43	#include <linux/oom.h>
44	#include <linux/sched/isolation.h>
45	#include <linux/cgroup.h>
46	#include <linux/wait.h>
47	#include <linux/workqueue.h>
48
49	DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
50	DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
51
52	/*
53	* There could be abnormal cpuset configurations for cpu or memory
54	* node binding, add this key to provide a quick low-cost judgment
55	* of the situation.
56	*/
57	DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
58
59	/ See "Frequency meter" comments, below. /
60
61	struct fmeter {
62	int cnt; / unprocessed events count /
63	int val; / most recent output value /
64	time64_t time; / clock (secs) when val computed /
65	spinlock_t lock; / guards read or write of above /
66	};
67
68	/*
69	* Invalid partition error code
70	*/
71	enum prs_errcode {
72	PERR_NONE = `0`,
73	PERR_INVCPUS,
74	PERR_INVPARENT,
75	PERR_NOTPART,
76	PERR_NOTEXCL,
77	PERR_NOCPUS,
78	PERR_HOTPLUG,
79	PERR_CPUSEMPTY,
80	PERR_HKEEPING,
81	};
82
83	static const char * const perr_strings[] = {
84	[PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive",
85	[PERR_INVPARENT] = "Parent is an invalid partition root",
86	[PERR_NOTPART] = "Parent is not a partition root",
87	[PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
88	[PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
89	[PERR_HOTPLUG] = "No cpu available due to hotplug",
90	[PERR_CPUSEMPTY] = "cpuset.cpus is empty",
91	[PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
92	};
93
94	struct cpuset {
95	struct cgroup_subsys_state css;
96
97	unsigned long flags; / "unsigned long" so bitops work /
98
99	/*
100	* On default hierarchy:
101	*
102	* The user-configured masks can only be changed by writing to
103	* cpuset.cpus and cpuset.mems, and won't be limited by the
104	* parent masks.
105	*
106	* The effective masks is the real masks that apply to the tasks
107	* in the cpuset. They may be changed if the configured masks are
108	* changed or hotplug happens.
109	*
110	* effective_mask == configured_mask & parent's effective_mask,
111	* and if it ends up empty, it will inherit the parent's mask.
112	*
113	*
114	* On legacy hierarchy:
115	*
116	* The user-configured masks are always the same with effective masks.
117	*/
118
119	/ user-configured CPUs and Memory Nodes allow to tasks /
120	cpumask_var_t cpus_allowed;
121	nodemask_t mems_allowed;
122
123	/ effective CPUs and Memory Nodes allow to tasks /
124	cpumask_var_t effective_cpus;
125	nodemask_t effective_mems;
126
127	/*
128	* Exclusive CPUs dedicated to current cgroup (default hierarchy only)
129	*
130	* This exclusive CPUs must be a subset of cpus_allowed. A parent
131	* cgroup can only grant exclusive CPUs to one of its children.
132	*
133	* When the cgroup becomes a valid partition root, effective_xcpus
134	* defaults to cpus_allowed if not set. The effective_cpus of a valid
135	* partition root comes solely from its effective_xcpus and some of the
136	* effective_xcpus may be distributed to sub-partitions below & hence
137	* excluded from its effective_cpus.
138	*/
139	cpumask_var_t effective_xcpus;
140
141	/*
142	* Exclusive CPUs as requested by the user (default hierarchy only)
143	*/
144	cpumask_var_t exclusive_cpus;
145
146	/*
147	* This is old Memory Nodes tasks took on.
148	*
149	* - top_cpuset.old_mems_allowed is initialized to mems_allowed.
150	* - A new cpuset's old_mems_allowed is initialized when some
151	* task is moved into it.
152	* - old_mems_allowed is used in cpuset_migrate_mm() when we change
153	* cpuset.mems_allowed and have tasks' nodemask updated, and
154	* then old_mems_allowed is updated to mems_allowed.
155	*/
156	nodemask_t old_mems_allowed;
157
158	struct fmeter fmeter; / memory_pressure filter /
159
160	/*
161	* Tasks are being attached to this cpuset. Used to prevent
162	* zeroing cpus/mems_allowed between ->can_attach() and ->attach().
163	*/
164	int attach_in_progress;
165
166	/ partition number for rebuild_sched_domains() /
167	int pn;
168
169	/ for custom sched domain /
170	int relax_domain_level;
171
172	/ number of valid sub-partitions /
173	int nr_subparts;
174
175	/ partition root state /
176	int partition_root_state;
177
178	/*
179	* Default hierarchy only:
180	* use_parent_ecpus - set if using parent's effective_cpus
181	* child_ecpus_count - # of children with use_parent_ecpus set
182	*/
183	int use_parent_ecpus;
184	int child_ecpus_count;
185
186	/*
187	* number of SCHED_DEADLINE tasks attached to this cpuset, so that we
188	* know when to rebuild associated root domain bandwidth information.
189	*/
190	int nr_deadline_tasks;
191	int nr_migrate_dl_tasks;
192	u64 sum_migrate_dl_bw;
193
194	/ Invalid partition error code, not lock protected /
195	enum prs_errcode prs_err;
196
197	/ Handle for cpuset.cpus.partition /
198	struct cgroup_file partition_file;
199
200	/ Remote partition silbling list anchored at remote_children /
201	struct list_head remote_sibling;
202	};
203
204	/*
205	* Exclusive CPUs distributed out to sub-partitions of top_cpuset
206	*/
207	static cpumask_var_t subpartitions_cpus;
208
209	/*
210	* Exclusive CPUs in isolated partitions
211	*/
212	static cpumask_var_t isolated_cpus;
213
214	/ List of remote partition root children /
215	static struct list_head remote_children;
216
217	/*
218	* Partition root states:
219	*
220	* 0 - member (not a partition root)
221	* 1 - partition root
222	* 2 - partition root without load balancing (isolated)
223	* -1 - invalid partition root
224	* -2 - invalid isolated partition root
225	*/
226	#define PRS_MEMBER 0
227	#define PRS_ROOT 1
228	#define PRS_ISOLATED 2
229	#define PRS_INVALID_ROOT -1
230	#define PRS_INVALID_ISOLATED -2
231
232	static inline bool is_prs_invalid(int prs_state)
233	{
234	return prs_state < `0`;
235	}
236
237	/*
238	* Temporary cpumasks for working with partitions that are passed among
239	* functions to avoid memory allocation in inner functions.
240	*/
241	struct tmpmasks {
242	cpumask_var_t addmask, delmask; / For partition root /
243	cpumask_var_t new_cpus; / For update_cpumasks_hier() /
244	};
245
246	static inline struct cpuset css_cs(struct* cgroup_subsys_state *css)
247	{
248	return css ? container_of(css, struct cpuset, css) : NULL;
249	}
250
251	/ Retrieve the cpuset for a task /
252	static inline struct cpuset task_cs(struct* task_struct *task)
253	{
254	return css_cs(css: task_css(task, subsys_id: cpuset_cgrp_id));
255	}
256
257	static inline struct cpuset parent_cs(struct* cpuset *cs)
258	{
259	return css_cs(css: cs->css.parent);
260	}
261
262	void inc_dl_tasks_cs(struct task_struct *p)
263	{
264	struct cpuset *cs = task_cs(task: p);
265
266	cs->nr_deadline_tasks++;
267	}
268
269	void dec_dl_tasks_cs(struct task_struct *p)
270	{
271	struct cpuset *cs = task_cs(task: p);
272
273	cs->nr_deadline_tasks--;
274	}
275
276	/ bits in struct cpuset flags field /
277	typedef enum {
278	CS_ONLINE,
279	CS_CPU_EXCLUSIVE,
280	CS_MEM_EXCLUSIVE,
281	CS_MEM_HARDWALL,
282	CS_MEMORY_MIGRATE,
283	CS_SCHED_LOAD_BALANCE,
284	CS_SPREAD_PAGE,
285	CS_SPREAD_SLAB,
286	} cpuset_flagbits_t;
287
288	/ convenient tests for these bits /
289	static inline bool is_cpuset_online(struct cpuset *cs)
290	{
291	return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(css: &cs->css);
292	}
293
294	static inline int is_cpu_exclusive(const struct cpuset *cs)
295	{
296	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
297	}
298
299	static inline int is_mem_exclusive(const struct cpuset *cs)
300	{
301	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
302	}
303
304	static inline int is_mem_hardwall(const struct cpuset *cs)
305	{
306	return test_bit(CS_MEM_HARDWALL, &cs->flags);
307	}
308
309	static inline int is_sched_load_balance(const struct cpuset *cs)
310	{
311	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
312	}
313
314	static inline int is_memory_migrate(const struct cpuset *cs)
315	{
316	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
317	}
318
319	static inline int is_spread_page(const struct cpuset *cs)
320	{
321	return test_bit(CS_SPREAD_PAGE, &cs->flags);
322	}
323
324	static inline int is_spread_slab(const struct cpuset *cs)
325	{
326	return test_bit(CS_SPREAD_SLAB, &cs->flags);
327	}
328
329	static inline int is_partition_valid(const struct cpuset *cs)
330	{
331	return cs->partition_root_state > `0`;
332	}
333
334	static inline int is_partition_invalid(const struct cpuset *cs)
335	{
336	return cs->partition_root_state < `0`;
337	}
338
339	/*
340	* Callers should hold callback_lock to modify partition_root_state.
341	*/
342	static inline void make_partition_invalid(struct cpuset *cs)
343	{
344	if (cs->partition_root_state > `0`)
345	cs->partition_root_state = -cs->partition_root_state;
346	}
347
348	/*
349	* Send notification event of whenever partition_root_state changes.
350	*/
351	static inline void notify_partition_change(struct cpuset cs, int* old_prs)
352	{
353	if (old_prs == cs->partition_root_state)
354	return;
355	cgroup_file_notify(cfile: &cs->partition_file);
356
357	/ Reset prs_err if not invalid /
358	if (is_partition_valid(cs))
359	WRITE_ONCE(cs->prs_err, PERR_NONE);
360	}
361
362	static struct cpuset top_cpuset = {
363	.flags = ((`1` << CS_ONLINE) \| (`1` << CS_CPU_EXCLUSIVE) \|
364	(`1` << CS_MEM_EXCLUSIVE)),
365	.partition_root_state = PRS_ROOT,
366	.remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling),
367	};
368
369	/**
370	* cpuset_for_each_child - traverse online children of a cpuset
371	* @child_cs: loop cursor pointing to the current child
372	* @pos_css: used for iteration
373	* @parent_cs: target cpuset to walk children of
374	*
375	* Walk @child_cs through the online children of @parent_cs. Must be used
376	* with RCU read locked.
377	*/
378	#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
379	css_for_each_child((pos_css), &(parent_cs)->css) \
380	if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
381
382	/**
383	* cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
384	* @des_cs: loop cursor pointing to the current descendant
385	* @pos_css: used for iteration
386	* @root_cs: target cpuset to walk ancestor of
387	*
388	* Walk @des_cs through the online descendants of @root_cs. Must be used
389	* with RCU read locked. The caller may modify @pos_css by calling
390	* css_rightmost_descendant() to skip subtree. @root_cs is included in the
391	* iteration and the first node to be visited.
392	*/
393	#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
394	css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
395	if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
396
397	/*
398	* There are two global locks guarding cpuset structures - cpuset_mutex and
399	* callback_lock. We also require taking task_lock() when dereferencing a
400	* task's cpuset pointer. See "The task_lock() exception", at the end of this
401	* comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems
402	* can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
403	* structures. Note that cpuset_mutex needs to be a mutex as it is used in
404	* paths that rely on priority inheritance (e.g. scheduler - on RT) for
405	* correctness.
406	*
407	* A task must hold both locks to modify cpusets. If a task holds
408	* cpuset_mutex, it blocks others, ensuring that it is the only task able to
409	* also acquire callback_lock and be able to modify cpusets. It can perform
410	* various checks on the cpuset structure first, knowing nothing will change.
411	* It can also allocate memory while just holding cpuset_mutex. While it is
412	* performing these checks, various callback routines can briefly acquire
413	* callback_lock to query cpusets. Once it is ready to make the changes, it
414	* takes callback_lock, blocking everyone else.
415	*
416	* Calls to the kernel memory allocator can not be made while holding
417	* callback_lock, as that would risk double tripping on callback_lock
418	* from one of the callbacks into the cpuset code from within
419	* __alloc_pages().
420	*
421	* If a task is only holding callback_lock, then it has read-only
422	* access to cpusets.
423	*
424	* Now, the task_struct fields mems_allowed and mempolicy may be changed
425	* by other task, we use alloc_lock in the task_struct fields to protect
426	* them.
427	*
428	* The cpuset_common_file_read() handlers only hold callback_lock across
429	* small pieces of code, such as when reading out possibly multi-word
430	* cpumasks and nodemasks.
431	*
432	* Accessing a task's cpuset should be done in accordance with the
433	* guidelines for accessing subsystem state in kernel/cgroup.c
434	*/
435
436	static DEFINE_MUTEX(cpuset_mutex);
437
438	void cpuset_lock(void)
439	{
440	mutex_lock(&cpuset_mutex);
441	}
442
443	void cpuset_unlock(void)
444	{
445	mutex_unlock(lock: &cpuset_mutex);
446	}
447
448	static DEFINE_SPINLOCK(callback_lock);
449
450	static struct workqueue_struct *cpuset_migrate_mm_wq;
451
452	/*
453	* CPU / memory hotplug is handled asynchronously.
454	*/
455	static void cpuset_hotplug_workfn(struct work_struct *work);
456	static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
457
458	static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
459
460	static inline void check_insane_mems_config(nodemask_t *nodes)
461	{
462	if (!cpusets_insane_config() &&
463	movable_only_nodes(nodes)) {
464	static_branch_enable(&cpusets_insane_config_key);
465	pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
466	"Cpuset allocations might fail even with a lot of memory available.\n",
467	nodemask_pr_args(nodes));
468	}
469	}
470
471	/*
472	* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
473	* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
474	* the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
475	* With v2 behavior, "cpus" and "mems" are always what the users have
476	* requested and won't be changed by hotplug events. Only the effective
477	* cpus or mems will be affected.
478	*/
479	static inline bool is_in_v2_mode(void)
480	{
481	return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) \|\|
482	(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
483	}
484
485	/**
486	* partition_is_populated - check if partition has tasks
487	* @cs: partition root to be checked
488	* @excluded_child: a child cpuset to be excluded in task checking
489	* Return: true if there are tasks, false otherwise
490	*
491	* It is assumed that @cs is a valid partition root. @excluded_child should
492	* be non-NULL when this cpuset is going to become a partition itself.
493	*/
494	static inline bool partition_is_populated(struct cpuset *cs,
495	struct cpuset *excluded_child)
496	{
497	struct cgroup_subsys_state *css;
498	struct cpuset *child;
499
500	if (cs->css.cgroup->nr_populated_csets)
501	return true;
502	if (!excluded_child && !cs->nr_subparts)
503	return cgroup_is_populated(cgrp: cs->css.cgroup);
504
505	rcu_read_lock();
506	cpuset_for_each_child(child, css, cs) {
507	if (child == excluded_child)
508	continue;
509	if (is_partition_valid(cs: child))
510	continue;
511	if (cgroup_is_populated(cgrp: child->css.cgroup)) {
512	rcu_read_unlock();
513	return true;
514	}
515	}
516	rcu_read_unlock();
517	return false;
518	}
519
520	/*
521	* Return in pmask the portion of a task's cpusets's cpus_allowed that
522	* are online and are capable of running the task. If none are found,
523	* walk up the cpuset hierarchy until we find one that does have some
524	* appropriate cpus.
525	*
526	* One way or another, we guarantee to return some non-empty subset
527	* of cpu_online_mask.
528	*
529	* Call with callback_lock or cpuset_mutex held.
530	*/
531	static void guarantee_online_cpus(struct task_struct *tsk,
532	struct cpumask *pmask)
533	{
534	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
535	struct cpuset *cs;
536
537	if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
538	cpumask_copy(dstp: pmask, cpu_online_mask);
539
540	rcu_read_lock();
541	cs = task_cs(task: tsk);
542
543	while (!cpumask_intersects(src1p: cs->effective_cpus, src2p: pmask)) {
544	cs = parent_cs(cs);
545	if (unlikely(!cs)) {
546	/*
547	* The top cpuset doesn't have any online cpu as a
548	* consequence of a race between cpuset_hotplug_work
549	* and cpu hotplug notifier. But we know the top
550	* cpuset's effective_cpus is on its way to be
551	* identical to cpu_online_mask.
552	*/
553	goto out_unlock;
554	}
555	}
556	cpumask_and(dstp: pmask, src1p: pmask, src2p: cs->effective_cpus);
557
558	out_unlock:
559	rcu_read_unlock();
560	}
561
562	/*
563	* Return in *pmask the portion of a cpusets's mems_allowed that
564	* are online, with memory. If none are online with memory, walk
565	* up the cpuset hierarchy until we find one that does have some
566	* online mems. The top cpuset always has some mems online.
567	*
568	* One way or another, we guarantee to return some non-empty subset
569	* of node_states[N_MEMORY].
570	*
571	* Call with callback_lock or cpuset_mutex held.
572	*/
573	static void guarantee_online_mems(struct cpuset cs, nodemask_t pmask)
574	{
575	while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
576	cs = parent_cs(cs);
577	nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
578	}
579
580	/*
581	* update task's spread flag if cpuset's page/slab spread flag is set
582	*
583	* Call with callback_lock or cpuset_mutex held. The check can be skipped
584	* if on default hierarchy.
585	*/
586	static void cpuset_update_task_spread_flags(struct cpuset *cs,
587	struct task_struct *tsk)
588	{
589	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
590	return;
591
592	if (is_spread_page(cs))
593	task_set_spread_page(p: tsk);
594	else
595	task_clear_spread_page(p: tsk);
596
597	if (is_spread_slab(cs))
598	task_set_spread_slab(p: tsk);
599	else
600	task_clear_spread_slab(p: tsk);
601	}
602
603	/*
604	* is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
605	*
606	* One cpuset is a subset of another if all its allowed CPUs and
607	* Memory Nodes are a subset of the other, and its exclusive flags
608	* are only set if the other's are set. Call holding cpuset_mutex.
609	*/
610
611	static int is_cpuset_subset(const struct cpuset p, const* struct cpuset *q)
612	{
613	return cpumask_subset(src1p: p->cpus_allowed, src2p: q->cpus_allowed) &&
614	nodes_subset(p->mems_allowed, q->mems_allowed) &&
615	is_cpu_exclusive(cs: p) <= is_cpu_exclusive(cs: q) &&
616	is_mem_exclusive(cs: p) <= is_mem_exclusive(cs: q);
617	}
618
619	/**
620	* alloc_cpumasks - allocate three cpumasks for cpuset
621	* @cs: the cpuset that have cpumasks to be allocated.
622	* @tmp: the tmpmasks structure pointer
623	* Return: 0 if successful, -ENOMEM otherwise.
624	*
625	* Only one of the two input arguments should be non-NULL.
626	*/
627	static inline int alloc_cpumasks(struct cpuset cs, struct* tmpmasks *tmp)
628	{
629	cpumask_var_t pmask1, pmask2, pmask3, pmask4;
630
631	if (cs) {
632	pmask1 = &cs->cpus_allowed;
633	pmask2 = &cs->effective_cpus;
634	pmask3 = &cs->effective_xcpus;
635	pmask4 = &cs->exclusive_cpus;
636	} else {
637	pmask1 = &tmp->new_cpus;
638	pmask2 = &tmp->addmask;
639	pmask3 = &tmp->delmask;
640	pmask4 = NULL;
641	}
642
643	if (!zalloc_cpumask_var(mask: pmask1, GFP_KERNEL))
644	return -ENOMEM;
645
646	if (!zalloc_cpumask_var(mask: pmask2, GFP_KERNEL))
647	goto free_one;
648
649	if (!zalloc_cpumask_var(mask: pmask3, GFP_KERNEL))
650	goto free_two;
651
652	if (pmask4 && !zalloc_cpumask_var(mask: pmask4, GFP_KERNEL))
653	goto free_three;
654
655
656	return `0`;
657
658	free_three:
659	free_cpumask_var(mask: *pmask3);
660	free_two:
661	free_cpumask_var(mask: *pmask2);
662	free_one:
663	free_cpumask_var(mask: *pmask1);
664	return -ENOMEM;
665	}
666
667	/**
668	* free_cpumasks - free cpumasks in a tmpmasks structure
669	* @cs: the cpuset that have cpumasks to be free.
670	* @tmp: the tmpmasks structure pointer
671	*/
672	static inline void free_cpumasks(struct cpuset cs, struct* tmpmasks *tmp)
673	{
674	if (cs) {
675	free_cpumask_var(mask: cs->cpus_allowed);
676	free_cpumask_var(mask: cs->effective_cpus);
677	free_cpumask_var(mask: cs->effective_xcpus);
678	free_cpumask_var(mask: cs->exclusive_cpus);
679	}
680	if (tmp) {
681	free_cpumask_var(mask: tmp->new_cpus);
682	free_cpumask_var(mask: tmp->addmask);
683	free_cpumask_var(mask: tmp->delmask);
684	}
685	}
686
687	/**
688	* alloc_trial_cpuset - allocate a trial cpuset
689	* @cs: the cpuset that the trial cpuset duplicates
690	*/
691	static struct cpuset alloc_trial_cpuset(struct* cpuset *cs)
692	{
693	struct cpuset *trial;
694
695	trial = kmemdup(p: cs, size: sizeof(*cs), GFP_KERNEL);
696	if (!trial)
697	return NULL;
698
699	if (alloc_cpumasks(cs: trial, NULL)) {
700	kfree(objp: trial);
701	return NULL;
702	}
703
704	cpumask_copy(dstp: trial->cpus_allowed, srcp: cs->cpus_allowed);
705	cpumask_copy(dstp: trial->effective_cpus, srcp: cs->effective_cpus);
706	cpumask_copy(dstp: trial->effective_xcpus, srcp: cs->effective_xcpus);
707	cpumask_copy(dstp: trial->exclusive_cpus, srcp: cs->exclusive_cpus);
708	return trial;
709	}
710
711	/**
712	* free_cpuset - free the cpuset
713	* @cs: the cpuset to be freed
714	*/
715	static inline void free_cpuset(struct cpuset *cs)
716	{
717	free_cpumasks(cs, NULL);
718	kfree(objp: cs);
719	}
720
721	static inline struct cpumask fetch_xcpus(struct* cpuset *cs)
722	{
723	return !cpumask_empty(srcp: cs->exclusive_cpus) ? cs->exclusive_cpus :
724	cpumask_empty(srcp: cs->effective_xcpus) ? cs->cpus_allowed
725	: cs->effective_xcpus;
726	}
727
728	/*
729	* cpusets_are_exclusive() - check if two cpusets are exclusive
730	*
731	* Return true if exclusive, false if not
732	*/
733	static inline bool cpusets_are_exclusive(struct cpuset cs1, struct* cpuset *cs2)
734	{
735	struct cpumask *xcpus1 = fetch_xcpus(cs: cs1);
736	struct cpumask *xcpus2 = fetch_xcpus(cs: cs2);
737
738	if (cpumask_intersects(src1p: xcpus1, src2p: xcpus2))
739	return false;
740	return true;
741	}
742
743	/*
744	* validate_change_legacy() - Validate conditions specific to legacy (v1)
745	* behavior.
746	*/
747	static int validate_change_legacy(struct cpuset cur, struct* cpuset *trial)
748	{
749	struct cgroup_subsys_state *css;
750	struct cpuset c, par;
751	int ret;
752
753	WARN_ON_ONCE(!rcu_read_lock_held());
754
755	/ Each of our child cpusets must be a subset of us /
756	ret = -EBUSY;
757	cpuset_for_each_child(c, css, cur)
758	if (!is_cpuset_subset(p: c, q: trial))
759	goto out;
760
761	/ On legacy hierarchy, we must be a subset of our parent cpuset. /
762	ret = -EACCES;
763	par = parent_cs(cs: cur);
764	if (par && !is_cpuset_subset(p: trial, q: par))
765	goto out;
766
767	ret = `0`;
768	out:
769	return ret;
770	}
771
772	/*
773	* validate_change() - Used to validate that any proposed cpuset change
774	* follows the structural rules for cpusets.
775	*
776	* If we replaced the flag and mask values of the current cpuset
777	* (cur) with those values in the trial cpuset (trial), would
778	* our various subset and exclusive rules still be valid? Presumes
779	* cpuset_mutex held.
780	*
781	* 'cur' is the address of an actual, in-use cpuset. Operations
782	* such as list traversal that depend on the actual address of the
783	* cpuset in the list must use cur below, not trial.
784	*
785	* 'trial' is the address of bulk structure copy of cur, with
786	* perhaps one or more of the fields cpus_allowed, mems_allowed,
787	* or flags changed to new, trial values.
788	*
789	* Return 0 if valid, -errno if not.
790	*/
791
792	static int validate_change(struct cpuset cur, struct* cpuset *trial)
793	{
794	struct cgroup_subsys_state *css;
795	struct cpuset c, par;
796	int ret = `0`;
797
798	rcu_read_lock();
799
800	if (!is_in_v2_mode())
801	ret = validate_change_legacy(cur, trial);
802	if (ret)
803	goto out;
804
805	/ Remaining checks don't apply to root cpuset /
806	if (cur == &top_cpuset)
807	goto out;
808
809	par = parent_cs(cs: cur);
810
811	/*
812	* Cpusets with tasks - existing or newly being attached - can't
813	* be changed to have empty cpus_allowed or mems_allowed.
814	*/
815	ret = -ENOSPC;
816	if ((cgroup_is_populated(cgrp: cur->css.cgroup) \|\| cur->attach_in_progress)) {
817	if (!cpumask_empty(srcp: cur->cpus_allowed) &&
818	cpumask_empty(srcp: trial->cpus_allowed))
819	goto out;
820	if (!nodes_empty(cur->mems_allowed) &&
821	nodes_empty(trial->mems_allowed))
822	goto out;
823	}
824
825	/*
826	* We can't shrink if we won't have enough room for SCHED_DEADLINE
827	* tasks.
828	*/
829	ret = -EBUSY;
830	if (is_cpu_exclusive(cs: cur) &&
831	!cpuset_cpumask_can_shrink(cur: cur->cpus_allowed,
832	trial: trial->cpus_allowed))
833	goto out;
834
835	/*
836	* If either I or some sibling (!= me) is exclusive, we can't
837	* overlap
838	*/
839	ret = -EINVAL;
840	cpuset_for_each_child(c, css, par) {
841	if ((is_cpu_exclusive(cs: trial) \|\| is_cpu_exclusive(cs: c)) &&
842	c != cur) {
843	if (!cpusets_are_exclusive(cs1: trial, cs2: c))
844	goto out;
845	}
846	if ((is_mem_exclusive(cs: trial) \|\| is_mem_exclusive(cs: c)) &&
847	c != cur &&
848	nodes_intersects(trial->mems_allowed, c->mems_allowed))
849	goto out;
850	}
851
852	ret = `0`;
853	out:
854	rcu_read_unlock();
855	return ret;
856	}
857
858	#ifdef CONFIG_SMP
859	/*
860	* Helper routine for generate_sched_domains().
861	* Do cpusets a, b have overlapping effective cpus_allowed masks?
862	*/
863	static int cpusets_overlap(struct cpuset a, struct* cpuset *b)
864	{
865	return cpumask_intersects(src1p: a->effective_cpus, src2p: b->effective_cpus);
866	}
867
868	static void
869	update_domain_attr(struct sched_domain_attr dattr, struct* cpuset *c)
870	{
871	if (dattr->relax_domain_level < c->relax_domain_level)
872	dattr->relax_domain_level = c->relax_domain_level;
873	return;
874	}
875
876	static void update_domain_attr_tree(struct sched_domain_attr *dattr,
877	struct cpuset *root_cs)
878	{
879	struct cpuset *cp;
880	struct cgroup_subsys_state *pos_css;
881
882	rcu_read_lock();
883	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
884	/ skip the whole subtree if @cp doesn't have any CPU /
885	if (cpumask_empty(srcp: cp->cpus_allowed)) {
886	pos_css = css_rightmost_descendant(pos: pos_css);
887	continue;
888	}
889
890	if (is_sched_load_balance(cs: cp))
891	update_domain_attr(dattr, c: cp);
892	}
893	rcu_read_unlock();
894	}
895
896	/ Must be called with cpuset_mutex held. /
897	static inline int nr_cpusets(void)
898	{
899	/ jump label reference count + the top-level cpuset /
900	return static_key_count(key: &cpusets_enabled_key.key) + `1`;
901	}
902
903	/*
904	* generate_sched_domains()
905	*
906	* This function builds a partial partition of the systems CPUs
907	* A 'partial partition' is a set of non-overlapping subsets whose
908	* union is a subset of that set.
909	* The output of this function needs to be passed to kernel/sched/core.c
910	* partition_sched_domains() routine, which will rebuild the scheduler's
911	* load balancing domains (sched domains) as specified by that partial
912	* partition.
913	*
914	* See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
915	* for a background explanation of this.
916	*
917	* Does not return errors, on the theory that the callers of this
918	* routine would rather not worry about failures to rebuild sched
919	* domains when operating in the severe memory shortage situations
920	* that could cause allocation failures below.
921	*
922	* Must be called with cpuset_mutex held.
923	*
924	* The three key local variables below are:
925	* cp - cpuset pointer, used (together with pos_css) to perform a
926	* top-down scan of all cpusets. For our purposes, rebuilding
927	* the schedulers sched domains, we can ignore !is_sched_load_
928	* balance cpusets.
929	* csa - (for CpuSet Array) Array of pointers to all the cpusets
930	* that need to be load balanced, for convenient iterative
931	* access by the subsequent code that finds the best partition,
932	* i.e the set of domains (subsets) of CPUs such that the
933	* cpus_allowed of every cpuset marked is_sched_load_balance
934	* is a subset of one of these domains, while there are as
935	* many such domains as possible, each as small as possible.
936	* doms - Conversion of 'csa' to an array of cpumasks, for passing to
937	* the kernel/sched/core.c routine partition_sched_domains() in a
938	* convenient format, that can be easily compared to the prior
939	* value to determine what partition elements (sched domains)
940	* were changed (added or removed.)
941	*
942	* Finding the best partition (set of domains):
943	* The triple nested loops below over i, j, k scan over the
944	* load balanced cpusets (using the array of cpuset pointers in
945	* csa[]) looking for pairs of cpusets that have overlapping
946	* cpus_allowed, but which don't have the same 'pn' partition
947	* number and gives them in the same partition number. It keeps
948	* looping on the 'restart' label until it can no longer find
949	* any such pairs.
950	*
951	* The union of the cpus_allowed masks from the set of
952	* all cpusets having the same 'pn' value then form the one
953	* element of the partition (one sched domain) to be passed to
954	* partition_sched_domains().
955	*/
956	static int generate_sched_domains(cpumask_var_t **domains,
957	struct sched_domain_attr **attributes)
958	{
959	struct cpuset cp; /* top-down scan of cpusets /
960	struct cpuset *csa; /* array of all cpuset ptrs /
961	int csn; / how many cpuset ptrs in csa so far /
962	int i, j, k; / indices for partition finding loops /
963	cpumask_var_t doms; /* resulting partition; i.e. sched domains /
964	struct sched_domain_attr dattr; /* attributes for custom domains /
965	int ndoms = `0`; / number of sched domains in result /
966	int nslot; / next empty doms[] struct cpumask slot /
967	struct cgroup_subsys_state *pos_css;
968	bool root_load_balance = is_sched_load_balance(cs: &top_cpuset);
969
970	doms = NULL;
971	dattr = NULL;
972	csa = NULL;
973
974	/ Special case for the 99% of systems with one, full, sched domain /
975	if (root_load_balance && !top_cpuset.nr_subparts) {
976	ndoms = `1`;
977	doms = alloc_sched_domains(ndoms);
978	if (!doms)
979	goto done;
980
981	dattr = kmalloc(size: sizeof(struct sched_domain_attr), GFP_KERNEL);
982	if (dattr) {
983	*dattr = SD_ATTR_INIT;
984	update_domain_attr_tree(dattr, root_cs: &top_cpuset);
985	}
986	cpumask_and(dstp: doms[`0`], src1p: top_cpuset.effective_cpus,
987	src2p: housekeeping_cpumask(type: HK_TYPE_DOMAIN));
988
989	goto done;
990	}
991
992	csa = kmalloc_array(n: nr_cpusets(), size: sizeof(cp), GFP_KERNEL);
993	if (!csa)
994	goto done;
995	csn = `0`;
996
997	rcu_read_lock();
998	if (root_load_balance)
999	csa[csn++] = &top_cpuset;
1000	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
1001	if (cp == &top_cpuset)
1002	continue;
1003	/*
1004	* Continue traversing beyond @cp iff @cp has some CPUs and
1005	* isn't load balancing. The former is obvious. The
1006	* latter: All child cpusets contain a subset of the
1007	* parent's cpus, so just skip them, and then we call
1008	* update_domain_attr_tree() to calc relax_domain_level of
1009	* the corresponding sched domain.
1010	*
1011	* If root is load-balancing, we can skip @cp if it
1012	* is a subset of the root's effective_cpus.
1013	*/
1014	if (!cpumask_empty(srcp: cp->cpus_allowed) &&
1015	!(is_sched_load_balance(cs: cp) &&
1016	cpumask_intersects(src1p: cp->cpus_allowed,
1017	src2p: housekeeping_cpumask(type: HK_TYPE_DOMAIN))))
1018	continue;
1019
1020	if (root_load_balance &&
1021	cpumask_subset(src1p: cp->cpus_allowed, src2p: top_cpuset.effective_cpus))
1022	continue;
1023
1024	if (is_sched_load_balance(cs: cp) &&
1025	!cpumask_empty(srcp: cp->effective_cpus))
1026	csa[csn++] = cp;
1027
1028	/ skip @cp's subtree if not a partition root /
1029	if (!is_partition_valid(cs: cp))
1030	pos_css = css_rightmost_descendant(pos: pos_css);
1031	}
1032	rcu_read_unlock();
1033
1034	for (i = `0`; i < csn; i++)
1035	csa[i]->pn = i;
1036	ndoms = csn;
1037
1038	restart:
1039	/ Find the best partition (set of sched domains) /
1040	for (i = `0`; i < csn; i++) {
1041	struct cpuset *a = csa[i];
1042	int apn = a->pn;
1043
1044	for (j = `0`; j < csn; j++) {
1045	struct cpuset *b = csa[j];
1046	int bpn = b->pn;
1047
1048	if (apn != bpn && cpusets_overlap(a, b)) {
1049	for (k = `0`; k < csn; k++) {
1050	struct cpuset *c = csa[k];
1051
1052	if (c->pn == bpn)
1053	c->pn = apn;
1054	}
1055	ndoms--; / one less element /
1056	goto restart;
1057	}
1058	}
1059	}
1060
1061	/*
1062	* Now we know how many domains to create.
1063	* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
1064	*/
1065	doms = alloc_sched_domains(ndoms);
1066	if (!doms)
1067	goto done;
1068
1069	/*
1070	* The rest of the code, including the scheduler, can deal with
1071	* dattr==NULL case. No need to abort if alloc fails.
1072	*/
1073	dattr = kmalloc_array(n: ndoms, size: sizeof(struct sched_domain_attr),
1074	GFP_KERNEL);
1075
1076	for (nslot = `0`, i = `0`; i < csn; i++) {
1077	struct cpuset *a = csa[i];
1078	struct cpumask *dp;
1079	int apn = a->pn;
1080
1081	if (apn < `0`) {
1082	/ Skip completed partitions /
1083	continue;
1084	}
1085
1086	dp = doms[nslot];
1087
1088	if (nslot == ndoms) {
1089	static int warnings = `10`;
1090	if (warnings) {
1091	pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
1092	nslot, ndoms, csn, i, apn);
1093	warnings--;
1094	}
1095	continue;
1096	}
1097
1098	cpumask_clear(dstp: dp);
1099	if (dattr)
1100	*(dattr + nslot) = SD_ATTR_INIT;
1101	for (j = i; j < csn; j++) {
1102	struct cpuset *b = csa[j];
1103
1104	if (apn == b->pn) {
1105	cpumask_or(dstp: dp, src1p: dp, src2p: b->effective_cpus);
1106	cpumask_and(dstp: dp, src1p: dp, src2p: housekeeping_cpumask(type: HK_TYPE_DOMAIN));
1107	if (dattr)
1108	update_domain_attr_tree(dattr: dattr + nslot, root_cs: b);
1109
1110	/ Done with this partition /
1111	b->pn = -`1`;
1112	}
1113	}
1114	nslot++;
1115	}
1116	BUG_ON(nslot != ndoms);
1117
1118	done:
1119	kfree(objp: csa);
1120
1121	/*
1122	* Fallback to the default domain if kmalloc() failed.
1123	* See comments in partition_sched_domains().
1124	*/
1125	if (doms == NULL)
1126	ndoms = `1`;
1127
1128	*domains = doms;
1129	*attributes = dattr;
1130	return ndoms;
1131	}
1132
1133	static void dl_update_tasks_root_domain(struct cpuset *cs)
1134	{
1135	struct css_task_iter it;
1136	struct task_struct *task;
1137
1138	if (cs->nr_deadline_tasks == `0`)
1139	return;
1140
1141	css_task_iter_start(css: &cs->css, flags: `0`, it: &it);
1142
1143	while ((task = css_task_iter_next(it: &it)))
1144	dl_add_task_root_domain(p: task);
1145
1146	css_task_iter_end(it: &it);
1147	}
1148
1149	static void dl_rebuild_rd_accounting(void)
1150	{
1151	struct cpuset *cs = NULL;
1152	struct cgroup_subsys_state *pos_css;
1153
1154	lockdep_assert_held(&cpuset_mutex);
1155	lockdep_assert_cpus_held();
1156	lockdep_assert_held(&sched_domains_mutex);
1157
1158	rcu_read_lock();
1159
1160	/*
1161	* Clear default root domain DL accounting, it will be computed again
1162	* if a task belongs to it.
1163	*/
1164	dl_clear_root_domain(rd: &def_root_domain);
1165
1166	cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1167
1168	if (cpumask_empty(srcp: cs->effective_cpus)) {
1169	pos_css = css_rightmost_descendant(pos: pos_css);
1170	continue;
1171	}
1172
1173	css_get(css: &cs->css);
1174
1175	rcu_read_unlock();
1176
1177	dl_update_tasks_root_domain(cs);
1178
1179	rcu_read_lock();
1180	css_put(css: &cs->css);
1181	}
1182	rcu_read_unlock();
1183	}
1184
1185	static void
1186	partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
1187	struct sched_domain_attr *dattr_new)
1188	{
1189	mutex_lock(&sched_domains_mutex);
1190	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
1191	dl_rebuild_rd_accounting();
1192	mutex_unlock(lock: &sched_domains_mutex);
1193	}
1194
1195	/*
1196	* Rebuild scheduler domains.
1197	*
1198	* If the flag 'sched_load_balance' of any cpuset with non-empty
1199	* 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
1200	* which has that flag enabled, or if any cpuset with a non-empty
1201	* 'cpus' is removed, then call this routine to rebuild the
1202	* scheduler's dynamic sched domains.
1203	*
1204	* Call with cpuset_mutex held. Takes cpus_read_lock().
1205	*/
1206	static void rebuild_sched_domains_locked(void)
1207	{
1208	struct cgroup_subsys_state *pos_css;
1209	struct sched_domain_attr *attr;
1210	cpumask_var_t *doms;
1211	struct cpuset *cs;
1212	int ndoms;
1213
1214	lockdep_assert_cpus_held();
1215	lockdep_assert_held(&cpuset_mutex);
1216
1217	/*
1218	* If we have raced with CPU hotplug, return early to avoid
1219	* passing doms with offlined cpu to partition_sched_domains().
1220	* Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
1221	*
1222	* With no CPUs in any subpartitions, top_cpuset's effective CPUs
1223	* should be the same as the active CPUs, so checking only top_cpuset
1224	* is enough to detect racing CPU offlines.
1225	*/
1226	if (cpumask_empty(srcp: subpartitions_cpus) &&
1227	!cpumask_equal(src1p: top_cpuset.effective_cpus, cpu_active_mask))
1228	return;
1229
1230	/*
1231	* With subpartition CPUs, however, the effective CPUs of a partition
1232	* root should be only a subset of the active CPUs. Since a CPU in any
1233	* partition root could be offlined, all must be checked.
1234	*/
1235	if (top_cpuset.nr_subparts) {
1236	rcu_read_lock();
1237	cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
1238	if (!is_partition_valid(cs)) {
1239	pos_css = css_rightmost_descendant(pos: pos_css);
1240	continue;
1241	}
1242	if (!cpumask_subset(src1p: cs->effective_cpus,
1243	cpu_active_mask)) {
1244	rcu_read_unlock();
1245	return;
1246	}
1247	}
1248	rcu_read_unlock();
1249	}
1250
1251	/ Generate domain masks and attrs /
1252	ndoms = generate_sched_domains(domains: &doms, attributes: &attr);
1253
1254	/ Have scheduler rebuild the domains /
1255	partition_and_rebuild_sched_domains(ndoms_new: ndoms, doms_new: doms, dattr_new: attr);
1256	}
1257	#else /* !CONFIG_SMP */
1258	static void rebuild_sched_domains_locked(void)
1259	{
1260	}
1261	#endif /* CONFIG_SMP */
1262
1263	void rebuild_sched_domains(void)
1264	{
1265	cpus_read_lock();
1266	mutex_lock(&cpuset_mutex);
1267	rebuild_sched_domains_locked();
1268	mutex_unlock(lock: &cpuset_mutex);
1269	cpus_read_unlock();
1270	}
1271
1272	/**
1273	* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
1274	* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
1275	* @new_cpus: the temp variable for the new effective_cpus mask
1276	*
1277	* Iterate through each task of @cs updating its cpus_allowed to the
1278	* effective cpuset's. As this function is called with cpuset_mutex held,
1279	* cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
1280	* is used instead of effective_cpus to make sure all offline CPUs are also
1281	* included as hotplug code won't update cpumasks for tasks in top_cpuset.
1282	*/
1283	static void update_tasks_cpumask(struct cpuset cs, struct* cpumask *new_cpus)
1284	{
1285	struct css_task_iter it;
1286	struct task_struct *task;
1287	bool top_cs = cs == &top_cpuset;
1288
1289	css_task_iter_start(css: &cs->css, flags: `0`, it: &it);
1290	while ((task = css_task_iter_next(it: &it))) {
1291	const struct cpumask *possible_mask = task_cpu_possible_mask(task);
1292
1293	if (top_cs) {
1294	/*
1295	* Percpu kthreads in top_cpuset are ignored
1296	*/
1297	if (kthread_is_per_cpu(k: task))
1298	continue;
1299	cpumask_andnot(dstp: new_cpus, src1p: possible_mask, src2p: subpartitions_cpus);
1300	} else {
1301	cpumask_and(dstp: new_cpus, src1p: possible_mask, src2p: cs->effective_cpus);
1302	}
1303	set_cpus_allowed_ptr(p: task, new_mask: new_cpus);
1304	}
1305	css_task_iter_end(it: &it);
1306	}
1307
1308	/**
1309	* compute_effective_cpumask - Compute the effective cpumask of the cpuset
1310	* @new_cpus: the temp variable for the new effective_cpus mask
1311	* @cs: the cpuset the need to recompute the new effective_cpus mask
1312	* @parent: the parent cpuset
1313	*
1314	* The result is valid only if the given cpuset isn't a partition root.
1315	*/
1316	static void compute_effective_cpumask(struct cpumask *new_cpus,
1317	struct cpuset cs, struct* cpuset *parent)
1318	{
1319	cpumask_and(dstp: new_cpus, src1p: cs->cpus_allowed, src2p: parent->effective_cpus);
1320	}
1321
1322	/*
1323	* Commands for update_parent_effective_cpumask
1324	*/
1325	enum partition_cmd {
1326	partcmd_enable, / Enable partition root /
1327	partcmd_enablei, / Enable isolated partition root /
1328	partcmd_disable, / Disable partition root /
1329	partcmd_update, / Update parent's effective_cpus /
1330	partcmd_invalidate, / Make partition invalid /
1331	};
1332
1333	static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1334	int turning_on);
1335	static void update_sibling_cpumasks(struct cpuset parent, struct* cpuset *cs,
1336	struct tmpmasks *tmp);
1337
1338	/*
1339	* Update partition exclusive flag
1340	*
1341	* Return: 0 if successful, an error code otherwise
1342	*/
1343	static int update_partition_exclusive(struct cpuset cs, int* new_prs)
1344	{
1345	bool exclusive = (new_prs > `0`);
1346
1347	if (exclusive && !is_cpu_exclusive(cs)) {
1348	if (update_flag(bit: CS_CPU_EXCLUSIVE, cs, turning_on: `1`))
1349	return PERR_NOTEXCL;
1350	} else if (!exclusive && is_cpu_exclusive(cs)) {
1351	/ Turning off CS_CPU_EXCLUSIVE will not return error /
1352	update_flag(bit: CS_CPU_EXCLUSIVE, cs, turning_on: `0`);
1353	}
1354	return `0`;
1355	}
1356
1357	/*
1358	* Update partition load balance flag and/or rebuild sched domain
1359	*
1360	* Changing load balance flag will automatically call
1361	* rebuild_sched_domains_locked().
1362	* This function is for cgroup v2 only.
1363	*/
1364	static void update_partition_sd_lb(struct cpuset cs, int* old_prs)
1365	{
1366	int new_prs = cs->partition_root_state;
1367	bool rebuild_domains = (new_prs > `0`) \|\| (old_prs > `0`);
1368	bool new_lb;
1369
1370	/*
1371	* If cs is not a valid partition root, the load balance state
1372	* will follow its parent.
1373	*/
1374	if (new_prs > `0`) {
1375	new_lb = (new_prs != PRS_ISOLATED);
1376	} else {
1377	new_lb = is_sched_load_balance(cs: parent_cs(cs));
1378	}
1379	if (new_lb != !!is_sched_load_balance(cs)) {
1380	rebuild_domains = true;
1381	if (new_lb)
1382	set_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cs->flags);
1383	else
1384	clear_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cs->flags);
1385	}
1386
1387	if (rebuild_domains)
1388	rebuild_sched_domains_locked();
1389	}
1390
1391	/*
1392	* tasks_nocpu_error - Return true if tasks will have no effective_cpus
1393	*/
1394	static bool tasks_nocpu_error(struct cpuset parent, struct* cpuset *cs,
1395	struct cpumask *xcpus)
1396	{
1397	/*
1398	* A populated partition (cs or parent) can't have empty effective_cpus
1399	*/
1400	return (cpumask_subset(src1p: parent->effective_cpus, src2p: xcpus) &&
1401	partition_is_populated(cs: parent, excluded_child: cs)) \|\|
1402	(!cpumask_intersects(src1p: xcpus, cpu_active_mask) &&
1403	partition_is_populated(cs, NULL));
1404	}
1405
1406	static void reset_partition_data(struct cpuset *cs)
1407	{
1408	struct cpuset *parent = parent_cs(cs);
1409
1410	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
1411	return;
1412
1413	lockdep_assert_held(&callback_lock);
1414
1415	cs->nr_subparts = `0`;
1416	if (cpumask_empty(srcp: cs->exclusive_cpus)) {
1417	cpumask_clear(dstp: cs->effective_xcpus);
1418	if (is_cpu_exclusive(cs))
1419	clear_bit(nr: CS_CPU_EXCLUSIVE, addr: &cs->flags);
1420	}
1421	if (!cpumask_and(dstp: cs->effective_cpus,
1422	src1p: parent->effective_cpus, src2p: cs->cpus_allowed)) {
1423	cs->use_parent_ecpus = true;
1424	parent->child_ecpus_count++;
1425	cpumask_copy(dstp: cs->effective_cpus, srcp: parent->effective_cpus);
1426	}
1427	}
1428
1429	/*
1430	* partition_xcpus_newstate - Exclusive CPUs state change
1431	* @old_prs: old partition_root_state
1432	* @new_prs: new partition_root_state
1433	* @xcpus: exclusive CPUs with state change
1434	*/
1435	static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
1436	{
1437	WARN_ON_ONCE(old_prs == new_prs);
1438	if (new_prs == PRS_ISOLATED)
1439	cpumask_or(dstp: isolated_cpus, src1p: isolated_cpus, src2p: xcpus);
1440	else
1441	cpumask_andnot(dstp: isolated_cpus, src1p: isolated_cpus, src2p: xcpus);
1442	}
1443
1444	/*
1445	* partition_xcpus_add - Add new exclusive CPUs to partition
1446	* @new_prs: new partition_root_state
1447	* @parent: parent cpuset
1448	* @xcpus: exclusive CPUs to be added
1449	* Return: true if isolated_cpus modified, false otherwise
1450	*
1451	* Remote partition if parent == NULL
1452	*/
1453	static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
1454	struct cpumask *xcpus)
1455	{
1456	bool isolcpus_updated;
1457
1458	WARN_ON_ONCE(new_prs < `0`);
1459	lockdep_assert_held(&callback_lock);
1460	if (!parent)
1461	parent = &top_cpuset;
1462
1463
1464	if (parent == &top_cpuset)
1465	cpumask_or(dstp: subpartitions_cpus, src1p: subpartitions_cpus, src2p: xcpus);
1466
1467	isolcpus_updated = (new_prs != parent->partition_root_state);
1468	if (isolcpus_updated)
1469	partition_xcpus_newstate(old_prs: parent->partition_root_state, new_prs,
1470	xcpus);
1471
1472	cpumask_andnot(dstp: parent->effective_cpus, src1p: parent->effective_cpus, src2p: xcpus);
1473	return isolcpus_updated;
1474	}
1475
1476	/*
1477	* partition_xcpus_del - Remove exclusive CPUs from partition
1478	* @old_prs: old partition_root_state
1479	* @parent: parent cpuset
1480	* @xcpus: exclusive CPUs to be removed
1481	* Return: true if isolated_cpus modified, false otherwise
1482	*
1483	* Remote partition if parent == NULL
1484	*/
1485	static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
1486	struct cpumask *xcpus)
1487	{
1488	bool isolcpus_updated;
1489
1490	WARN_ON_ONCE(old_prs < `0`);
1491	lockdep_assert_held(&callback_lock);
1492	if (!parent)
1493	parent = &top_cpuset;
1494
1495	if (parent == &top_cpuset)
1496	cpumask_andnot(dstp: subpartitions_cpus, src1p: subpartitions_cpus, src2p: xcpus);
1497
1498	isolcpus_updated = (old_prs != parent->partition_root_state);
1499	if (isolcpus_updated)
1500	partition_xcpus_newstate(old_prs, new_prs: parent->partition_root_state,
1501	xcpus);
1502
1503	cpumask_and(dstp: xcpus, src1p: xcpus, cpu_active_mask);
1504	cpumask_or(dstp: parent->effective_cpus, src1p: parent->effective_cpus, src2p: xcpus);
1505	return isolcpus_updated;
1506	}
1507
1508	static void update_unbound_workqueue_cpumask(bool isolcpus_updated)
1509	{
1510	int ret;
1511
1512	lockdep_assert_cpus_held();
1513
1514	if (!isolcpus_updated)
1515	return;
1516
1517	ret = workqueue_unbound_exclude_cpumask(cpumask: isolated_cpus);
1518	WARN_ON_ONCE(ret < `0`);
1519	}
1520
1521	/**
1522	* cpuset_cpu_is_isolated - Check if the given CPU is isolated
1523	* @cpu: the CPU number to be checked
1524	* Return: true if CPU is used in an isolated partition, false otherwise
1525	*/
1526	bool cpuset_cpu_is_isolated(int cpu)
1527	{
1528	return cpumask_test_cpu(cpu, cpumask: isolated_cpus);
1529	}
1530	EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
1531
1532	/*
1533	* compute_effective_exclusive_cpumask - compute effective exclusive CPUs
1534	* @cs: cpuset
1535	* @xcpus: effective exclusive CPUs value to be set
1536	* Return: true if xcpus is not empty, false otherwise.
1537	*
1538	* Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
1539	* it must be a subset of cpus_allowed and parent's effective_xcpus.
1540	*/
1541	static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
1542	struct cpumask *xcpus)
1543	{
1544	struct cpuset *parent = parent_cs(cs);
1545
1546	if (!xcpus)
1547	xcpus = cs->effective_xcpus;
1548
1549	if (!cpumask_empty(srcp: cs->exclusive_cpus))
1550	cpumask_and(dstp: xcpus, src1p: cs->exclusive_cpus, src2p: cs->cpus_allowed);
1551	else
1552	cpumask_copy(dstp: xcpus, srcp: cs->cpus_allowed);
1553
1554	return cpumask_and(dstp: xcpus, src1p: xcpus, src2p: parent->effective_xcpus);
1555	}
1556
1557	static inline bool is_remote_partition(struct cpuset *cs)
1558	{
1559	return !list_empty(head: &cs->remote_sibling);
1560	}
1561
1562	static inline bool is_local_partition(struct cpuset *cs)
1563	{
1564	return is_partition_valid(cs) && !is_remote_partition(cs);
1565	}
1566
1567	/*
1568	* remote_partition_enable - Enable current cpuset as a remote partition root
1569	* @cs: the cpuset to update
1570	* @new_prs: new partition_root_state
1571	* @tmp: temparary masks
1572	* Return: 1 if successful, 0 if error
1573	*
1574	* Enable the current cpuset to become a remote partition root taking CPUs
1575	* directly from the top cpuset. cpuset_mutex must be held by the caller.
1576	*/
1577	static int remote_partition_enable(struct cpuset cs, int* new_prs,
1578	struct tmpmasks *tmp)
1579	{
1580	bool isolcpus_updated;
1581
1582	/*
1583	* The user must have sysadmin privilege.
1584	*/
1585	if (!capable(CAP_SYS_ADMIN))
1586	return `0`;
1587
1588	/*
1589	* The requested exclusive_cpus must not be allocated to other
1590	* partitions and it can't use up all the root's effective_cpus.
1591	*
1592	* Note that if there is any local partition root above it or
1593	* remote partition root underneath it, its exclusive_cpus must
1594	* have overlapped with subpartitions_cpus.
1595	*/
1596	compute_effective_exclusive_cpumask(cs, xcpus: tmp->new_cpus);
1597	if (cpumask_empty(srcp: tmp->new_cpus) \|\|
1598	cpumask_intersects(src1p: tmp->new_cpus, src2p: subpartitions_cpus) \|\|
1599	cpumask_subset(src1p: top_cpuset.effective_cpus, src2p: tmp->new_cpus))
1600	return `0`;
1601
1602	spin_lock_irq(lock: &callback_lock);
1603	isolcpus_updated = partition_xcpus_add(new_prs, NULL, xcpus: tmp->new_cpus);
1604	list_add(new: &cs->remote_sibling, head: &remote_children);
1605	if (cs->use_parent_ecpus) {
1606	struct cpuset *parent = parent_cs(cs);
1607
1608	cs->use_parent_ecpus = false;
1609	parent->child_ecpus_count--;
1610	}
1611	spin_unlock_irq(lock: &callback_lock);
1612	update_unbound_workqueue_cpumask(isolcpus_updated);
1613
1614	/*
1615	* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
1616	*/
1617	update_tasks_cpumask(cs: &top_cpuset, new_cpus: tmp->new_cpus);
1618	update_sibling_cpumasks(parent: &top_cpuset, NULL, tmp);
1619	return `1`;
1620	}
1621
1622	/*
1623	* remote_partition_disable - Remove current cpuset from remote partition list
1624	* @cs: the cpuset to update
1625	* @tmp: temparary masks
1626	*
1627	* The effective_cpus is also updated.
1628	*
1629	* cpuset_mutex must be held by the caller.
1630	*/
1631	static void remote_partition_disable(struct cpuset cs, struct* tmpmasks *tmp)
1632	{
1633	bool isolcpus_updated;
1634
1635	compute_effective_exclusive_cpumask(cs, xcpus: tmp->new_cpus);
1636	WARN_ON_ONCE(!is_remote_partition(cs));
1637	WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
1638
1639	spin_lock_irq(lock: &callback_lock);
1640	list_del_init(entry: &cs->remote_sibling);
1641	isolcpus_updated = partition_xcpus_del(old_prs: cs->partition_root_state,
1642	NULL, xcpus: tmp->new_cpus);
1643	cs->partition_root_state = -cs->partition_root_state;
1644	if (!cs->prs_err)
1645	cs->prs_err = PERR_INVCPUS;
1646	reset_partition_data(cs);
1647	spin_unlock_irq(lock: &callback_lock);
1648	update_unbound_workqueue_cpumask(isolcpus_updated);
1649
1650	/*
1651	* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
1652	*/
1653	update_tasks_cpumask(cs: &top_cpuset, new_cpus: tmp->new_cpus);
1654	update_sibling_cpumasks(parent: &top_cpuset, NULL, tmp);
1655	}
1656
1657	/*
1658	* remote_cpus_update - cpus_exclusive change of remote partition
1659	* @cs: the cpuset to be updated
1660	* @newmask: the new effective_xcpus mask
1661	* @tmp: temparary masks
1662	*
1663	* top_cpuset and subpartitions_cpus will be updated or partition can be
1664	* invalidated.
1665	*/
1666	static void remote_cpus_update(struct cpuset cs, struct* cpumask *newmask,
1667	struct tmpmasks *tmp)
1668	{
1669	bool adding, deleting;
1670	int prs = cs->partition_root_state;
1671	int isolcpus_updated = `0`;
1672
1673	if (WARN_ON_ONCE(!is_remote_partition(cs)))
1674	return;
1675
1676	WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
1677
1678	if (cpumask_empty(srcp: newmask))
1679	goto invalidate;
1680
1681	adding = cpumask_andnot(dstp: tmp->addmask, src1p: newmask, src2p: cs->effective_xcpus);
1682	deleting = cpumask_andnot(dstp: tmp->delmask, src1p: cs->effective_xcpus, src2p: newmask);
1683
1684	/*
1685	* Additions of remote CPUs is only allowed if those CPUs are
1686	* not allocated to other partitions and there are effective_cpus
1687	* left in the top cpuset.
1688	*/
1689	if (adding && (!capable(CAP_SYS_ADMIN) \|\|
1690	cpumask_intersects(src1p: tmp->addmask, src2p: subpartitions_cpus) \|\|
1691	cpumask_subset(src1p: top_cpuset.effective_cpus, src2p: tmp->addmask)))
1692	goto invalidate;
1693
1694	spin_lock_irq(lock: &callback_lock);
1695	if (adding)
1696	isolcpus_updated += partition_xcpus_add(new_prs: prs, NULL, xcpus: tmp->addmask);
1697	if (deleting)
1698	isolcpus_updated += partition_xcpus_del(old_prs: prs, NULL, xcpus: tmp->delmask);
1699	spin_unlock_irq(lock: &callback_lock);
1700	update_unbound_workqueue_cpumask(isolcpus_updated);
1701
1702	/*
1703	* Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
1704	*/
1705	update_tasks_cpumask(cs: &top_cpuset, new_cpus: tmp->new_cpus);
1706	update_sibling_cpumasks(parent: &top_cpuset, NULL, tmp);
1707	return;
1708
1709	invalidate:
1710	remote_partition_disable(cs, tmp);
1711	}
1712
1713	/*
1714	* remote_partition_check - check if a child remote partition needs update
1715	* @cs: the cpuset to be updated
1716	* @newmask: the new effective_xcpus mask
1717	* @delmask: temporary mask for deletion (not in tmp)
1718	* @tmp: temparary masks
1719	*
1720	* This should be called before the given cs has updated its cpus_allowed
1721	* and/or effective_xcpus.
1722	*/
1723	static void remote_partition_check(struct cpuset cs, struct* cpumask *newmask,
1724	struct cpumask delmask, struct* tmpmasks *tmp)
1725	{
1726	struct cpuset child, next;
1727	int disable_cnt = `0`;
1728
1729	/*
1730	* Compute the effective exclusive CPUs that will be deleted.
1731	*/
1732	if (!cpumask_andnot(dstp: delmask, src1p: cs->effective_xcpus, src2p: newmask) \|\|
1733	!cpumask_intersects(src1p: delmask, src2p: subpartitions_cpus))
1734	return; / No deletion of exclusive CPUs in partitions /
1735
1736	/*
1737	* Searching the remote children list to look for those that will
1738	* be impacted by the deletion of exclusive CPUs.
1739	*
1740	* Since a cpuset must be removed from the remote children list
1741	* before it can go offline and holding cpuset_mutex will prevent
1742	* any change in cpuset status. RCU read lock isn't needed.
1743	*/
1744	lockdep_assert_held(&cpuset_mutex);
1745	list_for_each_entry_safe(child, next, &remote_children, remote_sibling)
1746	if (cpumask_intersects(src1p: child->effective_cpus, src2p: delmask)) {
1747	remote_partition_disable(cs: child, tmp);
1748	disable_cnt++;
1749	}
1750	if (disable_cnt)
1751	rebuild_sched_domains_locked();
1752	}
1753
1754	/*
1755	* prstate_housekeeping_conflict - check for partition & housekeeping conflicts
1756	* @prstate: partition root state to be checked
1757	* @new_cpus: cpu mask
1758	* Return: true if there is conflict, false otherwise
1759	*
1760	* CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in
1761	* an isolated partition.
1762	*/
1763	static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
1764	{
1765	const struct cpumask *hk_domain = housekeeping_cpumask(type: HK_TYPE_DOMAIN);
1766	bool all_in_hk = cpumask_subset(src1p: new_cpus, src2p: hk_domain);
1767
1768	if (!all_in_hk && (prstate != PRS_ISOLATED))
1769	return true;
1770
1771	return false;
1772	}
1773
1774	/**
1775	* update_parent_effective_cpumask - update effective_cpus mask of parent cpuset
1776	* @cs: The cpuset that requests change in partition root state
1777	* @cmd: Partition root state change command
1778	* @newmask: Optional new cpumask for partcmd_update
1779	* @tmp: Temporary addmask and delmask
1780	* Return: 0 or a partition root state error code
1781	*
1782	* For partcmd_enable*, the cpuset is being transformed from a non-partition
1783	* root to a partition root. The effective_xcpus (cpus_allowed if
1784	* effective_xcpus not set) mask of the given cpuset will be taken away from
1785	* parent's effective_cpus. The function will return 0 if all the CPUs listed
1786	* in effective_xcpus can be granted or an error code will be returned.
1787	*
1788	* For partcmd_disable, the cpuset is being transformed from a partition
1789	* root back to a non-partition root. Any CPUs in effective_xcpus will be
1790	* given back to parent's effective_cpus. 0 will always be returned.
1791	*
1792	* For partcmd_update, if the optional newmask is specified, the cpu list is
1793	* to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is
1794	* assumed to remain the same. The cpuset should either be a valid or invalid
1795	* partition root. The partition root state may change from valid to invalid
1796	* or vice versa. An error code will be returned if transitioning from
1797	* invalid to valid violates the exclusivity rule.
1798	*
1799	* For partcmd_invalidate, the current partition will be made invalid.
1800	*
1801	* The partcmd_enable* and partcmd_disable commands are used by
1802	* update_prstate(). An error code may be returned and the caller will check
1803	* for error.
1804	*
1805	* The partcmd_update command is used by update_cpumasks_hier() with newmask
1806	* NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
1807	* by update_cpumask() with NULL newmask. In both cases, the callers won't
1808	* check for error and so partition_root_state and prs_error will be updated
1809	* directly.
1810	*/
1811	static int update_parent_effective_cpumask(struct cpuset cs, int* cmd,
1812	struct cpumask *newmask,
1813	struct tmpmasks *tmp)
1814	{
1815	struct cpuset *parent = parent_cs(cs);
1816	int adding; / Adding cpus to parent's effective_cpus /
1817	int deleting; / Deleting cpus from parent's effective_cpus /
1818	int old_prs, new_prs;
1819	int part_error = PERR_NONE; / Partition error? /
1820	int subparts_delta = `0`;
1821	struct cpumask xcpus; /* cs effective_xcpus /
1822	int isolcpus_updated = `0`;
1823	bool nocpu;
1824
1825	lockdep_assert_held(&cpuset_mutex);
1826
1827	/*
1828	* new_prs will only be changed for the partcmd_update and
1829	* partcmd_invalidate commands.
1830	*/
1831	adding = deleting = false;
1832	old_prs = new_prs = cs->partition_root_state;
1833	xcpus = !cpumask_empty(srcp: cs->exclusive_cpus)
1834	? cs->effective_xcpus : cs->cpus_allowed;
1835
1836	if (cmd == partcmd_invalidate) {
1837	if (is_prs_invalid(prs_state: old_prs))
1838	return `0`;
1839
1840	/*
1841	* Make the current partition invalid.
1842	*/
1843	if (is_partition_valid(cs: parent))
1844	adding = cpumask_and(dstp: tmp->addmask,
1845	src1p: xcpus, src2p: parent->effective_xcpus);
1846	if (old_prs > `0`) {
1847	new_prs = -old_prs;
1848	subparts_delta--;
1849	}
1850	goto write_error;
1851	}
1852
1853	/*
1854	* The parent must be a partition root.
1855	* The new cpumask, if present, or the current cpus_allowed must
1856	* not be empty.
1857	*/
1858	if (!is_partition_valid(cs: parent)) {
1859	return is_partition_invalid(cs: parent)
1860	? PERR_INVPARENT : PERR_NOTPART;
1861	}
1862	if (!newmask && cpumask_empty(srcp: cs->cpus_allowed))
1863	return PERR_CPUSEMPTY;
1864
1865	nocpu = tasks_nocpu_error(parent, cs, xcpus);
1866
1867	if ((cmd == partcmd_enable) \|\| (cmd == partcmd_enablei)) {
1868	/*
1869	* Enabling partition root is not allowed if its
1870	* effective_xcpus is empty or doesn't overlap with
1871	* parent's effective_xcpus.
1872	*/
1873	if (cpumask_empty(srcp: xcpus) \|\|
1874	!cpumask_intersects(src1p: xcpus, src2p: parent->effective_xcpus))
1875	return PERR_INVCPUS;
1876
1877	if (prstate_housekeeping_conflict(prstate: new_prs, new_cpus: xcpus))
1878	return PERR_HKEEPING;
1879
1880	/*
1881	* A parent can be left with no CPU as long as there is no
1882	* task directly associated with the parent partition.
1883	*/
1884	if (nocpu)
1885	return PERR_NOCPUS;
1886
1887	cpumask_copy(dstp: tmp->delmask, srcp: xcpus);
1888	deleting = true;
1889	subparts_delta++;
1890	new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
1891	} else if (cmd == partcmd_disable) {
1892	/*
1893	* May need to add cpus to parent's effective_cpus for
1894	* valid partition root.
1895	*/
1896	adding = !is_prs_invalid(prs_state: old_prs) &&
1897	cpumask_and(dstp: tmp->addmask, src1p: xcpus, src2p: parent->effective_xcpus);
1898	if (adding)
1899	subparts_delta--;
1900	new_prs = PRS_MEMBER;
1901	} else if (newmask) {
1902	/*
1903	* Empty cpumask is not allowed
1904	*/
1905	if (cpumask_empty(srcp: newmask)) {
1906	part_error = PERR_CPUSEMPTY;
1907	goto write_error;
1908	}
1909
1910	/*
1911	* partcmd_update with newmask:
1912	*
1913	* Compute add/delete mask to/from effective_cpus
1914	*
1915	* For valid partition:
1916	* addmask = exclusive_cpus & ~newmask
1917	* & parent->effective_xcpus
1918	* delmask = newmask & ~exclusive_cpus
1919	* & parent->effective_xcpus
1920	*
1921	* For invalid partition:
1922	* delmask = newmask & parent->effective_xcpus
1923	*/
1924	if (is_prs_invalid(prs_state: old_prs)) {
1925	adding = false;
1926	deleting = cpumask_and(dstp: tmp->delmask,
1927	src1p: newmask, src2p: parent->effective_xcpus);
1928	} else {
1929	cpumask_andnot(dstp: tmp->addmask, src1p: xcpus, src2p: newmask);
1930	adding = cpumask_and(dstp: tmp->addmask, src1p: tmp->addmask,
1931	src2p: parent->effective_xcpus);
1932
1933	cpumask_andnot(dstp: tmp->delmask, src1p: newmask, src2p: xcpus);
1934	deleting = cpumask_and(dstp: tmp->delmask, src1p: tmp->delmask,
1935	src2p: parent->effective_xcpus);
1936	}
1937	/*
1938	* Make partition invalid if parent's effective_cpus could
1939	* become empty and there are tasks in the parent.
1940	*/
1941	if (nocpu && (!adding \|\|
1942	!cpumask_intersects(src1p: tmp->addmask, cpu_active_mask))) {
1943	part_error = PERR_NOCPUS;
1944	deleting = false;
1945	adding = cpumask_and(dstp: tmp->addmask,
1946	src1p: xcpus, src2p: parent->effective_xcpus);
1947	}
1948	} else {
1949	/*
1950	* partcmd_update w/o newmask
1951	*
1952	* delmask = effective_xcpus & parent->effective_cpus
1953	*
1954	* This can be called from:
1955	* 1) update_cpumasks_hier()
1956	* 2) cpuset_hotplug_update_tasks()
1957	*
1958	* Check to see if it can be transitioned from valid to
1959	* invalid partition or vice versa.
1960	*
1961	* A partition error happens when parent has tasks and all
1962	* its effective CPUs will have to be distributed out.
1963	*/
1964	WARN_ON_ONCE(!is_partition_valid(parent));
1965	if (nocpu) {
1966	part_error = PERR_NOCPUS;
1967	if (is_partition_valid(cs))
1968	adding = cpumask_and(dstp: tmp->addmask,
1969	src1p: xcpus, src2p: parent->effective_xcpus);
1970	} else if (is_partition_invalid(cs) &&
1971	cpumask_subset(src1p: xcpus, src2p: parent->effective_xcpus)) {
1972	struct cgroup_subsys_state *css;
1973	struct cpuset *child;
1974	bool exclusive = true;
1975
1976	/*
1977	* Convert invalid partition to valid has to
1978	* pass the cpu exclusivity test.
1979	*/
1980	rcu_read_lock();
1981	cpuset_for_each_child(child, css, parent) {
1982	if (child == cs)
1983	continue;
1984	if (!cpusets_are_exclusive(cs1: cs, cs2: child)) {
1985	exclusive = false;
1986	break;
1987	}
1988	}
1989	rcu_read_unlock();
1990	if (exclusive)
1991	deleting = cpumask_and(dstp: tmp->delmask,
1992	src1p: xcpus, src2p: parent->effective_cpus);
1993	else
1994	part_error = PERR_NOTEXCL;
1995	}
1996	}
1997
1998	write_error:
1999	if (part_error)
2000	WRITE_ONCE(cs->prs_err, part_error);
2001
2002	if (cmd == partcmd_update) {
2003	/*
2004	* Check for possible transition between valid and invalid
2005	* partition root.
2006	*/
2007	switch (cs->partition_root_state) {
2008	case PRS_ROOT:
2009	case PRS_ISOLATED:
2010	if (part_error) {
2011	new_prs = -old_prs;
2012	subparts_delta--;
2013	}
2014	break;
2015	case PRS_INVALID_ROOT:
2016	case PRS_INVALID_ISOLATED:
2017	if (!part_error) {
2018	new_prs = -old_prs;
2019	subparts_delta++;
2020	}
2021	break;
2022	}
2023	}
2024
2025	if (!adding && !deleting && (new_prs == old_prs))
2026	return `0`;
2027
2028	/*
2029	* Transitioning between invalid to valid or vice versa may require
2030	* changing CS_CPU_EXCLUSIVE. In the case of partcmd_update,
2031	* validate_change() has already been successfully called and
2032	* CPU lists in cs haven't been updated yet. So defer it to later.
2033	*/
2034	if ((old_prs != new_prs) && (cmd != partcmd_update)) {
2035	int err = update_partition_exclusive(cs, new_prs);
2036
2037	if (err)
2038	return err;
2039	}
2040
2041	/*
2042	* Change the parent's effective_cpus & effective_xcpus (top cpuset
2043	* only).
2044	*
2045	* Newly added CPUs will be removed from effective_cpus and
2046	* newly deleted ones will be added back to effective_cpus.
2047	*/
2048	spin_lock_irq(lock: &callback_lock);
2049	if (old_prs != new_prs) {
2050	cs->partition_root_state = new_prs;
2051	if (new_prs <= `0`)
2052	cs->nr_subparts = `0`;
2053	}
2054	/*
2055	* Adding to parent's effective_cpus means deletion CPUs from cs
2056	* and vice versa.
2057	*/
2058	if (adding)
2059	isolcpus_updated += partition_xcpus_del(old_prs, parent,
2060	xcpus: tmp->addmask);
2061	if (deleting)
2062	isolcpus_updated += partition_xcpus_add(new_prs, parent,
2063	xcpus: tmp->delmask);
2064
2065	if (is_partition_valid(cs: parent)) {
2066	parent->nr_subparts += subparts_delta;
2067	WARN_ON_ONCE(parent->nr_subparts < `0`);
2068	}
2069	spin_unlock_irq(lock: &callback_lock);
2070	update_unbound_workqueue_cpumask(isolcpus_updated);
2071
2072	if ((old_prs != new_prs) && (cmd == partcmd_update))
2073	update_partition_exclusive(cs, new_prs);
2074
2075	if (adding \|\| deleting) {
2076	update_tasks_cpumask(cs: parent, new_cpus: tmp->addmask);
2077	update_sibling_cpumasks(parent, cs, tmp);
2078	}
2079
2080	/*
2081	* For partcmd_update without newmask, it is being called from
2082	* cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
2083	* Update the load balance flag and scheduling domain if
2084	* cpus_read_trylock() is successful.
2085	*/
2086	if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
2087	update_partition_sd_lb(cs, old_prs);
2088	cpus_read_unlock();
2089	}
2090
2091	notify_partition_change(cs, old_prs);
2092	return `0`;
2093	}
2094
2095	/**
2096	* compute_partition_effective_cpumask - compute effective_cpus for partition
2097	* @cs: partition root cpuset
2098	* @new_ecpus: previously computed effective_cpus to be updated
2099	*
2100	* Compute the effective_cpus of a partition root by scanning effective_xcpus
2101	* of child partition roots and excluding their effective_xcpus.
2102	*
2103	* This has the side effect of invalidating valid child partition roots,
2104	* if necessary. Since it is called from either cpuset_hotplug_update_tasks()
2105	* or update_cpumasks_hier() where parent and children are modified
2106	* successively, we don't need to call update_parent_effective_cpumask()
2107	* and the child's effective_cpus will be updated in later iterations.
2108	*
2109	* Note that rcu_read_lock() is assumed to be held.
2110	*/
2111	static void compute_partition_effective_cpumask(struct cpuset *cs,
2112	struct cpumask *new_ecpus)
2113	{
2114	struct cgroup_subsys_state *css;
2115	struct cpuset *child;
2116	bool populated = partition_is_populated(cs, NULL);
2117
2118	/*
2119	* Check child partition roots to see if they should be
2120	* invalidated when
2121	* 1) child effective_xcpus not a subset of new
2122	* excluisve_cpus
2123	* 2) All the effective_cpus will be used up and cp
2124	* has tasks
2125	*/
2126	compute_effective_exclusive_cpumask(cs, xcpus: new_ecpus);
2127	cpumask_and(dstp: new_ecpus, src1p: new_ecpus, cpu_active_mask);
2128
2129	rcu_read_lock();
2130	cpuset_for_each_child(child, css, cs) {
2131	if (!is_partition_valid(cs: child))
2132	continue;
2133
2134	child->prs_err = `0`;
2135	if (!cpumask_subset(src1p: child->effective_xcpus,
2136	src2p: cs->effective_xcpus))
2137	child->prs_err = PERR_INVCPUS;
2138	else if (populated &&
2139	cpumask_subset(src1p: new_ecpus, src2p: child->effective_xcpus))
2140	child->prs_err = PERR_NOCPUS;
2141
2142	if (child->prs_err) {
2143	int old_prs = child->partition_root_state;
2144
2145	/*
2146	* Invalidate child partition
2147	*/
2148	spin_lock_irq(lock: &callback_lock);
2149	make_partition_invalid(cs: child);
2150	cs->nr_subparts--;
2151	child->nr_subparts = `0`;
2152	spin_unlock_irq(lock: &callback_lock);
2153	notify_partition_change(cs: child, old_prs);
2154	continue;
2155	}
2156	cpumask_andnot(dstp: new_ecpus, src1p: new_ecpus,
2157	src2p: child->effective_xcpus);
2158	}
2159	rcu_read_unlock();
2160	}
2161
2162	/*
2163	* update_cpumasks_hier() flags
2164	*/
2165	#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */
2166	#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */
2167
2168	/*
2169	* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
2170	* @cs: the cpuset to consider
2171	* @tmp: temp variables for calculating effective_cpus & partition setup
2172	* @force: don't skip any descendant cpusets if set
2173	*
2174	* When configured cpumask is changed, the effective cpumasks of this cpuset
2175	* and all its descendants need to be updated.
2176	*
2177	* On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
2178	*
2179	* Called with cpuset_mutex held
2180	*/
2181	static void update_cpumasks_hier(struct cpuset cs, struct* tmpmasks *tmp,
2182	int flags)
2183	{
2184	struct cpuset *cp;
2185	struct cgroup_subsys_state *pos_css;
2186	bool need_rebuild_sched_domains = false;
2187	int old_prs, new_prs;
2188
2189	rcu_read_lock();
2190	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
2191	struct cpuset *parent = parent_cs(cs: cp);
2192	bool remote = is_remote_partition(cs: cp);
2193	bool update_parent = false;
2194
2195	/*
2196	* Skip descendent remote partition that acquires CPUs
2197	* directly from top cpuset unless it is cs.
2198	*/
2199	if (remote && (cp != cs)) {
2200	pos_css = css_rightmost_descendant(pos: pos_css);
2201	continue;
2202	}
2203
2204	/*
2205	* Update effective_xcpus if exclusive_cpus set.
2206	* The case when exclusive_cpus isn't set is handled later.
2207	*/
2208	if (!cpumask_empty(srcp: cp->exclusive_cpus) && (cp != cs)) {
2209	spin_lock_irq(lock: &callback_lock);
2210	compute_effective_exclusive_cpumask(cs: cp, NULL);
2211	spin_unlock_irq(lock: &callback_lock);
2212	}
2213
2214	old_prs = new_prs = cp->partition_root_state;
2215	if (remote \|\| (is_partition_valid(cs: parent) &&
2216	is_partition_valid(cs: cp)))
2217	compute_partition_effective_cpumask(cs: cp, new_ecpus: tmp->new_cpus);
2218	else
2219	compute_effective_cpumask(new_cpus: tmp->new_cpus, cs: cp, parent);
2220
2221	/*
2222	* A partition with no effective_cpus is allowed as long as
2223	* there is no task associated with it. Call
2224	* update_parent_effective_cpumask() to check it.
2225	*/
2226	if (is_partition_valid(cs: cp) && cpumask_empty(srcp: tmp->new_cpus)) {
2227	update_parent = true;
2228	goto update_parent_effective;
2229	}
2230
2231	/*
2232	* If it becomes empty, inherit the effective mask of the
2233	* parent, which is guaranteed to have some CPUs unless
2234	* it is a partition root that has explicitly distributed
2235	* out all its CPUs.
2236	*/
2237	if (is_in_v2_mode() && !remote && cpumask_empty(srcp: tmp->new_cpus)) {
2238	cpumask_copy(dstp: tmp->new_cpus, srcp: parent->effective_cpus);
2239	if (!cp->use_parent_ecpus) {
2240	cp->use_parent_ecpus = true;
2241	parent->child_ecpus_count++;
2242	}
2243	} else if (cp->use_parent_ecpus) {
2244	cp->use_parent_ecpus = false;
2245	WARN_ON_ONCE(!parent->child_ecpus_count);
2246	parent->child_ecpus_count--;
2247	}
2248
2249	if (remote)
2250	goto get_css;
2251
2252	/*
2253	* Skip the whole subtree if
2254	* 1) the cpumask remains the same,
2255	* 2) has no partition root state,
2256	* 3) HIER_CHECKALL flag not set, and
2257	* 4) for v2 load balance state same as its parent.
2258	*/
2259	if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
2260	cpumask_equal(src1p: tmp->new_cpus, src2p: cp->effective_cpus) &&
2261	(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) \|\|
2262	(is_sched_load_balance(cs: parent) == is_sched_load_balance(cs: cp)))) {
2263	pos_css = css_rightmost_descendant(pos: pos_css);
2264	continue;
2265	}
2266
2267	update_parent_effective:
2268	/*
2269	* update_parent_effective_cpumask() should have been called
2270	* for cs already in update_cpumask(). We should also call
2271	* update_tasks_cpumask() again for tasks in the parent
2272	* cpuset if the parent's effective_cpus changes.
2273	*/
2274	if ((cp != cs) && old_prs) {
2275	switch (parent->partition_root_state) {
2276	case PRS_ROOT:
2277	case PRS_ISOLATED:
2278	update_parent = true;
2279	break;
2280
2281	default:
2282	/*
2283	* When parent is not a partition root or is
2284	* invalid, child partition roots become
2285	* invalid too.
2286	*/
2287	if (is_partition_valid(cs: cp))
2288	new_prs = -cp->partition_root_state;
2289	WRITE_ONCE(cp->prs_err,
2290	is_partition_invalid(parent)
2291	? PERR_INVPARENT : PERR_NOTPART);
2292	break;
2293	}
2294	}
2295	get_css:
2296	if (!css_tryget_online(css: &cp->css))
2297	continue;
2298	rcu_read_unlock();
2299
2300	if (update_parent) {
2301	update_parent_effective_cpumask(cs: cp, cmd: partcmd_update, NULL, tmp);
2302	/*
2303	* The cpuset partition_root_state may become
2304	* invalid. Capture it.
2305	*/
2306	new_prs = cp->partition_root_state;
2307	}
2308
2309	spin_lock_irq(lock: &callback_lock);
2310	cpumask_copy(dstp: cp->effective_cpus, srcp: tmp->new_cpus);
2311	cp->partition_root_state = new_prs;
2312	/*
2313	* Make sure effective_xcpus is properly set for a valid
2314	* partition root.
2315	*/
2316	if ((new_prs > `0`) && cpumask_empty(srcp: cp->exclusive_cpus))
2317	cpumask_and(dstp: cp->effective_xcpus,
2318	src1p: cp->cpus_allowed, src2p: parent->effective_xcpus);
2319	else if (new_prs < `0`)
2320	reset_partition_data(cs: cp);
2321	spin_unlock_irq(lock: &callback_lock);
2322
2323	notify_partition_change(cs: cp, old_prs);
2324
2325	WARN_ON(!is_in_v2_mode() &&
2326	!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
2327
2328	update_tasks_cpumask(cs: cp, new_cpus: cp->effective_cpus);
2329
2330	/*
2331	* On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE
2332	* from parent if current cpuset isn't a valid partition root
2333	* and their load balance states differ.
2334	*/
2335	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
2336	!is_partition_valid(cs: cp) &&
2337	(is_sched_load_balance(cs: parent) != is_sched_load_balance(cs: cp))) {
2338	if (is_sched_load_balance(cs: parent))
2339	set_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cp->flags);
2340	else
2341	clear_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cp->flags);
2342	}
2343
2344	/*
2345	* On legacy hierarchy, if the effective cpumask of any non-
2346	* empty cpuset is changed, we need to rebuild sched domains.
2347	* On default hierarchy, the cpuset needs to be a partition
2348	* root as well.
2349	*/
2350	if (!cpumask_empty(srcp: cp->cpus_allowed) &&
2351	is_sched_load_balance(cs: cp) &&
2352	(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) \|\|
2353	is_partition_valid(cs: cp)))
2354	need_rebuild_sched_domains = true;
2355
2356	rcu_read_lock();
2357	css_put(css: &cp->css);
2358	}
2359	rcu_read_unlock();
2360
2361	if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD))
2362	rebuild_sched_domains_locked();
2363	}
2364
2365	/**
2366	* update_sibling_cpumasks - Update siblings cpumasks
2367	* @parent: Parent cpuset
2368	* @cs: Current cpuset
2369	* @tmp: Temp variables
2370	*/
2371	static void update_sibling_cpumasks(struct cpuset parent, struct* cpuset *cs,
2372	struct tmpmasks *tmp)
2373	{
2374	struct cpuset *sibling;
2375	struct cgroup_subsys_state *pos_css;
2376
2377	lockdep_assert_held(&cpuset_mutex);
2378
2379	/*
2380	* Check all its siblings and call update_cpumasks_hier()
2381	* if their effective_cpus will need to be changed.
2382	*
2383	* With the addition of effective_xcpus which is a subset of
2384	* cpus_allowed. It is possible a change in parent's effective_cpus
2385	* due to a change in a child partition's effective_xcpus will impact
2386	* its siblings even if they do not inherit parent's effective_cpus
2387	* directly.
2388	*
2389	* The update_cpumasks_hier() function may sleep. So we have to
2390	* release the RCU read lock before calling it. HIER_NO_SD_REBUILD
2391	* flag is used to suppress rebuild of sched domains as the callers
2392	* will take care of that.
2393	*/
2394	rcu_read_lock();
2395	cpuset_for_each_child(sibling, pos_css, parent) {
2396	if (sibling == cs)
2397	continue;
2398	if (!sibling->use_parent_ecpus &&
2399	!is_partition_valid(cs: sibling)) {
2400	compute_effective_cpumask(new_cpus: tmp->new_cpus, cs: sibling,
2401	parent);
2402	if (cpumask_equal(src1p: tmp->new_cpus, src2p: sibling->effective_cpus))
2403	continue;
2404	}
2405	if (!css_tryget_online(css: &sibling->css))
2406	continue;
2407
2408	rcu_read_unlock();
2409	update_cpumasks_hier(cs: sibling, tmp, HIER_NO_SD_REBUILD);
2410	rcu_read_lock();
2411	css_put(css: &sibling->css);
2412	}
2413	rcu_read_unlock();
2414	}
2415
2416	/**
2417	* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
2418	* @cs: the cpuset to consider
2419	* @trialcs: trial cpuset
2420	* @buf: buffer of cpu numbers written to this cpuset
2421	*/
2422	static int update_cpumask(struct cpuset cs, struct* cpuset *trialcs,
2423	const char *buf)
2424	{
2425	int retval;
2426	struct tmpmasks tmp;
2427	struct cpuset *parent = parent_cs(cs);
2428	bool invalidate = false;
2429	int hier_flags = `0`;
2430	int old_prs = cs->partition_root_state;
2431
2432	/ top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only /
2433	if (cs == &top_cpuset)
2434	return -EACCES;
2435
2436	/*
2437	* An empty cpus_allowed is ok only if the cpuset has no tasks.
2438	* Since cpulist_parse() fails on an empty mask, we special case
2439	* that parsing. The validate_change() call ensures that cpusets
2440	* with tasks have cpus.
2441	*/
2442	if (!*buf) {
2443	cpumask_clear(dstp: trialcs->cpus_allowed);
2444	cpumask_clear(dstp: trialcs->effective_xcpus);
2445	} else {
2446	retval = cpulist_parse(buf, dstp: trialcs->cpus_allowed);
2447	if (retval < `0`)
2448	return retval;
2449
2450	if (!cpumask_subset(src1p: trialcs->cpus_allowed,
2451	src2p: top_cpuset.cpus_allowed))
2452	return -EINVAL;
2453
2454	/*
2455	* When exclusive_cpus isn't explicitly set, it is constrainted
2456	* by cpus_allowed and parent's effective_xcpus. Otherwise,
2457	* trialcs->effective_xcpus is used as a temporary cpumask
2458	* for checking validity of the partition root.
2459	*/
2460	if (!cpumask_empty(srcp: trialcs->exclusive_cpus) \|\| is_partition_valid(cs))
2461	compute_effective_exclusive_cpumask(cs: trialcs, NULL);
2462	}
2463
2464	/ Nothing to do if the cpus didn't change /
2465	if (cpumask_equal(src1p: cs->cpus_allowed, src2p: trialcs->cpus_allowed))
2466	return `0`;
2467
2468	if (alloc_cpumasks(NULL, tmp: &tmp))
2469	return -ENOMEM;
2470
2471	if (old_prs) {
2472	if (is_partition_valid(cs) &&
2473	cpumask_empty(srcp: trialcs->effective_xcpus)) {
2474	invalidate = true;
2475	cs->prs_err = PERR_INVCPUS;
2476	} else if (prstate_housekeeping_conflict(prstate: old_prs, new_cpus: trialcs->effective_xcpus)) {
2477	invalidate = true;
2478	cs->prs_err = PERR_HKEEPING;
2479	} else if (tasks_nocpu_error(parent, cs, xcpus: trialcs->effective_xcpus)) {
2480	invalidate = true;
2481	cs->prs_err = PERR_NOCPUS;
2482	}
2483	}
2484
2485	/*
2486	* Check all the descendants in update_cpumasks_hier() if
2487	* effective_xcpus is to be changed.
2488	*/
2489	if (!cpumask_equal(src1p: cs->effective_xcpus, src2p: trialcs->effective_xcpus))
2490	hier_flags = HIER_CHECKALL;
2491
2492	retval = validate_change(cur: cs, trial: trialcs);
2493
2494	if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
2495	struct cgroup_subsys_state *css;
2496	struct cpuset *cp;
2497
2498	/*
2499	* The -EINVAL error code indicates that partition sibling
2500	* CPU exclusivity rule has been violated. We still allow
2501	* the cpumask change to proceed while invalidating the
2502	* partition. However, any conflicting sibling partitions
2503	* have to be marked as invalid too.
2504	*/
2505	invalidate = true;
2506	rcu_read_lock();
2507	cpuset_for_each_child(cp, css, parent) {
2508	struct cpumask *xcpus = fetch_xcpus(cs: trialcs);
2509
2510	if (is_partition_valid(cs: cp) &&
2511	cpumask_intersects(src1p: xcpus, src2p: cp->effective_xcpus)) {
2512	rcu_read_unlock();
2513	update_parent_effective_cpumask(cs: cp, cmd: partcmd_invalidate, NULL, tmp: &tmp);
2514	rcu_read_lock();
2515	}
2516	}
2517	rcu_read_unlock();
2518	retval = `0`;
2519	}
2520
2521	if (retval < `0`)
2522	goto out_free;
2523
2524	if (is_partition_valid(cs) \|\|
2525	(is_partition_invalid(cs) && !invalidate)) {
2526	struct cpumask *xcpus = trialcs->effective_xcpus;
2527
2528	if (cpumask_empty(srcp: xcpus) && is_partition_invalid(cs))
2529	xcpus = trialcs->cpus_allowed;
2530
2531	/*
2532	* Call remote_cpus_update() to handle valid remote partition
2533	*/
2534	if (is_remote_partition(cs))
2535	remote_cpus_update(cs, newmask: xcpus, tmp: &tmp);
2536	else if (invalidate)
2537	update_parent_effective_cpumask(cs, cmd: partcmd_invalidate,
2538	NULL, tmp: &tmp);
2539	else
2540	update_parent_effective_cpumask(cs, cmd: partcmd_update,
2541	newmask: xcpus, tmp: &tmp);
2542	} else if (!cpumask_empty(srcp: cs->exclusive_cpus)) {
2543	/*
2544	* Use trialcs->effective_cpus as a temp cpumask
2545	*/
2546	remote_partition_check(cs, newmask: trialcs->effective_xcpus,
2547	delmask: trialcs->effective_cpus, tmp: &tmp);
2548	}
2549
2550	spin_lock_irq(lock: &callback_lock);
2551	cpumask_copy(dstp: cs->cpus_allowed, srcp: trialcs->cpus_allowed);
2552	cpumask_copy(dstp: cs->effective_xcpus, srcp: trialcs->effective_xcpus);
2553	if ((old_prs > `0`) && !is_partition_valid(cs))
2554	reset_partition_data(cs);
2555	spin_unlock_irq(lock: &callback_lock);
2556
2557	/ effective_cpus/effective_xcpus will be updated here /
2558	update_cpumasks_hier(cs, tmp: &tmp, flags: hier_flags);
2559
2560	/ Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary /
2561	if (cs->partition_root_state)
2562	update_partition_sd_lb(cs, old_prs);
2563	out_free:
2564	free_cpumasks(NULL, tmp: &tmp);
2565	return retval;
2566	}
2567
2568	/**
2569	* update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset
2570	* @cs: the cpuset to consider
2571	* @trialcs: trial cpuset
2572	* @buf: buffer of cpu numbers written to this cpuset
2573	*
2574	* The tasks' cpumask will be updated if cs is a valid partition root.
2575	*/
2576	static int update_exclusive_cpumask(struct cpuset cs, struct* cpuset *trialcs,
2577	const char *buf)
2578	{
2579	int retval;
2580	struct tmpmasks tmp;
2581	struct cpuset *parent = parent_cs(cs);
2582	bool invalidate = false;
2583	int hier_flags = `0`;
2584	int old_prs = cs->partition_root_state;
2585
2586	if (!*buf) {
2587	cpumask_clear(dstp: trialcs->exclusive_cpus);
2588	cpumask_clear(dstp: trialcs->effective_xcpus);
2589	} else {
2590	retval = cpulist_parse(buf, dstp: trialcs->exclusive_cpus);
2591	if (retval < `0`)
2592	return retval;
2593	if (!is_cpu_exclusive(cs))
2594	set_bit(nr: CS_CPU_EXCLUSIVE, addr: &trialcs->flags);
2595	}
2596
2597	/ Nothing to do if the CPUs didn't change /
2598	if (cpumask_equal(src1p: cs->exclusive_cpus, src2p: trialcs->exclusive_cpus))
2599	return `0`;
2600
2601	if (*buf)
2602	compute_effective_exclusive_cpumask(cs: trialcs, NULL);
2603
2604	/*
2605	* Check all the descendants in update_cpumasks_hier() if
2606	* effective_xcpus is to be changed.
2607	*/
2608	if (!cpumask_equal(src1p: cs->effective_xcpus, src2p: trialcs->effective_xcpus))
2609	hier_flags = HIER_CHECKALL;
2610
2611	retval = validate_change(cur: cs, trial: trialcs);
2612	if (retval)
2613	return retval;
2614
2615	if (alloc_cpumasks(NULL, tmp: &tmp))
2616	return -ENOMEM;
2617
2618	if (old_prs) {
2619	if (cpumask_empty(srcp: trialcs->effective_xcpus)) {
2620	invalidate = true;
2621	cs->prs_err = PERR_INVCPUS;
2622	} else if (prstate_housekeeping_conflict(prstate: old_prs, new_cpus: trialcs->effective_xcpus)) {
2623	invalidate = true;
2624	cs->prs_err = PERR_HKEEPING;
2625	} else if (tasks_nocpu_error(parent, cs, xcpus: trialcs->effective_xcpus)) {
2626	invalidate = true;
2627	cs->prs_err = PERR_NOCPUS;
2628	}
2629
2630	if (is_remote_partition(cs)) {
2631	if (invalidate)
2632	remote_partition_disable(cs, tmp: &tmp);
2633	else
2634	remote_cpus_update(cs, newmask: trialcs->effective_xcpus,
2635	tmp: &tmp);
2636	} else if (invalidate) {
2637	update_parent_effective_cpumask(cs, cmd: partcmd_invalidate,
2638	NULL, tmp: &tmp);
2639	} else {
2640	update_parent_effective_cpumask(cs, cmd: partcmd_update,
2641	newmask: trialcs->effective_xcpus, tmp: &tmp);
2642	}
2643	} else if (!cpumask_empty(srcp: trialcs->exclusive_cpus)) {
2644	/*
2645	* Use trialcs->effective_cpus as a temp cpumask
2646	*/
2647	remote_partition_check(cs, newmask: trialcs->effective_xcpus,
2648	delmask: trialcs->effective_cpus, tmp: &tmp);
2649	}
2650	spin_lock_irq(lock: &callback_lock);
2651	cpumask_copy(dstp: cs->exclusive_cpus, srcp: trialcs->exclusive_cpus);
2652	cpumask_copy(dstp: cs->effective_xcpus, srcp: trialcs->effective_xcpus);
2653	if ((old_prs > `0`) && !is_partition_valid(cs))
2654	reset_partition_data(cs);
2655	spin_unlock_irq(lock: &callback_lock);
2656
2657	/*
2658	* Call update_cpumasks_hier() to update effective_cpus/effective_xcpus
2659	* of the subtree when it is a valid partition root or effective_xcpus
2660	* is updated.
2661	*/
2662	if (is_partition_valid(cs) \|\| hier_flags)
2663	update_cpumasks_hier(cs, tmp: &tmp, flags: hier_flags);
2664
2665	/ Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary /
2666	if (cs->partition_root_state)
2667	update_partition_sd_lb(cs, old_prs);
2668
2669	free_cpumasks(NULL, tmp: &tmp);
2670	return `0`;
2671	}
2672
2673	/*
2674	* Migrate memory region from one set of nodes to another. This is
2675	* performed asynchronously as it can be called from process migration path
2676	* holding locks involved in process management. All mm migrations are
2677	* performed in the queued order and can be waited for by flushing
2678	* cpuset_migrate_mm_wq.
2679	*/
2680
2681	struct cpuset_migrate_mm_work {
2682	struct work_struct work;
2683	struct mm_struct *mm;
2684	nodemask_t from;
2685	nodemask_t to;
2686	};
2687
2688	static void cpuset_migrate_mm_workfn(struct work_struct *work)
2689	{
2690	struct cpuset_migrate_mm_work *mwork =
2691	container_of(work, struct cpuset_migrate_mm_work, work);
2692
2693	/ on a wq worker, no need to worry about %current's mems_allowed /
2694	do_migrate_pages(mm: mwork->mm, from: &mwork->from, to: &mwork->to, MPOL_MF_MOVE_ALL);
2695	mmput(mwork->mm);
2696	kfree(objp: mwork);
2697	}
2698
2699	static void cpuset_migrate_mm(struct mm_struct mm, const* nodemask_t *from,
2700	const nodemask_t *to)
2701	{
2702	struct cpuset_migrate_mm_work *mwork;
2703
2704	if (nodes_equal(from, to)) {
2705	mmput(mm);
2706	return;
2707	}
2708
2709	mwork = kzalloc(size: sizeof(*mwork), GFP_KERNEL);
2710	if (mwork) {
2711	mwork->mm = mm;
2712	mwork->from = *from;
2713	mwork->to = *to;
2714	INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
2715	queue_work(wq: cpuset_migrate_mm_wq, work: &mwork->work);
2716	} else {
2717	mmput(mm);
2718	}
2719	}
2720
2721	static void cpuset_post_attach(void)
2722	{
2723	flush_workqueue(cpuset_migrate_mm_wq);
2724	}
2725
2726	/*
2727	* cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
2728	* @tsk: the task to change
2729	* @newmems: new nodes that the task will be set
2730	*
2731	* We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
2732	* and rebind an eventual tasks' mempolicy. If the task is allocating in
2733	* parallel, it might temporarily see an empty intersection, which results in
2734	* a seqlock check and retry before OOM or allocation failure.
2735	*/
2736	static void cpuset_change_task_nodemask(struct task_struct *tsk,
2737	nodemask_t *newmems)
2738	{
2739	task_lock(p: tsk);
2740
2741	local_irq_disable();
2742	write_seqcount_begin(&tsk->mems_allowed_seq);
2743
2744	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
2745	mpol_rebind_task(tsk, new: newmems);
2746	tsk->mems_allowed = *newmems;
2747
2748	write_seqcount_end(&tsk->mems_allowed_seq);
2749	local_irq_enable();
2750
2751	task_unlock(p: tsk);
2752	}
2753
2754	static void *cpuset_being_rebound;
2755
2756	/**
2757	* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
2758	* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
2759	*
2760	* Iterate through each task of @cs updating its mems_allowed to the
2761	* effective cpuset's. As this function is called with cpuset_mutex held,
2762	* cpuset membership stays stable.
2763	*/
2764	static void update_tasks_nodemask(struct cpuset *cs)
2765	{
2766	static nodemask_t newmems; / protected by cpuset_mutex /
2767	struct css_task_iter it;
2768	struct task_struct *task;
2769
2770	cpuset_being_rebound = cs; / causes mpol_dup() rebind /
2771
2772	guarantee_online_mems(cs, pmask: &newmems);
2773
2774	/*
2775	* The mpol_rebind_mm() call takes mmap_lock, which we couldn't
2776	* take while holding tasklist_lock. Forks can happen - the
2777	* mpol_dup() cpuset_being_rebound check will catch such forks,
2778	* and rebind their vma mempolicies too. Because we still hold
2779	* the global cpuset_mutex, we know that no other rebind effort
2780	* will be contending for the global variable cpuset_being_rebound.
2781	* It's ok if we rebind the same mm twice; mpol_rebind_mm()
2782	* is idempotent. Also migrate pages in each mm to new nodes.
2783	*/
2784	css_task_iter_start(css: &cs->css, flags: `0`, it: &it);
2785	while ((task = css_task_iter_next(it: &it))) {
2786	struct mm_struct *mm;
2787	bool migrate;
2788
2789	cpuset_change_task_nodemask(tsk: task, newmems: &newmems);
2790
2791	mm = get_task_mm(task);
2792	if (!mm)
2793	continue;
2794
2795	migrate = is_memory_migrate(cs);
2796
2797	mpol_rebind_mm(mm, new: &cs->mems_allowed);
2798	if (migrate)
2799	cpuset_migrate_mm(mm, from: &cs->old_mems_allowed, to: &newmems);
2800	else
2801	mmput(mm);
2802	}
2803	css_task_iter_end(it: &it);
2804
2805	/*
2806	* All the tasks' nodemasks have been updated, update
2807	* cs->old_mems_allowed.
2808	*/
2809	cs->old_mems_allowed = newmems;
2810
2811	/ We're done rebinding vmas to this cpuset's new mems_allowed. /
2812	cpuset_being_rebound = NULL;
2813	}
2814
2815	/*
2816	* update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
2817	* @cs: the cpuset to consider
2818	* @new_mems: a temp variable for calculating new effective_mems
2819	*
2820	* When configured nodemask is changed, the effective nodemasks of this cpuset
2821	* and all its descendants need to be updated.
2822	*
2823	* On legacy hierarchy, effective_mems will be the same with mems_allowed.
2824	*
2825	* Called with cpuset_mutex held
2826	*/
2827	static void update_nodemasks_hier(struct cpuset cs, nodemask_t new_mems)
2828	{
2829	struct cpuset *cp;
2830	struct cgroup_subsys_state *pos_css;
2831
2832	rcu_read_lock();
2833	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
2834	struct cpuset *parent = parent_cs(cs: cp);
2835
2836	nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
2837
2838	/*
2839	* If it becomes empty, inherit the effective mask of the
2840	* parent, which is guaranteed to have some MEMs.
2841	*/
2842	if (is_in_v2_mode() && nodes_empty(*new_mems))
2843	*new_mems = parent->effective_mems;
2844
2845	/ Skip the whole subtree if the nodemask remains the same. /
2846	if (nodes_equal(*new_mems, cp->effective_mems)) {
2847	pos_css = css_rightmost_descendant(pos: pos_css);
2848	continue;
2849	}
2850
2851	if (!css_tryget_online(css: &cp->css))
2852	continue;
2853	rcu_read_unlock();
2854
2855	spin_lock_irq(lock: &callback_lock);
2856	cp->effective_mems = *new_mems;
2857	spin_unlock_irq(lock: &callback_lock);
2858
2859	WARN_ON(!is_in_v2_mode() &&
2860	!nodes_equal(cp->mems_allowed, cp->effective_mems));
2861
2862	update_tasks_nodemask(cs: cp);
2863
2864	rcu_read_lock();
2865	css_put(css: &cp->css);
2866	}
2867	rcu_read_unlock();
2868	}
2869
2870	/*
2871	* Handle user request to change the 'mems' memory placement
2872	* of a cpuset. Needs to validate the request, update the
2873	* cpusets mems_allowed, and for each task in the cpuset,
2874	* update mems_allowed and rebind task's mempolicy and any vma
2875	* mempolicies and if the cpuset is marked 'memory_migrate',
2876	* migrate the tasks pages to the new memory.
2877	*
2878	* Call with cpuset_mutex held. May take callback_lock during call.
2879	* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
2880	* lock each such tasks mm->mmap_lock, scan its vma's and rebind
2881	* their mempolicies to the cpusets new mems_allowed.
2882	*/
2883	static int update_nodemask(struct cpuset cs, struct* cpuset *trialcs,
2884	const char *buf)
2885	{
2886	int retval;
2887
2888	/*
2889	* top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
2890	* it's read-only
2891	*/
2892	if (cs == &top_cpuset) {
2893	retval = -EACCES;
2894	goto done;
2895	}
2896
2897	/*
2898	* An empty mems_allowed is ok iff there are no tasks in the cpuset.
2899	* Since nodelist_parse() fails on an empty mask, we special case
2900	* that parsing. The validate_change() call ensures that cpusets
2901	* with tasks have memory.
2902	*/
2903	if (!*buf) {
2904	nodes_clear(trialcs->mems_allowed);
2905	} else {
2906	retval = nodelist_parse(buf, trialcs->mems_allowed);
2907	if (retval < `0`)
2908	goto done;
2909
2910	if (!nodes_subset(trialcs->mems_allowed,
2911	top_cpuset.mems_allowed)) {
2912	retval = -EINVAL;
2913	goto done;
2914	}
2915	}
2916
2917	if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
2918	retval = `0`; / Too easy - nothing to do /
2919	goto done;
2920	}
2921	retval = validate_change(cur: cs, trial: trialcs);
2922	if (retval < `0`)
2923	goto done;
2924
2925	check_insane_mems_config(nodes: &trialcs->mems_allowed);
2926
2927	spin_lock_irq(lock: &callback_lock);
2928	cs->mems_allowed = trialcs->mems_allowed;
2929	spin_unlock_irq(lock: &callback_lock);
2930
2931	/ use trialcs->mems_allowed as a temp variable /
2932	update_nodemasks_hier(cs, new_mems: &trialcs->mems_allowed);
2933	done:
2934	return retval;
2935	}
2936
2937	bool current_cpuset_is_being_rebound(void)
2938	{
2939	bool ret;
2940
2941	rcu_read_lock();
2942	ret = task_cs(current) == cpuset_being_rebound;
2943	rcu_read_unlock();
2944
2945	return ret;
2946	}
2947
2948	static int update_relax_domain_level(struct cpuset *cs, s64 val)
2949	{
2950	#ifdef CONFIG_SMP
2951	if (val < -`1` \|\| val >= sched_domain_level_max)
2952	return -EINVAL;
2953	#endif
2954
2955	if (val != cs->relax_domain_level) {
2956	cs->relax_domain_level = val;
2957	if (!cpumask_empty(srcp: cs->cpus_allowed) &&
2958	is_sched_load_balance(cs))
2959	rebuild_sched_domains_locked();
2960	}
2961
2962	return `0`;
2963	}
2964
2965	/**
2966	* update_tasks_flags - update the spread flags of tasks in the cpuset.
2967	* @cs: the cpuset in which each task's spread flags needs to be changed
2968	*
2969	* Iterate through each task of @cs updating its spread flags. As this
2970	* function is called with cpuset_mutex held, cpuset membership stays
2971	* stable.
2972	*/
2973	static void update_tasks_flags(struct cpuset *cs)
2974	{
2975	struct css_task_iter it;
2976	struct task_struct *task;
2977
2978	css_task_iter_start(css: &cs->css, flags: `0`, it: &it);
2979	while ((task = css_task_iter_next(it: &it)))
2980	cpuset_update_task_spread_flags(cs, tsk: task);
2981	css_task_iter_end(it: &it);
2982	}
2983
2984	/*
2985	* update_flag - read a 0 or a 1 in a file and update associated flag
2986	* bit: the bit to update (see cpuset_flagbits_t)
2987	* cs: the cpuset to update
2988	* turning_on: whether the flag is being set or cleared
2989	*
2990	* Call with cpuset_mutex held.
2991	*/
2992
2993	static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
2994	int turning_on)
2995	{
2996	struct cpuset *trialcs;
2997	int balance_flag_changed;
2998	int spread_flag_changed;
2999	int err;
3000
3001	trialcs = alloc_trial_cpuset(cs);
3002	if (!trialcs)
3003	return -ENOMEM;
3004
3005	if (turning_on)
3006	set_bit(nr: bit, addr: &trialcs->flags);
3007	else
3008	clear_bit(nr: bit, addr: &trialcs->flags);
3009
3010	err = validate_change(cur: cs, trial: trialcs);
3011	if (err < `0`)
3012	goto out;
3013
3014	balance_flag_changed = (is_sched_load_balance(cs) !=
3015	is_sched_load_balance(cs: trialcs));
3016
3017	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(cs: trialcs))
3018	\|\| (is_spread_page(cs) != is_spread_page(cs: trialcs)));
3019
3020	spin_lock_irq(lock: &callback_lock);
3021	cs->flags = trialcs->flags;
3022	spin_unlock_irq(lock: &callback_lock);
3023
3024	if (!cpumask_empty(srcp: trialcs->cpus_allowed) && balance_flag_changed)
3025	rebuild_sched_domains_locked();
3026
3027	if (spread_flag_changed)
3028	update_tasks_flags(cs);
3029	out:
3030	free_cpuset(cs: trialcs);
3031	return err;
3032	}
3033
3034	/**
3035	* update_prstate - update partition_root_state
3036	* @cs: the cpuset to update
3037	* @new_prs: new partition root state
3038	* Return: 0 if successful, != 0 if error
3039	*
3040	* Call with cpuset_mutex held.
3041	*/
3042	static int update_prstate(struct cpuset cs, int* new_prs)
3043	{
3044	int err = PERR_NONE, old_prs = cs->partition_root_state;
3045	struct cpuset *parent = parent_cs(cs);
3046	struct tmpmasks tmpmask;
3047	bool new_xcpus_state = false;
3048
3049	if (old_prs == new_prs)
3050	return `0`;
3051
3052	/*
3053	* Treat a previously invalid partition root as if it is a "member".
3054	*/
3055	if (new_prs && is_prs_invalid(prs_state: old_prs))
3056	old_prs = PRS_MEMBER;
3057
3058	if (alloc_cpumasks(NULL, tmp: &tmpmask))
3059	return -ENOMEM;
3060
3061	/*
3062	* Setup effective_xcpus if not properly set yet, it will be cleared
3063	* later if partition becomes invalid.
3064	*/
3065	if ((new_prs > `0`) && cpumask_empty(srcp: cs->exclusive_cpus)) {
3066	spin_lock_irq(lock: &callback_lock);
3067	cpumask_and(dstp: cs->effective_xcpus,
3068	src1p: cs->cpus_allowed, src2p: parent->effective_xcpus);
3069	spin_unlock_irq(lock: &callback_lock);
3070	}
3071
3072	err = update_partition_exclusive(cs, new_prs);
3073	if (err)
3074	goto out;
3075
3076	if (!old_prs) {
3077	enum partition_cmd cmd = (new_prs == PRS_ROOT)
3078	? partcmd_enable : partcmd_enablei;
3079
3080	/*
3081	* cpus_allowed cannot be empty.
3082	*/
3083	if (cpumask_empty(srcp: cs->cpus_allowed)) {
3084	err = PERR_CPUSEMPTY;
3085	goto out;
3086	}
3087
3088	err = update_parent_effective_cpumask(cs, cmd, NULL, tmp: &tmpmask);
3089	/*
3090	* If an attempt to become local partition root fails,
3091	* try to become a remote partition root instead.
3092	*/
3093	if (err && remote_partition_enable(cs, new_prs, tmp: &tmpmask))
3094	err = `0`;
3095	} else if (old_prs && new_prs) {
3096	/*
3097	* A change in load balance state only, no change in cpumasks.
3098	*/
3099	new_xcpus_state = true;
3100	} else {
3101	/*
3102	* Switching back to member is always allowed even if it
3103	* disables child partitions.
3104	*/
3105	if (is_remote_partition(cs))
3106	remote_partition_disable(cs, tmp: &tmpmask);
3107	else
3108	update_parent_effective_cpumask(cs, cmd: partcmd_disable,
3109	NULL, tmp: &tmpmask);
3110
3111	/*
3112	* Invalidation of child partitions will be done in
3113	* update_cpumasks_hier().
3114	*/
3115	}
3116	out:
3117	/*
3118	* Make partition invalid & disable CS_CPU_EXCLUSIVE if an error
3119	* happens.
3120	*/
3121	if (err) {
3122	new_prs = -new_prs;
3123	update_partition_exclusive(cs, new_prs);
3124	}
3125
3126	spin_lock_irq(lock: &callback_lock);
3127	cs->partition_root_state = new_prs;
3128	WRITE_ONCE(cs->prs_err, err);
3129	if (!is_partition_valid(cs))
3130	reset_partition_data(cs);
3131	else if (new_xcpus_state)
3132	partition_xcpus_newstate(old_prs, new_prs, xcpus: cs->effective_xcpus);
3133	spin_unlock_irq(lock: &callback_lock);
3134	update_unbound_workqueue_cpumask(isolcpus_updated: new_xcpus_state);
3135
3136	/ Force update if switching back to member /
3137	update_cpumasks_hier(cs, tmp: &tmpmask, flags: !new_prs ? HIER_CHECKALL : `0`);
3138
3139	/ Update sched domains and load balance flag /
3140	update_partition_sd_lb(cs, old_prs);
3141
3142	notify_partition_change(cs, old_prs);
3143	free_cpumasks(NULL, tmp: &tmpmask);
3144	return `0`;
3145	}
3146
3147	/*
3148	* Frequency meter - How fast is some event occurring?
3149	*
3150	* These routines manage a digitally filtered, constant time based,
3151	* event frequency meter. There are four routines:
3152	* fmeter_init() - initialize a frequency meter.
3153	* fmeter_markevent() - called each time the event happens.
3154	* fmeter_getrate() - returns the recent rate of such events.
3155	* fmeter_update() - internal routine used to update fmeter.
3156	*
3157	* A common data structure is passed to each of these routines,
3158	* which is used to keep track of the state required to manage the
3159	* frequency meter and its digital filter.
3160	*
3161	* The filter works on the number of events marked per unit time.
3162	* The filter is single-pole low-pass recursive (IIR). The time unit
3163	* is 1 second. Arithmetic is done using 32-bit integers scaled to
3164	* simulate 3 decimal digits of precision (multiplied by 1000).
3165	*
3166	* With an FM_COEF of 933, and a time base of 1 second, the filter
3167	* has a half-life of 10 seconds, meaning that if the events quit
3168	* happening, then the rate returned from the fmeter_getrate()
3169	* will be cut in half each 10 seconds, until it converges to zero.
3170	*
3171	* It is not worth doing a real infinitely recursive filter. If more
3172	* than FM_MAXTICKS ticks have elapsed since the last filter event,
3173	* just compute FM_MAXTICKS ticks worth, by which point the level
3174	* will be stable.
3175	*
3176	* Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
3177	* arithmetic overflow in the fmeter_update() routine.
3178	*
3179	* Given the simple 32 bit integer arithmetic used, this meter works
3180	* best for reporting rates between one per millisecond (msec) and
3181	* one per 32 (approx) seconds. At constant rates faster than one
3182	* per msec it maxes out at values just under 1,000,000. At constant
3183	* rates between one per msec, and one per second it will stabilize
3184	* to a value N*1000, where N is the rate of events per second.
3185	* At constant rates between one per second and one per 32 seconds,
3186	* it will be choppy, moving up on the seconds that have an event,
3187	* and then decaying until the next event. At rates slower than
3188	* about one in 32 seconds, it decays all the way back to zero between
3189	* each event.
3190	*/
3191
3192	#define FM_COEF 933 /* coefficient for half-life of 10 secs */
3193	#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
3194	#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
3195	#define FM_SCALE 1000 /* faux fixed point scale */
3196
3197	/ Initialize a frequency meter /
3198	static void fmeter_init(struct fmeter *fmp)
3199	{
3200	fmp->cnt = `0`;
3201	fmp->val = `0`;
3202	fmp->time = `0`;
3203	spin_lock_init(&fmp->lock);
3204	}
3205
3206	/ Internal meter update - process cnt events and update value /
3207	static void fmeter_update(struct fmeter *fmp)
3208	{
3209	time64_t now;
3210	u32 ticks;
3211
3212	now = ktime_get_seconds();
3213	ticks = now - fmp->time;
3214
3215	if (ticks == `0`)
3216	return;
3217
3218	ticks = min(FM_MAXTICKS, ticks);
3219	while (ticks-- > `0`)
3220	fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
3221	fmp->time = now;
3222
3223	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
3224	fmp->cnt = `0`;
3225	}
3226
3227	/ Process any previous ticks, then bump cnt by one (times scale). /
3228	static void fmeter_markevent(struct fmeter *fmp)
3229	{
3230	spin_lock(lock: &fmp->lock);
3231	fmeter_update(fmp);
3232	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
3233	spin_unlock(lock: &fmp->lock);
3234	}
3235
3236	/ Process any previous ticks, then return current value. /
3237	static int fmeter_getrate(struct fmeter *fmp)
3238	{
3239	int val;
3240
3241	spin_lock(lock: &fmp->lock);
3242	fmeter_update(fmp);
3243	val = fmp->val;
3244	spin_unlock(lock: &fmp->lock);
3245	return val;
3246	}
3247
3248	static struct cpuset *cpuset_attach_old_cs;
3249
3250	/*
3251	* Check to see if a cpuset can accept a new task
3252	* For v1, cpus_allowed and mems_allowed can't be empty.
3253	* For v2, effective_cpus can't be empty.
3254	* Note that in v1, effective_cpus = cpus_allowed.
3255	*/
3256	static int cpuset_can_attach_check(struct cpuset *cs)
3257	{
3258	if (cpumask_empty(srcp: cs->effective_cpus) \|\|
3259	(!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
3260	return -ENOSPC;
3261	return `0`;
3262	}
3263
3264	static void reset_migrate_dl_data(struct cpuset *cs)
3265	{
3266	cs->nr_migrate_dl_tasks = `0`;
3267	cs->sum_migrate_dl_bw = `0`;
3268	}
3269
3270	/ Called by cgroups to determine if a cpuset is usable; cpuset_mutex held /
3271	static int cpuset_can_attach(struct cgroup_taskset *tset)
3272	{
3273	struct cgroup_subsys_state *css;
3274	struct cpuset cs, oldcs;
3275	struct task_struct *task;
3276	bool cpus_updated, mems_updated;
3277	int ret;
3278
3279	/ used later by cpuset_attach() /
3280	cpuset_attach_old_cs = task_cs(task: cgroup_taskset_first(tset, dst_cssp: &css));
3281	oldcs = cpuset_attach_old_cs;
3282	cs = css_cs(css);
3283
3284	mutex_lock(&cpuset_mutex);
3285
3286	/ Check to see if task is allowed in the cpuset /
3287	ret = cpuset_can_attach_check(cs);
3288	if (ret)
3289	goto out_unlock;
3290
3291	cpus_updated = !cpumask_equal(src1p: cs->effective_cpus, src2p: oldcs->effective_cpus);
3292	mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
3293
3294	cgroup_taskset_for_each(task, css, tset) {
3295	ret = task_can_attach(p: task);
3296	if (ret)
3297	goto out_unlock;
3298
3299	/*
3300	* Skip rights over task check in v2 when nothing changes,
3301	* migration permission derives from hierarchy ownership in
3302	* cgroup_procs_write_permission()).
3303	*/
3304	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) \|\|
3305	(cpus_updated \|\| mems_updated)) {
3306	ret = security_task_setscheduler(p: task);
3307	if (ret)
3308	goto out_unlock;
3309	}
3310
3311	if (dl_task(p: task)) {
3312	cs->nr_migrate_dl_tasks++;
3313	cs->sum_migrate_dl_bw += task->dl.dl_bw;
3314	}
3315	}
3316
3317	if (!cs->nr_migrate_dl_tasks)
3318	goto out_success;
3319
3320	if (!cpumask_intersects(src1p: oldcs->effective_cpus, src2p: cs->effective_cpus)) {
3321	int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
3322
3323	if (unlikely(cpu >= nr_cpu_ids)) {
3324	reset_migrate_dl_data(cs);
3325	ret = -EINVAL;
3326	goto out_unlock;
3327	}
3328
3329	ret = dl_bw_alloc(cpu, dl_bw: cs->sum_migrate_dl_bw);
3330	if (ret) {
3331	reset_migrate_dl_data(cs);
3332	goto out_unlock;
3333	}
3334	}
3335
3336	out_success:
3337	/*
3338	* Mark attach is in progress. This makes validate_change() fail
3339	* changes which zero cpus/mems_allowed.
3340	*/
3341	cs->attach_in_progress++;
3342	out_unlock:
3343	mutex_unlock(lock: &cpuset_mutex);
3344	return ret;
3345	}
3346
3347	static void cpuset_cancel_attach(struct cgroup_taskset *tset)
3348	{
3349	struct cgroup_subsys_state *css;
3350	struct cpuset *cs;
3351
3352	cgroup_taskset_first(tset, dst_cssp: &css);
3353	cs = css_cs(css);
3354
3355	mutex_lock(&cpuset_mutex);
3356	cs->attach_in_progress--;
3357	if (!cs->attach_in_progress)
3358	wake_up(&cpuset_attach_wq);
3359
3360	if (cs->nr_migrate_dl_tasks) {
3361	int cpu = cpumask_any(cs->effective_cpus);
3362
3363	dl_bw_free(cpu, dl_bw: cs->sum_migrate_dl_bw);
3364	reset_migrate_dl_data(cs);
3365	}
3366
3367	mutex_unlock(lock: &cpuset_mutex);
3368	}
3369
3370	/*
3371	* Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
3372	* but we can't allocate it dynamically there. Define it global and
3373	* allocate from cpuset_init().
3374	*/
3375	static cpumask_var_t cpus_attach;
3376	static nodemask_t cpuset_attach_nodemask_to;
3377
3378	static void cpuset_attach_task(struct cpuset cs, struct* task_struct *task)
3379	{
3380	lockdep_assert_held(&cpuset_mutex);
3381
3382	if (cs != &top_cpuset)
3383	guarantee_online_cpus(tsk: task, pmask: cpus_attach);
3384	else
3385	cpumask_andnot(dstp: cpus_attach, task_cpu_possible_mask(task),
3386	src2p: subpartitions_cpus);
3387	/*
3388	* can_attach beforehand should guarantee that this doesn't
3389	* fail. TODO: have a better way to handle failure here
3390	*/
3391	WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
3392
3393	cpuset_change_task_nodemask(tsk: task, newmems: &cpuset_attach_nodemask_to);
3394	cpuset_update_task_spread_flags(cs, tsk: task);
3395	}
3396
3397	static void cpuset_attach(struct cgroup_taskset *tset)
3398	{
3399	struct task_struct *task;
3400	struct task_struct *leader;
3401	struct cgroup_subsys_state *css;
3402	struct cpuset *cs;
3403	struct cpuset *oldcs = cpuset_attach_old_cs;
3404	bool cpus_updated, mems_updated;
3405
3406	cgroup_taskset_first(tset, dst_cssp: &css);
3407	cs = css_cs(css);
3408
3409	lockdep_assert_cpus_held(); / see cgroup_attach_lock() /
3410	mutex_lock(&cpuset_mutex);
3411	cpus_updated = !cpumask_equal(src1p: cs->effective_cpus,
3412	src2p: oldcs->effective_cpus);
3413	mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
3414
3415	/*
3416	* In the default hierarchy, enabling cpuset in the child cgroups
3417	* will trigger a number of cpuset_attach() calls with no change
3418	* in effective cpus and mems. In that case, we can optimize out
3419	* by skipping the task iteration and update.
3420	*/
3421	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
3422	!cpus_updated && !mems_updated) {
3423	cpuset_attach_nodemask_to = cs->effective_mems;
3424	goto out;
3425	}
3426
3427	guarantee_online_mems(cs, pmask: &cpuset_attach_nodemask_to);
3428
3429	cgroup_taskset_for_each(task, css, tset)
3430	cpuset_attach_task(cs, task);
3431
3432	/*
3433	* Change mm for all threadgroup leaders. This is expensive and may
3434	* sleep and should be moved outside migration path proper. Skip it
3435	* if there is no change in effective_mems and CS_MEMORY_MIGRATE is
3436	* not set.
3437	*/
3438	cpuset_attach_nodemask_to = cs->effective_mems;
3439	if (!is_memory_migrate(cs) && !mems_updated)
3440	goto out;
3441
3442	cgroup_taskset_for_each_leader(leader, css, tset) {
3443	struct mm_struct *mm = get_task_mm(task: leader);
3444
3445	if (mm) {
3446	mpol_rebind_mm(mm, new: &cpuset_attach_nodemask_to);
3447
3448	/*
3449	* old_mems_allowed is the same with mems_allowed
3450	* here, except if this task is being moved
3451	* automatically due to hotplug. In that case
3452	* @mems_allowed has been updated and is empty, so
3453	* @old_mems_allowed is the right nodesets that we
3454	* migrate mm from.
3455	*/
3456	if (is_memory_migrate(cs))
3457	cpuset_migrate_mm(mm, from: &oldcs->old_mems_allowed,
3458	to: &cpuset_attach_nodemask_to);
3459	else
3460	mmput(mm);
3461	}
3462	}
3463
3464	out:
3465	cs->old_mems_allowed = cpuset_attach_nodemask_to;
3466
3467	if (cs->nr_migrate_dl_tasks) {
3468	cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
3469	oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
3470	reset_migrate_dl_data(cs);
3471	}
3472
3473	cs->attach_in_progress--;
3474	if (!cs->attach_in_progress)
3475	wake_up(&cpuset_attach_wq);
3476
3477	mutex_unlock(lock: &cpuset_mutex);
3478	}
3479
3480	/ The various types of files and directories in a cpuset file system /
3481
3482	typedef enum {
3483	FILE_MEMORY_MIGRATE,
3484	FILE_CPULIST,
3485	FILE_MEMLIST,
3486	FILE_EFFECTIVE_CPULIST,
3487	FILE_EFFECTIVE_MEMLIST,
3488	FILE_SUBPARTS_CPULIST,
3489	FILE_EXCLUSIVE_CPULIST,
3490	FILE_EFFECTIVE_XCPULIST,
3491	FILE_ISOLATED_CPULIST,
3492	FILE_CPU_EXCLUSIVE,
3493	FILE_MEM_EXCLUSIVE,
3494	FILE_MEM_HARDWALL,
3495	FILE_SCHED_LOAD_BALANCE,
3496	FILE_PARTITION_ROOT,
3497	FILE_SCHED_RELAX_DOMAIN_LEVEL,
3498	FILE_MEMORY_PRESSURE_ENABLED,
3499	FILE_MEMORY_PRESSURE,
3500	FILE_SPREAD_PAGE,
3501	FILE_SPREAD_SLAB,
3502	} cpuset_filetype_t;
3503
3504	static int cpuset_write_u64(struct cgroup_subsys_state css, struct* cftype *cft,
3505	u64 val)
3506	{
3507	struct cpuset *cs = css_cs(css);
3508	cpuset_filetype_t type = cft->private;
3509	int retval = `0`;
3510
3511	cpus_read_lock();
3512	mutex_lock(&cpuset_mutex);
3513	if (!is_cpuset_online(cs)) {
3514	retval = -ENODEV;
3515	goto out_unlock;
3516	}
3517
3518	switch (type) {
3519	case FILE_CPU_EXCLUSIVE:
3520	retval = update_flag(bit: CS_CPU_EXCLUSIVE, cs, turning_on: val);
3521	break;
3522	case FILE_MEM_EXCLUSIVE:
3523	retval = update_flag(bit: CS_MEM_EXCLUSIVE, cs, turning_on: val);
3524	break;
3525	case FILE_MEM_HARDWALL:
3526	retval = update_flag(bit: CS_MEM_HARDWALL, cs, turning_on: val);
3527	break;
3528	case FILE_SCHED_LOAD_BALANCE:
3529	retval = update_flag(bit: CS_SCHED_LOAD_BALANCE, cs, turning_on: val);
3530	break;
3531	case FILE_MEMORY_MIGRATE:
3532	retval = update_flag(bit: CS_MEMORY_MIGRATE, cs, turning_on: val);
3533	break;
3534	case FILE_MEMORY_PRESSURE_ENABLED:
3535	cpuset_memory_pressure_enabled = !!val;
3536	break;
3537	case FILE_SPREAD_PAGE:
3538	retval = update_flag(bit: CS_SPREAD_PAGE, cs, turning_on: val);
3539	break;
3540	case FILE_SPREAD_SLAB:
3541	retval = update_flag(bit: CS_SPREAD_SLAB, cs, turning_on: val);
3542	break;
3543	default:
3544	retval = -EINVAL;
3545	break;
3546	}
3547	out_unlock:
3548	mutex_unlock(lock: &cpuset_mutex);
3549	cpus_read_unlock();
3550	return retval;
3551	}
3552
3553	static int cpuset_write_s64(struct cgroup_subsys_state css, struct* cftype *cft,
3554	s64 val)
3555	{
3556	struct cpuset *cs = css_cs(css);
3557	cpuset_filetype_t type = cft->private;
3558	int retval = -ENODEV;
3559
3560	cpus_read_lock();
3561	mutex_lock(&cpuset_mutex);
3562	if (!is_cpuset_online(cs))
3563	goto out_unlock;
3564
3565	switch (type) {
3566	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
3567	retval = update_relax_domain_level(cs, val);
3568	break;
3569	default:
3570	retval = -EINVAL;
3571	break;
3572	}
3573	out_unlock:
3574	mutex_unlock(lock: &cpuset_mutex);
3575	cpus_read_unlock();
3576	return retval;
3577	}
3578
3579	/*
3580	* Common handling for a write to a "cpus" or "mems" file.
3581	*/
3582	static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
3583	char *buf, size_t nbytes, loff_t off)
3584	{
3585	struct cpuset *cs = css_cs(css: of_css(of));
3586	struct cpuset *trialcs;
3587	int retval = -ENODEV;
3588
3589	buf = strstrip(str: buf);
3590
3591	/*
3592	* CPU or memory hotunplug may leave @cs w/o any execution
3593	* resources, in which case the hotplug code asynchronously updates
3594	* configuration and transfers all tasks to the nearest ancestor
3595	* which can execute.
3596	*
3597	* As writes to "cpus" or "mems" may restore @cs's execution
3598	* resources, wait for the previously scheduled operations before
3599	* proceeding, so that we don't end up keep removing tasks added
3600	* after execution capability is restored.
3601	*
3602	* cpuset_hotplug_work calls back into cgroup core via
3603	* cgroup_transfer_tasks() and waiting for it from a cgroupfs
3604	* operation like this one can lead to a deadlock through kernfs
3605	* active_ref protection. Let's break the protection. Losing the
3606	* protection is okay as we check whether @cs is online after
3607	* grabbing cpuset_mutex anyway. This only happens on the legacy
3608	* hierarchies.
3609	*/
3610	css_get(css: &cs->css);
3611	kernfs_break_active_protection(kn: of->kn);
3612	flush_work(work: &cpuset_hotplug_work);
3613
3614	cpus_read_lock();
3615	mutex_lock(&cpuset_mutex);
3616	if (!is_cpuset_online(cs))
3617	goto out_unlock;
3618
3619	trialcs = alloc_trial_cpuset(cs);
3620	if (!trialcs) {
3621	retval = -ENOMEM;
3622	goto out_unlock;
3623	}
3624
3625	switch (of_cft(of)->private) {
3626	case FILE_CPULIST:
3627	retval = update_cpumask(cs, trialcs, buf);
3628	break;
3629	case FILE_EXCLUSIVE_CPULIST:
3630	retval = update_exclusive_cpumask(cs, trialcs, buf);
3631	break;
3632	case FILE_MEMLIST:
3633	retval = update_nodemask(cs, trialcs, buf);
3634	break;
3635	default:
3636	retval = -EINVAL;
3637	break;
3638	}
3639
3640	free_cpuset(cs: trialcs);
3641	out_unlock:
3642	mutex_unlock(lock: &cpuset_mutex);
3643	cpus_read_unlock();
3644	kernfs_unbreak_active_protection(kn: of->kn);
3645	css_put(css: &cs->css);
3646	flush_workqueue(cpuset_migrate_mm_wq);
3647	return retval ?: nbytes;
3648	}
3649
3650	/*
3651	* These ascii lists should be read in a single call, by using a user
3652	* buffer large enough to hold the entire map. If read in smaller
3653	* chunks, there is no guarantee of atomicity. Since the display format
3654	* used, list of ranges of sequential numbers, is variable length,
3655	* and since these maps can change value dynamically, one could read
3656	* gibberish by doing partial reads while a list was changing.
3657	*/
3658	static int cpuset_common_seq_show(struct seq_file sf, void* *v)
3659	{
3660	struct cpuset *cs = css_cs(css: seq_css(seq: sf));
3661	cpuset_filetype_t type = seq_cft(seq: sf)->private;
3662	int ret = `0`;
3663
3664	spin_lock_irq(lock: &callback_lock);
3665
3666	switch (type) {
3667	case FILE_CPULIST:
3668	seq_printf(m: sf, fmt: "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
3669	break;
3670	case FILE_MEMLIST:
3671	seq_printf(m: sf, fmt: "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
3672	break;
3673	case FILE_EFFECTIVE_CPULIST:
3674	seq_printf(m: sf, fmt: "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
3675	break;
3676	case FILE_EFFECTIVE_MEMLIST:
3677	seq_printf(m: sf, fmt: "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
3678	break;
3679	case FILE_EXCLUSIVE_CPULIST:
3680	seq_printf(m: sf, fmt: "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
3681	break;
3682	case FILE_EFFECTIVE_XCPULIST:
3683	seq_printf(m: sf, fmt: "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
3684	break;
3685	case FILE_SUBPARTS_CPULIST:
3686	seq_printf(m: sf, fmt: "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
3687	break;
3688	case FILE_ISOLATED_CPULIST:
3689	seq_printf(m: sf, fmt: "%*pbl\n", cpumask_pr_args(isolated_cpus));
3690	break;
3691	default:
3692	ret = -EINVAL;
3693	}
3694
3695	spin_unlock_irq(lock: &callback_lock);
3696	return ret;
3697	}
3698
3699	static u64 cpuset_read_u64(struct cgroup_subsys_state css, struct* cftype *cft)
3700	{
3701	struct cpuset *cs = css_cs(css);
3702	cpuset_filetype_t type = cft->private;
3703	switch (type) {
3704	case FILE_CPU_EXCLUSIVE:
3705	return is_cpu_exclusive(cs);
3706	case FILE_MEM_EXCLUSIVE:
3707	return is_mem_exclusive(cs);
3708	case FILE_MEM_HARDWALL:
3709	return is_mem_hardwall(cs);
3710	case FILE_SCHED_LOAD_BALANCE:
3711	return is_sched_load_balance(cs);
3712	case FILE_MEMORY_MIGRATE:
3713	return is_memory_migrate(cs);
3714	case FILE_MEMORY_PRESSURE_ENABLED:
3715	return cpuset_memory_pressure_enabled;
3716	case FILE_MEMORY_PRESSURE:
3717	return fmeter_getrate(fmp: &cs->fmeter);
3718	case FILE_SPREAD_PAGE:
3719	return is_spread_page(cs);
3720	case FILE_SPREAD_SLAB:
3721	return is_spread_slab(cs);
3722	default:
3723	BUG();
3724	}
3725
3726	/ Unreachable but makes gcc happy /
3727	return `0`;
3728	}
3729
3730	static s64 cpuset_read_s64(struct cgroup_subsys_state css, struct* cftype *cft)
3731	{
3732	struct cpuset *cs = css_cs(css);
3733	cpuset_filetype_t type = cft->private;
3734	switch (type) {
3735	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
3736	return cs->relax_domain_level;
3737	default:
3738	BUG();
3739	}
3740
3741	/ Unreachable but makes gcc happy /
3742	return `0`;
3743	}
3744
3745	static int sched_partition_show(struct seq_file seq, void* *v)
3746	{
3747	struct cpuset *cs = css_cs(css: seq_css(seq));
3748	const char err, type = NULL;
3749
3750	switch (cs->partition_root_state) {
3751	case PRS_ROOT:
3752	seq_puts(m: seq, s: "root\n");
3753	break;
3754	case PRS_ISOLATED:
3755	seq_puts(m: seq, s: "isolated\n");
3756	break;
3757	case PRS_MEMBER:
3758	seq_puts(m: seq, s: "member\n");
3759	break;
3760	case PRS_INVALID_ROOT:
3761	type = "root";
3762	fallthrough;
3763	case PRS_INVALID_ISOLATED:
3764	if (!type)
3765	type = "isolated";
3766	err = perr_strings[READ_ONCE(cs->prs_err)];
3767	if (err)
3768	seq_printf(m: seq, fmt: "%s invalid (%s)\n", type, err);
3769	else
3770	seq_printf(m: seq, fmt: "%s invalid\n", type);
3771	break;
3772	}
3773	return `0`;
3774	}
3775
3776	static ssize_t sched_partition_write(struct kernfs_open_file of, char* *buf,
3777	size_t nbytes, loff_t off)
3778	{
3779	struct cpuset *cs = css_cs(css: of_css(of));
3780	int val;
3781	int retval = -ENODEV;
3782
3783	buf = strstrip(str: buf);
3784
3785	/*
3786	* Convert "root" to ENABLED, and convert "member" to DISABLED.
3787	*/
3788	if (!strcmp(buf, "root"))
3789	val = PRS_ROOT;
3790	else if (!strcmp(buf, "member"))
3791	val = PRS_MEMBER;
3792	else if (!strcmp(buf, "isolated"))
3793	val = PRS_ISOLATED;
3794	else
3795	return -EINVAL;
3796
3797	css_get(css: &cs->css);
3798	cpus_read_lock();
3799	mutex_lock(&cpuset_mutex);
3800	if (!is_cpuset_online(cs))
3801	goto out_unlock;
3802
3803	retval = update_prstate(cs, new_prs: val);
3804	out_unlock:
3805	mutex_unlock(lock: &cpuset_mutex);
3806	cpus_read_unlock();
3807	css_put(css: &cs->css);
3808	return retval ?: nbytes;
3809	}
3810
3811	/*
3812	* for the common functions, 'private' gives the type of file
3813	*/
3814
3815	static struct cftype legacy_files[] = {
3816	{
3817	.name = "cpus",
3818	.seq_show = cpuset_common_seq_show,
3819	.write = cpuset_write_resmask,
3820	.max_write_len = (`100U` + `6` * NR_CPUS),
3821	.private = FILE_CPULIST,
3822	},
3823
3824	{
3825	.name = "mems",
3826	.seq_show = cpuset_common_seq_show,
3827	.write = cpuset_write_resmask,
3828	.max_write_len = (`100U` + `6` * MAX_NUMNODES),
3829	.private = FILE_MEMLIST,
3830	},
3831
3832	{
3833	.name = "effective_cpus",
3834	.seq_show = cpuset_common_seq_show,
3835	.private = FILE_EFFECTIVE_CPULIST,
3836	},
3837
3838	{
3839	.name = "effective_mems",
3840	.seq_show = cpuset_common_seq_show,
3841	.private = FILE_EFFECTIVE_MEMLIST,
3842	},
3843
3844	{
3845	.name = "cpu_exclusive",
3846	.read_u64 = cpuset_read_u64,
3847	.write_u64 = cpuset_write_u64,
3848	.private = FILE_CPU_EXCLUSIVE,
3849	},
3850
3851	{
3852	.name = "mem_exclusive",
3853	.read_u64 = cpuset_read_u64,
3854	.write_u64 = cpuset_write_u64,
3855	.private = FILE_MEM_EXCLUSIVE,
3856	},
3857
3858	{
3859	.name = "mem_hardwall",
3860	.read_u64 = cpuset_read_u64,
3861	.write_u64 = cpuset_write_u64,
3862	.private = FILE_MEM_HARDWALL,
3863	},
3864
3865	{
3866	.name = "sched_load_balance",
3867	.read_u64 = cpuset_read_u64,
3868	.write_u64 = cpuset_write_u64,
3869	.private = FILE_SCHED_LOAD_BALANCE,
3870	},
3871
3872	{
3873	.name = "sched_relax_domain_level",
3874	.read_s64 = cpuset_read_s64,
3875	.write_s64 = cpuset_write_s64,
3876	.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
3877	},
3878
3879	{
3880	.name = "memory_migrate",
3881	.read_u64 = cpuset_read_u64,
3882	.write_u64 = cpuset_write_u64,
3883	.private = FILE_MEMORY_MIGRATE,
3884	},
3885
3886	{
3887	.name = "memory_pressure",
3888	.read_u64 = cpuset_read_u64,
3889	.private = FILE_MEMORY_PRESSURE,
3890	},
3891
3892	{
3893	.name = "memory_spread_page",
3894	.read_u64 = cpuset_read_u64,
3895	.write_u64 = cpuset_write_u64,
3896	.private = FILE_SPREAD_PAGE,
3897	},
3898
3899	{
3900	/ obsolete, may be removed in the future /
3901	.name = "memory_spread_slab",
3902	.read_u64 = cpuset_read_u64,
3903	.write_u64 = cpuset_write_u64,
3904	.private = FILE_SPREAD_SLAB,
3905	},
3906
3907	{
3908	.name = "memory_pressure_enabled",
3909	.flags = CFTYPE_ONLY_ON_ROOT,
3910	.read_u64 = cpuset_read_u64,
3911	.write_u64 = cpuset_write_u64,
3912	.private = FILE_MEMORY_PRESSURE_ENABLED,
3913	},
3914
3915	{ } / terminate /
3916	};
3917
3918	/*
3919	* This is currently a minimal set for the default hierarchy. It can be
3920	* expanded later on by migrating more features and control files from v1.
3921	*/
3922	static struct cftype dfl_files[] = {
3923	{
3924	.name = "cpus",
3925	.seq_show = cpuset_common_seq_show,
3926	.write = cpuset_write_resmask,
3927	.max_write_len = (`100U` + `6` * NR_CPUS),
3928	.private = FILE_CPULIST,
3929	.flags = CFTYPE_NOT_ON_ROOT,
3930	},
3931
3932	{
3933	.name = "mems",
3934	.seq_show = cpuset_common_seq_show,
3935	.write = cpuset_write_resmask,
3936	.max_write_len = (`100U` + `6` * MAX_NUMNODES),
3937	.private = FILE_MEMLIST,
3938	.flags = CFTYPE_NOT_ON_ROOT,
3939	},
3940
3941	{
3942	.name = "cpus.effective",
3943	.seq_show = cpuset_common_seq_show,
3944	.private = FILE_EFFECTIVE_CPULIST,
3945	},
3946
3947	{
3948	.name = "mems.effective",
3949	.seq_show = cpuset_common_seq_show,
3950	.private = FILE_EFFECTIVE_MEMLIST,
3951	},
3952
3953	{
3954	.name = "cpus.partition",
3955	.seq_show = sched_partition_show,
3956	.write = sched_partition_write,
3957	.private = FILE_PARTITION_ROOT,
3958	.flags = CFTYPE_NOT_ON_ROOT,
3959	.file_offset = offsetof(struct cpuset, partition_file),
3960	},
3961
3962	{
3963	.name = "cpus.exclusive",
3964	.seq_show = cpuset_common_seq_show,
3965	.write = cpuset_write_resmask,
3966	.max_write_len = (`100U` + `6` * NR_CPUS),
3967	.private = FILE_EXCLUSIVE_CPULIST,
3968	.flags = CFTYPE_NOT_ON_ROOT,
3969	},
3970
3971	{
3972	.name = "cpus.exclusive.effective",
3973	.seq_show = cpuset_common_seq_show,
3974	.private = FILE_EFFECTIVE_XCPULIST,
3975	.flags = CFTYPE_NOT_ON_ROOT,
3976	},
3977
3978	{
3979	.name = "cpus.subpartitions",
3980	.seq_show = cpuset_common_seq_show,
3981	.private = FILE_SUBPARTS_CPULIST,
3982	.flags = CFTYPE_ONLY_ON_ROOT \| CFTYPE_DEBUG,
3983	},
3984
3985	{
3986	.name = "cpus.isolated",
3987	.seq_show = cpuset_common_seq_show,
3988	.private = FILE_ISOLATED_CPULIST,
3989	.flags = CFTYPE_ONLY_ON_ROOT,
3990	},
3991
3992	{ } / terminate /
3993	};
3994
3995
3996	/**
3997	* cpuset_css_alloc - Allocate a cpuset css
3998	* @parent_css: Parent css of the control group that the new cpuset will be
3999	* part of
4000	* Return: cpuset css on success, -ENOMEM on failure.
4001	*
4002	* Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
4003	* top cpuset css otherwise.
4004	*/
4005	static struct cgroup_subsys_state *
4006	cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
4007	{
4008	struct cpuset *cs;
4009
4010	if (!parent_css)
4011	return &top_cpuset.css;
4012
4013	cs = kzalloc(size: sizeof(*cs), GFP_KERNEL);
4014	if (!cs)
4015	return ERR_PTR(error: -ENOMEM);
4016
4017	if (alloc_cpumasks(cs, NULL)) {
4018	kfree(objp: cs);
4019	return ERR_PTR(error: -ENOMEM);
4020	}
4021
4022	__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
4023	nodes_clear(cs->mems_allowed);
4024	nodes_clear(cs->effective_mems);
4025	fmeter_init(fmp: &cs->fmeter);
4026	cs->relax_domain_level = -`1`;
4027	INIT_LIST_HEAD(list: &cs->remote_sibling);
4028
4029	/ Set CS_MEMORY_MIGRATE for default hierarchy /
4030	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
4031	__set_bit(CS_MEMORY_MIGRATE, &cs->flags);
4032
4033	return &cs->css;
4034	}
4035
4036	static int cpuset_css_online(struct cgroup_subsys_state *css)
4037	{
4038	struct cpuset *cs = css_cs(css);
4039	struct cpuset *parent = parent_cs(cs);
4040	struct cpuset *tmp_cs;
4041	struct cgroup_subsys_state *pos_css;
4042
4043	if (!parent)
4044	return `0`;
4045
4046	cpus_read_lock();
4047	mutex_lock(&cpuset_mutex);
4048
4049	set_bit(nr: CS_ONLINE, addr: &cs->flags);
4050	if (is_spread_page(cs: parent))
4051	set_bit(nr: CS_SPREAD_PAGE, addr: &cs->flags);
4052	if (is_spread_slab(cs: parent))
4053	set_bit(nr: CS_SPREAD_SLAB, addr: &cs->flags);
4054
4055	cpuset_inc();
4056
4057	spin_lock_irq(lock: &callback_lock);
4058	if (is_in_v2_mode()) {
4059	cpumask_copy(dstp: cs->effective_cpus, srcp: parent->effective_cpus);
4060	cs->effective_mems = parent->effective_mems;
4061	cs->use_parent_ecpus = true;
4062	parent->child_ecpus_count++;
4063	/*
4064	* Clear CS_SCHED_LOAD_BALANCE if parent is isolated
4065	*/
4066	if (!is_sched_load_balance(cs: parent))
4067	clear_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cs->flags);
4068	}
4069
4070	/*
4071	* For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
4072	*/
4073	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
4074	!is_sched_load_balance(cs: parent))
4075	clear_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &cs->flags);
4076
4077	spin_unlock_irq(lock: &callback_lock);
4078
4079	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
4080	goto out_unlock;
4081
4082	/*
4083	* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
4084	* set. This flag handling is implemented in cgroup core for
4085	* historical reasons - the flag may be specified during mount.
4086	*
4087	* Currently, if any sibling cpusets have exclusive cpus or mem, we
4088	* refuse to clone the configuration - thereby refusing the task to
4089	* be entered, and as a result refusing the sys_unshare() or
4090	* clone() which initiated it. If this becomes a problem for some
4091	* users who wish to allow that scenario, then this could be
4092	* changed to grant parent->cpus_allowed-sibling_cpus_exclusive
4093	* (and likewise for mems) to the new cgroup.
4094	*/
4095	rcu_read_lock();
4096	cpuset_for_each_child(tmp_cs, pos_css, parent) {
4097	if (is_mem_exclusive(cs: tmp_cs) \|\| is_cpu_exclusive(cs: tmp_cs)) {
4098	rcu_read_unlock();
4099	goto out_unlock;
4100	}
4101	}
4102	rcu_read_unlock();
4103
4104	spin_lock_irq(lock: &callback_lock);
4105	cs->mems_allowed = parent->mems_allowed;
4106	cs->effective_mems = parent->mems_allowed;
4107	cpumask_copy(dstp: cs->cpus_allowed, srcp: parent->cpus_allowed);
4108	cpumask_copy(dstp: cs->effective_cpus, srcp: parent->cpus_allowed);
4109	spin_unlock_irq(lock: &callback_lock);
4110	out_unlock:
4111	mutex_unlock(lock: &cpuset_mutex);
4112	cpus_read_unlock();
4113	return `0`;
4114	}
4115
4116	/*
4117	* If the cpuset being removed has its flag 'sched_load_balance'
4118	* enabled, then simulate turning sched_load_balance off, which
4119	* will call rebuild_sched_domains_locked(). That is not needed
4120	* in the default hierarchy where only changes in partition
4121	* will cause repartitioning.
4122	*
4123	* If the cpuset has the 'sched.partition' flag enabled, simulate
4124	* turning 'sched.partition" off.
4125	*/
4126
4127	static void cpuset_css_offline(struct cgroup_subsys_state *css)
4128	{
4129	struct cpuset *cs = css_cs(css);
4130
4131	cpus_read_lock();
4132	mutex_lock(&cpuset_mutex);
4133
4134	if (is_partition_valid(cs))
4135	update_prstate(cs, new_prs: `0`);
4136
4137	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
4138	is_sched_load_balance(cs))
4139	update_flag(bit: CS_SCHED_LOAD_BALANCE, cs, turning_on: `0`);
4140
4141	if (cs->use_parent_ecpus) {
4142	struct cpuset *parent = parent_cs(cs);
4143
4144	cs->use_parent_ecpus = false;
4145	parent->child_ecpus_count--;
4146	}
4147
4148	cpuset_dec();
4149	clear_bit(nr: CS_ONLINE, addr: &cs->flags);
4150
4151	mutex_unlock(lock: &cpuset_mutex);
4152	cpus_read_unlock();
4153	}
4154
4155	static void cpuset_css_free(struct cgroup_subsys_state *css)
4156	{
4157	struct cpuset *cs = css_cs(css);
4158
4159	free_cpuset(cs);
4160	}
4161
4162	static void cpuset_bind(struct cgroup_subsys_state *root_css)
4163	{
4164	mutex_lock(&cpuset_mutex);
4165	spin_lock_irq(lock: &callback_lock);
4166
4167	if (is_in_v2_mode()) {
4168	cpumask_copy(dstp: top_cpuset.cpus_allowed, cpu_possible_mask);
4169	cpumask_copy(dstp: top_cpuset.effective_xcpus, cpu_possible_mask);
4170	top_cpuset.mems_allowed = node_possible_map;
4171	} else {
4172	cpumask_copy(dstp: top_cpuset.cpus_allowed,
4173	srcp: top_cpuset.effective_cpus);
4174	top_cpuset.mems_allowed = top_cpuset.effective_mems;
4175	}
4176
4177	spin_unlock_irq(lock: &callback_lock);
4178	mutex_unlock(lock: &cpuset_mutex);
4179	}
4180
4181	/*
4182	* In case the child is cloned into a cpuset different from its parent,
4183	* additional checks are done to see if the move is allowed.
4184	*/
4185	static int cpuset_can_fork(struct task_struct task, struct* css_set *cset)
4186	{
4187	struct cpuset *cs = css_cs(css: cset->subsys[cpuset_cgrp_id]);
4188	bool same_cs;
4189	int ret;
4190
4191	rcu_read_lock();
4192	same_cs = (cs == task_cs(current));
4193	rcu_read_unlock();
4194
4195	if (same_cs)
4196	return `0`;
4197
4198	lockdep_assert_held(&cgroup_mutex);
4199	mutex_lock(&cpuset_mutex);
4200
4201	/ Check to see if task is allowed in the cpuset /
4202	ret = cpuset_can_attach_check(cs);
4203	if (ret)
4204	goto out_unlock;
4205
4206	ret = task_can_attach(p: task);
4207	if (ret)
4208	goto out_unlock;
4209
4210	ret = security_task_setscheduler(p: task);
4211	if (ret)
4212	goto out_unlock;
4213
4214	/*
4215	* Mark attach is in progress. This makes validate_change() fail
4216	* changes which zero cpus/mems_allowed.
4217	*/
4218	cs->attach_in_progress++;
4219	out_unlock:
4220	mutex_unlock(lock: &cpuset_mutex);
4221	return ret;
4222	}
4223
4224	static void cpuset_cancel_fork(struct task_struct task, struct* css_set *cset)
4225	{
4226	struct cpuset *cs = css_cs(css: cset->subsys[cpuset_cgrp_id]);
4227	bool same_cs;
4228
4229	rcu_read_lock();
4230	same_cs = (cs == task_cs(current));
4231	rcu_read_unlock();
4232
4233	if (same_cs)
4234	return;
4235
4236	mutex_lock(&cpuset_mutex);
4237	cs->attach_in_progress--;
4238	if (!cs->attach_in_progress)
4239	wake_up(&cpuset_attach_wq);
4240	mutex_unlock(lock: &cpuset_mutex);
4241	}
4242
4243	/*
4244	* Make sure the new task conform to the current state of its parent,
4245	* which could have been changed by cpuset just after it inherits the
4246	* state from the parent and before it sits on the cgroup's task list.
4247	*/
4248	static void cpuset_fork(struct task_struct *task)
4249	{
4250	struct cpuset *cs;
4251	bool same_cs;
4252
4253	rcu_read_lock();
4254	cs = task_cs(task);
4255	same_cs = (cs == task_cs(current));
4256	rcu_read_unlock();
4257
4258	if (same_cs) {
4259	if (cs == &top_cpuset)
4260	return;
4261
4262	set_cpus_allowed_ptr(p: task, current->cpus_ptr);
4263	task->mems_allowed = current->mems_allowed;
4264	return;
4265	}
4266
4267	/ CLONE_INTO_CGROUP /
4268	mutex_lock(&cpuset_mutex);
4269	guarantee_online_mems(cs, pmask: &cpuset_attach_nodemask_to);
4270	cpuset_attach_task(cs, task);
4271
4272	cs->attach_in_progress--;
4273	if (!cs->attach_in_progress)
4274	wake_up(&cpuset_attach_wq);
4275
4276	mutex_unlock(lock: &cpuset_mutex);
4277	}
4278
4279	struct cgroup_subsys cpuset_cgrp_subsys = {
4280	.css_alloc = cpuset_css_alloc,
4281	.css_online = cpuset_css_online,
4282	.css_offline = cpuset_css_offline,
4283	.css_free = cpuset_css_free,
4284	.can_attach = cpuset_can_attach,
4285	.cancel_attach = cpuset_cancel_attach,
4286	.attach = cpuset_attach,
4287	.post_attach = cpuset_post_attach,
4288	.bind = cpuset_bind,
4289	.can_fork = cpuset_can_fork,
4290	.cancel_fork = cpuset_cancel_fork,
4291	.fork = cpuset_fork,
4292	.legacy_cftypes = legacy_files,
4293	.dfl_cftypes = dfl_files,
4294	.early_init = true,
4295	.threaded = true,
4296	};
4297
4298	/**
4299	* cpuset_init - initialize cpusets at system boot
4300	*
4301	* Description: Initialize top_cpuset
4302	**/
4303
4304	int __init cpuset_init(void)
4305	{
4306	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
4307	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
4308	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
4309	BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
4310	BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
4311	BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
4312
4313	cpumask_setall(dstp: top_cpuset.cpus_allowed);
4314	nodes_setall(top_cpuset.mems_allowed);
4315	cpumask_setall(dstp: top_cpuset.effective_cpus);
4316	cpumask_setall(dstp: top_cpuset.effective_xcpus);
4317	cpumask_setall(dstp: top_cpuset.exclusive_cpus);
4318	nodes_setall(top_cpuset.effective_mems);
4319
4320	fmeter_init(fmp: &top_cpuset.fmeter);
4321	set_bit(nr: CS_SCHED_LOAD_BALANCE, addr: &top_cpuset.flags);
4322	top_cpuset.relax_domain_level = -`1`;
4323	INIT_LIST_HEAD(list: &remote_children);
4324
4325	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
4326
4327	return `0`;
4328	}
4329
4330	/*
4331	* If CPU and/or memory hotplug handlers, below, unplug any CPUs
4332	* or memory nodes, we need to walk over the cpuset hierarchy,
4333	* removing that CPU or node from all cpusets. If this removes the
4334	* last CPU or node from a cpuset, then move the tasks in the empty
4335	* cpuset to its next-highest non-empty parent.
4336	*/
4337	static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
4338	{
4339	struct cpuset *parent;
4340
4341	/*
4342	* Find its next-highest non-empty parent, (top cpuset
4343	* has online cpus, so can't be empty).
4344	*/
4345	parent = parent_cs(cs);
4346	while (cpumask_empty(srcp: parent->cpus_allowed) \|\|
4347	nodes_empty(parent->mems_allowed))
4348	parent = parent_cs(cs: parent);
4349
4350	if (cgroup_transfer_tasks(to: parent->css.cgroup, from: cs->css.cgroup)) {
4351	pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
4352	pr_cont_cgroup_name(cgrp: cs->css.cgroup);
4353	pr_cont("\n");
4354	}
4355	}
4356
4357	static void
4358	hotplug_update_tasks_legacy(struct cpuset *cs,
4359	struct cpumask new_cpus, nodemask_t new_mems,
4360	bool cpus_updated, bool mems_updated)
4361	{
4362	bool is_empty;
4363
4364	spin_lock_irq(lock: &callback_lock);
4365	cpumask_copy(dstp: cs->cpus_allowed, srcp: new_cpus);
4366	cpumask_copy(dstp: cs->effective_cpus, srcp: new_cpus);
4367	cs->mems_allowed = *new_mems;
4368	cs->effective_mems = *new_mems;
4369	spin_unlock_irq(lock: &callback_lock);
4370
4371	/*
4372	* Don't call update_tasks_cpumask() if the cpuset becomes empty,
4373	* as the tasks will be migrated to an ancestor.
4374	*/
4375	if (cpus_updated && !cpumask_empty(srcp: cs->cpus_allowed))
4376	update_tasks_cpumask(cs, new_cpus);
4377	if (mems_updated && !nodes_empty(cs->mems_allowed))
4378	update_tasks_nodemask(cs);
4379
4380	is_empty = cpumask_empty(srcp: cs->cpus_allowed) \|\|
4381	nodes_empty(cs->mems_allowed);
4382
4383	/*
4384	* Move tasks to the nearest ancestor with execution resources,
4385	* This is full cgroup operation which will also call back into
4386	* cpuset. Should be done outside any lock.
4387	*/
4388	if (is_empty) {
4389	mutex_unlock(lock: &cpuset_mutex);
4390	remove_tasks_in_empty_cpuset(cs);
4391	mutex_lock(&cpuset_mutex);
4392	}
4393	}
4394
4395	static void
4396	hotplug_update_tasks(struct cpuset *cs,
4397	struct cpumask new_cpus, nodemask_t new_mems,
4398	bool cpus_updated, bool mems_updated)
4399	{
4400	/ A partition root is allowed to have empty effective cpus /
4401	if (cpumask_empty(srcp: new_cpus) && !is_partition_valid(cs))
4402	cpumask_copy(dstp: new_cpus, srcp: parent_cs(cs)->effective_cpus);
4403	if (nodes_empty(*new_mems))
4404	*new_mems = parent_cs(cs)->effective_mems;
4405
4406	spin_lock_irq(lock: &callback_lock);
4407	cpumask_copy(dstp: cs->effective_cpus, srcp: new_cpus);
4408	cs->effective_mems = *new_mems;
4409	spin_unlock_irq(lock: &callback_lock);
4410
4411	if (cpus_updated)
4412	update_tasks_cpumask(cs, new_cpus);
4413	if (mems_updated)
4414	update_tasks_nodemask(cs);
4415	}
4416
4417	static bool force_rebuild;
4418
4419	void cpuset_force_rebuild(void)
4420	{
4421	force_rebuild = true;
4422	}
4423
4424	/*
4425	* Attempt to acquire a cpus_read_lock while a hotplug operation may be in
4426	* progress.
4427	* Return: true if successful, false otherwise
4428	*
4429	* To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
4430	* cpus_read_trylock() is used here to acquire the lock.
4431	*/
4432	static bool cpuset_hotplug_cpus_read_trylock(void)
4433	{
4434	int retries = `0`;
4435
4436	while (!cpus_read_trylock()) {
4437	/*
4438	* CPU hotplug still in progress. Retry 5 times
4439	* with a 10ms wait before bailing out.
4440	*/
4441	if (++retries > `5`)
4442	return false;
4443	msleep(msecs: `10`);
4444	}
4445	return true;
4446	}
4447
4448	/**
4449	* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
4450	* @cs: cpuset in interest
4451	* @tmp: the tmpmasks structure pointer
4452	*
4453	* Compare @cs's cpu and mem masks against top_cpuset and if some have gone
4454	* offline, update @cs accordingly. If @cs ends up with no CPU or memory,
4455	* all its tasks are moved to the nearest ancestor with both resources.
4456	*/
4457	static void cpuset_hotplug_update_tasks(struct cpuset cs, struct* tmpmasks *tmp)
4458	{
4459	static cpumask_t new_cpus;
4460	static nodemask_t new_mems;
4461	bool cpus_updated;
4462	bool mems_updated;
4463	bool remote;
4464	int partcmd = -`1`;
4465	struct cpuset *parent;
4466	retry:
4467	wait_event(cpuset_attach_wq, cs->attach_in_progress == `0`);
4468
4469	mutex_lock(&cpuset_mutex);
4470
4471	/*
4472	* We have raced with task attaching. We wait until attaching
4473	* is finished, so we won't attach a task to an empty cpuset.
4474	*/
4475	if (cs->attach_in_progress) {
4476	mutex_unlock(lock: &cpuset_mutex);
4477	goto retry;
4478	}
4479
4480	parent = parent_cs(cs);
4481	compute_effective_cpumask(new_cpus: &new_cpus, cs, parent);
4482	nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
4483
4484	if (!tmp \|\| !cs->partition_root_state)
4485	goto update_tasks;
4486
4487	/*
4488	* Compute effective_cpus for valid partition root, may invalidate
4489	* child partition roots if necessary.
4490	*/
4491	remote = is_remote_partition(cs);
4492	if (remote \|\| (is_partition_valid(cs) && is_partition_valid(cs: parent)))
4493	compute_partition_effective_cpumask(cs, new_ecpus: &new_cpus);
4494
4495	if (remote && cpumask_empty(srcp: &new_cpus) &&
4496	partition_is_populated(cs, NULL) &&
4497	cpuset_hotplug_cpus_read_trylock()) {
4498	remote_partition_disable(cs, tmp);
4499	compute_effective_cpumask(new_cpus: &new_cpus, cs, parent);
4500	remote = false;
4501	cpuset_force_rebuild();
4502	cpus_read_unlock();
4503	}
4504
4505	/*
4506	* Force the partition to become invalid if either one of
4507	* the following conditions hold:
4508	* 1) empty effective cpus but not valid empty partition.
4509	* 2) parent is invalid or doesn't grant any cpus to child
4510	* partitions.
4511	*/
4512	if (is_local_partition(cs) && (!is_partition_valid(cs: parent) \|\|
4513	tasks_nocpu_error(parent, cs, xcpus: &new_cpus)))
4514	partcmd = partcmd_invalidate;
4515	/*
4516	* On the other hand, an invalid partition root may be transitioned
4517	* back to a regular one.
4518	*/
4519	else if (is_partition_valid(cs: parent) && is_partition_invalid(cs))
4520	partcmd = partcmd_update;
4521
4522	/*
4523	* cpus_read_lock needs to be held before calling
4524	* update_parent_effective_cpumask(). To avoid circular lock
4525	* dependency between cpuset_mutex and cpus_read_lock,
4526	* cpus_read_trylock() is used here to acquire the lock.
4527	*/
4528	if (partcmd >= `0`) {
4529	if (!cpuset_hotplug_cpus_read_trylock())
4530	goto update_tasks;
4531
4532	update_parent_effective_cpumask(cs, cmd: partcmd, NULL, tmp);
4533	cpus_read_unlock();
4534	if ((partcmd == partcmd_invalidate) \|\| is_partition_valid(cs)) {
4535	compute_partition_effective_cpumask(cs, new_ecpus: &new_cpus);
4536	cpuset_force_rebuild();
4537	}
4538	}
4539
4540	update_tasks:
4541	cpus_updated = !cpumask_equal(src1p: &new_cpus, src2p: cs->effective_cpus);
4542	mems_updated = !nodes_equal(new_mems, cs->effective_mems);
4543	if (!cpus_updated && !mems_updated)
4544	goto unlock; / Hotplug doesn't affect this cpuset /
4545
4546	if (mems_updated)
4547	check_insane_mems_config(nodes: &new_mems);
4548
4549	if (is_in_v2_mode())
4550	hotplug_update_tasks(cs, new_cpus: &new_cpus, new_mems: &new_mems,
4551	cpus_updated, mems_updated);
4552	else
4553	hotplug_update_tasks_legacy(cs, new_cpus: &new_cpus, new_mems: &new_mems,
4554	cpus_updated, mems_updated);
4555
4556	unlock:
4557	mutex_unlock(lock: &cpuset_mutex);
4558	}
4559
4560	/**
4561	* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
4562	* @work: unused
4563	*
4564	* This function is called after either CPU or memory configuration has
4565	* changed and updates cpuset accordingly. The top_cpuset is always
4566	* synchronized to cpu_active_mask and N_MEMORY, which is necessary in
4567	* order to make cpusets transparent (of no affect) on systems that are
4568	* actively using CPU hotplug but making no active use of cpusets.
4569	*
4570	* Non-root cpusets are only affected by offlining. If any CPUs or memory
4571	* nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
4572	* all descendants.
4573	*
4574	* Note that CPU offlining during suspend is ignored. We don't modify
4575	* cpusets across suspend/resume cycles at all.
4576	*/
4577	static void cpuset_hotplug_workfn(struct work_struct *work)
4578	{
4579	static cpumask_t new_cpus;
4580	static nodemask_t new_mems;
4581	bool cpus_updated, mems_updated;
4582	bool on_dfl = is_in_v2_mode();
4583	struct tmpmasks tmp, *ptmp = NULL;
4584
4585	if (on_dfl && !alloc_cpumasks(NULL, tmp: &tmp))
4586	ptmp = &tmp;
4587
4588	mutex_lock(&cpuset_mutex);
4589
4590	/ fetch the available cpus/mems and find out which changed how /
4591	cpumask_copy(dstp: &new_cpus, cpu_active_mask);
4592	new_mems = node_states[N_MEMORY];
4593
4594	/*
4595	* If subpartitions_cpus is populated, it is likely that the check
4596	* below will produce a false positive on cpus_updated when the cpu
4597	* list isn't changed. It is extra work, but it is better to be safe.
4598	*/
4599	cpus_updated = !cpumask_equal(src1p: top_cpuset.effective_cpus, src2p: &new_cpus) \|\|
4600	!cpumask_empty(srcp: subpartitions_cpus);
4601	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
4602
4603	/*
4604	* In the rare case that hotplug removes all the cpus in
4605	* subpartitions_cpus, we assumed that cpus are updated.
4606	*/
4607	if (!cpus_updated && top_cpuset.nr_subparts)
4608	cpus_updated = true;
4609
4610	/ For v1, synchronize cpus_allowed to cpu_active_mask /
4611	if (cpus_updated) {
4612	spin_lock_irq(lock: &callback_lock);
4613	if (!on_dfl)
4614	cpumask_copy(dstp: top_cpuset.cpus_allowed, srcp: &new_cpus);
4615	/*
4616	* Make sure that CPUs allocated to child partitions
4617	* do not show up in effective_cpus. If no CPU is left,
4618	* we clear the subpartitions_cpus & let the child partitions
4619	* fight for the CPUs again.
4620	*/
4621	if (!cpumask_empty(srcp: subpartitions_cpus)) {
4622	if (cpumask_subset(src1p: &new_cpus, src2p: subpartitions_cpus)) {
4623	top_cpuset.nr_subparts = `0`;
4624	cpumask_clear(dstp: subpartitions_cpus);
4625	} else {
4626	cpumask_andnot(dstp: &new_cpus, src1p: &new_cpus,
4627	src2p: subpartitions_cpus);
4628	}
4629	}
4630	cpumask_copy(dstp: top_cpuset.effective_cpus, srcp: &new_cpus);
4631	spin_unlock_irq(lock: &callback_lock);
4632	/ we don't mess with cpumasks of tasks in top_cpuset /
4633	}
4634
4635	/ synchronize mems_allowed to N_MEMORY /
4636	if (mems_updated) {
4637	spin_lock_irq(lock: &callback_lock);
4638	if (!on_dfl)
4639	top_cpuset.mems_allowed = new_mems;
4640	top_cpuset.effective_mems = new_mems;
4641	spin_unlock_irq(lock: &callback_lock);
4642	update_tasks_nodemask(cs: &top_cpuset);
4643	}
4644
4645	mutex_unlock(lock: &cpuset_mutex);
4646
4647	/ if cpus or mems changed, we need to propagate to descendants /
4648	if (cpus_updated \|\| mems_updated) {
4649	struct cpuset *cs;
4650	struct cgroup_subsys_state *pos_css;
4651
4652	rcu_read_lock();
4653	cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
4654	if (cs == &top_cpuset \|\| !css_tryget_online(css: &cs->css))
4655	continue;
4656	rcu_read_unlock();
4657
4658	cpuset_hotplug_update_tasks(cs, tmp: ptmp);
4659
4660	rcu_read_lock();
4661	css_put(css: &cs->css);
4662	}
4663	rcu_read_unlock();
4664	}
4665
4666	/ rebuild sched domains if cpus_allowed has changed /
4667	if (cpus_updated \|\| force_rebuild) {
4668	force_rebuild = false;
4669	rebuild_sched_domains();
4670	}
4671
4672	free_cpumasks(NULL, tmp: ptmp);
4673	}
4674
4675	void cpuset_update_active_cpus(void)
4676	{
4677	/*
4678	* We're inside cpu hotplug critical region which usually nests
4679	* inside cgroup synchronization. Bounce actual hotplug processing
4680	* to a work item to avoid reverse locking order.
4681	*/
4682	schedule_work(work: &cpuset_hotplug_work);
4683	}
4684
4685	void cpuset_wait_for_hotplug(void)
4686	{
4687	flush_work(work: &cpuset_hotplug_work);
4688	}
4689
4690	/*
4691	* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
4692	* Call this routine anytime after node_states[N_MEMORY] changes.
4693	* See cpuset_update_active_cpus() for CPU hotplug handling.
4694	*/
4695	static int cpuset_track_online_nodes(struct notifier_block *self,
4696	unsigned long action, void *arg)
4697	{
4698	schedule_work(work: &cpuset_hotplug_work);
4699	return NOTIFY_OK;
4700	}
4701
4702	/**
4703	* cpuset_init_smp - initialize cpus_allowed
4704	*
4705	* Description: Finish top cpuset after cpu, node maps are initialized
4706	*/
4707	void __init cpuset_init_smp(void)
4708	{
4709	/*
4710	* cpus_allowd/mems_allowed set to v2 values in the initial
4711	* cpuset_bind() call will be reset to v1 values in another
4712	* cpuset_bind() call when v1 cpuset is mounted.
4713	*/
4714	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
4715
4716	cpumask_copy(dstp: top_cpuset.effective_cpus, cpu_active_mask);
4717	top_cpuset.effective_mems = node_states[N_MEMORY];
4718
4719	hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
4720
4721	cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", `0`);
4722	BUG_ON(!cpuset_migrate_mm_wq);
4723	}
4724
4725	/**
4726	* cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
4727	* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
4728	* @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
4729	*
4730	* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
4731	* attached to the specified @tsk. Guaranteed to return some non-empty
4732	* subset of cpu_online_mask, even if this means going outside the
4733	* tasks cpuset, except when the task is in the top cpuset.
4734	**/
4735
4736	void cpuset_cpus_allowed(struct task_struct tsk, struct* cpumask *pmask)
4737	{
4738	unsigned long flags;
4739	struct cpuset *cs;
4740
4741	spin_lock_irqsave(&callback_lock, flags);
4742	rcu_read_lock();
4743
4744	cs = task_cs(task: tsk);
4745	if (cs != &top_cpuset)
4746	guarantee_online_cpus(tsk, pmask);
4747	/*
4748	* Tasks in the top cpuset won't get update to their cpumasks
4749	* when a hotplug online/offline event happens. So we include all
4750	* offline cpus in the allowed cpu list.
4751	*/
4752	if ((cs == &top_cpuset) \|\| cpumask_empty(srcp: pmask)) {
4753	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
4754
4755	/*
4756	* We first exclude cpus allocated to partitions. If there is no
4757	* allowable online cpu left, we fall back to all possible cpus.
4758	*/
4759	cpumask_andnot(dstp: pmask, src1p: possible_mask, src2p: subpartitions_cpus);
4760	if (!cpumask_intersects(src1p: pmask, cpu_online_mask))
4761	cpumask_copy(dstp: pmask, srcp: possible_mask);
4762	}
4763
4764	rcu_read_unlock();
4765	spin_unlock_irqrestore(lock: &callback_lock, flags);
4766	}
4767
4768	/**
4769	* cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
4770	* @tsk: pointer to task_struct with which the scheduler is struggling
4771	*
4772	* Description: In the case that the scheduler cannot find an allowed cpu in
4773	* tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
4774	* mode however, this value is the same as task_cs(tsk)->effective_cpus,
4775	* which will not contain a sane cpumask during cases such as cpu hotplugging.
4776	* This is the absolute last resort for the scheduler and it is only used if
4777	* _every_ other avenue has been traveled.
4778	*
4779	* Returns true if the affinity of @tsk was changed, false otherwise.
4780	**/
4781
4782	bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
4783	{
4784	const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
4785	const struct cpumask *cs_mask;
4786	bool changed = false;
4787
4788	rcu_read_lock();
4789	cs_mask = task_cs(task: tsk)->cpus_allowed;
4790	if (is_in_v2_mode() && cpumask_subset(src1p: cs_mask, src2p: possible_mask)) {
4791	do_set_cpus_allowed(p: tsk, new_mask: cs_mask);
4792	changed = true;
4793	}
4794	rcu_read_unlock();
4795
4796	/*
4797	* We own tsk->cpus_allowed, nobody can change it under us.
4798	*
4799	* But we used cs && cs->cpus_allowed lockless and thus can
4800	* race with cgroup_attach_task() or update_cpumask() and get
4801	* the wrong tsk->cpus_allowed. However, both cases imply the
4802	* subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
4803	* which takes task_rq_lock().
4804	*
4805	* If we are called after it dropped the lock we must see all
4806	* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
4807	* set any mask even if it is not right from task_cs() pov,
4808	* the pending set_cpus_allowed_ptr() will fix things.
4809	*
4810	* select_fallback_rq() will fix things ups and set cpu_possible_mask
4811	* if required.
4812	*/
4813	return changed;
4814	}
4815
4816	void __init cpuset_init_current_mems_allowed(void)
4817	{
4818	nodes_setall(current->mems_allowed);
4819	}
4820
4821	/**
4822	* cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
4823	* @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
4824	*
4825	* Description: Returns the nodemask_t mems_allowed of the cpuset
4826	* attached to the specified @tsk. Guaranteed to return some non-empty
4827	* subset of node_states[N_MEMORY], even if this means going outside the
4828	* tasks cpuset.
4829	**/
4830
4831	nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
4832	{
4833	nodemask_t mask;
4834	unsigned long flags;
4835
4836	spin_lock_irqsave(&callback_lock, flags);
4837	rcu_read_lock();
4838	guarantee_online_mems(cs: task_cs(task: tsk), pmask: &mask);
4839	rcu_read_unlock();
4840	spin_unlock_irqrestore(lock: &callback_lock, flags);
4841
4842	return mask;
4843	}
4844
4845	/**
4846	* cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
4847	* @nodemask: the nodemask to be checked
4848	*
4849	* Are any of the nodes in the nodemask allowed in current->mems_allowed?
4850	*/
4851	int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
4852	{
4853	return nodes_intersects(*nodemask, current->mems_allowed);
4854	}
4855
4856	/*
4857	* nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
4858	* mem_hardwall ancestor to the specified cpuset. Call holding
4859	* callback_lock. If no ancestor is mem_exclusive or mem_hardwall
4860	* (an unusual configuration), then returns the root cpuset.
4861	*/
4862	static struct cpuset nearest_hardwall_ancestor(struct* cpuset *cs)
4863	{
4864	while (!(is_mem_exclusive(cs) \|\| is_mem_hardwall(cs)) && parent_cs(cs))
4865	cs = parent_cs(cs);
4866	return cs;
4867	}
4868
4869	/*
4870	* cpuset_node_allowed - Can we allocate on a memory node?
4871	* @node: is this an allowed node?
4872	* @gfp_mask: memory allocation flags
4873	*
4874	* If we're in interrupt, yes, we can always allocate. If @node is set in
4875	* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
4876	* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
4877	* yes. If current has access to memory reserves as an oom victim, yes.
4878	* Otherwise, no.
4879	*
4880	* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
4881	* and do not allow allocations outside the current tasks cpuset
4882	* unless the task has been OOM killed.
4883	* GFP_KERNEL allocations are not so marked, so can escape to the
4884	* nearest enclosing hardwalled ancestor cpuset.
4885	*
4886	* Scanning up parent cpusets requires callback_lock. The
4887	* __alloc_pages() routine only calls here with __GFP_HARDWALL bit
4888	* _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
4889	* current tasks mems_allowed came up empty on the first pass over
4890	* the zonelist. So only GFP_KERNEL allocations, if all nodes in the
4891	* cpuset are short of memory, might require taking the callback_lock.
4892	*
4893	* The first call here from mm/page_alloc:get_page_from_freelist()
4894	* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
4895	* so no allocation on a node outside the cpuset is allowed (unless
4896	* in interrupt, of course).
4897	*
4898	* The second pass through get_page_from_freelist() doesn't even call
4899	* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
4900	* variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
4901	* in alloc_flags. That logic and the checks below have the combined
4902	* affect that:
4903	* in_interrupt - any node ok (current task context irrelevant)
4904	* GFP_ATOMIC - any node ok
4905	* tsk_is_oom_victim - any node ok
4906	* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
4907	* GFP_USER - only nodes in current tasks mems allowed ok.
4908	*/
4909	bool cpuset_node_allowed(int node, gfp_t gfp_mask)
4910	{
4911	struct cpuset cs; /* current cpuset ancestors /
4912	bool allowed; / is allocation in zone z allowed? /
4913	unsigned long flags;
4914
4915	if (in_interrupt())
4916	return true;
4917	if (node_isset(node, current->mems_allowed))
4918	return true;
4919	/*
4920	* Allow tasks that have access to memory reserves because they have
4921	* been OOM killed to get memory anywhere.
4922	*/
4923	if (unlikely(tsk_is_oom_victim(current)))
4924	return true;
4925	if (gfp_mask & __GFP_HARDWALL) / If hardwall request, stop here /
4926	return false;
4927
4928	if (current->flags & PF_EXITING) / Let dying task have memory /
4929	return true;
4930
4931	/ Not hardwall and node outside mems_allowed: scan up cpusets /
4932	spin_lock_irqsave(&callback_lock, flags);
4933
4934	rcu_read_lock();
4935	cs = nearest_hardwall_ancestor(cs: task_cs(current));
4936	allowed = node_isset(node, cs->mems_allowed);
4937	rcu_read_unlock();
4938
4939	spin_unlock_irqrestore(lock: &callback_lock, flags);
4940	return allowed;
4941	}
4942
4943	/**
4944	* cpuset_spread_node() - On which node to begin search for a page
4945	* @rotor: round robin rotor
4946	*
4947	* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
4948	* tasks in a cpuset with is_spread_page or is_spread_slab set),
4949	* and if the memory allocation used cpuset_mem_spread_node()
4950	* to determine on which node to start looking, as it will for
4951	* certain page cache or slab cache pages such as used for file
4952	* system buffers and inode caches, then instead of starting on the
4953	* local node to look for a free page, rather spread the starting
4954	* node around the tasks mems_allowed nodes.
4955	*
4956	* We don't have to worry about the returned node being offline
4957	* because "it can't happen", and even if it did, it would be ok.
4958	*
4959	* The routines calling guarantee_online_mems() are careful to
4960	* only set nodes in task->mems_allowed that are online. So it
4961	* should not be possible for the following code to return an
4962	* offline node. But if it did, that would be ok, as this routine
4963	* is not returning the node where the allocation must be, only
4964	* the node where the search should start. The zonelist passed to
4965	* __alloc_pages() will include all nodes. If the slab allocator
4966	* is passed an offline node, it will fall back to the local node.
4967	* See kmem_cache_alloc_node().
4968	*/
4969	static int cpuset_spread_node(int *rotor)
4970	{
4971	return rotor = next_node_in(rotor, current->mems_allowed);
4972	}
4973
4974	/**
4975	* cpuset_mem_spread_node() - On which node to begin search for a file page
4976	*/
4977	int cpuset_mem_spread_node(void)
4978	{
4979	if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
4980	current->cpuset_mem_spread_rotor =
4981	node_random(maskp: &current->mems_allowed);
4982
4983	return cpuset_spread_node(rotor: &current->cpuset_mem_spread_rotor);
4984	}
4985
4986	/**
4987	* cpuset_slab_spread_node() - On which node to begin search for a slab page
4988	*/
4989	int cpuset_slab_spread_node(void)
4990	{
4991	if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
4992	current->cpuset_slab_spread_rotor =
4993	node_random(maskp: &current->mems_allowed);
4994
4995	return cpuset_spread_node(rotor: &current->cpuset_slab_spread_rotor);
4996	}
4997	EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
4998
4999	/**
5000	* cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
5001	* @tsk1: pointer to task_struct of some task.
5002	* @tsk2: pointer to task_struct of some other task.
5003	*
5004	* Description: Return true if @tsk1's mems_allowed intersects the
5005	* mems_allowed of @tsk2. Used by the OOM killer to determine if
5006	* one of the task's memory usage might impact the memory available
5007	* to the other.
5008	**/
5009
5010	int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
5011	const struct task_struct *tsk2)
5012	{
5013	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
5014	}
5015
5016	/**
5017	* cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
5018	*
5019	* Description: Prints current's name, cpuset name, and cached copy of its
5020	* mems_allowed to the kernel log.
5021	*/
5022	void cpuset_print_current_mems_allowed(void)
5023	{
5024	struct cgroup *cgrp;
5025
5026	rcu_read_lock();
5027
5028	cgrp = task_cs(current)->css.cgroup;
5029	pr_cont(",cpuset=");
5030	pr_cont_cgroup_name(cgrp);
5031	pr_cont(",mems_allowed=%*pbl",
5032	nodemask_pr_args(&current->mems_allowed));
5033
5034	rcu_read_unlock();
5035	}
5036
5037	/*
5038	* Collection of memory_pressure is suppressed unless
5039	* this flag is enabled by writing "1" to the special
5040	* cpuset file 'memory_pressure_enabled' in the root cpuset.
5041	*/
5042
5043	int cpuset_memory_pressure_enabled __read_mostly;
5044
5045	/*
5046	* __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
5047	*
5048	* Keep a running average of the rate of synchronous (direct)
5049	* page reclaim efforts initiated by tasks in each cpuset.
5050	*
5051	* This represents the rate at which some task in the cpuset
5052	* ran low on memory on all nodes it was allowed to use, and
5053	* had to enter the kernels page reclaim code in an effort to
5054	* create more free memory by tossing clean pages or swapping
5055	* or writing dirty pages.
5056	*
5057	* Display to user space in the per-cpuset read-only file
5058	* "memory_pressure". Value displayed is an integer
5059	* representing the recent rate of entry into the synchronous
5060	* (direct) page reclaim by any task attached to the cpuset.
5061	*/
5062
5063	void __cpuset_memory_pressure_bump(void)
5064	{
5065	rcu_read_lock();
5066	fmeter_markevent(fmp: &task_cs(current)->fmeter);
5067	rcu_read_unlock();
5068	}
5069
5070	#ifdef CONFIG_PROC_PID_CPUSET
5071	/*
5072	* proc_cpuset_show()
5073	* - Print tasks cpuset path into seq_file.
5074	* - Used for /proc/<pid>/cpuset.
5075	* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
5076	* doesn't really matter if tsk->cpuset changes after we read it,
5077	* and we take cpuset_mutex, keeping cpuset_attach() from changing it
5078	* anyway.
5079	*/
5080	int proc_cpuset_show(struct seq_file m, struct* pid_namespace *ns,
5081	struct pid pid, struct* task_struct *tsk)
5082	{
5083	char *buf;
5084	struct cgroup_subsys_state *css;
5085	int retval;
5086
5087	retval = -ENOMEM;
5088	buf = kmalloc(PATH_MAX, GFP_KERNEL);
5089	if (!buf)
5090	goto out;
5091
5092	css = task_get_css(task: tsk, subsys_id: cpuset_cgrp_id);
5093	retval = cgroup_path_ns(cgrp: css->cgroup, buf, PATH_MAX,
5094	current->nsproxy->cgroup_ns);
5095	css_put(css);
5096	if (retval == -E2BIG)
5097	retval = -ENAMETOOLONG;
5098	if (retval < `0`)
5099	goto out_free;
5100	seq_puts(m, s: buf);
5101	seq_putc(m, c: `'\n'`);
5102	retval = `0`;
5103	out_free:
5104	kfree(objp: buf);
5105	out:
5106	return retval;
5107	}
5108	#endif /* CONFIG_PROC_PID_CPUSET */
5109
5110	/ Display task mems_allowed in /proc/<pid>/status file. /
5111	void cpuset_task_status_allowed(struct seq_file m, struct* task_struct *task)
5112	{
5113	seq_printf(m, fmt: "Mems_allowed:\t%*pb\n",
5114	nodemask_pr_args(&task->mems_allowed));
5115	seq_printf(m, fmt: "Mems_allowed_list:\t%*pbl\n",
5116	nodemask_pr_args(&task->mems_allowed));
5117	}
5118

source code of linux/kernel/cgroup/cpuset.c