core.c source code [linux/kernel/sched/core.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* kernel/sched/core.c
4	*
5	* Core kernel CPU scheduler code
6	*
7	* Copyright (C) 1991-2002 Linus Torvalds
8	* Copyright (C) 1998-2024 Ingo Molnar, Red Hat
9	*/
10	#include <linux/highmem.h>
11	#include <linux/hrtimer_api.h>
12	#include <linux/ktime_api.h>
13	#include <linux/sched/signal.h>
14	#include <linux/syscalls_api.h>
15	#include <linux/debug_locks.h>
16	#include <linux/prefetch.h>
17	#include <linux/capability.h>
18	#include <linux/pgtable_api.h>
19	#include <linux/wait_bit.h>
20	#include <linux/jiffies.h>
21	#include <linux/spinlock_api.h>
22	#include <linux/cpumask_api.h>
23	#include <linux/lockdep_api.h>
24	#include <linux/hardirq.h>
25	#include <linux/softirq.h>
26	#include <linux/refcount_api.h>
27	#include <linux/topology.h>
28	#include <linux/sched/clock.h>
29	#include <linux/sched/cond_resched.h>
30	#include <linux/sched/cputime.h>
31	#include <linux/sched/debug.h>
32	#include <linux/sched/hotplug.h>
33	#include <linux/sched/init.h>
34	#include <linux/sched/isolation.h>
35	#include <linux/sched/loadavg.h>
36	#include <linux/sched/mm.h>
37	#include <linux/sched/nohz.h>
38	#include <linux/sched/rseq_api.h>
39	#include <linux/sched/rt.h>
40
41	#include <linux/blkdev.h>
42	#include <linux/context_tracking.h>
43	#include <linux/cpuset.h>
44	#include <linux/delayacct.h>
45	#include <linux/init_task.h>
46	#include <linux/interrupt.h>
47	#include <linux/ioprio.h>
48	#include <linux/kallsyms.h>
49	#include <linux/kcov.h>
50	#include <linux/kprobes.h>
51	#include <linux/llist_api.h>
52	#include <linux/mmu_context.h>
53	#include <linux/mmzone.h>
54	#include <linux/mutex_api.h>
55	#include <linux/nmi.h>
56	#include <linux/nospec.h>
57	#include <linux/perf_event_api.h>
58	#include <linux/profile.h>
59	#include <linux/psi.h>
60	#include <linux/rcuwait_api.h>
61	#include <linux/rseq.h>
62	#include <linux/sched/wake_q.h>
63	#include <linux/scs.h>
64	#include <linux/slab.h>
65	#include <linux/syscalls.h>
66	#include <linux/vtime.h>
67	#include <linux/wait_api.h>
68	#include <linux/workqueue_api.h>
69	#include <linux/livepatch_sched.h>
70
71	#ifdef CONFIG_PREEMPT_DYNAMIC
72	# ifdef CONFIG_GENERIC_ENTRY
73	# include <linux/entry-common.h>
74	# endif
75	#endif
76
77	#include <uapi/linux/sched/types.h>
78
79	#include <asm/irq_regs.h>
80	#include <asm/switch_to.h>
81	#include <asm/tlb.h>
82
83	#define CREATE_TRACE_POINTS
84	#include <linux/sched/rseq_api.h>
85	#include <trace/events/sched.h>
86	#include <trace/events/ipi.h>
87	#undef CREATE_TRACE_POINTS
88
89	#include "sched.h"
90	#include "stats.h"
91
92	#include "autogroup.h"
93	#include "pelt.h"
94	#include "smp.h"
95
96	#include "../workqueue_internal.h"
97	#include "../../io_uring/io-wq.h"
98	#include "../smpboot.h"
99
100	EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
101	EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
102
103	/*
104	* Export tracepoints that act as a bare tracehook (ie: have no trace event
105	* associated with them) to allow external modules to probe them.
106	*/
107	EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
108	EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
109	EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
110	EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
111	EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
112	EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp);
113	EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
114	EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
115	EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
116	EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
117	EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
118	EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
119
120	DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
121
122	/*
123	* Debugging: various feature bits
124	*
125	* If SCHED_DEBUG is disabled, each compilation unit has its own copy of
126	* sysctl_sched_features, defined in sched.h, to allow constants propagation
127	* at compile time and compiler optimization based on features default.
128	*/
129	#define SCHED_FEAT(name, enabled) \
130	(1UL << __SCHED_FEAT_##name) * enabled \|
131	__read_mostly unsigned int sysctl_sched_features =
132	#include "features.h"
133	`0`;
134	#undef SCHED_FEAT
135
136	/*
137	* Print a warning if need_resched is set for the given duration (if
138	* LATENCY_WARN is enabled).
139	*
140	* If sysctl_resched_latency_warn_once is set, only one warning will be shown
141	* per boot.
142	*/
143	__read_mostly int sysctl_resched_latency_warn_ms = `100`;
144	__read_mostly int sysctl_resched_latency_warn_once = `1`;
145
146	/*
147	* Number of tasks to iterate in a single balance run.
148	* Limited because this is done with IRQs disabled.
149	*/
150	__read_mostly unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
151
152	__read_mostly int scheduler_running;
153
154	#ifdef CONFIG_SCHED_CORE
155
156	DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
157
158	/ kernel prio, less is more /
159	static inline int __task_prio(const struct task_struct *p)
160	{
161	if (p->sched_class == &stop_sched_class) / trumps deadline /
162	return -`2`;
163
164	if (p->dl_server)
165	return -`1`; / deadline /
166
167	if (rt_or_dl_prio(prio: p->prio))
168	return p->prio; / [-1, 99] /
169
170	if (p->sched_class == &idle_sched_class)
171	return MAX_RT_PRIO + NICE_WIDTH; / 140 /
172
173	if (task_on_scx(p))
174	return MAX_RT_PRIO + MAX_NICE + `1`; / 120, squash ext /
175
176	return MAX_RT_PRIO + MAX_NICE; / 119, squash fair /
177	}
178
179	/*
180	* l(a,b)
181	* le(a,b) := !l(b,a)
182	* g(a,b) := l(b,a)
183	* ge(a,b) := !l(a,b)
184	*/
185
186	/ real prio, less is less /
187	static inline bool prio_less(const struct task_struct *a,
188	const struct task_struct *b, bool in_fi)
189	{
190
191	int pa = __task_prio(p: a), pb = __task_prio(p: b);
192
193	if (-pa < -pb)
194	return true;
195
196	if (-pb < -pa)
197	return false;
198
199	if (pa == -`1`) { / dl_prio() doesn't work because of stop_class above /
200	const struct sched_dl_entity a_dl, b_dl;
201
202	a_dl = &a->dl;
203	/*
204	* Since,'a' and 'b' can be CFS tasks served by DL server,
205	* __task_prio() can return -1 (for DL) even for those. In that
206	* case, get to the dl_server's DL entity.
207	*/
208	if (a->dl_server)
209	a_dl = a->dl_server;
210
211	b_dl = &b->dl;
212	if (b->dl_server)
213	b_dl = b->dl_server;
214
215	return !dl_time_before(a: a_dl->deadline, b: b_dl->deadline);
216	}
217
218	if (pa == MAX_RT_PRIO + MAX_NICE) / fair /
219	return cfs_prio_less(a, b, fi: in_fi);
220
221	#ifdef CONFIG_SCHED_CLASS_EXT
222	if (pa == MAX_RT_PRIO + MAX_NICE + `1`) / ext /
223	return scx_prio_less(a, b, in_fi);
224	#endif
225
226	return false;
227	}
228
229	static inline bool __sched_core_less(const struct task_struct *a,
230	const struct task_struct *b)
231	{
232	if (a->core_cookie < b->core_cookie)
233	return true;
234
235	if (a->core_cookie > b->core_cookie)
236	return false;
237
238	/ flip prio, so high prio is leftmost /
239	if (prio_less(a: b, b: a, in_fi: !!task_rq(a)->core->core_forceidle_count))
240	return true;
241
242	return false;
243	}
244
245	#define __node_2_sc(node) rb_entry((node), struct task_struct, core_node)
246
247	static inline bool rb_sched_core_less(struct rb_node a, const* struct rb_node *b)
248	{
249	return __sched_core_less(__node_2_sc(a), __node_2_sc(b));
250	}
251
252	static inline int rb_sched_core_cmp(const void key, const* struct rb_node *node)
253	{
254	const struct task_struct *p = __node_2_sc(node);
255	unsigned long cookie = (unsigned long)key;
256
257	if (cookie < p->core_cookie)
258	return -`1`;
259
260	if (cookie > p->core_cookie)
261	return `1`;
262
263	return `0`;
264	}
265
266	void sched_core_enqueue(struct rq rq, struct* task_struct *p)
267	{
268	if (p->se.sched_delayed)
269	return;
270
271	rq->core->core_task_seq++;
272
273	if (!p->core_cookie)
274	return;
275
276	rb_add(node: &p->core_node, tree: &rq->core_tree, less: rb_sched_core_less);
277	}
278
279	void sched_core_dequeue(struct rq rq, struct* task_struct p, int* flags)
280	{
281	if (p->se.sched_delayed)
282	return;
283
284	rq->core->core_task_seq++;
285
286	if (sched_core_enqueued(p)) {
287	rb_erase(&p->core_node, &rq->core_tree);
288	RB_CLEAR_NODE(&p->core_node);
289	}
290
291	/*
292	* Migrating the last task off the cpu, with the cpu in forced idle
293	* state. Reschedule to create an accounting edge for forced idle,
294	* and re-examine whether the core is still in forced idle state.
295	*/
296	if (!(flags & DEQUEUE_SAVE) && rq->nr_running == `1` &&
297	rq->core->core_forceidle_count && rq->curr == rq->idle)
298	resched_curr(rq);
299	}
300
301	static int sched_task_is_throttled(struct task_struct p, int* cpu)
302	{
303	if (p->sched_class->task_is_throttled)
304	return p->sched_class->task_is_throttled(p, cpu);
305
306	return `0`;
307	}
308
309	static struct task_struct sched_core_next(struct* task_struct p, unsigned* long cookie)
310	{
311	struct rb_node *node = &p->core_node;
312	int cpu = task_cpu(p);
313
314	do {
315	node = rb_next(node);
316	if (!node)
317	return NULL;
318
319	p = __node_2_sc(node);
320	if (p->core_cookie != cookie)
321	return NULL;
322
323	} while (sched_task_is_throttled(p, cpu));
324
325	return p;
326	}
327
328	/*
329	* Find left-most (aka, highest priority) and unthrottled task matching @cookie.
330	* If no suitable task is found, NULL will be returned.
331	*/
332	static struct task_struct sched_core_find(struct* rq rq, unsigned* long cookie)
333	{
334	struct task_struct *p;
335	struct rb_node *node;
336
337	node = rb_find_first(key: (void *)cookie, tree: &rq->core_tree, cmp: rb_sched_core_cmp);
338	if (!node)
339	return NULL;
340
341	p = __node_2_sc(node);
342	if (!sched_task_is_throttled(p, cpu: rq->cpu))
343	return p;
344
345	return sched_core_next(p, cookie);
346	}
347
348	/*
349	* Magic required such that:
350	*
351	* raw_spin_rq_lock(rq);
352	* ...
353	* raw_spin_rq_unlock(rq);
354	*
355	* ends up locking and unlocking the _same_ lock, and all CPUs
356	* always agree on what rq has what lock.
357	*
358	* XXX entirely possible to selectively enable cores, don't bother for now.
359	*/
360
361	static DEFINE_MUTEX(sched_core_mutex);
362	static atomic_t sched_core_count;
363	static struct cpumask sched_core_mask;
364
365	static void sched_core_lock(int cpu, unsigned long *flags)
366	{
367	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
368	int t, i = `0`;
369
370	local_irq_save(*flags);
371	for_each_cpu(t, smt_mask)
372	raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
373	}
374
375	static void sched_core_unlock(int cpu, unsigned long *flags)
376	{
377	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
378	int t;
379
380	for_each_cpu(t, smt_mask)
381	raw_spin_unlock(&cpu_rq(t)->__lock);
382	local_irq_restore(*flags);
383	}
384
385	static void __sched_core_flip(bool enabled)
386	{
387	unsigned long flags;
388	int cpu, t;
389
390	cpus_read_lock();
391
392	/*
393	* Toggle the online cores, one by one.
394	*/
395	cpumask_copy(dstp: &sched_core_mask, cpu_online_mask);
396	for_each_cpu(cpu, &sched_core_mask) {
397	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
398
399	sched_core_lock(cpu, flags: &flags);
400
401	for_each_cpu(t, smt_mask)
402	cpu_rq(t)->core_enabled = enabled;
403
404	cpu_rq(cpu)->core->core_forceidle_start = `0`;
405
406	sched_core_unlock(cpu, flags: &flags);
407
408	cpumask_andnot(dstp: &sched_core_mask, src1p: &sched_core_mask, src2p: smt_mask);
409	}
410
411	/*
412	* Toggle the offline CPUs.
413	*/
414	for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask)
415	cpu_rq(cpu)->core_enabled = enabled;
416
417	cpus_read_unlock();
418	}
419
420	static void sched_core_assert_empty(void)
421	{
422	int cpu;
423
424	for_each_possible_cpu(cpu)
425	WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->core_tree));
426	}
427
428	static void __sched_core_enable(void)
429	{
430	static_branch_enable(&__sched_core_enabled);
431	/*
432	* Ensure all previous instances of raw_spin_rq_*lock() have finished
433	* and future ones will observe !sched_core_disabled().
434	*/
435	synchronize_rcu();
436	__sched_core_flip(enabled: true);
437	sched_core_assert_empty();
438	}
439
440	static void __sched_core_disable(void)
441	{
442	sched_core_assert_empty();
443	__sched_core_flip(enabled: false);
444	static_branch_disable(&__sched_core_enabled);
445	}
446
447	void sched_core_get(void)
448	{
449	if (atomic_inc_not_zero(v: &sched_core_count))
450	return;
451
452	mutex_lock(&sched_core_mutex);
453	if (!atomic_read(v: &sched_core_count))
454	__sched_core_enable();
455
456	smp_mb__before_atomic();
457	atomic_inc(v: &sched_core_count);
458	mutex_unlock(lock: &sched_core_mutex);
459	}
460
461	static void __sched_core_put(struct work_struct *work)
462	{
463	if (atomic_dec_and_mutex_lock(cnt: &sched_core_count, lock: &sched_core_mutex)) {
464	__sched_core_disable();
465	mutex_unlock(lock: &sched_core_mutex);
466	}
467	}
468
469	void sched_core_put(void)
470	{
471	static DECLARE_WORK(_work, __sched_core_put);
472
473	/*
474	* "There can be only one"
475	*
476	* Either this is the last one, or we don't actually need to do any
477	* 'work'. If it is the last again, we rely on
478	* WORK_STRUCT_PENDING_BIT.
479	*/
480	if (!atomic_add_unless(v: &sched_core_count, a: -`1`, u: `1`))
481	schedule_work(work: &_work);
482	}
483
484	#else /* !CONFIG_SCHED_CORE */
485
486	static inline void sched_core_enqueue(struct rq rq, struct* task_struct *p) { }
487	static inline void
488	sched_core_dequeue(struct rq rq, struct* task_struct p, int* flags) { }
489
490	#endif /* CONFIG_SCHED_CORE */
491
492	/ need a wrapper since we may need to trace from modules /
493	EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp);
494
495	/ Call via the helper macro trace_set_current_state. /
496	void __trace_set_current_state(int state_value)
497	{
498	trace_sched_set_state_tp(current, state: state_value);
499	}
500	EXPORT_SYMBOL(__trace_set_current_state);
501
502	/*
503	* Serialization rules:
504	*
505	* Lock order:
506	*
507	* p->pi_lock
508	* rq->lock
509	* hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
510	*
511	* rq1->lock
512	* rq2->lock where: rq1 < rq2
513	*
514	* Regular state:
515	*
516	* Normal scheduling state is serialized by rq->lock. __schedule() takes the
517	* local CPU's rq->lock, it optionally removes the task from the runqueue and
518	* always looks at the local rq data structures to find the most eligible task
519	* to run next.
520	*
521	* Task enqueue is also under rq->lock, possibly taken from another CPU.
522	* Wakeups from another LLC domain might use an IPI to transfer the enqueue to
523	* the local CPU to avoid bouncing the runqueue state around [ see
524	* ttwu_queue_wakelist() ]
525	*
526	* Task wakeup, specifically wakeups that involve migration, are horribly
527	* complicated to avoid having to take two rq->locks.
528	*
529	* Special state:
530	*
531	* System-calls and anything external will use task_rq_lock() which acquires
532	* both p->pi_lock and rq->lock. As a consequence the state they change is
533	* stable while holding either lock:
534	*
535	* - sched_setaffinity()/
536	* set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
537	* - set_user_nice(): p->se.load, p->*prio
538	* - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
539	* p->se.load, p->rt_priority,
540	* p->dl.dl_{runtime, deadline, period, flags, bw, density}
541	* - sched_setnuma(): p->numa_preferred_nid
542	* - sched_move_task(): p->sched_task_group
543	* - uclamp_update_active() p->uclamp*
544	*
545	* p->state <- TASK_*:
546	*
547	* is changed locklessly using set_current_state(), __set_current_state() or
548	* set_special_state(), see their respective comments, or by
549	* try_to_wake_up(). This latter uses p->pi_lock to serialize against
550	* concurrent self.
551	*
552	* p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
553	*
554	* is set by activate_task() and cleared by deactivate_task(), under
555	* rq->lock. Non-zero indicates the task is runnable, the special
556	* ON_RQ_MIGRATING state is used for migration without holding both
557	* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
558	*
559	* Additionally it is possible to be ->on_rq but still be considered not
560	* runnable when p->se.sched_delayed is true. These tasks are on the runqueue
561	* but will be dequeued as soon as they get picked again. See the
562	* task_is_runnable() helper.
563	*
564	* p->on_cpu <- { 0, 1 }:
565	*
566	* is set by prepare_task() and cleared by finish_task() such that it will be
567	* set before p is scheduled-in and cleared after p is scheduled-out, both
568	* under rq->lock. Non-zero indicates the task is running on its CPU.
569	*
570	* [ The astute reader will observe that it is possible for two tasks on one
571	* CPU to have ->on_cpu = 1 at the same time. ]
572	*
573	* task_cpu(p): is changed by set_task_cpu(), the rules are:
574	*
575	* - Don't call set_task_cpu() on a blocked task:
576	*
577	* We don't care what CPU we're not running on, this simplifies hotplug,
578	* the CPU assignment of blocked tasks isn't required to be valid.
579	*
580	* - for try_to_wake_up(), called under p->pi_lock:
581	*
582	* This allows try_to_wake_up() to only take one rq->lock, see its comment.
583	*
584	* - for migration called under rq->lock:
585	* [ see task_on_rq_migrating() in task_rq_lock() ]
586	*
587	* o move_queued_task()
588	* o detach_task()
589	*
590	* - for migration called under double_rq_lock():
591	*
592	* o __migrate_swap_task()
593	* o push_rt_task() / pull_rt_task()
594	* o push_dl_task() / pull_dl_task()
595	* o dl_task_offline_migration()
596	*
597	*/
598
599	void raw_spin_rq_lock_nested(struct rq rq, int* subclass)
600	{
601	raw_spinlock_t *lock;
602
603	/ Matches synchronize_rcu() in __sched_core_enable() /
604	preempt_disable();
605	if (sched_core_disabled()) {
606	raw_spin_lock_nested(&rq->__lock, subclass);
607	/ preempt_count MUST be > 1 /
608	preempt_enable_no_resched();
609	return;
610	}
611
612	for (;;) {
613	lock = __rq_lockp(rq);
614	raw_spin_lock_nested(lock, subclass);
615	if (likely(lock == __rq_lockp(rq))) {
616	/ preempt_count MUST be > 1 /
617	preempt_enable_no_resched();
618	return;
619	}
620	raw_spin_unlock(lock);
621	}
622	}
623
624	bool raw_spin_rq_trylock(struct rq *rq)
625	{
626	raw_spinlock_t *lock;
627	bool ret;
628
629	/ Matches synchronize_rcu() in __sched_core_enable() /
630	preempt_disable();
631	if (sched_core_disabled()) {
632	ret = raw_spin_trylock(&rq->__lock);
633	preempt_enable();
634	return ret;
635	}
636
637	for (;;) {
638	lock = __rq_lockp(rq);
639	ret = raw_spin_trylock(lock);
640	if (!ret \|\| (likely(lock == __rq_lockp(rq)))) {
641	preempt_enable();
642	return ret;
643	}
644	raw_spin_unlock(lock);
645	}
646	}
647
648	void raw_spin_rq_unlock(struct rq *rq)
649	{
650	raw_spin_unlock(rq_lockp(rq));
651	}
652
653	#ifdef CONFIG_SMP
654	/*
655	* double_rq_lock - safely lock two runqueues
656	*/
657	void double_rq_lock(struct rq rq1, struct* rq *rq2)
658	{
659	lockdep_assert_irqs_disabled();
660
661	if (rq_order_less(rq1: rq2, rq2: rq1))
662	swap(rq1, rq2);
663
664	raw_spin_rq_lock(rq: rq1);
665	if (__rq_lockp(rq: rq1) != __rq_lockp(rq: rq2))
666	raw_spin_rq_lock_nested(rq: rq2, SINGLE_DEPTH_NESTING);
667
668	double_rq_clock_clear_update(rq1, rq2);
669	}
670	#endif
671
672	/*
673	* __task_rq_lock - lock the rq @p resides on.
674	*/
675	struct rq __task_rq_lock(struct* task_struct p, struct* rq_flags *rf)
676	__acquires(rq->lock)
677	{
678	struct rq *rq;
679
680	lockdep_assert_held(&p->pi_lock);
681
682	for (;;) {
683	rq = task_rq(p);
684	raw_spin_rq_lock(rq);
685	if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
686	rq_pin_lock(rq, rf);
687	return rq;
688	}
689	raw_spin_rq_unlock(rq);
690
691	while (unlikely(task_on_rq_migrating(p)))
692	cpu_relax();
693	}
694	}
695
696	/*
697	* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
698	*/
699	struct rq task_rq_lock(struct* task_struct p, struct* rq_flags *rf)
700	__acquires(p->pi_lock)
701	__acquires(rq->lock)
702	{
703	struct rq *rq;
704
705	for (;;) {
706	raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
707	rq = task_rq(p);
708	raw_spin_rq_lock(rq);
709	/*
710	* move_queued_task() task_rq_lock()
711	*
712	* ACQUIRE (rq->lock)
713	* [S] ->on_rq = MIGRATING [L] rq = task_rq()
714	* WMB (__set_task_cpu()) ACQUIRE (rq->lock);
715	* [S] ->cpu = new_cpu [L] task_rq()
716	* [L] ->on_rq
717	* RELEASE (rq->lock)
718	*
719	* If we observe the old CPU in task_rq_lock(), the acquire of
720	* the old rq->lock will fully serialize against the stores.
721	*
722	* If we observe the new CPU in task_rq_lock(), the address
723	* dependency headed by '[L] rq = task_rq()' and the acquire
724	* will pair with the WMB to ensure we then also see migrating.
725	*/
726	if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
727	rq_pin_lock(rq, rf);
728	return rq;
729	}
730	raw_spin_rq_unlock(rq);
731	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
732
733	while (unlikely(task_on_rq_migrating(p)))
734	cpu_relax();
735	}
736	}
737
738	/*
739	* RQ-clock updating methods:
740	*/
741
742	static void update_rq_clock_task(struct rq *rq, s64 delta)
743	{
744	/*
745	* In theory, the compile should just see 0 here, and optimize out the call
746	* to sched_rt_avg_update. But I don't trust it...
747	*/
748	s64 __maybe_unused steal = `0`, irq_delta = `0`;
749
750	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
751	if (irqtime_enabled()) {
752	irq_delta = irq_time_read(cpu: cpu_of(rq)) - rq->prev_irq_time;
753
754	/*
755	* Since irq_time is only updated on {soft,}irq_exit, we might run into
756	* this case when a previous update_rq_clock() happened inside a
757	* {soft,}IRQ region.
758	*
759	* When this happens, we stop ->clock_task and only update the
760	* prev_irq_time stamp to account for the part that fit, so that a next
761	* update will consume the rest. This ensures ->clock_task is
762	* monotonic.
763	*
764	* It does however cause some slight miss-attribution of {soft,}IRQ
765	* time, a more accurate solution would be to update the irq_time using
766	* the current rq->clock timestamp, except that would require using
767	* atomic ops.
768	*/
769	if (irq_delta > delta)
770	irq_delta = delta;
771
772	rq->prev_irq_time += irq_delta;
773	delta -= irq_delta;
774	delayacct_irq(task: rq->curr, delta: irq_delta);
775	}
776	#endif
777	#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
778	if (static_key_false(key: (&paravirt_steal_rq_enabled))) {
779	u64 prev_steal;
780
781	steal = prev_steal = paravirt_steal_clock(cpu: cpu_of(rq));
782	steal -= rq->prev_steal_time_rq;
783
784	if (unlikely(steal > delta))
785	steal = delta;
786
787	rq->prev_steal_time_rq = prev_steal;
788	delta -= steal;
789	}
790	#endif
791
792	rq->clock_task += delta;
793
794	#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
795	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
796	update_irq_load_avg(rq, running: irq_delta + steal);
797	#endif
798	update_rq_clock_pelt(rq, delta);
799	}
800
801	void update_rq_clock(struct rq *rq)
802	{
803	s64 delta;
804	u64 clock;
805
806	lockdep_assert_rq_held(rq);
807
808	if (rq->clock_update_flags & RQCF_ACT_SKIP)
809	return;
810
811	if (sched_feat(WARN_DOUBLE_CLOCK))
812	WARN_ON_ONCE(rq->clock_update_flags & RQCF_UPDATED);
813	rq->clock_update_flags \|= RQCF_UPDATED;
814
815	clock = sched_clock_cpu(cpu: cpu_of(rq));
816	scx_rq_clock_update(rq, clock);
817
818	delta = clock - rq->clock;
819	if (delta < `0`)
820	return;
821	rq->clock += delta;
822
823	update_rq_clock_task(rq, delta);
824	}
825
826	#ifdef CONFIG_SCHED_HRTICK
827	/*
828	* Use HR-timers to deliver accurate preemption points.
829	*/
830
831	static void hrtick_clear(struct rq *rq)
832	{
833	if (hrtimer_active(timer: &rq->hrtick_timer))
834	hrtimer_cancel(timer: &rq->hrtick_timer);
835	}
836
837	/*
838	* High-resolution timer tick.
839	* Runs from hardirq context with interrupts disabled.
840	*/
841	static enum hrtimer_restart hrtick(struct hrtimer *timer)
842	{
843	struct rq rq = container_of(timer, struct* rq, hrtick_timer);
844	struct rq_flags rf;
845
846	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
847
848	rq_lock(rq, rf: &rf);
849	update_rq_clock(rq);
850	rq->donor->sched_class->task_tick(rq, rq->curr, `1`);
851	rq_unlock(rq, rf: &rf);
852
853	return HRTIMER_NORESTART;
854	}
855
856	#ifdef CONFIG_SMP
857
858	static void __hrtick_restart(struct rq *rq)
859	{
860	struct hrtimer *timer = &rq->hrtick_timer;
861	ktime_t time = rq->hrtick_time;
862
863	hrtimer_start(timer, tim: time, mode: HRTIMER_MODE_ABS_PINNED_HARD);
864	}
865
866	/*
867	* called from hardirq (IPI) context
868	*/
869	static void __hrtick_start(void *arg)
870	{
871	struct rq *rq = arg;
872	struct rq_flags rf;
873
874	rq_lock(rq, rf: &rf);
875	__hrtick_restart(rq);
876	rq_unlock(rq, rf: &rf);
877	}
878
879	/*
880	* Called to set the hrtick timer state.
881	*
882	* called with rq->lock held and IRQs disabled
883	*/
884	void hrtick_start(struct rq *rq, u64 delay)
885	{
886	struct hrtimer *timer = &rq->hrtick_timer;
887	s64 delta;
888
889	/*
890	* Don't schedule slices shorter than 10000ns, that just
891	* doesn't make sense and can cause timer DoS.
892	*/
893	delta = max_t(s64, delay, `10000LL`);
894	rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
895
896	if (rq == this_rq())
897	__hrtick_restart(rq);
898	else
899	smp_call_function_single_async(cpu: cpu_of(rq), csd: &rq->hrtick_csd);
900	}
901
902	#else
903	/*
904	* Called to set the hrtick timer state.
905	*
906	* called with rq->lock held and IRQs disabled
907	*/
908	void hrtick_start(struct rq *rq, u64 delay)
909	{
910	/*
911	* Don't schedule slices shorter than 10000ns, that just
912	* doesn't make sense. Rely on vruntime for fairness.
913	*/
914	delay = max_t(u64, delay, `10000LL`);
915	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
916	HRTIMER_MODE_REL_PINNED_HARD);
917	}
918
919	#endif /* CONFIG_SMP */
920
921	static void hrtick_rq_init(struct rq *rq)
922	{
923	#ifdef CONFIG_SMP
924	INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
925	#endif
926	hrtimer_setup(timer: &rq->hrtick_timer, function: hrtick, CLOCK_MONOTONIC, mode: HRTIMER_MODE_REL_HARD);
927	}
928	#else /* CONFIG_SCHED_HRTICK */
929	static inline void hrtick_clear(struct rq *rq)
930	{
931	}
932
933	static inline void hrtick_rq_init(struct rq *rq)
934	{
935	}
936	#endif /* CONFIG_SCHED_HRTICK */
937
938	/*
939	* try_cmpxchg based fetch_or() macro so it works for different integer types:
940	*/
941	#define fetch_or(ptr, mask) \
942	({ \
943	typeof(ptr) _ptr = (ptr); \
944	typeof(mask) _mask = (mask); \
945	typeof(_ptr) _val = _ptr; \
946	\
947	do { \
948	} while (!try_cmpxchg(_ptr, &_val, _val \| _mask)); \
949	_val; \
950	})
951
952	#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
953	/*
954	* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
955	* this avoids any races wrt polling state changes and thereby avoids
956	* spurious IPIs.
957	*/
958	static inline bool set_nr_and_not_polling(struct thread_info ti, int* tif)
959	{
960	return !(fetch_or(&ti->flags, `1` << tif) & _TIF_POLLING_NRFLAG);
961	}
962
963	/*
964	* Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
965	*
966	* If this returns true, then the idle task promises to call
967	* sched_ttwu_pending() and reschedule soon.
968	*/
969	static bool set_nr_if_polling(struct task_struct *p)
970	{
971	struct thread_info *ti = task_thread_info(p);
972	typeof(ti->flags) val = READ_ONCE(ti->flags);
973
974	do {
975	if (!(val & _TIF_POLLING_NRFLAG))
976	return false;
977	if (val & _TIF_NEED_RESCHED)
978	return true;
979	} while (!try_cmpxchg(&ti->flags, &val, val \| _TIF_NEED_RESCHED));
980
981	return true;
982	}
983
984	#else
985	static inline bool set_nr_and_not_polling(struct thread_info ti, int* tif)
986	{
987	set_ti_thread_flag(ti, tif);
988	return true;
989	}
990
991	#ifdef CONFIG_SMP
992	static inline bool set_nr_if_polling(struct task_struct *p)
993	{
994	return false;
995	}
996	#endif
997	#endif
998
999	static bool __wake_q_add(struct wake_q_head head, struct* task_struct *task)
1000	{
1001	struct wake_q_node *node = &task->wake_q;
1002
1003	/*
1004	* Atomically grab the task, if ->wake_q is !nil already it means
1005	* it's already queued (either by us or someone else) and will get the
1006	* wakeup due to that.
1007	*
1008	* In order to ensure that a pending wakeup will observe our pending
1009	* state, even in the failed case, an explicit smp_mb() must be used.
1010	*/
1011	smp_mb__before_atomic();
1012	if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL)))
1013	return false;
1014
1015	/*
1016	* The head is context local, there can be no concurrency.
1017	*/
1018	*head->lastp = node;
1019	head->lastp = &node->next;
1020	return true;
1021	}
1022
1023	/**
1024	* wake_q_add() - queue a wakeup for 'later' waking.
1025	* @head: the wake_q_head to add @task to
1026	* @task: the task to queue for 'later' wakeup
1027	*
1028	* Queue a task for later wakeup, most likely by the wake_up_q() call in the
1029	* same context, _HOWEVER_ this is not guaranteed, the wakeup can come
1030	* instantly.
1031	*
1032	* This function must be used as-if it were wake_up_process(); IOW the task
1033	* must be ready to be woken at this location.
1034	*/
1035	void wake_q_add(struct wake_q_head head, struct* task_struct *task)
1036	{
1037	if (__wake_q_add(head, task))
1038	get_task_struct(t: task);
1039	}
1040
1041	/**
1042	* wake_q_add_safe() - safely queue a wakeup for 'later' waking.
1043	* @head: the wake_q_head to add @task to
1044	* @task: the task to queue for 'later' wakeup
1045	*
1046	* Queue a task for later wakeup, most likely by the wake_up_q() call in the
1047	* same context, _HOWEVER_ this is not guaranteed, the wakeup can come
1048	* instantly.
1049	*
1050	* This function must be used as-if it were wake_up_process(); IOW the task
1051	* must be ready to be woken at this location.
1052	*
1053	* This function is essentially a task-safe equivalent to wake_q_add(). Callers
1054	* that already hold reference to @task can call the 'safe' version and trust
1055	* wake_q to do the right thing depending whether or not the @task is already
1056	* queued for wakeup.
1057	*/
1058	void wake_q_add_safe(struct wake_q_head head, struct* task_struct *task)
1059	{
1060	if (!__wake_q_add(head, task))
1061	put_task_struct(t: task);
1062	}
1063
1064	void wake_up_q(struct wake_q_head *head)
1065	{
1066	struct wake_q_node *node = head->first;
1067
1068	while (node != WAKE_Q_TAIL) {
1069	struct task_struct *task;
1070
1071	task = container_of(node, struct task_struct, wake_q);
1072	node = node->next;
1073	/ pairs with cmpxchg_relaxed() in __wake_q_add() /
1074	WRITE_ONCE(task->wake_q.next, NULL);
1075	/ Task can safely be re-inserted now. /
1076
1077	/*
1078	* wake_up_process() executes a full barrier, which pairs with
1079	* the queueing in wake_q_add() so as not to miss wakeups.
1080	*/
1081	wake_up_process(tsk: task);
1082	put_task_struct(t: task);
1083	}
1084	}
1085
1086	/*
1087	* resched_curr - mark rq's current task 'to be rescheduled now'.
1088	*
1089	* On UP this means the setting of the need_resched flag, on SMP it
1090	* might also involve a cross-CPU call to trigger the scheduler on
1091	* the target CPU.
1092	*/
1093	static void __resched_curr(struct rq rq, int* tif)
1094	{
1095	struct task_struct *curr = rq->curr;
1096	struct thread_info *cti = task_thread_info(curr);
1097	int cpu;
1098
1099	lockdep_assert_rq_held(rq);
1100
1101	/*
1102	* Always immediately preempt the idle task; no point in delaying doing
1103	* actual work.
1104	*/
1105	if (is_idle_task(p: curr) && tif == TIF_NEED_RESCHED_LAZY)
1106	tif = TIF_NEED_RESCHED;
1107
1108	if (cti->flags & ((`1` << tif) \| _TIF_NEED_RESCHED))
1109	return;
1110
1111	cpu = cpu_of(rq);
1112
1113	if (cpu == smp_processor_id()) {
1114	set_ti_thread_flag(ti: cti, flag: tif);
1115	if (tif == TIF_NEED_RESCHED)
1116	set_preempt_need_resched();
1117	return;
1118	}
1119
1120	if (set_nr_and_not_polling(ti: cti, tif)) {
1121	if (tif == TIF_NEED_RESCHED)
1122	smp_send_reschedule(cpu);
1123	} else {
1124	trace_sched_wake_idle_without_ipi(cpu);
1125	}
1126	}
1127
1128	void resched_curr(struct rq *rq)
1129	{
1130	__resched_curr(rq, TIF_NEED_RESCHED);
1131	}
1132
1133	#ifdef CONFIG_PREEMPT_DYNAMIC
1134	static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
1135	static __always_inline bool dynamic_preempt_lazy(void)
1136	{
1137	return static_branch_unlikely(&sk_dynamic_preempt_lazy);
1138	}
1139	#else
1140	static __always_inline bool dynamic_preempt_lazy(void)
1141	{
1142	return IS_ENABLED(CONFIG_PREEMPT_LAZY);
1143	}
1144	#endif
1145
1146	static __always_inline int get_lazy_tif_bit(void)
1147	{
1148	if (dynamic_preempt_lazy())
1149	return TIF_NEED_RESCHED_LAZY;
1150
1151	return TIF_NEED_RESCHED;
1152	}
1153
1154	void resched_curr_lazy(struct rq *rq)
1155	{
1156	__resched_curr(rq, tif: get_lazy_tif_bit());
1157	}
1158
1159	void resched_cpu(int cpu)
1160	{
1161	struct rq *rq = cpu_rq(cpu);
1162	unsigned long flags;
1163
1164	raw_spin_rq_lock_irqsave(rq, flags);
1165	if (cpu_online(cpu) \|\| cpu == smp_processor_id())
1166	resched_curr(rq);
1167	raw_spin_rq_unlock_irqrestore(rq, flags);
1168	}
1169
1170	#ifdef CONFIG_SMP
1171	#ifdef CONFIG_NO_HZ_COMMON
1172	/*
1173	* In the semi idle case, use the nearest busy CPU for migrating timers
1174	* from an idle CPU. This is good for power-savings.
1175	*
1176	* We don't do similar optimization for completely idle system, as
1177	* selecting an idle CPU will add more delays to the timers than intended
1178	* (as that CPU's timer base may not be up to date wrt jiffies etc).
1179	*/
1180	int get_nohz_timer_target(void)
1181	{
1182	int i, cpu = smp_processor_id(), default_cpu = -`1`;
1183	struct sched_domain *sd;
1184	const struct cpumask *hk_mask;
1185
1186	if (housekeeping_cpu(cpu, type: HK_TYPE_KERNEL_NOISE)) {
1187	if (!idle_cpu(cpu))
1188	return cpu;
1189	default_cpu = cpu;
1190	}
1191
1192	hk_mask = housekeeping_cpumask(type: HK_TYPE_KERNEL_NOISE);
1193
1194	guard(rcu)();
1195
1196	for_each_domain(cpu, sd) {
1197	for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
1198	if (cpu == i)
1199	continue;
1200
1201	if (!idle_cpu(cpu: i))
1202	return i;
1203	}
1204	}
1205
1206	if (default_cpu == -`1`)
1207	default_cpu = housekeeping_any_cpu(type: HK_TYPE_KERNEL_NOISE);
1208
1209	return default_cpu;
1210	}
1211
1212	/*
1213	* When add_timer_on() enqueues a timer into the timer wheel of an
1214	* idle CPU then this timer might expire before the next timer event
1215	* which is scheduled to wake up that CPU. In case of a completely
1216	* idle system the next event might even be infinite time into the
1217	* future. wake_up_idle_cpu() ensures that the CPU is woken up and
1218	* leaves the inner idle loop so the newly added timer is taken into
1219	* account when the CPU goes back to idle and evaluates the timer
1220	* wheel for the next timer event.
1221	*/
1222	static void wake_up_idle_cpu(int cpu)
1223	{
1224	struct rq *rq = cpu_rq(cpu);
1225
1226	if (cpu == smp_processor_id())
1227	return;
1228
1229	/*
1230	* Set TIF_NEED_RESCHED and send an IPI if in the non-polling
1231	* part of the idle loop. This forces an exit from the idle loop
1232	* and a round trip to schedule(). Now this could be optimized
1233	* because a simple new idle loop iteration is enough to
1234	* re-evaluate the next tick. Provided some re-ordering of tick
1235	* nohz functions that would need to follow TIF_NR_POLLING
1236	* clearing:
1237	*
1238	* - On most architectures, a simple fetch_or on ti::flags with a
1239	* "0" value would be enough to know if an IPI needs to be sent.
1240	*
1241	* - x86 needs to perform a last need_resched() check between
1242	* monitor and mwait which doesn't take timers into account.
1243	* There a dedicated TIF_TIMER flag would be required to
1244	* fetch_or here and be checked along with TIF_NEED_RESCHED
1245	* before mwait().
1246	*
1247	* However, remote timer enqueue is not such a frequent event
1248	* and testing of the above solutions didn't appear to report
1249	* much benefits.
1250	*/
1251	if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED))
1252	smp_send_reschedule(cpu);
1253	else
1254	trace_sched_wake_idle_without_ipi(cpu);
1255	}
1256
1257	static bool wake_up_full_nohz_cpu(int cpu)
1258	{
1259	/*
1260	* We just need the target to call irq_exit() and re-evaluate
1261	* the next tick. The nohz full kick at least implies that.
1262	* If needed we can still optimize that later with an
1263	* empty IRQ.
1264	*/
1265	if (cpu_is_offline(cpu))
1266	return true; / Don't try to wake offline CPUs. /
1267	if (tick_nohz_full_cpu(cpu)) {
1268	if (cpu != smp_processor_id() \|\|
1269	tick_nohz_tick_stopped())
1270	tick_nohz_full_kick_cpu(cpu);
1271	return true;
1272	}
1273
1274	return false;
1275	}
1276
1277	/*
1278	* Wake up the specified CPU. If the CPU is going offline, it is the
1279	* caller's responsibility to deal with the lost wakeup, for example,
1280	* by hooking into the CPU_DEAD notifier like timers and hrtimers do.
1281	*/
1282	void wake_up_nohz_cpu(int cpu)
1283	{
1284	if (!wake_up_full_nohz_cpu(cpu))
1285	wake_up_idle_cpu(cpu);
1286	}
1287
1288	static void nohz_csd_func(void *info)
1289	{
1290	struct rq *rq = info;
1291	int cpu = cpu_of(rq);
1292	unsigned int flags;
1293
1294	/*
1295	* Release the rq::nohz_csd.
1296	*/
1297	flags = atomic_fetch_andnot(NOHZ_KICK_MASK \| NOHZ_NEWILB_KICK, nohz_flags(cpu));
1298	WARN_ON(!(flags & NOHZ_KICK_MASK));
1299
1300	rq->idle_balance = idle_cpu(cpu);
1301	if (rq->idle_balance) {
1302	rq->nohz_idle_balance = flags;
1303	__raise_softirq_irqoff(nr: SCHED_SOFTIRQ);
1304	}
1305	}
1306
1307	#endif /* CONFIG_NO_HZ_COMMON */
1308
1309	#ifdef CONFIG_NO_HZ_FULL
1310	static inline bool __need_bw_check(struct rq rq, struct* task_struct *p)
1311	{
1312	if (rq->nr_running != `1`)
1313	return false;
1314
1315	if (p->sched_class != &fair_sched_class)
1316	return false;
1317
1318	if (!task_on_rq_queued(p))
1319	return false;
1320
1321	return true;
1322	}
1323
1324	bool sched_can_stop_tick(struct rq *rq)
1325	{
1326	int fifo_nr_running;
1327
1328	/ Deadline tasks, even if single, need the tick /
1329	if (rq->dl.dl_nr_running)
1330	return false;
1331
1332	/*
1333	* If there are more than one RR tasks, we need the tick to affect the
1334	* actual RR behaviour.
1335	*/
1336	if (rq->rt.rr_nr_running) {
1337	if (rq->rt.rr_nr_running == `1`)
1338	return true;
1339	else
1340	return false;
1341	}
1342
1343	/*
1344	* If there's no RR tasks, but FIFO tasks, we can skip the tick, no
1345	* forced preemption between FIFO tasks.
1346	*/
1347	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
1348	if (fifo_nr_running)
1349	return true;
1350
1351	/*
1352	* If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
1353	* left. For CFS, if there's more than one we need the tick for
1354	* involuntary preemption. For SCX, ask.
1355	*/
1356	if (scx_enabled() && !scx_can_stop_tick(rq))
1357	return false;
1358
1359	if (rq->cfs.h_nr_queued > `1`)
1360	return false;
1361
1362	/*
1363	* If there is one task and it has CFS runtime bandwidth constraints
1364	* and it's on the cpu now we don't want to stop the tick.
1365	* This check prevents clearing the bit if a newly enqueued task here is
1366	* dequeued by migrating while the constrained task continues to run.
1367	* E.g. going from 2->1 without going through pick_next_task().
1368	*/
1369	if (__need_bw_check(rq, rq->curr)) {
1370	if (cfs_task_bw_constrained(rq->curr))
1371	return false;
1372	}
1373
1374	return true;
1375	}
1376	#endif /* CONFIG_NO_HZ_FULL */
1377	#endif /* CONFIG_SMP */
1378
1379	#if defined(CONFIG_RT_GROUP_SCHED) \|\| (defined(CONFIG_FAIR_GROUP_SCHED) && \
1380	(defined(CONFIG_SMP) \|\| defined(CONFIG_CFS_BANDWIDTH)))
1381	/*
1382	* Iterate task_group tree rooted at *from, calling @down when first entering a
1383	* node and @up when leaving it for the final time.
1384	*
1385	* Caller must hold rcu_lock or sufficient equivalent.
1386	*/
1387	int walk_tg_tree_from(struct task_group *from,
1388	tg_visitor down, tg_visitor up, void *data)
1389	{
1390	struct task_group parent, child;
1391	int ret;
1392
1393	parent = from;
1394
1395	down:
1396	ret = (*down)(parent, data);
1397	if (ret)
1398	goto out;
1399	list_for_each_entry_rcu(child, &parent->children, siblings) {
1400	parent = child;
1401	goto down;
1402
1403	up:
1404	continue;
1405	}
1406	ret = (*up)(parent, data);
1407	if (ret \|\| parent == from)
1408	goto out;
1409
1410	child = parent;
1411	parent = parent->parent;
1412	if (parent)
1413	goto up;
1414	out:
1415	return ret;
1416	}
1417
1418	int tg_nop(struct task_group tg, void* *data)
1419	{
1420	return `0`;
1421	}
1422	#endif
1423
1424	void set_load_weight(struct task_struct *p, bool update_load)
1425	{
1426	int prio = p->static_prio - MAX_RT_PRIO;
1427	struct load_weight lw;
1428
1429	if (task_has_idle_policy(p)) {
1430	lw.weight = scale_load(WEIGHT_IDLEPRIO);
1431	lw.inv_weight = WMULT_IDLEPRIO;
1432	} else {
1433	lw.weight = scale_load(sched_prio_to_weight[prio]);
1434	lw.inv_weight = sched_prio_to_wmult[prio];
1435	}
1436
1437	/*
1438	* SCHED_OTHER tasks have to update their load when changing their
1439	* weight
1440	*/
1441	if (update_load && p->sched_class->reweight_task)
1442	p->sched_class->reweight_task(task_rq(p), p, &lw);
1443	else
1444	p->se.load = lw;
1445	}
1446
1447	#ifdef CONFIG_UCLAMP_TASK
1448	/*
1449	* Serializes updates of utilization clamp values
1450	*
1451	* The (slow-path) user-space triggers utilization clamp value updates which
1452	* can require updates on (fast-path) scheduler's data structures used to
1453	* support enqueue/dequeue operations.
1454	* While the per-CPU rq lock protects fast-path update operations, user-space
1455	* requests are serialized using a mutex to reduce the risk of conflicting
1456	* updates or API abuses.
1457	*/
1458	static __maybe_unused DEFINE_MUTEX(uclamp_mutex);
1459
1460	/ Max allowed minimum utilization /
1461	static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
1462
1463	/ Max allowed maximum utilization /
1464	static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
1465
1466	/*
1467	* By default RT tasks run at the maximum performance point/capacity of the
1468	* system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
1469	* SCHED_CAPACITY_SCALE.
1470	*
1471	* This knob allows admins to change the default behavior when uclamp is being
1472	* used. In battery powered devices, particularly, running at the maximum
1473	* capacity and frequency will increase energy consumption and shorten the
1474	* battery life.
1475	*
1476	* This knob only affects RT tasks that their uclamp_se->user_defined == false.
1477	*
1478	* This knob will not override the system default sched_util_clamp_min defined
1479	* above.
1480	*/
1481	unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
1482
1483	/ All clamps are required to be less or equal than these values /
1484	static struct uclamp_se uclamp_default[UCLAMP_CNT];
1485
1486	/*
1487	* This static key is used to reduce the uclamp overhead in the fast path. It
1488	* primarily disables the call to uclamp_rq_{inc, dec}() in
1489	* enqueue/dequeue_task().
1490	*
1491	* This allows users to continue to enable uclamp in their kernel config with
1492	* minimum uclamp overhead in the fast path.
1493	*
1494	* As soon as userspace modifies any of the uclamp knobs, the static key is
1495	* enabled, since we have an actual users that make use of uclamp
1496	* functionality.
1497	*
1498	* The knobs that would enable this static key are:
1499	*
1500	* * A task modifying its uclamp value with sched_setattr().
1501	* * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
1502	* * An admin modifying the cgroup cpu.uclamp.{min, max}
1503	*/
1504	DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
1505
1506	static inline unsigned int
1507	uclamp_idle_value(struct rq rq, enum* uclamp_id clamp_id,
1508	unsigned int clamp_value)
1509	{
1510	/*
1511	* Avoid blocked utilization pushing up the frequency when we go
1512	* idle (which drops the max-clamp) by retaining the last known
1513	* max-clamp.
1514	*/
1515	if (clamp_id == UCLAMP_MAX) {
1516	rq->uclamp_flags \|= UCLAMP_FLAG_IDLE;
1517	return clamp_value;
1518	}
1519
1520	return uclamp_none(clamp_id: UCLAMP_MIN);
1521	}
1522
1523	static inline void uclamp_idle_reset(struct rq rq, enum* uclamp_id clamp_id,
1524	unsigned int clamp_value)
1525	{
1526	/ Reset max-clamp retention only on idle exit /
1527	if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1528	return;
1529
1530	uclamp_rq_set(rq, clamp_id, value: clamp_value);
1531	}
1532
1533	static inline
1534	unsigned int uclamp_rq_max_value(struct rq rq, enum* uclamp_id clamp_id,
1535	unsigned int clamp_value)
1536	{
1537	struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
1538	int bucket_id = UCLAMP_BUCKETS - `1`;
1539
1540	/*
1541	* Since both min and max clamps are max aggregated, find the
1542	* top most bucket with tasks in.
1543	*/
1544	for ( ; bucket_id >= `0`; bucket_id--) {
1545	if (!bucket[bucket_id].tasks)
1546	continue;
1547	return bucket[bucket_id].value;
1548	}
1549
1550	/ No tasks -- default clamp values /
1551	return uclamp_idle_value(rq, clamp_id, clamp_value);
1552	}
1553
1554	static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1555	{
1556	unsigned int default_util_min;
1557	struct uclamp_se *uc_se;
1558
1559	lockdep_assert_held(&p->pi_lock);
1560
1561	uc_se = &p->uclamp_req[UCLAMP_MIN];
1562
1563	/ Only sync if user didn't override the default /
1564	if (uc_se->user_defined)
1565	return;
1566
1567	default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1568	uclamp_se_set(uc_se, value: default_util_min, user_defined: false);
1569	}
1570
1571	static void uclamp_update_util_min_rt_default(struct task_struct *p)
1572	{
1573	if (!rt_task(p))
1574	return;
1575
1576	/ Protect updates to p->uclamp_* /
1577	guard(task_rq_lock)(l: p);
1578	__uclamp_update_util_min_rt_default(p);
1579	}
1580
1581	static inline struct uclamp_se
1582	uclamp_tg_restrict(struct task_struct p, enum* uclamp_id clamp_id)
1583	{
1584	/ Copy by value as we could modify it /
1585	struct uclamp_se uc_req = p->uclamp_req[clamp_id];
1586	#ifdef CONFIG_UCLAMP_TASK_GROUP
1587	unsigned int tg_min, tg_max, value;
1588
1589	/*
1590	* Tasks in autogroups or root task group will be
1591	* restricted by system defaults.
1592	*/
1593	if (task_group_is_autogroup(tg: task_group(p)))
1594	return uc_req;
1595	if (task_group(p) == &root_task_group)
1596	return uc_req;
1597
1598	tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1599	tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1600	value = uc_req.value;
1601	value = clamp(value, tg_min, tg_max);
1602	uclamp_se_set(uc_se: &uc_req, value, user_defined: false);
1603	#endif
1604
1605	return uc_req;
1606	}
1607
1608	/*
1609	* The effective clamp bucket index of a task depends on, by increasing
1610	* priority:
1611	* - the task specific clamp value, when explicitly requested from userspace
1612	* - the task group effective clamp value, for tasks not either in the root
1613	* group or in an autogroup
1614	* - the system default clamp value, defined by the sysadmin
1615	*/
1616	static inline struct uclamp_se
1617	uclamp_eff_get(struct task_struct p, enum* uclamp_id clamp_id)
1618	{
1619	struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
1620	struct uclamp_se uc_max = uclamp_default[clamp_id];
1621
1622	/ System default restrictions always apply /
1623	if (unlikely(uc_req.value > uc_max.value))
1624	return uc_max;
1625
1626	return uc_req;
1627	}
1628
1629	unsigned long uclamp_eff_value(struct task_struct p, enum* uclamp_id clamp_id)
1630	{
1631	struct uclamp_se uc_eff;
1632
1633	/ Task currently refcounted: use back-annotated (effective) value /
1634	if (p->uclamp[clamp_id].active)
1635	return (unsigned long)p->uclamp[clamp_id].value;
1636
1637	uc_eff = uclamp_eff_get(p, clamp_id);
1638
1639	return (unsigned long)uc_eff.value;
1640	}
1641
1642	/*
1643	* When a task is enqueued on a rq, the clamp bucket currently defined by the
1644	* task's uclamp::bucket_id is refcounted on that rq. This also immediately
1645	* updates the rq's clamp value if required.
1646	*
1647	* Tasks can have a task-specific value requested from user-space, track
1648	* within each bucket the maximum value for tasks refcounted in it.
1649	* This "local max aggregation" allows to track the exact "requested" value
1650	* for each bucket when all its RUNNABLE tasks require the same clamp.
1651	*/
1652	static inline void uclamp_rq_inc_id(struct rq rq, struct* task_struct *p,
1653	enum uclamp_id clamp_id)
1654	{
1655	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1656	struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1657	struct uclamp_bucket *bucket;
1658
1659	lockdep_assert_rq_held(rq);
1660
1661	/ Update task effective clamp /
1662	p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
1663
1664	bucket = &uc_rq->bucket[uc_se->bucket_id];
1665	bucket->tasks++;
1666	uc_se->active = true;
1667
1668	uclamp_idle_reset(rq, clamp_id, clamp_value: uc_se->value);
1669
1670	/*
1671	* Local max aggregation: rq buckets always track the max
1672	* "requested" clamp value of its RUNNABLE tasks.
1673	*/
1674	if (bucket->tasks == `1` \|\| uc_se->value > bucket->value)
1675	bucket->value = uc_se->value;
1676
1677	if (uc_se->value > uclamp_rq_get(rq, clamp_id))
1678	uclamp_rq_set(rq, clamp_id, value: uc_se->value);
1679	}
1680
1681	/*
1682	* When a task is dequeued from a rq, the clamp bucket refcounted by the task
1683	* is released. If this is the last task reference counting the rq's max
1684	* active clamp value, then the rq's clamp value is updated.
1685	*
1686	* Both refcounted tasks and rq's cached clamp values are expected to be
1687	* always valid. If it's detected they are not, as defensive programming,
1688	* enforce the expected state and warn.
1689	*/
1690	static inline void uclamp_rq_dec_id(struct rq rq, struct* task_struct *p,
1691	enum uclamp_id clamp_id)
1692	{
1693	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1694	struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1695	struct uclamp_bucket *bucket;
1696	unsigned int bkt_clamp;
1697	unsigned int rq_clamp;
1698
1699	lockdep_assert_rq_held(rq);
1700
1701	/*
1702	* If sched_uclamp_used was enabled after task @p was enqueued,
1703	* we could end up with unbalanced call to uclamp_rq_dec_id().
1704	*
1705	* In this case the uc_se->active flag should be false since no uclamp
1706	* accounting was performed at enqueue time and we can just return
1707	* here.
1708	*
1709	* Need to be careful of the following enqueue/dequeue ordering
1710	* problem too
1711	*
1712	* enqueue(taskA)
1713	* // sched_uclamp_used gets enabled
1714	* enqueue(taskB)
1715	* dequeue(taskA)
1716	* // Must not decrement bucket->tasks here
1717	* dequeue(taskB)
1718	*
1719	* where we could end up with stale data in uc_se and
1720	* bucket[uc_se->bucket_id].
1721	*
1722	* The following check here eliminates the possibility of such race.
1723	*/
1724	if (unlikely(!uc_se->active))
1725	return;
1726
1727	bucket = &uc_rq->bucket[uc_se->bucket_id];
1728
1729	WARN_ON_ONCE(!bucket->tasks);
1730	if (likely(bucket->tasks))
1731	bucket->tasks--;
1732
1733	uc_se->active = false;
1734
1735	/*
1736	* Keep "local max aggregation" simple and accept to (possibly)
1737	* overboost some RUNNABLE tasks in the same bucket.
1738	* The rq clamp bucket value is reset to its base value whenever
1739	* there are no more RUNNABLE tasks refcounting it.
1740	*/
1741	if (likely(bucket->tasks))
1742	return;
1743
1744	rq_clamp = uclamp_rq_get(rq, clamp_id);
1745	/*
1746	* Defensive programming: this should never happen. If it happens,
1747	* e.g. due to future modification, warn and fix up the expected value.
1748	*/
1749	WARN_ON_ONCE(bucket->value > rq_clamp);
1750	if (bucket->value >= rq_clamp) {
1751	bkt_clamp = uclamp_rq_max_value(rq, clamp_id, clamp_value: uc_se->value);
1752	uclamp_rq_set(rq, clamp_id, value: bkt_clamp);
1753	}
1754	}
1755
1756	static inline void uclamp_rq_inc(struct rq rq, struct* task_struct p, int* flags)
1757	{
1758	enum uclamp_id clamp_id;
1759
1760	/*
1761	* Avoid any overhead until uclamp is actually used by the userspace.
1762	*
1763	* The condition is constructed such that a NOP is generated when
1764	* sched_uclamp_used is disabled.
1765	*/
1766	if (!uclamp_is_used())
1767	return;
1768
1769	if (unlikely(!p->sched_class->uclamp_enabled))
1770	return;
1771
1772	/ Only inc the delayed task which being woken up. /
1773	if (p->se.sched_delayed && !(flags & ENQUEUE_DELAYED))
1774	return;
1775
1776	for_each_clamp_id(clamp_id)
1777	uclamp_rq_inc_id(rq, p, clamp_id);
1778
1779	/ Reset clamp idle holding when there is one RUNNABLE task /
1780	if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
1781	rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1782	}
1783
1784	static inline void uclamp_rq_dec(struct rq rq, struct* task_struct *p)
1785	{
1786	enum uclamp_id clamp_id;
1787
1788	/*
1789	* Avoid any overhead until uclamp is actually used by the userspace.
1790	*
1791	* The condition is constructed such that a NOP is generated when
1792	* sched_uclamp_used is disabled.
1793	*/
1794	if (!uclamp_is_used())
1795	return;
1796
1797	if (unlikely(!p->sched_class->uclamp_enabled))
1798	return;
1799
1800	if (p->se.sched_delayed)
1801	return;
1802
1803	for_each_clamp_id(clamp_id)
1804	uclamp_rq_dec_id(rq, p, clamp_id);
1805	}
1806
1807	static inline void uclamp_rq_reinc_id(struct rq rq, struct* task_struct *p,
1808	enum uclamp_id clamp_id)
1809	{
1810	if (!p->uclamp[clamp_id].active)
1811	return;
1812
1813	uclamp_rq_dec_id(rq, p, clamp_id);
1814	uclamp_rq_inc_id(rq, p, clamp_id);
1815
1816	/*
1817	* Make sure to clear the idle flag if we've transiently reached 0
1818	* active tasks on rq.
1819	*/
1820	if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
1821	rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1822	}
1823
1824	static inline void
1825	uclamp_update_active(struct task_struct *p)
1826	{
1827	enum uclamp_id clamp_id;
1828	struct rq_flags rf;
1829	struct rq *rq;
1830
1831	/*
1832	* Lock the task and the rq where the task is (or was) queued.
1833	*
1834	* We might lock the (previous) rq of a !RUNNABLE task, but that's the
1835	* price to pay to safely serialize util_{min,max} updates with
1836	* enqueues, dequeues and migration operations.
1837	* This is the same locking schema used by __set_cpus_allowed_ptr().
1838	*/
1839	rq = task_rq_lock(p, rf: &rf);
1840
1841	/*
1842	* Setting the clamp bucket is serialized by task_rq_lock().
1843	* If the task is not yet RUNNABLE and its task_struct is not
1844	* affecting a valid clamp bucket, the next time it's enqueued,
1845	* it will already see the updated clamp bucket value.
1846	*/
1847	for_each_clamp_id(clamp_id)
1848	uclamp_rq_reinc_id(rq, p, clamp_id);
1849
1850	task_rq_unlock(rq, p, rf: &rf);
1851	}
1852
1853	#ifdef CONFIG_UCLAMP_TASK_GROUP
1854	static inline void
1855	uclamp_update_active_tasks(struct cgroup_subsys_state *css)
1856	{
1857	struct css_task_iter it;
1858	struct task_struct *p;
1859
1860	css_task_iter_start(css, flags: `0`, it: &it);
1861	while ((p = css_task_iter_next(it: &it)))
1862	uclamp_update_active(p);
1863	css_task_iter_end(it: &it);
1864	}
1865
1866	static void cpu_util_update_eff(struct cgroup_subsys_state *css);
1867	#endif
1868
1869	#ifdef CONFIG_SYSCTL
1870	#ifdef CONFIG_UCLAMP_TASK_GROUP
1871	static void uclamp_update_root_tg(void)
1872	{
1873	struct task_group *tg = &root_task_group;
1874
1875	uclamp_se_set(uc_se: &tg->uclamp_req[UCLAMP_MIN],
1876	value: sysctl_sched_uclamp_util_min, user_defined: false);
1877	uclamp_se_set(uc_se: &tg->uclamp_req[UCLAMP_MAX],
1878	value: sysctl_sched_uclamp_util_max, user_defined: false);
1879
1880	guard(rcu)();
1881	cpu_util_update_eff(css: &root_task_group.css);
1882	}
1883	#else
1884	static void uclamp_update_root_tg(void) { }
1885	#endif
1886
1887	static void uclamp_sync_util_min_rt_default(void)
1888	{
1889	struct task_struct g, p;
1890
1891	/*
1892	* copy_process() sysctl_uclamp
1893	* uclamp_min_rt = X;
1894	* write_lock(&tasklist_lock) read_lock(&tasklist_lock)
1895	* // link thread smp_mb__after_spinlock()
1896	* write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
1897	* sched_post_fork() for_each_process_thread()
1898	* __uclamp_sync_rt() __uclamp_sync_rt()
1899	*
1900	* Ensures that either sched_post_fork() will observe the new
1901	* uclamp_min_rt or for_each_process_thread() will observe the new
1902	* task.
1903	*/
1904	read_lock(&tasklist_lock);
1905	smp_mb__after_spinlock();
1906	read_unlock(&tasklist_lock);
1907
1908	guard(rcu)();
1909	for_each_process_thread(g, p)
1910	uclamp_update_util_min_rt_default(p);
1911	}
1912
1913	static int sysctl_sched_uclamp_handler(const struct ctl_table table, int* write,
1914	void buffer, size_t lenp, loff_t *ppos)
1915	{
1916	bool update_root_tg = false;
1917	int old_min, old_max, old_min_rt;
1918	int result;
1919
1920	guard(mutex)(T: &uclamp_mutex);
1921
1922	old_min = sysctl_sched_uclamp_util_min;
1923	old_max = sysctl_sched_uclamp_util_max;
1924	old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1925
1926	result = proc_dointvec(table, write, buffer, lenp, ppos);
1927	if (result)
1928	goto undo;
1929	if (!write)
1930	return `0`;
1931
1932	if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max \|\|
1933	sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE \|\|
1934	sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1935
1936	result = -EINVAL;
1937	goto undo;
1938	}
1939
1940	if (old_min != sysctl_sched_uclamp_util_min) {
1941	uclamp_se_set(uc_se: &uclamp_default[UCLAMP_MIN],
1942	value: sysctl_sched_uclamp_util_min, user_defined: false);
1943	update_root_tg = true;
1944	}
1945	if (old_max != sysctl_sched_uclamp_util_max) {
1946	uclamp_se_set(uc_se: &uclamp_default[UCLAMP_MAX],
1947	value: sysctl_sched_uclamp_util_max, user_defined: false);
1948	update_root_tg = true;
1949	}
1950
1951	if (update_root_tg) {
1952	sched_uclamp_enable();
1953	uclamp_update_root_tg();
1954	}
1955
1956	if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1957	sched_uclamp_enable();
1958	uclamp_sync_util_min_rt_default();
1959	}
1960
1961	/*
1962	* We update all RUNNABLE tasks only when task groups are in use.
1963	* Otherwise, keep it simple and do just a lazy update at each next
1964	* task enqueue time.
1965	*/
1966	return `0`;
1967
1968	undo:
1969	sysctl_sched_uclamp_util_min = old_min;
1970	sysctl_sched_uclamp_util_max = old_max;
1971	sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1972	return result;
1973	}
1974	#endif
1975
1976	static void uclamp_fork(struct task_struct *p)
1977	{
1978	enum uclamp_id clamp_id;
1979
1980	/*
1981	* We don't need to hold task_rq_lock() when updating p->uclamp_* here
1982	* as the task is still at its early fork stages.
1983	*/
1984	for_each_clamp_id(clamp_id)
1985	p->uclamp[clamp_id].active = false;
1986
1987	if (likely(!p->sched_reset_on_fork))
1988	return;
1989
1990	for_each_clamp_id(clamp_id) {
1991	uclamp_se_set(uc_se: &p->uclamp_req[clamp_id],
1992	value: uclamp_none(clamp_id), user_defined: false);
1993	}
1994	}
1995
1996	static void uclamp_post_fork(struct task_struct *p)
1997	{
1998	uclamp_update_util_min_rt_default(p);
1999	}
2000
2001	static void __init init_uclamp_rq(struct rq *rq)
2002	{
2003	enum uclamp_id clamp_id;
2004	struct uclamp_rq *uc_rq = rq->uclamp;
2005
2006	for_each_clamp_id(clamp_id) {
2007	uc_rq[clamp_id] = (struct uclamp_rq) {
2008	.value = uclamp_none(clamp_id)
2009	};
2010	}
2011
2012	rq->uclamp_flags = UCLAMP_FLAG_IDLE;
2013	}
2014
2015	static void __init init_uclamp(void)
2016	{
2017	struct uclamp_se uc_max = {};
2018	enum uclamp_id clamp_id;
2019	int cpu;
2020
2021	for_each_possible_cpu(cpu)
2022	init_uclamp_rq(cpu_rq(cpu));
2023
2024	for_each_clamp_id(clamp_id) {
2025	uclamp_se_set(uc_se: &init_task.uclamp_req[clamp_id],
2026	value: uclamp_none(clamp_id), user_defined: false);
2027	}
2028
2029	/ System defaults allow max clamp values for both indexes /
2030	uclamp_se_set(uc_se: &uc_max, value: uclamp_none(clamp_id: UCLAMP_MAX), user_defined: false);
2031	for_each_clamp_id(clamp_id) {
2032	uclamp_default[clamp_id] = uc_max;
2033	#ifdef CONFIG_UCLAMP_TASK_GROUP
2034	root_task_group.uclamp_req[clamp_id] = uc_max;
2035	root_task_group.uclamp[clamp_id] = uc_max;
2036	#endif
2037	}
2038	}
2039
2040	#else /* !CONFIG_UCLAMP_TASK */
2041	static inline void uclamp_rq_inc(struct rq rq, struct* task_struct p, int* flags) { }
2042	static inline void uclamp_rq_dec(struct rq rq, struct* task_struct *p) { }
2043	static inline void uclamp_fork(struct task_struct *p) { }
2044	static inline void uclamp_post_fork(struct task_struct *p) { }
2045	static inline void init_uclamp(void) { }
2046	#endif /* CONFIG_UCLAMP_TASK */
2047
2048	bool sched_task_on_rq(struct task_struct *p)
2049	{
2050	return task_on_rq_queued(p);
2051	}
2052
2053	unsigned long get_wchan(struct task_struct *p)
2054	{
2055	unsigned long ip = `0`;
2056	unsigned int state;
2057
2058	if (!p \|\| p == current)
2059	return `0`;
2060
2061	/ Only get wchan if task is blocked and we can keep it that way. /
2062	raw_spin_lock_irq(&p->pi_lock);
2063	state = READ_ONCE(p->__state);
2064	smp_rmb(); / see try_to_wake_up() /
2065	if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
2066	ip = __get_wchan(p);
2067	raw_spin_unlock_irq(&p->pi_lock);
2068
2069	return ip;
2070	}
2071
2072	void enqueue_task(struct rq rq, struct* task_struct p, int* flags)
2073	{
2074	if (!(flags & ENQUEUE_NOCLOCK))
2075	update_rq_clock(rq);
2076
2077	/*
2078	* Can be before ->enqueue_task() because uclamp considers the
2079	* ENQUEUE_DELAYED task before its ->sched_delayed gets cleared
2080	* in ->enqueue_task().
2081	*/
2082	uclamp_rq_inc(rq, p, flags);
2083
2084	p->sched_class->enqueue_task(rq, p, flags);
2085
2086	psi_enqueue(p, flags);
2087
2088	if (!(flags & ENQUEUE_RESTORE))
2089	sched_info_enqueue(rq, t: p);
2090
2091	if (sched_core_enabled(rq))
2092	sched_core_enqueue(rq, p);
2093	}
2094
2095	/*
2096	* Must only return false when DEQUEUE_SLEEP.
2097	*/
2098	inline bool dequeue_task(struct rq rq, struct* task_struct p, int* flags)
2099	{
2100	if (sched_core_enabled(rq))
2101	sched_core_dequeue(rq, p, flags);
2102
2103	if (!(flags & DEQUEUE_NOCLOCK))
2104	update_rq_clock(rq);
2105
2106	if (!(flags & DEQUEUE_SAVE))
2107	sched_info_dequeue(rq, t: p);
2108
2109	psi_dequeue(p, flags);
2110
2111	/*
2112	* Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
2113	* and mark the task ->sched_delayed.
2114	*/
2115	uclamp_rq_dec(rq, p);
2116	return p->sched_class->dequeue_task(rq, p, flags);
2117	}
2118
2119	void activate_task(struct rq rq, struct* task_struct p, int* flags)
2120	{
2121	if (task_on_rq_migrating(p))
2122	flags \|= ENQUEUE_MIGRATED;
2123	if (flags & ENQUEUE_MIGRATED)
2124	sched_mm_cid_migrate_to(dst_rq: rq, t: p);
2125
2126	enqueue_task(rq, p, flags);
2127
2128	WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED);
2129	ASSERT_EXCLUSIVE_WRITER(p->on_rq);
2130	}
2131
2132	void deactivate_task(struct rq rq, struct* task_struct p, int* flags)
2133	{
2134	WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
2135
2136	WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
2137	ASSERT_EXCLUSIVE_WRITER(p->on_rq);
2138
2139	/*
2140	* Code explicitly relies on TASK_ON_RQ_MIGRATING begin set before
2141	* dequeue_task() and cleared after enqueue_task().
2142	*/
2143
2144	dequeue_task(rq, p, flags);
2145	}
2146
2147	static void block_task(struct rq rq, struct* task_struct p, int* flags)
2148	{
2149	if (dequeue_task(rq, p, DEQUEUE_SLEEP \| flags))
2150	__block_task(rq, p);
2151	}
2152
2153	/**
2154	* task_curr - is this task currently executing on a CPU?
2155	* @p: the task in question.
2156	*
2157	* Return: 1 if the task is currently executing. 0 otherwise.
2158	*/
2159	inline int task_curr(const struct task_struct *p)
2160	{
2161	return cpu_curr(task_cpu(p)) == p;
2162	}
2163
2164	/*
2165	* ->switching_to() is called with the pi_lock and rq_lock held and must not
2166	* mess with locking.
2167	*/
2168	void check_class_changing(struct rq rq, struct* task_struct *p,
2169	const struct sched_class *prev_class)
2170	{
2171	if (prev_class != p->sched_class && p->sched_class->switching_to)
2172	p->sched_class->switching_to(rq, p);
2173	}
2174
2175	/*
2176	* switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
2177	* use the balance_callback list if you want balancing.
2178	*
2179	* this means any call to check_class_changed() must be followed by a call to
2180	* balance_callback().
2181	*/
2182	void check_class_changed(struct rq rq, struct* task_struct *p,
2183	const struct sched_class *prev_class,
2184	int oldprio)
2185	{
2186	if (prev_class != p->sched_class) {
2187	if (prev_class->switched_from)
2188	prev_class->switched_from(rq, p);
2189
2190	p->sched_class->switched_to(rq, p);
2191	} else if (oldprio != p->prio \|\| dl_task(p))
2192	p->sched_class->prio_changed(rq, p, oldprio);
2193	}
2194
2195	void wakeup_preempt(struct rq rq, struct* task_struct p, int* flags)
2196	{
2197	struct task_struct *donor = rq->donor;
2198
2199	if (p->sched_class == donor->sched_class)
2200	donor->sched_class->wakeup_preempt(rq, p, flags);
2201	else if (sched_class_above(p->sched_class, donor->sched_class))
2202	resched_curr(rq);
2203
2204	/*
2205	* A queue event has occurred, and we're going to schedule. In
2206	* this case, we can save a useless back to back clock update.
2207	*/
2208	if (task_on_rq_queued(p: donor) && test_tsk_need_resched(tsk: rq->curr))
2209	rq_clock_skip_update(rq);
2210	}
2211
2212	static __always_inline
2213	int __task_state_match(struct task_struct p, unsigned* int state)
2214	{
2215	if (READ_ONCE(p->__state) & state)
2216	return `1`;
2217
2218	if (READ_ONCE(p->saved_state) & state)
2219	return -`1`;
2220
2221	return `0`;
2222	}
2223
2224	static __always_inline
2225	int task_state_match(struct task_struct p, unsigned* int state)
2226	{
2227	/*
2228	* Serialize against current_save_and_set_rtlock_wait_state(),
2229	* current_restore_rtlock_saved_state(), and __refrigerator().
2230	*/
2231	guard(raw_spinlock_irq)(l: &p->pi_lock);
2232	return __task_state_match(p, state);
2233	}
2234
2235	/*
2236	* wait_task_inactive - wait for a thread to unschedule.
2237	*
2238	* Wait for the thread to block in any of the states set in @match_state.
2239	* If it changes, i.e. @p might have woken up, then return zero. When we
2240	* succeed in waiting for @p to be off its CPU, we return a positive number
2241	* (its total switch count). If a second call a short while later returns the
2242	* same number, the caller can be sure that @p has remained unscheduled the
2243	* whole time.
2244	*
2245	* The caller must ensure that the task will unschedule sometime soon,
2246	* else this function might spin for a long time. This function can't
2247	* be called with interrupts off, or it may introduce deadlock with
2248	* smp_call_function() if an IPI is sent by the same process we are
2249	* waiting to become inactive.
2250	*/
2251	unsigned long wait_task_inactive(struct task_struct p, unsigned* int match_state)
2252	{
2253	int running, queued, match;
2254	struct rq_flags rf;
2255	unsigned long ncsw;
2256	struct rq *rq;
2257
2258	for (;;) {
2259	/*
2260	* We do the initial early heuristics without holding
2261	* any task-queue locks at all. We'll only try to get
2262	* the runqueue lock when things look like they will
2263	* work out!
2264	*/
2265	rq = task_rq(p);
2266
2267	/*
2268	* If the task is actively running on another CPU
2269	* still, just relax and busy-wait without holding
2270	* any locks.
2271	*
2272	* NOTE! Since we don't hold any locks, it's not
2273	* even sure that "rq" stays as the right runqueue!
2274	* But we don't care, since "task_on_cpu()" will
2275	* return false if the runqueue has changed and p
2276	* is actually now running somewhere else!
2277	*/
2278	while (task_on_cpu(rq, p)) {
2279	if (!task_state_match(p, state: match_state))
2280	return `0`;
2281	cpu_relax();
2282	}
2283
2284	/*
2285	* Ok, time to look more closely! We need the rq
2286	* lock now, to be sure. If we're wrong, we'll
2287	* just go back and repeat.
2288	*/
2289	rq = task_rq_lock(p, rf: &rf);
2290	/*
2291	* If task is sched_delayed, force dequeue it, to avoid always
2292	* hitting the tick timeout in the queued case
2293	*/
2294	if (p->se.sched_delayed)
2295	dequeue_task(rq, p, DEQUEUE_SLEEP \| DEQUEUE_DELAYED);
2296	trace_sched_wait_task(p);
2297	running = task_on_cpu(rq, p);
2298	queued = task_on_rq_queued(p);
2299	ncsw = `0`;
2300	if ((match = __task_state_match(p, state: match_state))) {
2301	/*
2302	* When matching on p->saved_state, consider this task
2303	* still queued so it will wait.
2304	*/
2305	if (match < `0`)
2306	queued = `1`;
2307	ncsw = p->nvcsw \| LONG_MIN; / sets MSB /
2308	}
2309	task_rq_unlock(rq, p, rf: &rf);
2310
2311	/*
2312	* If it changed from the expected state, bail out now.
2313	*/
2314	if (unlikely(!ncsw))
2315	break;
2316
2317	/*
2318	* Was it really running after all now that we
2319	* checked with the proper locks actually held?
2320	*
2321	* Oops. Go back and try again..
2322	*/
2323	if (unlikely(running)) {
2324	cpu_relax();
2325	continue;
2326	}
2327
2328	/*
2329	* It's not enough that it's not actively running,
2330	* it must be off the runqueue _entirely_, and not
2331	* preempted!
2332	*
2333	* So if it was still runnable (but just not actively
2334	* running right now), it's preempted, and we should
2335	* yield - it could be a while.
2336	*/
2337	if (unlikely(queued)) {
2338	ktime_t to = NSEC_PER_SEC / HZ;
2339
2340	set_current_state(TASK_UNINTERRUPTIBLE);
2341	schedule_hrtimeout(expires: &to, mode: HRTIMER_MODE_REL_HARD);
2342	continue;
2343	}
2344
2345	/*
2346	* Ahh, all good. It wasn't running, and it wasn't
2347	* runnable, which means that it will never become
2348	* running in the future either. We're all done!
2349	*/
2350	break;
2351	}
2352
2353	return ncsw;
2354	}
2355
2356	#ifdef CONFIG_SMP
2357
2358	static void
2359	__do_set_cpus_allowed(struct task_struct p, struct* affinity_context *ctx);
2360
2361	static void migrate_disable_switch(struct rq rq, struct* task_struct *p)
2362	{
2363	struct affinity_context ac = {
2364	.new_mask = cpumask_of(rq->cpu),
2365	.flags = SCA_MIGRATE_DISABLE,
2366	};
2367
2368	if (likely(!p->migration_disabled))
2369	return;
2370
2371	if (p->cpus_ptr != &p->cpus_mask)
2372	return;
2373
2374	/*
2375	* Violates locking rules! See comment in __do_set_cpus_allowed().
2376	*/
2377	__do_set_cpus_allowed(p, ctx: &ac);
2378	}
2379
2380	void migrate_disable(void)
2381	{
2382	struct task_struct *p = current;
2383
2384	if (p->migration_disabled) {
2385	#ifdef CONFIG_DEBUG_PREEMPT
2386	/*
2387	*Warn about overflow half-way through the range.
2388	*/
2389	WARN_ON_ONCE((s16)p->migration_disabled < `0`);
2390	#endif
2391	p->migration_disabled++;
2392	return;
2393	}
2394
2395	guard(preempt)();
2396	this_rq()->nr_pinned++;
2397	p->migration_disabled = `1`;
2398	}
2399	EXPORT_SYMBOL_GPL(migrate_disable);
2400
2401	void migrate_enable(void)
2402	{
2403	struct task_struct *p = current;
2404	struct affinity_context ac = {
2405	.new_mask = &p->cpus_mask,
2406	.flags = SCA_MIGRATE_ENABLE,
2407	};
2408
2409	#ifdef CONFIG_DEBUG_PREEMPT
2410	/*
2411	* Check both overflow from migrate_disable() and superfluous
2412	* migrate_enable().
2413	*/
2414	if (WARN_ON_ONCE((s16)p->migration_disabled <= `0`))
2415	return;
2416	#endif
2417
2418	if (p->migration_disabled > `1`) {
2419	p->migration_disabled--;
2420	return;
2421	}
2422
2423	/*
2424	* Ensure stop_task runs either before or after this, and that
2425	* __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
2426	*/
2427	guard(preempt)();
2428	if (p->cpus_ptr != &p->cpus_mask)
2429	__set_cpus_allowed_ptr(p, ctx: &ac);
2430	/*
2431	* Mustn't clear migration_disabled() until cpus_ptr points back at the
2432	* regular cpus_mask, otherwise things that race (eg.
2433	* select_fallback_rq) get confused.
2434	*/
2435	barrier();
2436	p->migration_disabled = `0`;
2437	this_rq()->nr_pinned--;
2438	}
2439	EXPORT_SYMBOL_GPL(migrate_enable);
2440
2441	static inline bool rq_has_pinned_tasks(struct rq *rq)
2442	{
2443	return rq->nr_pinned;
2444	}
2445
2446	/*
2447	* Per-CPU kthreads are allowed to run on !active && online CPUs, see
2448	* __set_cpus_allowed_ptr() and select_fallback_rq().
2449	*/
2450	static inline bool is_cpu_allowed(struct task_struct p, int* cpu)
2451	{
2452	/ When not in the task's cpumask, no point in looking further. /
2453	if (!task_allowed_on_cpu(p, cpu))
2454	return false;
2455
2456	/ migrate_disabled() must be allowed to finish. /
2457	if (is_migration_disabled(p))
2458	return cpu_online(cpu);
2459
2460	/ Non kernel threads are not allowed during either online or offline. /
2461	if (!(p->flags & PF_KTHREAD))
2462	return cpu_active(cpu);
2463
2464	/ KTHREAD_IS_PER_CPU is always allowed. /
2465	if (kthread_is_per_cpu(k: p))
2466	return cpu_online(cpu);
2467
2468	/ Regular kernel threads don't get to stay during offline. /
2469	if (cpu_dying(cpu))
2470	return false;
2471
2472	/ But are allowed during online. /
2473	return cpu_online(cpu);
2474	}
2475
2476	/*
2477	* This is how migration works:
2478	*
2479	* 1) we invoke migration_cpu_stop() on the target CPU using
2480	* stop_one_cpu().
2481	* 2) stopper starts to run (implicitly forcing the migrated thread
2482	* off the CPU)
2483	* 3) it checks whether the migrated task is still in the wrong runqueue.
2484	* 4) if it's in the wrong runqueue then the migration thread removes
2485	* it and puts it into the right queue.
2486	* 5) stopper completes and stop_one_cpu() returns and the migration
2487	* is done.
2488	*/
2489
2490	/*
2491	* move_queued_task - move a queued task to new rq.
2492	*
2493	* Returns (locked) new rq. Old rq's lock is released.
2494	*/
2495	static struct rq move_queued_task(struct* rq rq, struct* rq_flags *rf,
2496	struct task_struct p, int* new_cpu)
2497	{
2498	lockdep_assert_rq_held(rq);
2499
2500	deactivate_task(rq, p, DEQUEUE_NOCLOCK);
2501	set_task_cpu(p, cpu: new_cpu);
2502	rq_unlock(rq, rf);
2503
2504	rq = cpu_rq(new_cpu);
2505
2506	rq_lock(rq, rf);
2507	WARN_ON_ONCE(task_cpu(p) != new_cpu);
2508	activate_task(rq, p, flags: `0`);
2509	wakeup_preempt(rq, p, flags: `0`);
2510
2511	return rq;
2512	}
2513
2514	struct migration_arg {
2515	struct task_struct *task;
2516	int dest_cpu;
2517	struct set_affinity_pending *pending;
2518	};
2519
2520	/*
2521	* @refs: number of wait_for_completion()
2522	* @stop_pending: is @stop_work in use
2523	*/
2524	struct set_affinity_pending {
2525	refcount_t refs;
2526	unsigned int stop_pending;
2527	struct completion done;
2528	struct cpu_stop_work stop_work;
2529	struct migration_arg arg;
2530	};
2531
2532	/*
2533	* Move (not current) task off this CPU, onto the destination CPU. We're doing
2534	* this because either it can't run here any more (set_cpus_allowed()
2535	* away from this CPU, or CPU going down), or because we're
2536	* attempting to rebalance this task on exec (sched_exec).
2537	*
2538	* So we race with normal scheduler movements, but that's OK, as long
2539	* as the task is no longer on this CPU.
2540	*/
2541	static struct rq __migrate_task(struct* rq rq, struct* rq_flags *rf,
2542	struct task_struct p, int* dest_cpu)
2543	{
2544	/ Affinity changed (again). /
2545	if (!is_cpu_allowed(p, cpu: dest_cpu))
2546	return rq;
2547
2548	rq = move_queued_task(rq, rf, p, new_cpu: dest_cpu);
2549
2550	return rq;
2551	}
2552
2553	/*
2554	* migration_cpu_stop - this will be executed by a high-prio stopper thread
2555	* and performs thread migration by bumping thread off CPU then
2556	* 'pushing' onto another runqueue.
2557	*/
2558	static int migration_cpu_stop(void *data)
2559	{
2560	struct migration_arg *arg = data;
2561	struct set_affinity_pending *pending = arg->pending;
2562	struct task_struct *p = arg->task;
2563	struct rq *rq = this_rq();
2564	bool complete = false;
2565	struct rq_flags rf;
2566
2567	/*
2568	* The original target CPU might have gone down and we might
2569	* be on another CPU but it doesn't matter.
2570	*/
2571	local_irq_save(rf.flags);
2572	/*
2573	* We need to explicitly wake pending tasks before running
2574	* __migrate_task() such that we will not miss enforcing cpus_ptr
2575	* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
2576	*/
2577	flush_smp_call_function_queue();
2578
2579	raw_spin_lock(&p->pi_lock);
2580	rq_lock(rq, rf: &rf);
2581
2582	/*
2583	* If we were passed a pending, then ->stop_pending was set, thus
2584	* p->migration_pending must have remained stable.
2585	*/
2586	WARN_ON_ONCE(pending && pending != p->migration_pending);
2587
2588	/*
2589	* If task_rq(p) != rq, it cannot be migrated here, because we're
2590	* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
2591	* we're holding p->pi_lock.
2592	*/
2593	if (task_rq(p) == rq) {
2594	if (is_migration_disabled(p))
2595	goto out;
2596
2597	if (pending) {
2598	p->migration_pending = NULL;
2599	complete = true;
2600
2601	if (cpumask_test_cpu(cpu: task_cpu(p), cpumask: &p->cpus_mask))
2602	goto out;
2603	}
2604
2605	if (task_on_rq_queued(p)) {
2606	update_rq_clock(rq);
2607	rq = __migrate_task(rq, rf: &rf, p, dest_cpu: arg->dest_cpu);
2608	} else {
2609	p->wake_cpu = arg->dest_cpu;
2610	}
2611
2612	/*
2613	* XXX __migrate_task() can fail, at which point we might end
2614	* up running on a dodgy CPU, AFAICT this can only happen
2615	* during CPU hotplug, at which point we'll get pushed out
2616	* anyway, so it's probably not a big deal.
2617	*/
2618
2619	} else if (pending) {
2620	/*
2621	* This happens when we get migrated between migrate_enable()'s
2622	* preempt_enable() and scheduling the stopper task. At that
2623	* point we're a regular task again and not current anymore.
2624	*
2625	* A !PREEMPT kernel has a giant hole here, which makes it far
2626	* more likely.
2627	*/
2628
2629	/*
2630	* The task moved before the stopper got to run. We're holding
2631	* ->pi_lock, so the allowed mask is stable - if it got
2632	* somewhere allowed, we're done.
2633	*/
2634	if (cpumask_test_cpu(cpu: task_cpu(p), cpumask: p->cpus_ptr)) {
2635	p->migration_pending = NULL;
2636	complete = true;
2637	goto out;
2638	}
2639
2640	/*
2641	* When migrate_enable() hits a rq mis-match we can't reliably
2642	* determine is_migration_disabled() and so have to chase after
2643	* it.
2644	*/
2645	WARN_ON_ONCE(!pending->stop_pending);
2646	preempt_disable();
2647	task_rq_unlock(rq, p, rf: &rf);
2648	stop_one_cpu_nowait(cpu: task_cpu(p), fn: migration_cpu_stop,
2649	arg: &pending->arg, work_buf: &pending->stop_work);
2650	preempt_enable();
2651	return `0`;
2652	}
2653	out:
2654	if (pending)
2655	pending->stop_pending = false;
2656	task_rq_unlock(rq, p, rf: &rf);
2657
2658	if (complete)
2659	complete_all(&pending->done);
2660
2661	return `0`;
2662	}
2663
2664	int push_cpu_stop(void *arg)
2665	{
2666	struct rq lowest_rq = NULL, rq = this_rq();
2667	struct task_struct *p = arg;
2668
2669	raw_spin_lock_irq(&p->pi_lock);
2670	raw_spin_rq_lock(rq);
2671
2672	if (task_rq(p) != rq)
2673	goto out_unlock;
2674
2675	if (is_migration_disabled(p)) {
2676	p->migration_flags \|= MDF_PUSH;
2677	goto out_unlock;
2678	}
2679
2680	p->migration_flags &= ~MDF_PUSH;
2681
2682	if (p->sched_class->find_lock_rq)
2683	lowest_rq = p->sched_class->find_lock_rq(p, rq);
2684
2685	if (!lowest_rq)
2686	goto out_unlock;
2687
2688	// XXX validate p is still the highest prio task
2689	if (task_rq(p) == rq) {
2690	move_queued_task_locked(src_rq: rq, dst_rq: lowest_rq, task: p);
2691	resched_curr(rq: lowest_rq);
2692	}
2693
2694	double_unlock_balance(this_rq: rq, busiest: lowest_rq);
2695
2696	out_unlock:
2697	rq->push_busy = false;
2698	raw_spin_rq_unlock(rq);
2699	raw_spin_unlock_irq(&p->pi_lock);
2700
2701	put_task_struct(t: p);
2702	return `0`;
2703	}
2704
2705	/*
2706	* sched_class::set_cpus_allowed must do the below, but is not required to
2707	* actually call this function.
2708	*/
2709	void set_cpus_allowed_common(struct task_struct p, struct* affinity_context *ctx)
2710	{
2711	if (ctx->flags & (SCA_MIGRATE_ENABLE \| SCA_MIGRATE_DISABLE)) {
2712	p->cpus_ptr = ctx->new_mask;
2713	return;
2714	}
2715
2716	cpumask_copy(dstp: &p->cpus_mask, srcp: ctx->new_mask);
2717	p->nr_cpus_allowed = cpumask_weight(srcp: ctx->new_mask);
2718
2719	/*
2720	* Swap in a new user_cpus_ptr if SCA_USER flag set
2721	*/
2722	if (ctx->flags & SCA_USER)
2723	swap(p->user_cpus_ptr, ctx->user_mask);
2724	}
2725
2726	static void
2727	__do_set_cpus_allowed(struct task_struct p, struct* affinity_context *ctx)
2728	{
2729	struct rq *rq = task_rq(p);
2730	bool queued, running;
2731
2732	/*
2733	* This here violates the locking rules for affinity, since we're only
2734	* supposed to change these variables while holding both rq->lock and
2735	* p->pi_lock.
2736	*
2737	* HOWEVER, it magically works, because ttwu() is the only code that
2738	* accesses these variables under p->pi_lock and only does so after
2739	* smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
2740	* before finish_task().
2741	*
2742	* XXX do further audits, this smells like something putrid.
2743	*/
2744	if (ctx->flags & SCA_MIGRATE_DISABLE)
2745	WARN_ON_ONCE(!p->on_cpu);
2746	else
2747	lockdep_assert_held(&p->pi_lock);
2748
2749	queued = task_on_rq_queued(p);
2750	running = task_current_donor(rq, p);
2751
2752	if (queued) {
2753	/*
2754	* Because __kthread_bind() calls this on blocked tasks without
2755	* holding rq->lock.
2756	*/
2757	lockdep_assert_rq_held(rq);
2758	dequeue_task(rq, p, DEQUEUE_SAVE \| DEQUEUE_NOCLOCK);
2759	}
2760	if (running)
2761	put_prev_task(rq, prev: p);
2762
2763	p->sched_class->set_cpus_allowed(p, ctx);
2764	mm_set_cpus_allowed(mm: p->mm, cpumask: ctx->new_mask);
2765
2766	if (queued)
2767	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
2768	if (running)
2769	set_next_task(rq, next: p);
2770	}
2771
2772	/*
2773	* Used for kthread_bind() and select_fallback_rq(), in both cases the user
2774	* affinity (if any) should be destroyed too.
2775	*/
2776	void do_set_cpus_allowed(struct task_struct p, const* struct cpumask *new_mask)
2777	{
2778	struct affinity_context ac = {
2779	.new_mask = new_mask,
2780	.user_mask = NULL,
2781	.flags = SCA_USER, / clear the user requested mask /
2782	};
2783	union cpumask_rcuhead {
2784	cpumask_t cpumask;
2785	struct rcu_head rcu;
2786	};
2787
2788	__do_set_cpus_allowed(p, ctx: &ac);
2789
2790	/*
2791	* Because this is called with p->pi_lock held, it is not possible
2792	* to use kfree() here (when PREEMPT_RT=y), therefore punt to using
2793	* kfree_rcu().
2794	*/
2795	kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu);
2796	}
2797
2798	int dup_user_cpus_ptr(struct task_struct dst, struct* task_struct *src,
2799	int node)
2800	{
2801	cpumask_t *user_mask;
2802	unsigned long flags;
2803
2804	/*
2805	* Always clear dst->user_cpus_ptr first as their user_cpus_ptr's
2806	* may differ by now due to racing.
2807	*/
2808	dst->user_cpus_ptr = NULL;
2809
2810	/*
2811	* This check is racy and losing the race is a valid situation.
2812	* It is not worth the extra overhead of taking the pi_lock on
2813	* every fork/clone.
2814	*/
2815	if (data_race(!src->user_cpus_ptr))
2816	return `0`;
2817
2818	user_mask = alloc_user_cpus_ptr(node);
2819	if (!user_mask)
2820	return -ENOMEM;
2821
2822	/*
2823	* Use pi_lock to protect content of user_cpus_ptr
2824	*
2825	* Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
2826	* do_set_cpus_allowed().
2827	*/
2828	raw_spin_lock_irqsave(&src->pi_lock, flags);
2829	if (src->user_cpus_ptr) {
2830	swap(dst->user_cpus_ptr, user_mask);
2831	cpumask_copy(dstp: dst->user_cpus_ptr, srcp: src->user_cpus_ptr);
2832	}
2833	raw_spin_unlock_irqrestore(&src->pi_lock, flags);
2834
2835	if (unlikely(user_mask))
2836	kfree(objp: user_mask);
2837
2838	return `0`;
2839	}
2840
2841	static inline struct cpumask clear_user_cpus_ptr(struct* task_struct *p)
2842	{
2843	struct cpumask *user_mask = NULL;
2844
2845	swap(p->user_cpus_ptr, user_mask);
2846
2847	return user_mask;
2848	}
2849
2850	void release_user_cpus_ptr(struct task_struct *p)
2851	{
2852	kfree(objp: clear_user_cpus_ptr(p));
2853	}
2854
2855	/*
2856	* This function is wildly self concurrent; here be dragons.
2857	*
2858	*
2859	* When given a valid mask, __set_cpus_allowed_ptr() must block until the
2860	* designated task is enqueued on an allowed CPU. If that task is currently
2861	* running, we have to kick it out using the CPU stopper.
2862	*
2863	* Migrate-Disable comes along and tramples all over our nice sandcastle.
2864	* Consider:
2865	*
2866	* Initial conditions: P0->cpus_mask = [0, 1]
2867	*
2868	* P0@CPU0 P1
2869	*
2870	* migrate_disable();
2871	* <preempted>
2872	* set_cpus_allowed_ptr(P0, [1]);
2873	*
2874	* P1 cannot return from this set_cpus_allowed_ptr() call until P0 executes
2875	* its outermost migrate_enable() (i.e. it exits its Migrate-Disable region).
2876	* This means we need the following scheme:
2877	*
2878	* P0@CPU0 P1
2879	*
2880	* migrate_disable();
2881	* <preempted>
2882	* set_cpus_allowed_ptr(P0, [1]);
2883	* <blocks>
2884	* <resumes>
2885	* migrate_enable();
2886	* __set_cpus_allowed_ptr();
2887	* <wakes local stopper>
2888	* `--> <woken on migration completion>
2889	*
2890	* Now the fun stuff: there may be several P1-like tasks, i.e. multiple
2891	* concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any
2892	* task p are serialized by p->pi_lock, which we can leverage: the one that
2893	* should come into effect at the end of the Migrate-Disable region is the last
2894	* one. This means we only need to track a single cpumask (i.e. p->cpus_mask),
2895	* but we still need to properly signal those waiting tasks at the appropriate
2896	* moment.
2897	*
2898	* This is implemented using struct set_affinity_pending. The first
2899	* __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will
2900	* setup an instance of that struct and install it on the targeted task_struct.
2901	* Any and all further callers will reuse that instance. Those then wait for
2902	* a completion signaled at the tail of the CPU stopper callback (1), triggered
2903	* on the end of the Migrate-Disable region (i.e. outermost migrate_enable()).
2904	*
2905	*
2906	* (1) In the cases covered above. There is one more where the completion is
2907	* signaled within affine_move_task() itself: when a subsequent affinity request
2908	* occurs after the stopper bailed out due to the targeted task still being
2909	* Migrate-Disable. Consider:
2910	*
2911	* Initial conditions: P0->cpus_mask = [0, 1]
2912	*
2913	* CPU0 P1 P2
2914	* <P0>
2915	* migrate_disable();
2916	* <preempted>
2917	* set_cpus_allowed_ptr(P0, [1]);
2918	* <blocks>
2919	* <migration/0>
2920	* migration_cpu_stop()
2921	* is_migration_disabled()
2922	* <bails>
2923	* set_cpus_allowed_ptr(P0, [0, 1]);
2924	* <signal completion>
2925	* <awakes>
2926	*
2927	* Note that the above is safe vs a concurrent migrate_enable(), as any
2928	* pending affinity completion is preceded by an uninstallation of
2929	* p->migration_pending done with p->pi_lock held.
2930	*/
2931	static int affine_move_task(struct rq rq, struct* task_struct p, struct* rq_flags *rf,
2932	int dest_cpu, unsigned int flags)
2933	__releases(rq->lock)
2934	__releases(p->pi_lock)
2935	{
2936	struct set_affinity_pending my_pending = { }, *pending = NULL;
2937	bool stop_pending, complete = false;
2938
2939	/ Can the task run on the task's current CPU? If so, we're done /
2940	if (cpumask_test_cpu(cpu: task_cpu(p), cpumask: &p->cpus_mask)) {
2941	struct task_struct *push_task = NULL;
2942
2943	if ((flags & SCA_MIGRATE_ENABLE) &&
2944	(p->migration_flags & MDF_PUSH) && !rq->push_busy) {
2945	rq->push_busy = true;
2946	push_task = get_task_struct(t: p);
2947	}
2948
2949	/*
2950	* If there are pending waiters, but no pending stop_work,
2951	* then complete now.
2952	*/
2953	pending = p->migration_pending;
2954	if (pending && !pending->stop_pending) {
2955	p->migration_pending = NULL;
2956	complete = true;
2957	}
2958
2959	preempt_disable();
2960	task_rq_unlock(rq, p, rf);
2961	if (push_task) {
2962	stop_one_cpu_nowait(cpu: rq->cpu, fn: push_cpu_stop,
2963	arg: p, work_buf: &rq->push_work);
2964	}
2965	preempt_enable();
2966
2967	if (complete)
2968	complete_all(&pending->done);
2969
2970	return `0`;
2971	}
2972
2973	if (!(flags & SCA_MIGRATE_ENABLE)) {
2974	/ serialized by p->pi_lock /
2975	if (!p->migration_pending) {
2976	/ Install the request /
2977	refcount_set(r: &my_pending.refs, n: `1`);
2978	init_completion(x: &my_pending.done);
2979	my_pending.arg = (struct migration_arg) {
2980	.task = p,
2981	.dest_cpu = dest_cpu,
2982	.pending = &my_pending,
2983	};
2984
2985	p->migration_pending = &my_pending;
2986	} else {
2987	pending = p->migration_pending;
2988	refcount_inc(r: &pending->refs);
2989	/*
2990	* Affinity has changed, but we've already installed a
2991	* pending. migration_cpu_stop() must see this, else
2992	* we risk a completion of the pending despite having a
2993	* task on a disallowed CPU.
2994	*
2995	* Serialized by p->pi_lock, so this is safe.
2996	*/
2997	pending->arg.dest_cpu = dest_cpu;
2998	}
2999	}
3000	pending = p->migration_pending;
3001	/*
3002	* - !MIGRATE_ENABLE:
3003	* we'll have installed a pending if there wasn't one already.
3004	*
3005	* - MIGRATE_ENABLE:
3006	* we're here because the current CPU isn't matching anymore,
3007	* the only way that can happen is because of a concurrent
3008	* set_cpus_allowed_ptr() call, which should then still be
3009	* pending completion.
3010	*
3011	* Either way, we really should have a @pending here.
3012	*/
3013	if (WARN_ON_ONCE(!pending)) {
3014	task_rq_unlock(rq, p, rf);
3015	return -EINVAL;
3016	}
3017
3018	if (task_on_cpu(rq, p) \|\| READ_ONCE(p->__state) == TASK_WAKING) {
3019	/*
3020	* MIGRATE_ENABLE gets here because 'p == current', but for
3021	* anything else we cannot do is_migration_disabled(), punt
3022	* and have the stopper function handle it all race-free.
3023	*/
3024	stop_pending = pending->stop_pending;
3025	if (!stop_pending)
3026	pending->stop_pending = true;
3027
3028	if (flags & SCA_MIGRATE_ENABLE)
3029	p->migration_flags &= ~MDF_PUSH;
3030
3031	preempt_disable();
3032	task_rq_unlock(rq, p, rf);
3033	if (!stop_pending) {
3034	stop_one_cpu_nowait(cpu: cpu_of(rq), fn: migration_cpu_stop,
3035	arg: &pending->arg, work_buf: &pending->stop_work);
3036	}
3037	preempt_enable();
3038
3039	if (flags & SCA_MIGRATE_ENABLE)
3040	return `0`;
3041	} else {
3042
3043	if (!is_migration_disabled(p)) {
3044	if (task_on_rq_queued(p))
3045	rq = move_queued_task(rq, rf, p, new_cpu: dest_cpu);
3046
3047	if (!pending->stop_pending) {
3048	p->migration_pending = NULL;
3049	complete = true;
3050	}
3051	}
3052	task_rq_unlock(rq, p, rf);
3053
3054	if (complete)
3055	complete_all(&pending->done);
3056	}
3057
3058	wait_for_completion(&pending->done);
3059
3060	if (refcount_dec_and_test(r: &pending->refs))
3061	wake_up_var(var: &pending->refs); / No UaF, just an address /
3062
3063	/*
3064	* Block the original owner of &pending until all subsequent callers
3065	* have seen the completion and decremented the refcount
3066	*/
3067	wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
3068
3069	/ ARGH /
3070	WARN_ON_ONCE(my_pending.stop_pending);
3071
3072	return `0`;
3073	}
3074
3075	/*
3076	* Called with both p->pi_lock and rq->lock held; drops both before returning.
3077	*/
3078	static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
3079	struct affinity_context *ctx,
3080	struct rq *rq,
3081	struct rq_flags *rf)
3082	__releases(rq->lock)
3083	__releases(p->pi_lock)
3084	{
3085	const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
3086	const struct cpumask *cpu_valid_mask = cpu_active_mask;
3087	bool kthread = p->flags & PF_KTHREAD;
3088	unsigned int dest_cpu;
3089	int ret = `0`;
3090
3091	update_rq_clock(rq);
3092
3093	if (kthread \|\| is_migration_disabled(p)) {
3094	/*
3095	* Kernel threads are allowed on online && !active CPUs,
3096	* however, during cpu-hot-unplug, even these might get pushed
3097	* away if not KTHREAD_IS_PER_CPU.
3098	*
3099	* Specifically, migration_disabled() tasks must not fail the
3100	* cpumask_any_and_distribute() pick below, esp. so on
3101	* SCA_MIGRATE_ENABLE, otherwise we'll not call
3102	* set_cpus_allowed_common() and actually reset p->cpus_ptr.
3103	*/
3104	cpu_valid_mask = cpu_online_mask;
3105	}
3106
3107	if (!kthread && !cpumask_subset(src1p: ctx->new_mask, src2p: cpu_allowed_mask)) {
3108	ret = -EINVAL;
3109	goto out;
3110	}
3111
3112	/*
3113	* Must re-check here, to close a race against __kthread_bind(),
3114	* sched_setaffinity() is not guaranteed to observe the flag.
3115	*/
3116	if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
3117	ret = -EINVAL;
3118	goto out;
3119	}
3120
3121	if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
3122	if (cpumask_equal(src1p: &p->cpus_mask, src2p: ctx->new_mask)) {
3123	if (ctx->flags & SCA_USER)
3124	swap(p->user_cpus_ptr, ctx->user_mask);
3125	goto out;
3126	}
3127
3128	if (WARN_ON_ONCE(p == current &&
3129	is_migration_disabled(p) &&
3130	!cpumask_test_cpu(task_cpu(p), ctx->new_mask))) {
3131	ret = -EBUSY;
3132	goto out;
3133	}
3134	}
3135
3136	/*
3137	* Picking a ~random cpu helps in cases where we are changing affinity
3138	* for groups of tasks (ie. cpuset), so that load balancing is not
3139	* immediately required to distribute the tasks within their new mask.
3140	*/
3141	dest_cpu = cpumask_any_and_distribute(src1p: cpu_valid_mask, src2p: ctx->new_mask);
3142	if (dest_cpu >= nr_cpu_ids) {
3143	ret = -EINVAL;
3144	goto out;
3145	}
3146
3147	__do_set_cpus_allowed(p, ctx);
3148
3149	return affine_move_task(rq, p, rf, dest_cpu, flags: ctx->flags);
3150
3151	out:
3152	task_rq_unlock(rq, p, rf);
3153
3154	return ret;
3155	}
3156
3157	/*
3158	* Change a given task's CPU affinity. Migrate the thread to a
3159	* proper CPU and schedule it away if the CPU it's executing on
3160	* is removed from the allowed bitmask.
3161	*
3162	* NOTE: the caller must have a valid reference to the task, the
3163	* task must not exit() & deallocate itself prematurely. The
3164	* call is not atomic; no spinlocks may be held.
3165	*/
3166	int __set_cpus_allowed_ptr(struct task_struct p, struct* affinity_context *ctx)
3167	{
3168	struct rq_flags rf;
3169	struct rq *rq;
3170
3171	rq = task_rq_lock(p, rf: &rf);
3172	/*
3173	* Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*
3174	* flags are set.
3175	*/
3176	if (p->user_cpus_ptr &&
3177	!(ctx->flags & (SCA_USER \| SCA_MIGRATE_ENABLE \| SCA_MIGRATE_DISABLE)) &&
3178	cpumask_and(dstp: rq->scratch_mask, src1p: ctx->new_mask, src2p: p->user_cpus_ptr))
3179	ctx->new_mask = rq->scratch_mask;
3180
3181	return __set_cpus_allowed_ptr_locked(p, ctx, rq, rf: &rf);
3182	}
3183
3184	int set_cpus_allowed_ptr(struct task_struct p, const* struct cpumask *new_mask)
3185	{
3186	struct affinity_context ac = {
3187	.new_mask = new_mask,
3188	.flags = `0`,
3189	};
3190
3191	return __set_cpus_allowed_ptr(p, ctx: &ac);
3192	}
3193	EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
3194
3195	/*
3196	* Change a given task's CPU affinity to the intersection of its current
3197	* affinity mask and @subset_mask, writing the resulting mask to @new_mask.
3198	* If user_cpus_ptr is defined, use it as the basis for restricting CPU
3199	* affinity or use cpu_online_mask instead.
3200	*
3201	* If the resulting mask is empty, leave the affinity unchanged and return
3202	* -EINVAL.
3203	*/
3204	static int restrict_cpus_allowed_ptr(struct task_struct *p,
3205	struct cpumask *new_mask,
3206	const struct cpumask *subset_mask)
3207	{
3208	struct affinity_context ac = {
3209	.new_mask = new_mask,
3210	.flags = `0`,
3211	};
3212	struct rq_flags rf;
3213	struct rq *rq;
3214	int err;
3215
3216	rq = task_rq_lock(p, rf: &rf);
3217
3218	/*
3219	* Forcefully restricting the affinity of a deadline task is
3220	* likely to cause problems, so fail and noisily override the
3221	* mask entirely.
3222	*/
3223	if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
3224	err = -EPERM;
3225	goto err_unlock;
3226	}
3227
3228	if (!cpumask_and(dstp: new_mask, src1p: task_user_cpus(p), src2p: subset_mask)) {
3229	err = -EINVAL;
3230	goto err_unlock;
3231	}
3232
3233	return __set_cpus_allowed_ptr_locked(p, ctx: &ac, rq, rf: &rf);
3234
3235	err_unlock:
3236	task_rq_unlock(rq, p, rf: &rf);
3237	return err;
3238	}
3239
3240	/*
3241	* Restrict the CPU affinity of task @p so that it is a subset of
3242	* task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the
3243	* old affinity mask. If the resulting mask is empty, we warn and walk
3244	* up the cpuset hierarchy until we find a suitable mask.
3245	*/
3246	void force_compatible_cpus_allowed_ptr(struct task_struct *p)
3247	{
3248	cpumask_var_t new_mask;
3249	const struct cpumask *override_mask = task_cpu_possible_mask(p);
3250
3251	alloc_cpumask_var(mask: &new_mask, GFP_KERNEL);
3252
3253	/*
3254	* __migrate_task() can fail silently in the face of concurrent
3255	* offlining of the chosen destination CPU, so take the hotplug
3256	* lock to ensure that the migration succeeds.
3257	*/
3258	cpus_read_lock();
3259	if (!cpumask_available(mask: new_mask))
3260	goto out_set_mask;
3261
3262	if (!restrict_cpus_allowed_ptr(p, new_mask, subset_mask: override_mask))
3263	goto out_free_mask;
3264
3265	/*
3266	* We failed to find a valid subset of the affinity mask for the
3267	* task, so override it based on its cpuset hierarchy.
3268	*/
3269	cpuset_cpus_allowed(p, mask: new_mask);
3270	override_mask = new_mask;
3271
3272	out_set_mask:
3273	if (printk_ratelimit()) {
3274	printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
3275	task_pid_nr(p), p->comm,
3276	cpumask_pr_args(override_mask));
3277	}
3278
3279	WARN_ON(set_cpus_allowed_ptr(p, override_mask));
3280	out_free_mask:
3281	cpus_read_unlock();
3282	free_cpumask_var(mask: new_mask);
3283	}
3284
3285	/*
3286	* Restore the affinity of a task @p which was previously restricted by a
3287	* call to force_compatible_cpus_allowed_ptr().
3288	*
3289	* It is the caller's responsibility to serialise this with any calls to
3290	* force_compatible_cpus_allowed_ptr(@p).
3291	*/
3292	void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
3293	{
3294	struct affinity_context ac = {
3295	.new_mask = task_user_cpus(p),
3296	.flags = `0`,
3297	};
3298	int ret;
3299
3300	/*
3301	* Try to restore the old affinity mask with __sched_setaffinity().
3302	* Cpuset masking will be done there too.
3303	*/
3304	ret = __sched_setaffinity(p, ctx: &ac);
3305	WARN_ON_ONCE(ret);
3306	}
3307
3308	void set_task_cpu(struct task_struct p, unsigned* int new_cpu)
3309	{
3310	unsigned int state = READ_ONCE(p->__state);
3311
3312	/*
3313	* We should never call set_task_cpu() on a blocked task,
3314	* ttwu() will sort out the placement.
3315	*/
3316	WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq);
3317
3318	/*
3319	* Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
3320	* because schedstat_wait_{start,end} rebase migrating task's wait_start
3321	* time relying on p->on_rq.
3322	*/
3323	WARN_ON_ONCE(state == TASK_RUNNING &&
3324	p->sched_class == &fair_sched_class &&
3325	(p->on_rq && !task_on_rq_migrating(p)));
3326
3327	#ifdef CONFIG_LOCKDEP
3328	/*
3329	* The caller should hold either p->pi_lock or rq->lock, when changing
3330	* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
3331	*
3332	* sched_move_task() holds both and thus holding either pins the cgroup,
3333	* see task_group().
3334	*
3335	* Furthermore, all task_rq users should acquire both locks, see
3336	* task_rq_lock().
3337	*/
3338	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) \|\|
3339	lockdep_is_held(__rq_lockp(task_rq(p)))));
3340	#endif
3341	/*
3342	* Clearly, migrating tasks to offline CPUs is a fairly daft thing.
3343	*/
3344	WARN_ON_ONCE(!cpu_online(new_cpu));
3345
3346	WARN_ON_ONCE(is_migration_disabled(p));
3347
3348	trace_sched_migrate_task(p, dest_cpu: new_cpu);
3349
3350	if (task_cpu(p) != new_cpu) {
3351	if (p->sched_class->migrate_task_rq)
3352	p->sched_class->migrate_task_rq(p, new_cpu);
3353	p->se.nr_migrations++;
3354	rseq_migrate(t: p);
3355	sched_mm_cid_migrate_from(t: p);
3356	perf_event_task_migrate(task: p);
3357	}
3358
3359	__set_task_cpu(p, cpu: new_cpu);
3360	}
3361
3362	#ifdef CONFIG_NUMA_BALANCING
3363	static void __migrate_swap_task(struct task_struct p, int* cpu)
3364	{
3365	__schedstat_inc(p->stats.numa_task_swapped);
3366	count_vm_numa_event(NUMA_TASK_SWAP);
3367	count_memcg_event_mm(mm: p->mm, idx: NUMA_TASK_SWAP);
3368
3369	if (task_on_rq_queued(p)) {
3370	struct rq src_rq, dst_rq;
3371	struct rq_flags srf, drf;
3372
3373	src_rq = task_rq(p);
3374	dst_rq = cpu_rq(cpu);
3375
3376	rq_pin_lock(rq: src_rq, rf: &srf);
3377	rq_pin_lock(rq: dst_rq, rf: &drf);
3378
3379	move_queued_task_locked(src_rq, dst_rq, task: p);
3380	wakeup_preempt(rq: dst_rq, p, flags: `0`);
3381
3382	rq_unpin_lock(rq: dst_rq, rf: &drf);
3383	rq_unpin_lock(rq: src_rq, rf: &srf);
3384
3385	} else {
3386	/*
3387	* Task isn't running anymore; make it appear like we migrated
3388	* it before it went to sleep. This means on wakeup we make the
3389	* previous CPU our target instead of where it really is.
3390	*/
3391	p->wake_cpu = cpu;
3392	}
3393	}
3394
3395	struct migration_swap_arg {
3396	struct task_struct src_task, dst_task;
3397	int src_cpu, dst_cpu;
3398	};
3399
3400	static int migrate_swap_stop(void *data)
3401	{
3402	struct migration_swap_arg *arg = data;
3403	struct rq src_rq, dst_rq;
3404
3405	if (!cpu_active(cpu: arg->src_cpu) \|\| !cpu_active(cpu: arg->dst_cpu))
3406	return -EAGAIN;
3407
3408	src_rq = cpu_rq(arg->src_cpu);
3409	dst_rq = cpu_rq(arg->dst_cpu);
3410
3411	guard(double_raw_spinlock)(lock: &arg->src_task->pi_lock, lock2: &arg->dst_task->pi_lock);
3412	guard(double_rq_lock)(lock: src_rq, lock2: dst_rq);
3413
3414	if (task_cpu(p: arg->dst_task) != arg->dst_cpu)
3415	return -EAGAIN;
3416
3417	if (task_cpu(p: arg->src_task) != arg->src_cpu)
3418	return -EAGAIN;
3419
3420	if (!cpumask_test_cpu(cpu: arg->dst_cpu, cpumask: arg->src_task->cpus_ptr))
3421	return -EAGAIN;
3422
3423	if (!cpumask_test_cpu(cpu: arg->src_cpu, cpumask: arg->dst_task->cpus_ptr))
3424	return -EAGAIN;
3425
3426	__migrate_swap_task(p: arg->src_task, cpu: arg->dst_cpu);
3427	__migrate_swap_task(p: arg->dst_task, cpu: arg->src_cpu);
3428
3429	return `0`;
3430	}
3431
3432	/*
3433	* Cross migrate two tasks
3434	*/
3435	int migrate_swap(struct task_struct cur, struct* task_struct *p,
3436	int target_cpu, int curr_cpu)
3437	{
3438	struct migration_swap_arg arg;
3439	int ret = -EINVAL;
3440
3441	arg = (struct migration_swap_arg){
3442	.src_task = cur,
3443	.src_cpu = curr_cpu,
3444	.dst_task = p,
3445	.dst_cpu = target_cpu,
3446	};
3447
3448	if (arg.src_cpu == arg.dst_cpu)
3449	goto out;
3450
3451	/*
3452	* These three tests are all lockless; this is OK since all of them
3453	* will be re-checked with proper locks held further down the line.
3454	*/
3455	if (!cpu_active(cpu: arg.src_cpu) \|\| !cpu_active(cpu: arg.dst_cpu))
3456	goto out;
3457
3458	if (!cpumask_test_cpu(cpu: arg.dst_cpu, cpumask: arg.src_task->cpus_ptr))
3459	goto out;
3460
3461	if (!cpumask_test_cpu(cpu: arg.src_cpu, cpumask: arg.dst_task->cpus_ptr))
3462	goto out;
3463
3464	trace_sched_swap_numa(src_tsk: cur, src_cpu: arg.src_cpu, dst_tsk: p, dst_cpu: arg.dst_cpu);
3465	ret = stop_two_cpus(cpu1: arg.dst_cpu, cpu2: arg.src_cpu, fn: migrate_swap_stop, arg: &arg);
3466
3467	out:
3468	return ret;
3469	}
3470	#endif /* CONFIG_NUMA_BALANCING */
3471
3472	/***
3473	* kick_process - kick a running thread to enter/exit the kernel
3474	* @p: the to-be-kicked thread
3475	*
3476	* Cause a process which is running on another CPU to enter
3477	* kernel-mode, without any delay. (to get signals handled.)
3478	*
3479	* NOTE: this function doesn't have to take the runqueue lock,
3480	* because all it wants to ensure is that the remote task enters
3481	* the kernel. If the IPI races and the task has been migrated
3482	* to another CPU then no harm is done and the purpose has been
3483	* achieved as well.
3484	*/
3485	void kick_process(struct task_struct *p)
3486	{
3487	guard(preempt)();
3488	int cpu = task_cpu(p);
3489
3490	if ((cpu != smp_processor_id()) && task_curr(p))
3491	smp_send_reschedule(cpu);
3492	}
3493	EXPORT_SYMBOL_GPL(kick_process);
3494
3495	/*
3496	* ->cpus_ptr is protected by both rq->lock and p->pi_lock
3497	*
3498	* A few notes on cpu_active vs cpu_online:
3499	*
3500	* - cpu_active must be a subset of cpu_online
3501	*
3502	* - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
3503	* see __set_cpus_allowed_ptr(). At this point the newly online
3504	* CPU isn't yet part of the sched domains, and balancing will not
3505	* see it.
3506	*
3507	* - on CPU-down we clear cpu_active() to mask the sched domains and
3508	* avoid the load balancer to place new tasks on the to be removed
3509	* CPU. Existing tasks will remain running there and will be taken
3510	* off.
3511	*
3512	* This means that fallback selection must not select !active CPUs.
3513	* And can assume that any active CPU must be online. Conversely
3514	* select_task_rq() below may allow selection of !active CPUs in order
3515	* to satisfy the above rules.
3516	*/
3517	static int select_fallback_rq(int cpu, struct task_struct *p)
3518	{
3519	int nid = cpu_to_node(cpu);
3520	const struct cpumask *nodemask = NULL;
3521	enum { cpuset, possible, fail } state = cpuset;
3522	int dest_cpu;
3523
3524	/*
3525	* If the node that the CPU is on has been offlined, cpu_to_node()
3526	* will return -1. There is no CPU on the node, and we should
3527	* select the CPU on the other node.
3528	*/
3529	if (nid != -`1`) {
3530	nodemask = cpumask_of_node(node: nid);
3531
3532	/ Look for allowed, online CPU in same node. /
3533	for_each_cpu(dest_cpu, nodemask) {
3534	if (is_cpu_allowed(p, cpu: dest_cpu))
3535	return dest_cpu;
3536	}
3537	}
3538
3539	for (;;) {
3540	/ Any allowed, online CPU? /
3541	for_each_cpu(dest_cpu, p->cpus_ptr) {
3542	if (!is_cpu_allowed(p, cpu: dest_cpu))
3543	continue;
3544
3545	goto out;
3546	}
3547
3548	/ No more Mr. Nice Guy. /
3549	switch (state) {
3550	case cpuset:
3551	if (cpuset_cpus_allowed_fallback(p)) {
3552	state = possible;
3553	break;
3554	}
3555	fallthrough;
3556	case possible:
3557	/*
3558	* XXX When called from select_task_rq() we only
3559	* hold p->pi_lock and again violate locking order.
3560	*
3561	* More yuck to audit.
3562	*/
3563	do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
3564	state = fail;
3565	break;
3566	case fail:
3567	BUG();
3568	break;
3569	}
3570	}
3571
3572	out:
3573	if (state != cpuset) {
3574	/*
3575	* Don't tell them about moving exiting tasks or
3576	* kernel threads (both mm NULL), since they never
3577	* leave kernel.
3578	*/
3579	if (p->mm && printk_ratelimit()) {
3580	printk_deferred("process %d (%s) no longer affine to cpu%d\n",
3581	task_pid_nr(p), p->comm, cpu);
3582	}
3583	}
3584
3585	return dest_cpu;
3586	}
3587
3588	/*
3589	* The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
3590	*/
3591	static inline
3592	int select_task_rq(struct task_struct p, int* cpu, int *wake_flags)
3593	{
3594	lockdep_assert_held(&p->pi_lock);
3595
3596	if (p->nr_cpus_allowed > `1` && !is_migration_disabled(p)) {
3597	cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags);
3598	*wake_flags \|= WF_RQ_SELECTED;
3599	} else {
3600	cpu = cpumask_any(p->cpus_ptr);
3601	}
3602
3603	/*
3604	* In order not to call set_task_cpu() on a blocking task we need
3605	* to rely on ttwu() to place the task on a valid ->cpus_ptr
3606	* CPU.
3607	*
3608	* Since this is common to all placement strategies, this lives here.
3609	*
3610	* [ this allows ->select_task() to simply return task_cpu(p) and
3611	* not worry about this generic constraint ]
3612	*/
3613	if (unlikely(!is_cpu_allowed(p, cpu)))
3614	cpu = select_fallback_rq(cpu: task_cpu(p), p);
3615
3616	return cpu;
3617	}
3618
3619	void sched_set_stop_task(int cpu, struct task_struct *stop)
3620	{
3621	static struct lock_class_key stop_pi_lock;
3622	struct sched_param param = { .sched_priority = MAX_RT_PRIO - `1` };
3623	struct task_struct *old_stop = cpu_rq(cpu)->stop;
3624
3625	if (stop) {
3626	/*
3627	* Make it appear like a SCHED_FIFO task, its something
3628	* userspace knows about and won't get confused about.
3629	*
3630	* Also, it will make PI more or less work without too
3631	* much confusion -- but then, stop work should not
3632	* rely on PI working anyway.
3633	*/
3634	sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
3635
3636	stop->sched_class = &stop_sched_class;
3637
3638	/*
3639	* The PI code calls rt_mutex_setprio() with ->pi_lock held to
3640	* adjust the effective priority of a task. As a result,
3641	* rt_mutex_setprio() can trigger (RT) balancing operations,
3642	* which can then trigger wakeups of the stop thread to push
3643	* around the current task.
3644	*
3645	* The stop task itself will never be part of the PI-chain, it
3646	* never blocks, therefore that ->pi_lock recursion is safe.
3647	* Tell lockdep about this by placing the stop->pi_lock in its
3648	* own class.
3649	*/
3650	lockdep_set_class(&stop->pi_lock, &stop_pi_lock);
3651	}
3652
3653	cpu_rq(cpu)->stop = stop;
3654
3655	if (old_stop) {
3656	/*
3657	* Reset it back to a normal scheduling class so that
3658	* it can die in pieces.
3659	*/
3660	old_stop->sched_class = &rt_sched_class;
3661	}
3662	}
3663
3664	#else /* CONFIG_SMP */
3665
3666	static inline void migrate_disable_switch(struct rq rq, struct* task_struct *p) { }
3667
3668	static inline bool rq_has_pinned_tasks(struct rq *rq)
3669	{
3670	return false;
3671	}
3672
3673	#endif /* !CONFIG_SMP */
3674
3675	static void
3676	ttwu_stat(struct task_struct p, int* cpu, int wake_flags)
3677	{
3678	struct rq *rq;
3679
3680	if (!schedstat_enabled())
3681	return;
3682
3683	rq = this_rq();
3684
3685	#ifdef CONFIG_SMP
3686	if (cpu == rq->cpu) {
3687	__schedstat_inc(rq->ttwu_local);
3688	__schedstat_inc(p->stats.nr_wakeups_local);
3689	} else {
3690	struct sched_domain *sd;
3691
3692	__schedstat_inc(p->stats.nr_wakeups_remote);
3693
3694	guard(rcu)();
3695	for_each_domain(rq->cpu, sd) {
3696	if (cpumask_test_cpu(cpu, cpumask: sched_domain_span(sd))) {
3697	__schedstat_inc(sd->ttwu_wake_remote);
3698	break;
3699	}
3700	}
3701	}
3702
3703	if (wake_flags & WF_MIGRATED)
3704	__schedstat_inc(p->stats.nr_wakeups_migrate);
3705	#endif /* CONFIG_SMP */
3706
3707	__schedstat_inc(rq->ttwu_count);
3708	__schedstat_inc(p->stats.nr_wakeups);
3709
3710	if (wake_flags & WF_SYNC)
3711	__schedstat_inc(p->stats.nr_wakeups_sync);
3712	}
3713
3714	/*
3715	* Mark the task runnable.
3716	*/
3717	static inline void ttwu_do_wakeup(struct task_struct *p)
3718	{
3719	WRITE_ONCE(p->__state, TASK_RUNNING);
3720	trace_sched_wakeup(p);
3721	}
3722
3723	static void
3724	ttwu_do_activate(struct rq rq, struct* task_struct p, int* wake_flags,
3725	struct rq_flags *rf)
3726	{
3727	int en_flags = ENQUEUE_WAKEUP \| ENQUEUE_NOCLOCK;
3728
3729	lockdep_assert_rq_held(rq);
3730
3731	if (p->sched_contributes_to_load)
3732	rq->nr_uninterruptible--;
3733
3734	#ifdef CONFIG_SMP
3735	if (wake_flags & WF_RQ_SELECTED)
3736	en_flags \|= ENQUEUE_RQ_SELECTED;
3737	if (wake_flags & WF_MIGRATED)
3738	en_flags \|= ENQUEUE_MIGRATED;
3739	else
3740	#endif
3741	if (p->in_iowait) {
3742	delayacct_blkio_end(p);
3743	atomic_dec(v: &task_rq(p)->nr_iowait);
3744	}
3745
3746	activate_task(rq, p, flags: en_flags);
3747	wakeup_preempt(rq, p, flags: wake_flags);
3748
3749	ttwu_do_wakeup(p);
3750
3751	#ifdef CONFIG_SMP
3752	if (p->sched_class->task_woken) {
3753	/*
3754	* Our task @p is fully woken up and running; so it's safe to
3755	* drop the rq->lock, hereafter rq is only used for statistics.
3756	*/
3757	rq_unpin_lock(rq, rf);
3758	p->sched_class->task_woken(rq, p);
3759	rq_repin_lock(rq, rf);
3760	}
3761
3762	if (rq->idle_stamp) {
3763	u64 delta = rq_clock(rq) - rq->idle_stamp;
3764	u64 max = `2`*rq->max_idle_balance_cost;
3765
3766	update_avg(avg: &rq->avg_idle, sample: delta);
3767
3768	if (rq->avg_idle > max)
3769	rq->avg_idle = max;
3770
3771	rq->idle_stamp = `0`;
3772	}
3773	#endif
3774	}
3775
3776	/*
3777	* Consider @p being inside a wait loop:
3778	*
3779	* for (;;) {
3780	* set_current_state(TASK_UNINTERRUPTIBLE);
3781	*
3782	* if (CONDITION)
3783	* break;
3784	*
3785	* schedule();
3786	* }
3787	* __set_current_state(TASK_RUNNING);
3788	*
3789	* between set_current_state() and schedule(). In this case @p is still
3790	* runnable, so all that needs doing is change p->state back to TASK_RUNNING in
3791	* an atomic manner.
3792	*
3793	* By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
3794	* then schedule() must still happen and p->state can be changed to
3795	* TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
3796	* need to do a full wakeup with enqueue.
3797	*
3798	* Returns: %true when the wakeup is done,
3799	* %false otherwise.
3800	*/
3801	static int ttwu_runnable(struct task_struct p, int* wake_flags)
3802	{
3803	struct rq_flags rf;
3804	struct rq *rq;
3805	int ret = `0`;
3806
3807	rq = __task_rq_lock(p, rf: &rf);
3808	if (task_on_rq_queued(p)) {
3809	update_rq_clock(rq);
3810	if (p->se.sched_delayed)
3811	enqueue_task(rq, p, ENQUEUE_NOCLOCK \| ENQUEUE_DELAYED);
3812	if (!task_on_cpu(rq, p)) {
3813	/*
3814	* When on_rq && !on_cpu the task is preempted, see if
3815	* it should preempt the task that is current now.
3816	*/
3817	wakeup_preempt(rq, p, flags: wake_flags);
3818	}
3819	ttwu_do_wakeup(p);
3820	ret = `1`;
3821	}
3822	__task_rq_unlock(rq, rf: &rf);
3823
3824	return ret;
3825	}
3826
3827	#ifdef CONFIG_SMP
3828	void sched_ttwu_pending(void *arg)
3829	{
3830	struct llist_node *llist = arg;
3831	struct rq *rq = this_rq();
3832	struct task_struct p, t;
3833	struct rq_flags rf;
3834
3835	if (!llist)
3836	return;
3837
3838	rq_lock_irqsave(rq, rf: &rf);
3839	update_rq_clock(rq);
3840
3841	llist_for_each_entry_safe(p, t, llist, wake_entry.llist) {
3842	if (WARN_ON_ONCE(p->on_cpu))
3843	smp_cond_load_acquire(&p->on_cpu, !VAL);
3844
3845	if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
3846	set_task_cpu(p, new_cpu: cpu_of(rq));
3847
3848	ttwu_do_activate(rq, p, wake_flags: p->sched_remote_wakeup ? WF_MIGRATED : `0`, rf: &rf);
3849	}
3850
3851	/*
3852	* Must be after enqueueing at least once task such that
3853	* idle_cpu() does not observe a false-negative -- if it does,
3854	* it is possible for select_idle_siblings() to stack a number
3855	* of tasks on this CPU during that window.
3856	*
3857	* It is OK to clear ttwu_pending when another task pending.
3858	* We will receive IPI after local IRQ enabled and then enqueue it.
3859	* Since now nr_running > 0, idle_cpu() will always get correct result.
3860	*/
3861	WRITE_ONCE(rq->ttwu_pending, `0`);
3862	rq_unlock_irqrestore(rq, rf: &rf);
3863	}
3864
3865	/*
3866	* Prepare the scene for sending an IPI for a remote smp_call
3867	*
3868	* Returns true if the caller can proceed with sending the IPI.
3869	* Returns false otherwise.
3870	*/
3871	bool call_function_single_prep_ipi(int cpu)
3872	{
3873	if (set_nr_if_polling(cpu_rq(cpu)->idle)) {
3874	trace_sched_wake_idle_without_ipi(cpu);
3875	return false;
3876	}
3877
3878	return true;
3879	}
3880
3881	/*
3882	* Queue a task on the target CPUs wake_list and wake the CPU via IPI if
3883	* necessary. The wakee CPU on receipt of the IPI will queue the task
3884	* via sched_ttwu_wakeup() for activation so the wakee incurs the cost
3885	* of the wakeup instead of the waker.
3886	*/
3887	static void __ttwu_queue_wakelist(struct task_struct p, int* cpu, int wake_flags)
3888	{
3889	struct rq *rq = cpu_rq(cpu);
3890
3891	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
3892
3893	WRITE_ONCE(rq->ttwu_pending, `1`);
3894	__smp_call_single_queue(cpu, node: &p->wake_entry.llist);
3895	}
3896
3897	void wake_up_if_idle(int cpu)
3898	{
3899	struct rq *rq = cpu_rq(cpu);
3900
3901	guard(rcu)();
3902	if (is_idle_task(rcu_dereference(rq->curr))) {
3903	guard(rq_lock_irqsave)(l: rq);
3904	if (is_idle_task(p: rq->curr))
3905	resched_curr(rq);
3906	}
3907	}
3908
3909	bool cpus_equal_capacity(int this_cpu, int that_cpu)
3910	{
3911	if (!sched_asym_cpucap_active())
3912	return true;
3913
3914	if (this_cpu == that_cpu)
3915	return true;
3916
3917	return arch_scale_cpu_capacity(cpu: this_cpu) == arch_scale_cpu_capacity(cpu: that_cpu);
3918	}
3919
3920	bool cpus_share_cache(int this_cpu, int that_cpu)
3921	{
3922	if (this_cpu == that_cpu)
3923	return true;
3924
3925	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
3926	}
3927
3928	/*
3929	* Whether CPUs are share cache resources, which means LLC on non-cluster
3930	* machines and LLC tag or L2 on machines with clusters.
3931	*/
3932	bool cpus_share_resources(int this_cpu, int that_cpu)
3933	{
3934	if (this_cpu == that_cpu)
3935	return true;
3936
3937	return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu);
3938	}
3939
3940	static inline bool ttwu_queue_cond(struct task_struct p, int* cpu)
3941	{
3942	/ See SCX_OPS_ALLOW_QUEUED_WAKEUP. /
3943	if (!scx_allow_ttwu_queue(p))
3944	return false;
3945
3946	/*
3947	* Do not complicate things with the async wake_list while the CPU is
3948	* in hotplug state.
3949	*/
3950	if (!cpu_active(cpu))
3951	return false;
3952
3953	/ Ensure the task will still be allowed to run on the CPU. /
3954	if (!cpumask_test_cpu(cpu, cpumask: p->cpus_ptr))
3955	return false;
3956
3957	/*
3958	* If the CPU does not share cache, then queue the task on the
3959	* remote rqs wakelist to avoid accessing remote data.
3960	*/
3961	if (!cpus_share_cache(smp_processor_id(), that_cpu: cpu))
3962	return true;
3963
3964	if (cpu == smp_processor_id())
3965	return false;
3966
3967	/*
3968	* If the wakee cpu is idle, or the task is descheduling and the
3969	* only running task on the CPU, then use the wakelist to offload
3970	* the task activation to the idle (or soon-to-be-idle) CPU as
3971	* the current CPU is likely busy. nr_running is checked to
3972	* avoid unnecessary task stacking.
3973	*
3974	* Note that we can only get here with (wakee) p->on_rq=0,
3975	* p->on_cpu can be whatever, we've done the dequeue, so
3976	* the wakee has been accounted out of ->nr_running.
3977	*/
3978	if (!cpu_rq(cpu)->nr_running)
3979	return true;
3980
3981	return false;
3982	}
3983
3984	static bool ttwu_queue_wakelist(struct task_struct p, int* cpu, int wake_flags)
3985	{
3986	if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)) {
3987	sched_clock_cpu(cpu); / Sync clocks across CPUs /
3988	__ttwu_queue_wakelist(p, cpu, wake_flags);
3989	return true;
3990	}
3991
3992	return false;
3993	}
3994
3995	#else /* !CONFIG_SMP */
3996
3997	static inline bool ttwu_queue_wakelist(struct task_struct p, int* cpu, int wake_flags)
3998	{
3999	return false;
4000	}
4001
4002	#endif /* CONFIG_SMP */
4003
4004	static void ttwu_queue(struct task_struct p, int* cpu, int wake_flags)
4005	{
4006	struct rq *rq = cpu_rq(cpu);
4007	struct rq_flags rf;
4008
4009	if (ttwu_queue_wakelist(p, cpu, wake_flags))
4010	return;
4011
4012	rq_lock(rq, rf: &rf);
4013	update_rq_clock(rq);
4014	ttwu_do_activate(rq, p, wake_flags, rf: &rf);
4015	rq_unlock(rq, rf: &rf);
4016	}
4017
4018	/*
4019	* Invoked from try_to_wake_up() to check whether the task can be woken up.
4020	*
4021	* The caller holds p::pi_lock if p != current or has preemption
4022	* disabled when p == current.
4023	*
4024	* The rules of saved_state:
4025	*
4026	* The related locking code always holds p::pi_lock when updating
4027	* p::saved_state, which means the code is fully serialized in both cases.
4028	*
4029	* For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT.
4030	* No other bits set. This allows to distinguish all wakeup scenarios.
4031	*
4032	* For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This
4033	* allows us to prevent early wakeup of tasks before they can be run on
4034	* asymmetric ISA architectures (eg ARMv9).
4035	*/
4036	static __always_inline
4037	bool ttwu_state_match(struct task_struct p, unsigned* int state, int *success)
4038	{
4039	int match;
4040
4041	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
4042	WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
4043	state != TASK_RTLOCK_WAIT);
4044	}
4045
4046	*success = !!(match = __task_state_match(p, state));
4047
4048	/*
4049	* Saved state preserves the task state across blocking on
4050	* an RT lock or TASK_FREEZABLE tasks. If the state matches,
4051	* set p::saved_state to TASK_RUNNING, but do not wake the task
4052	* because it waits for a lock wakeup or __thaw_task(). Also
4053	* indicate success because from the regular waker's point of
4054	* view this has succeeded.
4055	*
4056	* After acquiring the lock the task will restore p::__state
4057	* from p::saved_state which ensures that the regular
4058	* wakeup is not lost. The restore will also set
4059	* p::saved_state to TASK_RUNNING so any further tests will
4060	* not result in false positives vs. @success
4061	*/
4062	if (match < `0`)
4063	p->saved_state = TASK_RUNNING;
4064
4065	return match > `0`;
4066	}
4067
4068	/*
4069	* Notes on Program-Order guarantees on SMP systems.
4070	*
4071	* MIGRATION
4072	*
4073	* The basic program-order guarantee on SMP systems is that when a task [t]
4074	* migrates, all its activity on its old CPU [c0] happens-before any subsequent
4075	* execution on its new CPU [c1].
4076	*
4077	* For migration (of runnable tasks) this is provided by the following means:
4078	*
4079	* A) UNLOCK of the rq(c0)->lock scheduling out task t
4080	* B) migration for t is required to synchronize both rq(c0)->lock and
4081	* rq(c1)->lock (if not at the same time, then in that order).
4082	* C) LOCK of the rq(c1)->lock scheduling in task
4083	*
4084	* Release/acquire chaining guarantees that B happens after A and C after B.
4085	* Note: the CPU doing B need not be c0 or c1
4086	*
4087	* Example:
4088	*
4089	* CPU0 CPU1 CPU2
4090	*
4091	* LOCK rq(0)->lock
4092	* sched-out X
4093	* sched-in Y
4094	* UNLOCK rq(0)->lock
4095	*
4096	* LOCK rq(0)->lock // orders against CPU0
4097	* dequeue X
4098	* UNLOCK rq(0)->lock
4099	*
4100	* LOCK rq(1)->lock
4101	* enqueue X
4102	* UNLOCK rq(1)->lock
4103	*
4104	* LOCK rq(1)->lock // orders against CPU2
4105	* sched-out Z
4106	* sched-in X
4107	* UNLOCK rq(1)->lock
4108	*
4109	*
4110	* BLOCKING -- aka. SLEEP + WAKEUP
4111	*
4112	* For blocking we (obviously) need to provide the same guarantee as for
4113	* migration. However the means are completely different as there is no lock
4114	* chain to provide order. Instead we do:
4115	*
4116	* 1) smp_store_release(X->on_cpu, 0) -- finish_task()
4117	* 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
4118	*
4119	* Example:
4120	*
4121	* CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
4122	*
4123	* LOCK rq(0)->lock LOCK X->pi_lock
4124	* dequeue X
4125	* sched-out X
4126	* smp_store_release(X->on_cpu, 0);
4127	*
4128	* smp_cond_load_acquire(&X->on_cpu, !VAL);
4129	* X->state = WAKING
4130	* set_task_cpu(X,2)
4131	*
4132	* LOCK rq(2)->lock
4133	* enqueue X
4134	* X->state = RUNNING
4135	* UNLOCK rq(2)->lock
4136	*
4137	* LOCK rq(2)->lock // orders against CPU1
4138	* sched-out Z
4139	* sched-in X
4140	* UNLOCK rq(2)->lock
4141	*
4142	* UNLOCK X->pi_lock
4143	* UNLOCK rq(0)->lock
4144	*
4145	*
4146	* However, for wakeups there is a second guarantee we must provide, namely we
4147	* must ensure that CONDITION=1 done by the caller can not be reordered with
4148	* accesses to the task state; see try_to_wake_up() and set_current_state().
4149	*/
4150
4151	/**
4152	* try_to_wake_up - wake up a thread
4153	* @p: the thread to be awakened
4154	* @state: the mask of task states that can be woken
4155	* @wake_flags: wake modifier flags (WF_*)
4156	*
4157	* Conceptually does:
4158	*
4159	* If (@state & @p->state) @p->state = TASK_RUNNING.
4160	*
4161	* If the task was not queued/runnable, also place it back on a runqueue.
4162	*
4163	* This function is atomic against schedule() which would dequeue the task.
4164	*
4165	* It issues a full memory barrier before accessing @p->state, see the comment
4166	* with set_current_state().
4167	*
4168	* Uses p->pi_lock to serialize against concurrent wake-ups.
4169	*
4170	* Relies on p->pi_lock stabilizing:
4171	* - p->sched_class
4172	* - p->cpus_ptr
4173	* - p->sched_task_group
4174	* in order to do migration, see its use of select_task_rq()/set_task_cpu().
4175	*
4176	* Tries really hard to only take one task_rq(p)->lock for performance.
4177	* Takes rq->lock in:
4178	* - ttwu_runnable() -- old rq, unavoidable, see comment there;
4179	* - ttwu_queue() -- new rq, for enqueue of the task;
4180	* - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
4181	*
4182	* As a consequence we race really badly with just about everything. See the
4183	* many memory barriers and their comments for details.
4184	*
4185	* Return: %true if @p->state changes (an actual wakeup was done),
4186	* %false otherwise.
4187	*/
4188	int try_to_wake_up(struct task_struct p, unsigned* int state, int wake_flags)
4189	{
4190	guard(preempt)();
4191	int cpu, success = `0`;
4192
4193	wake_flags \|= WF_TTWU;
4194
4195	if (p == current) {
4196	/*
4197	* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
4198	* == smp_processor_id()'. Together this means we can special
4199	* case the whole 'p->on_rq && ttwu_runnable()' case below
4200	* without taking any locks.
4201	*
4202	* Specifically, given current runs ttwu() we must be before
4203	* schedule()'s block_task(), as such this must not observe
4204	* sched_delayed.
4205	*
4206	* In particular:
4207	* - we rely on Program-Order guarantees for all the ordering,
4208	* - we're serialized against set_special_state() by virtue of
4209	* it disabling IRQs (this allows not taking ->pi_lock).
4210	*/
4211	WARN_ON_ONCE(p->se.sched_delayed);
4212	if (!ttwu_state_match(p, state, success: &success))
4213	goto out;
4214
4215	trace_sched_waking(p);
4216	ttwu_do_wakeup(p);
4217	goto out;
4218	}
4219
4220	/*
4221	* If we are going to wake up a thread waiting for CONDITION we
4222	* need to ensure that CONDITION=1 done by the caller can not be
4223	* reordered with p->state check below. This pairs with smp_store_mb()
4224	* in set_current_state() that the waiting thread does.
4225	*/
4226	scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
4227	smp_mb__after_spinlock();
4228	if (!ttwu_state_match(p, state, success: &success))
4229	break;
4230
4231	trace_sched_waking(p);
4232
4233	/*
4234	* Ensure we load p->on_rq _after_ p->state, otherwise it would
4235	* be possible to, falsely, observe p->on_rq == 0 and get stuck
4236	* in smp_cond_load_acquire() below.
4237	*
4238	* sched_ttwu_pending() try_to_wake_up()
4239	* STORE p->on_rq = 1 LOAD p->state
4240	* UNLOCK rq->lock
4241	*
4242	* __schedule() (switch to task 'p')
4243	* LOCK rq->lock smp_rmb();
4244	* smp_mb__after_spinlock();
4245	* UNLOCK rq->lock
4246	*
4247	* [task p]
4248	* STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
4249	*
4250	* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
4251	* __schedule(). See the comment for smp_mb__after_spinlock().
4252	*
4253	* A similar smp_rmb() lives in __task_needs_rq_lock().
4254	*/
4255	smp_rmb();
4256	if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
4257	break;
4258
4259	#ifdef CONFIG_SMP
4260	/*
4261	* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
4262	* possible to, falsely, observe p->on_cpu == 0.
4263	*
4264	* One must be running (->on_cpu == 1) in order to remove oneself
4265	* from the runqueue.
4266	*
4267	* __schedule() (switch to task 'p') try_to_wake_up()
4268	* STORE p->on_cpu = 1 LOAD p->on_rq
4269	* UNLOCK rq->lock
4270	*
4271	* __schedule() (put 'p' to sleep)
4272	* LOCK rq->lock smp_rmb();
4273	* smp_mb__after_spinlock();
4274	* STORE p->on_rq = 0 LOAD p->on_cpu
4275	*
4276	* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
4277	* __schedule(). See the comment for smp_mb__after_spinlock().
4278	*
4279	* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
4280	* schedule()'s deactivate_task() has 'happened' and p will no longer
4281	* care about it's own p->state. See the comment in __schedule().
4282	*/
4283	smp_acquire__after_ctrl_dep();
4284
4285	/*
4286	* We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
4287	* == 0), which means we need to do an enqueue, change p->state to
4288	* TASK_WAKING such that we can unlock p->pi_lock before doing the
4289	* enqueue, such as ttwu_queue_wakelist().
4290	*/
4291	WRITE_ONCE(p->__state, TASK_WAKING);
4292
4293	/*
4294	* If the owning (remote) CPU is still in the middle of schedule() with
4295	* this task as prev, considering queueing p on the remote CPUs wake_list
4296	* which potentially sends an IPI instead of spinning on p->on_cpu to
4297	* let the waker make forward progress. This is safe because IRQs are
4298	* disabled and the IPI will deliver after on_cpu is cleared.
4299	*
4300	* Ensure we load task_cpu(p) after p->on_cpu:
4301	*
4302	* set_task_cpu(p, cpu);
4303	* STORE p->cpu = @cpu
4304	* __schedule() (switch to task 'p')
4305	* LOCK rq->lock
4306	* smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
4307	* STORE p->on_cpu = 1 LOAD p->cpu
4308	*
4309	* to ensure we observe the correct CPU on which the task is currently
4310	* scheduling.
4311	*/
4312	if (smp_load_acquire(&p->on_cpu) &&
4313	ttwu_queue_wakelist(p, cpu: task_cpu(p), wake_flags))
4314	break;
4315
4316	/*
4317	* If the owning (remote) CPU is still in the middle of schedule() with
4318	* this task as prev, wait until it's done referencing the task.
4319	*
4320	* Pairs with the smp_store_release() in finish_task().
4321	*
4322	* This ensures that tasks getting woken will be fully ordered against
4323	* their previous state and preserve Program Order.
4324	*/
4325	smp_cond_load_acquire(&p->on_cpu, !VAL);
4326
4327	cpu = select_task_rq(p, cpu: p->wake_cpu, wake_flags: &wake_flags);
4328	if (task_cpu(p) != cpu) {
4329	if (p->in_iowait) {
4330	delayacct_blkio_end(p);
4331	atomic_dec(v: &task_rq(p)->nr_iowait);
4332	}
4333
4334	wake_flags \|= WF_MIGRATED;
4335	psi_ttwu_dequeue(p);
4336	set_task_cpu(p, new_cpu: cpu);
4337	}
4338	#else
4339	cpu = task_cpu(p);
4340	#endif /* CONFIG_SMP */
4341
4342	ttwu_queue(p, cpu, wake_flags);
4343	}
4344	out:
4345	if (success)
4346	ttwu_stat(p, cpu: task_cpu(p), wake_flags);
4347
4348	return success;
4349	}
4350
4351	static bool __task_needs_rq_lock(struct task_struct *p)
4352	{
4353	unsigned int state = READ_ONCE(p->__state);
4354
4355	/*
4356	* Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
4357	* the task is blocked. Make sure to check @state since ttwu() can drop
4358	* locks at the end, see ttwu_queue_wakelist().
4359	*/
4360	if (state == TASK_RUNNING \|\| state == TASK_WAKING)
4361	return true;
4362
4363	/*
4364	* Ensure we load p->on_rq after p->__state, otherwise it would be
4365	* possible to, falsely, observe p->on_rq == 0.
4366	*
4367	* See try_to_wake_up() for a longer comment.
4368	*/
4369	smp_rmb();
4370	if (p->on_rq)
4371	return true;
4372
4373	#ifdef CONFIG_SMP
4374	/*
4375	* Ensure the task has finished __schedule() and will not be referenced
4376	* anymore. Again, see try_to_wake_up() for a longer comment.
4377	*/
4378	smp_rmb();
4379	smp_cond_load_acquire(&p->on_cpu, !VAL);
4380	#endif
4381
4382	return false;
4383	}
4384
4385	/**
4386	* task_call_func - Invoke a function on task in fixed state
4387	* @p: Process for which the function is to be invoked, can be @current.
4388	* @func: Function to invoke.
4389	* @arg: Argument to function.
4390	*
4391	* Fix the task in it's current state by avoiding wakeups and or rq operations
4392	* and call @func(@arg) on it. This function can use task_is_runnable() and
4393	* task_curr() to work out what the state is, if required. Given that @func
4394	* can be invoked with a runqueue lock held, it had better be quite
4395	* lightweight.
4396	*
4397	* Returns:
4398	* Whatever @func returns
4399	*/
4400	int task_call_func(struct task_struct p, task_call_f func, void* *arg)
4401	{
4402	struct rq *rq = NULL;
4403	struct rq_flags rf;
4404	int ret;
4405
4406	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
4407
4408	if (__task_needs_rq_lock(p))
4409	rq = __task_rq_lock(p, rf: &rf);
4410
4411	/*
4412	* At this point the task is pinned; either:
4413	* - blocked and we're holding off wakeups (pi->lock)
4414	* - woken, and we're holding off enqueue (rq->lock)
4415	* - queued, and we're holding off schedule (rq->lock)
4416	* - running, and we're holding off de-schedule (rq->lock)
4417	*
4418	* The called function (@func) can use: task_curr(), p->on_rq and
4419	* p->__state to differentiate between these states.
4420	*/
4421	ret = func(p, arg);
4422
4423	if (rq)
4424	rq_unlock(rq, rf: &rf);
4425
4426	raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
4427	return ret;
4428	}
4429
4430	/**
4431	* cpu_curr_snapshot - Return a snapshot of the currently running task
4432	* @cpu: The CPU on which to snapshot the task.
4433	*
4434	* Returns the task_struct pointer of the task "currently" running on
4435	* the specified CPU.
4436	*
4437	* If the specified CPU was offline, the return value is whatever it
4438	* is, perhaps a pointer to the task_struct structure of that CPU's idle
4439	* task, but there is no guarantee. Callers wishing a useful return
4440	* value must take some action to ensure that the specified CPU remains
4441	* online throughout.
4442	*
4443	* This function executes full memory barriers before and after fetching
4444	* the pointer, which permits the caller to confine this function's fetch
4445	* with respect to the caller's accesses to other shared variables.
4446	*/
4447	struct task_struct cpu_curr_snapshot(int* cpu)
4448	{
4449	struct rq *rq = cpu_rq(cpu);
4450	struct task_struct *t;
4451	struct rq_flags rf;
4452
4453	rq_lock_irqsave(rq, rf: &rf);
4454	smp_mb__after_spinlock(); / Pairing determined by caller's synchronization design. /
4455	t = rcu_dereference(cpu_curr(cpu));
4456	rq_unlock_irqrestore(rq, rf: &rf);
4457	smp_mb(); / Pairing determined by caller's synchronization design. /
4458
4459	return t;
4460	}
4461
4462	/**
4463	* wake_up_process - Wake up a specific process
4464	* @p: The process to be woken up.
4465	*
4466	* Attempt to wake up the nominated process and move it to the set of runnable
4467	* processes.
4468	*
4469	* Return: 1 if the process was woken up, 0 if it was already running.
4470	*
4471	* This function executes a full memory barrier before accessing the task state.
4472	*/
4473	int wake_up_process(struct task_struct *p)
4474	{
4475	return try_to_wake_up(p, TASK_NORMAL, wake_flags: `0`);
4476	}
4477	EXPORT_SYMBOL(wake_up_process);
4478
4479	int wake_up_state(struct task_struct p, unsigned* int state)
4480	{
4481	return try_to_wake_up(p, state, wake_flags: `0`);
4482	}
4483
4484	/*
4485	* Perform scheduler related setup for a newly forked process p.
4486	* p is forked by current.
4487	*
4488	* __sched_fork() is basic setup which is also used by sched_init() to
4489	* initialize the boot CPU's idle task.
4490	*/
4491	static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
4492	{
4493	p->on_rq = `0`;
4494
4495	p->se.on_rq = `0`;
4496	p->se.exec_start = `0`;
4497	p->se.sum_exec_runtime = `0`;
4498	p->se.prev_sum_exec_runtime = `0`;
4499	p->se.nr_migrations = `0`;
4500	p->se.vruntime = `0`;
4501	p->se.vlag = `0`;
4502	INIT_LIST_HEAD(list: &p->se.group_node);
4503
4504	/ A delayed task cannot be in clone(). /
4505	WARN_ON_ONCE(p->se.sched_delayed);
4506
4507	#ifdef CONFIG_FAIR_GROUP_SCHED
4508	p->se.cfs_rq = NULL;
4509	#endif
4510
4511	#ifdef CONFIG_SCHEDSTATS
4512	/ Even if schedstat is disabled, there should not be garbage /
4513	memset(&p->stats, `0`, sizeof(p->stats));
4514	#endif
4515
4516	init_dl_entity(dl_se: &p->dl);
4517
4518	INIT_LIST_HEAD(list: &p->rt.run_list);
4519	p->rt.timeout = `0`;
4520	p->rt.time_slice = sched_rr_timeslice;
4521	p->rt.on_rq = `0`;
4522	p->rt.on_list = `0`;
4523
4524	#ifdef CONFIG_SCHED_CLASS_EXT
4525	init_scx_entity(&p->scx);
4526	#endif
4527
4528	#ifdef CONFIG_PREEMPT_NOTIFIERS
4529	INIT_HLIST_HEAD(&p->preempt_notifiers);
4530	#endif
4531
4532	#ifdef CONFIG_COMPACTION
4533	p->capture_control = NULL;
4534	#endif
4535	init_numa_balancing(clone_flags, p);
4536	#ifdef CONFIG_SMP
4537	p->wake_entry.u_flags = CSD_TYPE_TTWU;
4538	p->migration_pending = NULL;
4539	#endif
4540	init_sched_mm_cid(t: p);
4541	}
4542
4543	DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
4544
4545	#ifdef CONFIG_NUMA_BALANCING
4546
4547	int sysctl_numa_balancing_mode;
4548
4549	static void __set_numabalancing_state(bool enabled)
4550	{
4551	if (enabled)
4552	static_branch_enable(&sched_numa_balancing);
4553	else
4554	static_branch_disable(&sched_numa_balancing);
4555	}
4556
4557	void set_numabalancing_state(bool enabled)
4558	{
4559	if (enabled)
4560	sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
4561	else
4562	sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
4563	__set_numabalancing_state(enabled);
4564	}
4565
4566	#ifdef CONFIG_PROC_SYSCTL
4567	static void reset_memory_tiering(void)
4568	{
4569	struct pglist_data *pgdat;
4570
4571	for_each_online_pgdat(pgdat) {
4572	pgdat->nbp_threshold = `0`;
4573	pgdat->nbp_th_nr_cand = node_page_state(pgdat, item: PGPROMOTE_CANDIDATE);
4574	pgdat->nbp_th_start = jiffies_to_msecs(j: jiffies);
4575	}
4576	}
4577
4578	static int sysctl_numa_balancing(const struct ctl_table table, int* write,
4579	void buffer, size_t lenp, loff_t *ppos)
4580	{
4581	struct ctl_table t;
4582	int err;
4583	int state = sysctl_numa_balancing_mode;
4584
4585	if (write && !capable(CAP_SYS_ADMIN))
4586	return -EPERM;
4587
4588	t = *table;
4589	t.data = &state;
4590	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4591	if (err < `0`)
4592	return err;
4593	if (write) {
4594	if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
4595	(state & NUMA_BALANCING_MEMORY_TIERING))
4596	reset_memory_tiering();
4597	sysctl_numa_balancing_mode = state;
4598	__set_numabalancing_state(enabled: state);
4599	}
4600	return err;
4601	}
4602	#endif
4603	#endif
4604
4605	#ifdef CONFIG_SCHEDSTATS
4606
4607	DEFINE_STATIC_KEY_FALSE(sched_schedstats);
4608
4609	static void set_schedstats(bool enabled)
4610	{
4611	if (enabled)
4612	static_branch_enable(&sched_schedstats);
4613	else
4614	static_branch_disable(&sched_schedstats);
4615	}
4616
4617	void force_schedstat_enabled(void)
4618	{
4619	if (!schedstat_enabled()) {
4620	pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
4621	static_branch_enable(&sched_schedstats);
4622	}
4623	}
4624
4625	static int __init setup_schedstats(char *str)
4626	{
4627	int ret = `0`;
4628	if (!str)
4629	goto out;
4630
4631	if (!strcmp(str, "enable")) {
4632	set_schedstats(true);
4633	ret = `1`;
4634	} else if (!strcmp(str, "disable")) {
4635	set_schedstats(false);
4636	ret = `1`;
4637	}
4638	out:
4639	if (!ret)
4640	pr_warn("Unable to parse schedstats=\n");
4641
4642	return ret;
4643	}
4644	__setup("schedstats=", setup_schedstats);
4645
4646	#ifdef CONFIG_PROC_SYSCTL
4647	static int sysctl_schedstats(const struct ctl_table table, int* write, void *buffer,
4648	size_t lenp, loff_t ppos)
4649	{
4650	struct ctl_table t;
4651	int err;
4652	int state = static_branch_likely(&sched_schedstats);
4653
4654	if (write && !capable(CAP_SYS_ADMIN))
4655	return -EPERM;
4656
4657	t = *table;
4658	t.data = &state;
4659	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
4660	if (err < `0`)
4661	return err;
4662	if (write)
4663	set_schedstats(state);
4664	return err;
4665	}
4666	#endif /* CONFIG_PROC_SYSCTL */
4667	#endif /* CONFIG_SCHEDSTATS */
4668
4669	#ifdef CONFIG_SYSCTL
4670	static const struct ctl_table sched_core_sysctls[] = {
4671	#ifdef CONFIG_SCHEDSTATS
4672	{
4673	.procname = "sched_schedstats",
4674	.data = NULL,
4675	.maxlen = sizeof(unsigned int),
4676	.mode = `0644`,
4677	.proc_handler = sysctl_schedstats,
4678	.extra1 = SYSCTL_ZERO,
4679	.extra2 = SYSCTL_ONE,
4680	},
4681	#endif /* CONFIG_SCHEDSTATS */
4682	#ifdef CONFIG_UCLAMP_TASK
4683	{
4684	.procname = "sched_util_clamp_min",
4685	.data = &sysctl_sched_uclamp_util_min,
4686	.maxlen = sizeof(unsigned int),
4687	.mode = `0644`,
4688	.proc_handler = sysctl_sched_uclamp_handler,
4689	},
4690	{
4691	.procname = "sched_util_clamp_max",
4692	.data = &sysctl_sched_uclamp_util_max,
4693	.maxlen = sizeof(unsigned int),
4694	.mode = `0644`,
4695	.proc_handler = sysctl_sched_uclamp_handler,
4696	},
4697	{
4698	.procname = "sched_util_clamp_min_rt_default",
4699	.data = &sysctl_sched_uclamp_util_min_rt_default,
4700	.maxlen = sizeof(unsigned int),
4701	.mode = `0644`,
4702	.proc_handler = sysctl_sched_uclamp_handler,
4703	},
4704	#endif /* CONFIG_UCLAMP_TASK */
4705	#ifdef CONFIG_NUMA_BALANCING
4706	{
4707	.procname = "numa_balancing",
4708	.data = NULL, / filled in by handler /
4709	.maxlen = sizeof(unsigned int),
4710	.mode = `0644`,
4711	.proc_handler = sysctl_numa_balancing,
4712	.extra1 = SYSCTL_ZERO,
4713	.extra2 = SYSCTL_FOUR,
4714	},
4715	#endif /* CONFIG_NUMA_BALANCING */
4716	};
4717	static int __init sched_core_sysctl_init(void)
4718	{
4719	register_sysctl_init("kernel", sched_core_sysctls);
4720	return `0`;
4721	}
4722	late_initcall(sched_core_sysctl_init);
4723	#endif /* CONFIG_SYSCTL */
4724
4725	/*
4726	* fork()/clone()-time setup:
4727	*/
4728	int sched_fork(unsigned long clone_flags, struct task_struct *p)
4729	{
4730	__sched_fork(clone_flags, p);
4731	/*
4732	* We mark the process as NEW here. This guarantees that
4733	* nobody will actually run it, and a signal or other external
4734	* event cannot wake it up and insert it on the runqueue either.
4735	*/
4736	p->__state = TASK_NEW;
4737
4738	/*
4739	* Make sure we do not leak PI boosting priority to the child.
4740	*/
4741	p->prio = current->normal_prio;
4742
4743	uclamp_fork(p);
4744
4745	/*
4746	* Revert to default priority/policy on fork if requested.
4747	*/
4748	if (unlikely(p->sched_reset_on_fork)) {
4749	if (task_has_dl_policy(p) \|\| task_has_rt_policy(p)) {
4750	p->policy = SCHED_NORMAL;
4751	p->static_prio = NICE_TO_PRIO(`0`);
4752	p->rt_priority = `0`;
4753	} else if (PRIO_TO_NICE(p->static_prio) < `0`)
4754	p->static_prio = NICE_TO_PRIO(`0`);
4755
4756	p->prio = p->normal_prio = p->static_prio;
4757	set_load_weight(p, update_load: false);
4758	p->se.custom_slice = `0`;
4759	p->se.slice = sysctl_sched_base_slice;
4760
4761	/*
4762	* We don't need the reset flag anymore after the fork. It has
4763	* fulfilled its duty:
4764	*/
4765	p->sched_reset_on_fork = `0`;
4766	}
4767
4768	if (dl_prio(prio: p->prio))
4769	return -EAGAIN;
4770
4771	scx_pre_fork(p);
4772
4773	if (rt_prio(prio: p->prio)) {
4774	p->sched_class = &rt_sched_class;
4775	#ifdef CONFIG_SCHED_CLASS_EXT
4776	} else if (task_should_scx(p->policy)) {
4777	p->sched_class = &ext_sched_class;
4778	#endif
4779	} else {
4780	p->sched_class = &fair_sched_class;
4781	}
4782
4783	init_entity_runnable_average(se: &p->se);
4784
4785
4786	#ifdef CONFIG_SCHED_INFO
4787	if (likely(sched_info_on()))
4788	memset(&p->sched_info, `0`, sizeof(p->sched_info));
4789	#endif
4790	#if defined(CONFIG_SMP)
4791	p->on_cpu = `0`;
4792	#endif
4793	init_task_preempt_count(p);
4794	#ifdef CONFIG_SMP
4795	plist_node_init(node: &p->pushable_tasks, MAX_PRIO);
4796	RB_CLEAR_NODE(&p->pushable_dl_tasks);
4797	#endif
4798	return `0`;
4799	}
4800
4801	int sched_cgroup_fork(struct task_struct p, struct* kernel_clone_args *kargs)
4802	{
4803	unsigned long flags;
4804
4805	/*
4806	* Because we're not yet on the pid-hash, p->pi_lock isn't strictly
4807	* required yet, but lockdep gets upset if rules are violated.
4808	*/
4809	raw_spin_lock_irqsave(&p->pi_lock, flags);
4810	#ifdef CONFIG_CGROUP_SCHED
4811	if (`1`) {
4812	struct task_group *tg;
4813	tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
4814	struct task_group, css);
4815	tg = autogroup_task_group(p, tg);
4816	p->sched_task_group = tg;
4817	}
4818	#endif
4819	rseq_migrate(t: p);
4820	/*
4821	* We're setting the CPU for the first time, we don't migrate,
4822	* so use __set_task_cpu().
4823	*/
4824	__set_task_cpu(p, smp_processor_id());
4825	if (p->sched_class->task_fork)
4826	p->sched_class->task_fork(p);
4827	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4828
4829	return scx_fork(p);
4830	}
4831
4832	void sched_cancel_fork(struct task_struct *p)
4833	{
4834	scx_cancel_fork(p);
4835	}
4836
4837	void sched_post_fork(struct task_struct *p)
4838	{
4839	uclamp_post_fork(p);
4840	scx_post_fork(p);
4841	}
4842
4843	unsigned long to_ratio(u64 period, u64 runtime)
4844	{
4845	if (runtime == RUNTIME_INF)
4846	return BW_UNIT;
4847
4848	/*
4849	* Doing this here saves a lot of checks in all
4850	* the calling paths, and returning zero seems
4851	* safe for them anyway.
4852	*/
4853	if (period == `0`)
4854	return `0`;
4855
4856	return div64_u64(dividend: runtime << BW_SHIFT, divisor: period);
4857	}
4858
4859	/*
4860	* wake_up_new_task - wake up a newly created task for the first time.
4861	*
4862	* This function will do some initial scheduler statistics housekeeping
4863	* that must be done for every newly created context, then puts the task
4864	* on the runqueue and wakes it.
4865	*/
4866	void wake_up_new_task(struct task_struct *p)
4867	{
4868	struct rq_flags rf;
4869	struct rq *rq;
4870	int wake_flags = WF_FORK;
4871
4872	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
4873	WRITE_ONCE(p->__state, TASK_RUNNING);
4874	#ifdef CONFIG_SMP
4875	/*
4876	* Fork balancing, do it here and not earlier because:
4877	* - cpus_ptr can change in the fork path
4878	* - any previously selected CPU might disappear through hotplug
4879	*
4880	* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
4881	* as we're not fully set-up yet.
4882	*/
4883	p->recent_used_cpu = task_cpu(p);
4884	rseq_migrate(t: p);
4885	__set_task_cpu(p, cpu: select_task_rq(p, cpu: task_cpu(p), wake_flags: &wake_flags));
4886	#endif
4887	rq = __task_rq_lock(p, rf: &rf);
4888	update_rq_clock(rq);
4889	post_init_entity_util_avg(p);
4890
4891	activate_task(rq, p, ENQUEUE_NOCLOCK \| ENQUEUE_INITIAL);
4892	trace_sched_wakeup_new(p);
4893	wakeup_preempt(rq, p, flags: wake_flags);
4894	#ifdef CONFIG_SMP
4895	if (p->sched_class->task_woken) {
4896	/*
4897	* Nothing relies on rq->lock after this, so it's fine to
4898	* drop it.
4899	*/
4900	rq_unpin_lock(rq, rf: &rf);
4901	p->sched_class->task_woken(rq, p);
4902	rq_repin_lock(rq, rf: &rf);
4903	}
4904	#endif
4905	task_rq_unlock(rq, p, rf: &rf);
4906	}
4907
4908	#ifdef CONFIG_PREEMPT_NOTIFIERS
4909
4910	static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
4911
4912	void preempt_notifier_inc(void)
4913	{
4914	static_branch_inc(&preempt_notifier_key);
4915	}
4916	EXPORT_SYMBOL_GPL(preempt_notifier_inc);
4917
4918	void preempt_notifier_dec(void)
4919	{
4920	static_branch_dec(&preempt_notifier_key);
4921	}
4922	EXPORT_SYMBOL_GPL(preempt_notifier_dec);
4923
4924	/**
4925	* preempt_notifier_register - tell me when current is being preempted & rescheduled
4926	* @notifier: notifier struct to register
4927	*/
4928	void preempt_notifier_register(struct preempt_notifier *notifier)
4929	{
4930	if (!static_branch_unlikely(&preempt_notifier_key))
4931	WARN(`1`, "registering preempt_notifier while notifiers disabled\n");
4932
4933	hlist_add_head(n: &notifier->link, h: &current->preempt_notifiers);
4934	}
4935	EXPORT_SYMBOL_GPL(preempt_notifier_register);
4936
4937	/**
4938	* preempt_notifier_unregister - no longer interested in preemption notifications
4939	* @notifier: notifier struct to unregister
4940	*
4941	* This is not safe to call from within a preemption notifier.
4942	*/
4943	void preempt_notifier_unregister(struct preempt_notifier *notifier)
4944	{
4945	hlist_del(n: &notifier->link);
4946	}
4947	EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
4948
4949	static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
4950	{
4951	struct preempt_notifier *notifier;
4952
4953	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4954	notifier->ops->sched_in(notifier, raw_smp_processor_id());
4955	}
4956
4957	static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4958	{
4959	if (static_branch_unlikely(&preempt_notifier_key))
4960	__fire_sched_in_preempt_notifiers(curr);
4961	}
4962
4963	static void
4964	__fire_sched_out_preempt_notifiers(struct task_struct *curr,
4965	struct task_struct *next)
4966	{
4967	struct preempt_notifier *notifier;
4968
4969	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
4970	notifier->ops->sched_out(notifier, next);
4971	}
4972
4973	static __always_inline void
4974	fire_sched_out_preempt_notifiers(struct task_struct *curr,
4975	struct task_struct *next)
4976	{
4977	if (static_branch_unlikely(&preempt_notifier_key))
4978	__fire_sched_out_preempt_notifiers(curr, next);
4979	}
4980
4981	#else /* !CONFIG_PREEMPT_NOTIFIERS */
4982
4983	static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
4984	{
4985	}
4986
4987	static inline void
4988	fire_sched_out_preempt_notifiers(struct task_struct *curr,
4989	struct task_struct *next)
4990	{
4991	}
4992
4993	#endif /* CONFIG_PREEMPT_NOTIFIERS */
4994
4995	static inline void prepare_task(struct task_struct *next)
4996	{
4997	#ifdef CONFIG_SMP
4998	/*
4999	* Claim the task as running, we do this before switching to it
5000	* such that any running task will have this set.
5001	*
5002	* See the smp_load_acquire(&p->on_cpu) case in ttwu() and
5003	* its ordering comment.
5004	*/
5005	WRITE_ONCE(next->on_cpu, `1`);
5006	#endif
5007	}
5008
5009	static inline void finish_task(struct task_struct *prev)
5010	{
5011	#ifdef CONFIG_SMP
5012	/*
5013	* This must be the very last reference to @prev from this CPU. After
5014	* p->on_cpu is cleared, the task can be moved to a different CPU. We
5015	* must ensure this doesn't happen until the switch is completely
5016	* finished.
5017	*
5018	* In particular, the load of prev->state in finish_task_switch() must
5019	* happen before this.
5020	*
5021	* Pairs with the smp_cond_load_acquire() in try_to_wake_up().
5022	*/
5023	smp_store_release(&prev->on_cpu, `0`);
5024	#endif
5025	}
5026
5027	#ifdef CONFIG_SMP
5028
5029	static void do_balance_callbacks(struct rq rq, struct* balance_callback *head)
5030	{
5031	void (func)(struct* rq *rq);
5032	struct balance_callback *next;
5033
5034	lockdep_assert_rq_held(rq);
5035
5036	while (head) {
5037	func = (void ()(struct* rq *))head->func;
5038	next = head->next;
5039	head->next = NULL;
5040	head = next;
5041
5042	func(rq);
5043	}
5044	}
5045
5046	static void balance_push(struct rq *rq);
5047
5048	/*
5049	* balance_push_callback is a right abuse of the callback interface and plays
5050	* by significantly different rules.
5051	*
5052	* Where the normal balance_callback's purpose is to be ran in the same context
5053	* that queued it (only later, when it's safe to drop rq->lock again),
5054	* balance_push_callback is specifically targeted at __schedule().
5055	*
5056	* This abuse is tolerated because it places all the unlikely/odd cases behind
5057	* a single test, namely: rq->balance_callback == NULL.
5058	*/
5059	struct balance_callback balance_push_callback = {
5060	.next = NULL,
5061	.func = balance_push,
5062	};
5063
5064	static inline struct balance_callback *
5065	__splice_balance_callbacks(struct rq *rq, bool split)
5066	{
5067	struct balance_callback *head = rq->balance_callback;
5068
5069	if (likely(!head))
5070	return NULL;
5071
5072	lockdep_assert_rq_held(rq);
5073	/*
5074	* Must not take balance_push_callback off the list when
5075	* splice_balance_callbacks() and balance_callbacks() are not
5076	* in the same rq->lock section.
5077	*
5078	* In that case it would be possible for __schedule() to interleave
5079	* and observe the list empty.
5080	*/
5081	if (split && head == &balance_push_callback)
5082	head = NULL;
5083	else
5084	rq->balance_callback = NULL;
5085
5086	return head;
5087	}
5088
5089	struct balance_callback splice_balance_callbacks(struct* rq *rq)
5090	{
5091	return __splice_balance_callbacks(rq, split: true);
5092	}
5093
5094	static void __balance_callbacks(struct rq *rq)
5095	{
5096	do_balance_callbacks(rq, head: __splice_balance_callbacks(rq, split: false));
5097	}
5098
5099	void balance_callbacks(struct rq rq, struct* balance_callback *head)
5100	{
5101	unsigned long flags;
5102
5103	if (unlikely(head)) {
5104	raw_spin_rq_lock_irqsave(rq, flags);
5105	do_balance_callbacks(rq, head);
5106	raw_spin_rq_unlock_irqrestore(rq, flags);
5107	}
5108	}
5109
5110	#else
5111
5112	static inline void __balance_callbacks(struct rq *rq)
5113	{
5114	}
5115
5116	#endif
5117
5118	static inline void
5119	prepare_lock_switch(struct rq rq, struct* task_struct next, struct* rq_flags *rf)
5120	{
5121	/*
5122	* Since the runqueue lock will be released by the next
5123	* task (which is an invalid locking op but in the case
5124	* of the scheduler it's an obvious special-case), so we
5125	* do an early lockdep release here:
5126	*/
5127	rq_unpin_lock(rq, rf);
5128	spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
5129	#ifdef CONFIG_DEBUG_SPINLOCK
5130	/ this is a valid case when another task releases the spinlock /
5131	rq_lockp(rq)->owner = next;
5132	#endif
5133	}
5134
5135	static inline void finish_lock_switch(struct rq *rq)
5136	{
5137	/*
5138	* If we are tracking spinlock dependencies then we have to
5139	* fix up the runqueue lock - which gets 'carried over' from
5140	* prev into current:
5141	*/
5142	spin_acquire(&__rq_lockp(rq)->dep_map, `0`, `0`, _THIS_IP_);
5143	__balance_callbacks(rq);
5144	raw_spin_rq_unlock_irq(rq);
5145	}
5146
5147	/*
5148	* NOP if the arch has not defined these:
5149	*/
5150
5151	#ifndef prepare_arch_switch
5152	# define prepare_arch_switch(next) do { } while (0)
5153	#endif
5154
5155	#ifndef finish_arch_post_lock_switch
5156	# define finish_arch_post_lock_switch() do { } while (0)
5157	#endif
5158
5159	static inline void kmap_local_sched_out(void)
5160	{
5161	#ifdef CONFIG_KMAP_LOCAL
5162	if (unlikely(current->kmap_ctrl.idx))
5163	__kmap_local_sched_out();
5164	#endif
5165	}
5166
5167	static inline void kmap_local_sched_in(void)
5168	{
5169	#ifdef CONFIG_KMAP_LOCAL
5170	if (unlikely(current->kmap_ctrl.idx))
5171	__kmap_local_sched_in();
5172	#endif
5173	}
5174
5175	/**
5176	* prepare_task_switch - prepare to switch tasks
5177	* @rq: the runqueue preparing to switch
5178	* @prev: the current task that is being switched out
5179	* @next: the task we are going to switch to.
5180	*
5181	* This is called with the rq lock held and interrupts off. It must
5182	* be paired with a subsequent finish_task_switch after the context
5183	* switch.
5184	*
5185	* prepare_task_switch sets up locking and calls architecture specific
5186	* hooks.
5187	*/
5188	static inline void
5189	prepare_task_switch(struct rq rq, struct* task_struct *prev,
5190	struct task_struct *next)
5191	{
5192	kcov_prepare_switch(prev);
5193	sched_info_switch(rq, prev, next);
5194	perf_event_task_sched_out(prev, next);
5195	rseq_preempt(t: prev);
5196	fire_sched_out_preempt_notifiers(curr: prev, next);
5197	kmap_local_sched_out();
5198	prepare_task(next);
5199	prepare_arch_switch(next);
5200	}
5201
5202	/**
5203	* finish_task_switch - clean up after a task-switch
5204	* @prev: the thread we just switched away from.
5205	*
5206	* finish_task_switch must be called after the context switch, paired
5207	* with a prepare_task_switch call before the context switch.
5208	* finish_task_switch will reconcile locking set up by prepare_task_switch,
5209	* and do any other architecture-specific cleanup actions.
5210	*
5211	* Note that we may have delayed dropping an mm in context_switch(). If
5212	* so, we finish that here outside of the runqueue lock. (Doing it
5213	* with the lock held can cause deadlocks; see schedule() for
5214	* details.)
5215	*
5216	* The context switch have flipped the stack from under us and restored the
5217	* local variables which were saved when this task called schedule() in the
5218	* past. 'prev == current' is still correct but we need to recalculate this_rq
5219	* because prev may have moved to another CPU.
5220	*/
5221	static struct rq finish_task_switch(struct* task_struct *prev)
5222	__releases(rq->lock)
5223	{
5224	struct rq *rq = this_rq();
5225	struct mm_struct *mm = rq->prev_mm;
5226	unsigned int prev_state;
5227
5228	/*
5229	* The previous task will have left us with a preempt_count of 2
5230	* because it left us after:
5231	*
5232	* schedule()
5233	* preempt_disable(); // 1
5234	* __schedule()
5235	* raw_spin_lock_irq(&rq->lock) // 2
5236	*
5237	* Also, see FORK_PREEMPT_COUNT.
5238	*/
5239	if (WARN_ONCE(preempt_count() != `2`*PREEMPT_DISABLE_OFFSET,
5240	"corrupted preempt_count: %s/%d/0x%x\n",
5241	current->comm, current->pid, preempt_count()))
5242	preempt_count_set(FORK_PREEMPT_COUNT);
5243
5244	rq->prev_mm = NULL;
5245
5246	/*
5247	* A task struct has one reference for the use as "current".
5248	* If a task dies, then it sets TASK_DEAD in tsk->state and calls
5249	* schedule one last time. The schedule call will never return, and
5250	* the scheduled task must drop that reference.
5251	*
5252	* We must observe prev->state before clearing prev->on_cpu (in
5253	* finish_task), otherwise a concurrent wakeup can get prev
5254	* running on another CPU and we could rave with its RUNNING -> DEAD
5255	* transition, resulting in a double drop.
5256	*/
5257	prev_state = READ_ONCE(prev->__state);
5258	vtime_task_switch(prev);
5259	perf_event_task_sched_in(prev, current);
5260	finish_task(prev);
5261	tick_nohz_task_switch();
5262	finish_lock_switch(rq);
5263	finish_arch_post_lock_switch();
5264	kcov_finish_switch(current);
5265	/*
5266	* kmap_local_sched_out() is invoked with rq::lock held and
5267	* interrupts disabled. There is no requirement for that, but the
5268	* sched out code does not have an interrupt enabled section.
5269	* Restoring the maps on sched in does not require interrupts being
5270	* disabled either.
5271	*/
5272	kmap_local_sched_in();
5273
5274	fire_sched_in_preempt_notifiers(current);
5275	/*
5276	* When switching through a kernel thread, the loop in
5277	* membarrier_{private,global}_expedited() may have observed that
5278	* kernel thread and not issued an IPI. It is therefore possible to
5279	* schedule between user->kernel->user threads without passing though
5280	* switch_mm(). Membarrier requires a barrier after storing to
5281	* rq->curr, before returning to userspace, so provide them here:
5282	*
5283	* - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
5284	* provided by mmdrop_lazy_tlb(),
5285	* - a sync_core for SYNC_CORE.
5286	*/
5287	if (mm) {
5288	membarrier_mm_sync_core_before_usermode(mm);
5289	mmdrop_lazy_tlb_sched(mm);
5290	}
5291
5292	if (unlikely(prev_state == TASK_DEAD)) {
5293	if (prev->sched_class->task_dead)
5294	prev->sched_class->task_dead(prev);
5295
5296	/ Task is done with its stack. /
5297	put_task_stack(tsk: prev);
5298
5299	put_task_struct_rcu_user(task: prev);
5300	}
5301
5302	return rq;
5303	}
5304
5305	/**
5306	* schedule_tail - first thing a freshly forked thread must call.
5307	* @prev: the thread we just switched away from.
5308	*/
5309	asmlinkage __visible void schedule_tail(struct task_struct *prev)
5310	__releases(rq->lock)
5311	{
5312	/*
5313	* New tasks start with FORK_PREEMPT_COUNT, see there and
5314	* finish_task_switch() for details.
5315	*
5316	* finish_task_switch() will drop rq->lock() and lower preempt_count
5317	* and the preempt_enable() will end up enabling preemption (on
5318	* PREEMPT_COUNT kernels).
5319	*/
5320
5321	finish_task_switch(prev);
5322	/*
5323	* This is a special case: the newly created task has just
5324	* switched the context for the first time. It is returning from
5325	* schedule for the first time in this path.
5326	*/
5327	trace_sched_exit_tp(is_switch: true, CALLER_ADDR0);
5328	preempt_enable();
5329
5330	if (current->set_child_tid)
5331	put_user(task_pid_vnr(current), current->set_child_tid);
5332
5333	calculate_sigpending();
5334	}
5335
5336	/*
5337	* context_switch - switch to the new MM and the new thread's register state.
5338	*/
5339	static __always_inline struct rq *
5340	context_switch(struct rq rq, struct* task_struct *prev,
5341	struct task_struct next, struct* rq_flags *rf)
5342	{
5343	prepare_task_switch(rq, prev, next);
5344
5345	/*
5346	* For paravirt, this is coupled with an exit in switch_to to
5347	* combine the page table reload and the switch backend into
5348	* one hypercall.
5349	*/
5350	arch_start_context_switch(prev);
5351
5352	/*
5353	* kernel -> kernel lazy + transfer active
5354	* user -> kernel lazy + mmgrab_lazy_tlb() active
5355	*
5356	* kernel -> user switch + mmdrop_lazy_tlb() active
5357	* user -> user switch
5358	*
5359	* switch_mm_cid() needs to be updated if the barriers provided
5360	* by context_switch() are modified.
5361	*/
5362	if (!next->mm) { // to kernel
5363	enter_lazy_tlb(mm: prev->active_mm, tsk: next);
5364
5365	next->active_mm = prev->active_mm;
5366	if (prev->mm) // from user
5367	mmgrab_lazy_tlb(mm: prev->active_mm);
5368	else
5369	prev->active_mm = NULL;
5370	} else { // to user
5371	membarrier_switch_mm(rq, prev_mm: prev->active_mm, next_mm: next->mm);
5372	/*
5373	* sys_membarrier() requires an smp_mb() between setting
5374	* rq->curr / membarrier_switch_mm() and returning to userspace.
5375	*
5376	* The below provides this either through switch_mm(), or in
5377	* case 'prev->active_mm == next->mm' through
5378	* finish_task_switch()'s mmdrop().
5379	*/
5380	switch_mm_irqs_off(prev: prev->active_mm, next: next->mm, tsk: next);
5381	lru_gen_use_mm(mm: next->mm);
5382
5383	if (!prev->mm) { // from kernel
5384	/ will mmdrop_lazy_tlb() in finish_task_switch(). /
5385	rq->prev_mm = prev->active_mm;
5386	prev->active_mm = NULL;
5387	}
5388	}
5389
5390	/ switch_mm_cid() requires the memory barriers above. /
5391	switch_mm_cid(rq, prev, next);
5392
5393	prepare_lock_switch(rq, next, rf);
5394
5395	/ Here we just switch the register state and the stack. /
5396	switch_to(prev, next, prev);
5397	barrier();
5398
5399	return finish_task_switch(prev);
5400	}
5401
5402	/*
5403	* nr_running and nr_context_switches:
5404	*
5405	* externally visible scheduler statistics: current number of runnable
5406	* threads, total number of context switches performed since bootup.
5407	*/
5408	unsigned int nr_running(void)
5409	{
5410	unsigned int i, sum = `0`;
5411
5412	for_each_online_cpu(i)
5413	sum += cpu_rq(i)->nr_running;
5414
5415	return sum;
5416	}
5417
5418	/*
5419	* Check if only the current task is running on the CPU.
5420	*
5421	* Caution: this function does not check that the caller has disabled
5422	* preemption, thus the result might have a time-of-check-to-time-of-use
5423	* race. The caller is responsible to use it correctly, for example:
5424	*
5425	* - from a non-preemptible section (of course)
5426	*
5427	* - from a thread that is bound to a single CPU
5428	*
5429	* - in a loop with very short iterations (e.g. a polling loop)
5430	*/
5431	bool single_task_running(void)
5432	{
5433	return raw_rq()->nr_running == `1`;
5434	}
5435	EXPORT_SYMBOL(single_task_running);
5436
5437	unsigned long long nr_context_switches_cpu(int cpu)
5438	{
5439	return cpu_rq(cpu)->nr_switches;
5440	}
5441
5442	unsigned long long nr_context_switches(void)
5443	{
5444	int i;
5445	unsigned long long sum = `0`;
5446
5447	for_each_possible_cpu(i)
5448	sum += cpu_rq(i)->nr_switches;
5449
5450	return sum;
5451	}
5452
5453	/*
5454	* Consumers of these two interfaces, like for example the cpuidle menu
5455	* governor, are using nonsensical data. Preferring shallow idle state selection
5456	* for a CPU that has IO-wait which might not even end up running the task when
5457	* it does become runnable.
5458	*/
5459
5460	unsigned int nr_iowait_cpu(int cpu)
5461	{
5462	return atomic_read(v: &cpu_rq(cpu)->nr_iowait);
5463	}
5464
5465	/*
5466	* IO-wait accounting, and how it's mostly bollocks (on SMP).
5467	*
5468	* The idea behind IO-wait account is to account the idle time that we could
5469	* have spend running if it were not for IO. That is, if we were to improve the
5470	* storage performance, we'd have a proportional reduction in IO-wait time.
5471	*
5472	* This all works nicely on UP, where, when a task blocks on IO, we account
5473	* idle time as IO-wait, because if the storage were faster, it could've been
5474	* running and we'd not be idle.
5475	*
5476	* This has been extended to SMP, by doing the same for each CPU. This however
5477	* is broken.
5478	*
5479	* Imagine for instance the case where two tasks block on one CPU, only the one
5480	* CPU will have IO-wait accounted, while the other has regular idle. Even
5481	* though, if the storage were faster, both could've ran at the same time,
5482	* utilising both CPUs.
5483	*
5484	* This means, that when looking globally, the current IO-wait accounting on
5485	* SMP is a lower bound, by reason of under accounting.
5486	*
5487	* Worse, since the numbers are provided per CPU, they are sometimes
5488	* interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
5489	* associated with any one particular CPU, it can wake to another CPU than it
5490	* blocked on. This means the per CPU IO-wait number is meaningless.
5491	*
5492	* Task CPU affinities can make all that even more 'interesting'.
5493	*/
5494
5495	unsigned int nr_iowait(void)
5496	{
5497	unsigned int i, sum = `0`;
5498
5499	for_each_possible_cpu(i)
5500	sum += nr_iowait_cpu(cpu: i);
5501
5502	return sum;
5503	}
5504
5505	#ifdef CONFIG_SMP
5506
5507	/*
5508	* sched_exec - execve() is a valuable balancing opportunity, because at
5509	* this point the task has the smallest effective memory and cache footprint.
5510	*/
5511	void sched_exec(void)
5512	{
5513	struct task_struct *p = current;
5514	struct migration_arg arg;
5515	int dest_cpu;
5516
5517	scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
5518	dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC);
5519	if (dest_cpu == smp_processor_id())
5520	return;
5521
5522	if (unlikely(!cpu_active(dest_cpu)))
5523	return;
5524
5525	arg = (struct migration_arg){ p, dest_cpu };
5526	}
5527	stop_one_cpu(cpu: task_cpu(p), fn: migration_cpu_stop, arg: &arg);
5528	}
5529
5530	#endif
5531
5532	DEFINE_PER_CPU(struct kernel_stat, kstat);
5533	DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
5534
5535	EXPORT_PER_CPU_SYMBOL(kstat);
5536	EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
5537
5538	/*
5539	* The function fair_sched_class.update_curr accesses the struct curr
5540	* and its field curr->exec_start; when called from task_sched_runtime(),
5541	* we observe a high rate of cache misses in practice.
5542	* Prefetching this data results in improved performance.
5543	*/
5544	static inline void prefetch_curr_exec_start(struct task_struct *p)
5545	{
5546	#ifdef CONFIG_FAIR_GROUP_SCHED
5547	struct sched_entity *curr = p->se.cfs_rq->curr;
5548	#else
5549	struct sched_entity *curr = task_rq(p)->cfs.curr;
5550	#endif
5551	prefetch(curr);
5552	prefetch(&curr->exec_start);
5553	}
5554
5555	/*
5556	* Return accounted runtime for the task.
5557	* In case the task is currently running, return the runtime plus current's
5558	* pending runtime that have not been accounted yet.
5559	*/
5560	unsigned long long task_sched_runtime(struct task_struct *p)
5561	{
5562	struct rq_flags rf;
5563	struct rq *rq;
5564	u64 ns;
5565
5566	#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
5567	/*
5568	* 64-bit doesn't need locks to atomically read a 64-bit value.
5569	* So we have a optimization chance when the task's delta_exec is 0.
5570	* Reading ->on_cpu is racy, but this is OK.
5571	*
5572	* If we race with it leaving CPU, we'll take a lock. So we're correct.
5573	* If we race with it entering CPU, unaccounted time is 0. This is
5574	* indistinguishable from the read occurring a few cycles earlier.
5575	* If we see ->on_cpu without ->on_rq, the task is leaving, and has
5576	* been accounted, so we're correct here as well.
5577	*/
5578	if (!p->on_cpu \|\| !task_on_rq_queued(p))
5579	return p->se.sum_exec_runtime;
5580	#endif
5581
5582	rq = task_rq_lock(p, rf: &rf);
5583	/*
5584	* Must be ->curr _and_ ->on_rq. If dequeued, we would
5585	* project cycles that may never be accounted to this
5586	* thread, breaking clock_gettime().
5587	*/
5588	if (task_current_donor(rq, p) && task_on_rq_queued(p)) {
5589	prefetch_curr_exec_start(p);
5590	update_rq_clock(rq);
5591	p->sched_class->update_curr(rq);
5592	}
5593	ns = p->se.sum_exec_runtime;
5594	task_rq_unlock(rq, p, rf: &rf);
5595
5596	return ns;
5597	}
5598
5599	static u64 cpu_resched_latency(struct rq *rq)
5600	{
5601	int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
5602	u64 resched_latency, now = rq_clock(rq);
5603	static bool warned_once;
5604
5605	if (sysctl_resched_latency_warn_once && warned_once)
5606	return `0`;
5607
5608	if (!need_resched() \|\| !latency_warn_ms)
5609	return `0`;
5610
5611	if (system_state == SYSTEM_BOOTING)
5612	return `0`;
5613
5614	if (!rq->last_seen_need_resched_ns) {
5615	rq->last_seen_need_resched_ns = now;
5616	rq->ticks_without_resched = `0`;
5617	return `0`;
5618	}
5619
5620	rq->ticks_without_resched++;
5621	resched_latency = now - rq->last_seen_need_resched_ns;
5622	if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
5623	return `0`;
5624
5625	warned_once = true;
5626
5627	return resched_latency;
5628	}
5629
5630	static int __init setup_resched_latency_warn_ms(char *str)
5631	{
5632	long val;
5633
5634	if ((kstrtol(s: str, base: `0`, res: &val))) {
5635	pr_warn("Unable to set resched_latency_warn_ms\n");
5636	return `1`;
5637	}
5638
5639	sysctl_resched_latency_warn_ms = val;
5640	return `1`;
5641	}
5642	__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
5643
5644	/*
5645	* This function gets called by the timer code, with HZ frequency.
5646	* We call it with interrupts disabled.
5647	*/
5648	void sched_tick(void)
5649	{
5650	int cpu = smp_processor_id();
5651	struct rq *rq = cpu_rq(cpu);
5652	/ accounting goes to the donor task /
5653	struct task_struct *donor;
5654	struct rq_flags rf;
5655	unsigned long hw_pressure;
5656	u64 resched_latency;
5657
5658	if (housekeeping_cpu(cpu, type: HK_TYPE_KERNEL_NOISE))
5659	arch_scale_freq_tick();
5660
5661	sched_clock_tick();
5662
5663	rq_lock(rq, rf: &rf);
5664	donor = rq->donor;
5665
5666	psi_account_irqtime(rq, curr: donor, NULL);
5667
5668	update_rq_clock(rq);
5669	hw_pressure = arch_scale_hw_pressure(cpu: cpu_of(rq));
5670	update_hw_load_avg(now: rq_clock_task(rq), rq, capacity: hw_pressure);
5671
5672	if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
5673	resched_curr(rq);
5674
5675	donor->sched_class->task_tick(rq, donor, `0`);
5676	if (sched_feat(LATENCY_WARN))
5677	resched_latency = cpu_resched_latency(rq);
5678	calc_global_load_tick(this_rq: rq);
5679	sched_core_tick(rq);
5680	task_tick_mm_cid(rq, curr: donor);
5681	scx_tick(rq);
5682
5683	rq_unlock(rq, rf: &rf);
5684
5685	if (sched_feat(LATENCY_WARN) && resched_latency)
5686	resched_latency_warn(cpu, latency: resched_latency);
5687
5688	perf_event_task_tick();
5689
5690	if (donor->flags & PF_WQ_WORKER)
5691	wq_worker_tick(task: donor);
5692
5693	#ifdef CONFIG_SMP
5694	if (!scx_switched_all()) {
5695	rq->idle_balance = idle_cpu(cpu);
5696	sched_balance_trigger(rq);
5697	}
5698	#endif
5699	}
5700
5701	#ifdef CONFIG_NO_HZ_FULL
5702
5703	struct tick_work {
5704	int cpu;
5705	atomic_t state;
5706	struct delayed_work work;
5707	};
5708	/ Values for ->state, see diagram below. /
5709	#define TICK_SCHED_REMOTE_OFFLINE 0
5710	#define TICK_SCHED_REMOTE_OFFLINING 1
5711	#define TICK_SCHED_REMOTE_RUNNING 2
5712
5713	/*
5714	* State diagram for ->state:
5715	*
5716	*
5717	* TICK_SCHED_REMOTE_OFFLINE
5718	* \| ^
5719	* \| \|
5720	* \| \| sched_tick_remote()
5721	* \| \|
5722	* \| \|
5723	* +--TICK_SCHED_REMOTE_OFFLINING
5724	* \| ^
5725	* \| \|
5726	* sched_tick_start() \| \| sched_tick_stop()
5727	* \| \|
5728	* V \|
5729	* TICK_SCHED_REMOTE_RUNNING
5730	*
5731	*
5732	* Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
5733	* and sched_tick_start() are happy to leave the state in RUNNING.
5734	*/
5735
5736	static struct tick_work __percpu *tick_work_cpu;
5737
5738	static void sched_tick_remote(struct work_struct *work)
5739	{
5740	struct delayed_work *dwork = to_delayed_work(work);
5741	struct tick_work twork = container_of(dwork, struct* tick_work, work);
5742	int cpu = twork->cpu;
5743	struct rq *rq = cpu_rq(cpu);
5744	int os;
5745
5746	/*
5747	* Handle the tick only if it appears the remote CPU is running in full
5748	* dynticks mode. The check is racy by nature, but missing a tick or
5749	* having one too much is no big deal because the scheduler tick updates
5750	* statistics and checks timeslices in a time-independent way, regardless
5751	* of when exactly it is running.
5752	*/
5753	if (tick_nohz_tick_stopped_cpu(cpu)) {
5754	guard(rq_lock_irq)(rq);
5755	struct task_struct *curr = rq->curr;
5756
5757	if (cpu_online(cpu)) {
5758	/*
5759	* Since this is a remote tick for full dynticks mode,
5760	* we are always sure that there is no proxy (only a
5761	* single task is running).
5762	*/
5763	WARN_ON_ONCE(rq->curr != rq->donor);
5764	update_rq_clock(rq);
5765
5766	if (!is_idle_task(curr)) {
5767	/*
5768	* Make sure the next tick runs within a
5769	* reasonable amount of time.
5770	*/
5771	u64 delta = rq_clock_task(rq) - curr->se.exec_start;
5772	WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * `3`);
5773	}
5774	curr->sched_class->task_tick(rq, curr, `0`);
5775
5776	calc_load_nohz_remote(rq);
5777	}
5778	}
5779
5780	/*
5781	* Run the remote tick once per second (1Hz). This arbitrary
5782	* frequency is large enough to avoid overload but short enough
5783	* to keep scheduler internal stats reasonably up to date. But
5784	* first update state to reflect hotplug activity if required.
5785	*/
5786	os = atomic_fetch_add_unless(&twork->state, -`1`, TICK_SCHED_REMOTE_RUNNING);
5787	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
5788	if (os == TICK_SCHED_REMOTE_RUNNING)
5789	queue_delayed_work(system_unbound_wq, dwork, HZ);
5790	}
5791
5792	static void sched_tick_start(int cpu)
5793	{
5794	int os;
5795	struct tick_work *twork;
5796
5797	if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
5798	return;
5799
5800	WARN_ON_ONCE(!tick_work_cpu);
5801
5802	twork = per_cpu_ptr(tick_work_cpu, cpu);
5803	os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
5804	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
5805	if (os == TICK_SCHED_REMOTE_OFFLINE) {
5806	twork->cpu = cpu;
5807	INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
5808	queue_delayed_work(system_unbound_wq, &twork->work, HZ);
5809	}
5810	}
5811
5812	#ifdef CONFIG_HOTPLUG_CPU
5813	static void sched_tick_stop(int cpu)
5814	{
5815	struct tick_work *twork;
5816	int os;
5817
5818	if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
5819	return;
5820
5821	WARN_ON_ONCE(!tick_work_cpu);
5822
5823	twork = per_cpu_ptr(tick_work_cpu, cpu);
5824	/ There cannot be competing actions, but don't rely on stop-machine. /
5825	os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
5826	WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
5827	/ Don't cancel, as this would mess up the state machine. /
5828	}
5829	#endif /* CONFIG_HOTPLUG_CPU */
5830
5831	int __init sched_tick_offload_init(void)
5832	{
5833	tick_work_cpu = alloc_percpu(struct tick_work);
5834	BUG_ON(!tick_work_cpu);
5835	return `0`;
5836	}
5837
5838	#else /* !CONFIG_NO_HZ_FULL */
5839	static inline void sched_tick_start(int cpu) { }
5840	static inline void sched_tick_stop(int cpu) { }
5841	#endif
5842
5843	#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) \|\| \
5844	defined(CONFIG_TRACE_PREEMPT_TOGGLE))
5845	/*
5846	* If the value passed in is equal to the current preempt count
5847	* then we just disabled preemption. Start timing the latency.
5848	*/
5849	static inline void preempt_latency_start(int val)
5850	{
5851	if (preempt_count() == val) {
5852	unsigned long ip = get_lock_parent_ip();
5853	#ifdef CONFIG_DEBUG_PREEMPT
5854	current->preempt_disable_ip = ip;
5855	#endif
5856	trace_preempt_off(CALLER_ADDR0, a1: ip);
5857	}
5858	}
5859
5860	void preempt_count_add(int val)
5861	{
5862	#ifdef CONFIG_DEBUG_PREEMPT
5863	/*
5864	* Underflow?
5865	*/
5866	if (DEBUG_LOCKS_WARN_ON((preempt_count() < `0`)))
5867	return;
5868	#endif
5869	__preempt_count_add(val);
5870	#ifdef CONFIG_DEBUG_PREEMPT
5871	/*
5872	* Spinlock count overflowing soon?
5873	*/
5874	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
5875	PREEMPT_MASK - `10`);
5876	#endif
5877	preempt_latency_start(val);
5878	}
5879	EXPORT_SYMBOL(preempt_count_add);
5880	NOKPROBE_SYMBOL(preempt_count_add);
5881
5882	/*
5883	* If the value passed in equals to the current preempt count
5884	* then we just enabled preemption. Stop timing the latency.
5885	*/
5886	static inline void preempt_latency_stop(int val)
5887	{
5888	if (preempt_count() == val)
5889	trace_preempt_on(CALLER_ADDR0, a1: get_lock_parent_ip());
5890	}
5891
5892	void preempt_count_sub(int val)
5893	{
5894	#ifdef CONFIG_DEBUG_PREEMPT
5895	/*
5896	* Underflow?
5897	*/
5898	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
5899	return;
5900	/*
5901	* Is the spinlock portion underflowing?
5902	*/
5903	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
5904	!(preempt_count() & PREEMPT_MASK)))
5905	return;
5906	#endif
5907
5908	preempt_latency_stop(val);
5909	__preempt_count_sub(val);
5910	}
5911	EXPORT_SYMBOL(preempt_count_sub);
5912	NOKPROBE_SYMBOL(preempt_count_sub);
5913
5914	#else
5915	static inline void preempt_latency_start(int val) { }
5916	static inline void preempt_latency_stop(int val) { }
5917	#endif
5918
5919	static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
5920	{
5921	#ifdef CONFIG_DEBUG_PREEMPT
5922	return p->preempt_disable_ip;
5923	#else
5924	return `0`;
5925	#endif
5926	}
5927
5928	/*
5929	* Print scheduling while atomic bug:
5930	*/
5931	static noinline void __schedule_bug(struct task_struct *prev)
5932	{
5933	/ Save this before calling printk(), since that will clobber it /
5934	unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
5935
5936	if (oops_in_progress)
5937	return;
5938
5939	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
5940	prev->comm, prev->pid, preempt_count());
5941
5942	debug_show_held_locks(task: prev);
5943	print_modules();
5944	if (irqs_disabled())
5945	print_irqtrace_events(curr: prev);
5946	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
5947	pr_err("Preemption disabled at:");
5948	print_ip_sym(KERN_ERR, ip: preempt_disable_ip);
5949	}
5950	check_panic_on_warn(origin: "scheduling while atomic");
5951
5952	dump_stack();
5953	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
5954	}
5955
5956	/*
5957	* Various schedule()-time debugging checks and statistics:
5958	*/
5959	static inline void schedule_debug(struct task_struct *prev, bool preempt)
5960	{
5961	#ifdef CONFIG_SCHED_STACK_END_CHECK
5962	if (task_stack_end_corrupted(prev))
5963	panic(fmt: "corrupted stack end detected inside scheduler\n");
5964
5965	if (task_scs_end_corrupted(tsk: prev))
5966	panic(fmt: "corrupted shadow stack detected inside scheduler\n");
5967	#endif
5968
5969	#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
5970	if (!preempt && READ_ONCE(prev->__state) && prev->non_block_count) {
5971	printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
5972	prev->comm, prev->pid, prev->non_block_count);
5973	dump_stack();
5974	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
5975	}
5976	#endif
5977
5978	if (unlikely(in_atomic_preempt_off())) {
5979	__schedule_bug(prev);
5980	preempt_count_set(PREEMPT_DISABLED);
5981	}
5982	rcu_sleep_check();
5983	WARN_ON_ONCE(ct_state() == CT_STATE_USER);
5984
5985	profile_hit(SCHED_PROFILING, ip: __builtin_return_address(`0`));
5986
5987	schedstat_inc(this_rq()->sched_count);
5988	}
5989
5990	static void prev_balance(struct rq rq, struct* task_struct *prev,
5991	struct rq_flags *rf)
5992	{
5993	const struct sched_class *start_class = prev->sched_class;
5994	const struct sched_class *class;
5995
5996	#ifdef CONFIG_SCHED_CLASS_EXT
5997	/*
5998	* SCX requires a balance() call before every pick_task() including when
5999	* waking up from SCHED_IDLE. If @start_class is below SCX, start from
6000	* SCX instead. Also, set a flag to detect missing balance() call.
6001	*/
6002	if (scx_enabled()) {
6003	rq->scx.flags \|= SCX_RQ_BAL_PENDING;
6004	if (sched_class_above(&ext_sched_class, start_class))
6005	start_class = &ext_sched_class;
6006	}
6007	#endif
6008
6009	/*
6010	* We must do the balancing pass before put_prev_task(), such
6011	* that when we release the rq->lock the task is in the same
6012	* state as before we took rq->lock.
6013	*
6014	* We can terminate the balance pass as soon as we know there is
6015	* a runnable task of @class priority or higher.
6016	*/
6017	for_active_class_range(class, start_class, &idle_sched_class) {
6018	if (class->balance && class->balance(rq, prev, rf))
6019	break;
6020	}
6021	}
6022
6023	/*
6024	* Pick up the highest-prio task:
6025	*/
6026	static inline struct task_struct *
6027	__pick_next_task(struct rq rq, struct* task_struct prev, struct* rq_flags *rf)
6028	{
6029	const struct sched_class *class;
6030	struct task_struct *p;
6031
6032	rq->dl_server = NULL;
6033
6034	if (scx_enabled())
6035	goto restart;
6036
6037	/*
6038	* Optimization: we know that if all tasks are in the fair class we can
6039	* call that function directly, but only if the @prev task wasn't of a
6040	* higher scheduling class, because otherwise those lose the
6041	* opportunity to pull in more work from other CPUs.
6042	*/
6043	if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
6044	rq->nr_running == rq->cfs.h_nr_queued)) {
6045
6046	p = pick_next_task_fair(rq, prev, rf);
6047	if (unlikely(p == RETRY_TASK))
6048	goto restart;
6049
6050	/ Assume the next prioritized class is idle_sched_class /
6051	if (!p) {
6052	p = pick_task_idle(rq);
6053	put_prev_set_next_task(rq, prev, next: p);
6054	}
6055
6056	return p;
6057	}
6058
6059	restart:
6060	prev_balance(rq, prev, rf);
6061
6062	for_each_active_class(class) {
6063	if (class->pick_next_task) {
6064	p = class->pick_next_task(rq, prev);
6065	if (p)
6066	return p;
6067	} else {
6068	p = class->pick_task(rq);
6069	if (p) {
6070	put_prev_set_next_task(rq, prev, next: p);
6071	return p;
6072	}
6073	}
6074	}
6075
6076	BUG(); / The idle class should always have a runnable task. /
6077	}
6078
6079	#ifdef CONFIG_SCHED_CORE
6080	static inline bool is_task_rq_idle(struct task_struct *t)
6081	{
6082	return (task_rq(t)->idle == t);
6083	}
6084
6085	static inline bool cookie_equals(struct task_struct a, unsigned* long cookie)
6086	{
6087	return is_task_rq_idle(t: a) \|\| (a->core_cookie == cookie);
6088	}
6089
6090	static inline bool cookie_match(struct task_struct a, struct* task_struct *b)
6091	{
6092	if (is_task_rq_idle(t: a) \|\| is_task_rq_idle(t: b))
6093	return true;
6094
6095	return a->core_cookie == b->core_cookie;
6096	}
6097
6098	static inline struct task_struct pick_task(struct* rq *rq)
6099	{
6100	const struct sched_class *class;
6101	struct task_struct *p;
6102
6103	rq->dl_server = NULL;
6104
6105	for_each_active_class(class) {
6106	p = class->pick_task(rq);
6107	if (p)
6108	return p;
6109	}
6110
6111	BUG(); / The idle class should always have a runnable task. /
6112	}
6113
6114	extern void task_vruntime_update(struct rq rq, struct* task_struct *p, bool in_fi);
6115
6116	static void queue_core_balance(struct rq *rq);
6117
6118	static struct task_struct *
6119	pick_next_task(struct rq rq, struct* task_struct prev, struct* rq_flags *rf)
6120	{
6121	struct task_struct next, p, *max = NULL;
6122	const struct cpumask *smt_mask;
6123	bool fi_before = false;
6124	bool core_clock_updated = (rq == rq->core);
6125	unsigned long cookie;
6126	int i, cpu, occ = `0`;
6127	struct rq *rq_i;
6128	bool need_sync;
6129
6130	if (!sched_core_enabled(rq))
6131	return __pick_next_task(rq, prev, rf);
6132
6133	cpu = cpu_of(rq);
6134
6135	/ Stopper task is switching into idle, no need core-wide selection. /
6136	if (cpu_is_offline(cpu)) {
6137	/*
6138	* Reset core_pick so that we don't enter the fastpath when
6139	* coming online. core_pick would already be migrated to
6140	* another cpu during offline.
6141	*/
6142	rq->core_pick = NULL;
6143	rq->core_dl_server = NULL;
6144	return __pick_next_task(rq, prev, rf);
6145	}
6146
6147	/*
6148	* If there were no {en,de}queues since we picked (IOW, the task
6149	* pointers are all still valid), and we haven't scheduled the last
6150	* pick yet, do so now.
6151	*
6152	* rq->core_pick can be NULL if no selection was made for a CPU because
6153	* it was either offline or went offline during a sibling's core-wide
6154	* selection. In this case, do a core-wide selection.
6155	*/
6156	if (rq->core->core_pick_seq == rq->core->core_task_seq &&
6157	rq->core->core_pick_seq != rq->core_sched_seq &&
6158	rq->core_pick) {
6159	WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq);
6160
6161	next = rq->core_pick;
6162	rq->dl_server = rq->core_dl_server;
6163	rq->core_pick = NULL;
6164	rq->core_dl_server = NULL;
6165	goto out_set_next;
6166	}
6167
6168	prev_balance(rq, prev, rf);
6169
6170	smt_mask = cpu_smt_mask(cpu);
6171	need_sync = !!rq->core->core_cookie;
6172
6173	/ reset state /
6174	rq->core->core_cookie = `0UL`;
6175	if (rq->core->core_forceidle_count) {
6176	if (!core_clock_updated) {
6177	update_rq_clock(rq: rq->core);
6178	core_clock_updated = true;
6179	}
6180	sched_core_account_forceidle(rq);
6181	/ reset after accounting force idle /
6182	rq->core->core_forceidle_start = `0`;
6183	rq->core->core_forceidle_count = `0`;
6184	rq->core->core_forceidle_occupation = `0`;
6185	need_sync = true;
6186	fi_before = true;
6187	}
6188
6189	/*
6190	* core->core_task_seq, core->core_pick_seq, rq->core_sched_seq
6191	*
6192	* @task_seq guards the task state ({en,de}queues)
6193	* @pick_seq is the @task_seq we did a selection on
6194	* @sched_seq is the @pick_seq we scheduled
6195	*
6196	* However, preemptions can cause multiple picks on the same task set.
6197	* 'Fix' this by also increasing @task_seq for every pick.
6198	*/
6199	rq->core->core_task_seq++;
6200
6201	/*
6202	* Optimize for common case where this CPU has no cookies
6203	* and there are no cookied tasks running on siblings.
6204	*/
6205	if (!need_sync) {
6206	next = pick_task(rq);
6207	if (!next->core_cookie) {
6208	rq->core_pick = NULL;
6209	rq->core_dl_server = NULL;
6210	/*
6211	* For robustness, update the min_vruntime_fi for
6212	* unconstrained picks as well.
6213	*/
6214	WARN_ON_ONCE(fi_before);
6215	task_vruntime_update(rq, p: next, in_fi: false);
6216	goto out_set_next;
6217	}
6218	}
6219
6220	/*
6221	* For each thread: do the regular task pick and find the max prio task
6222	* amongst them.
6223	*
6224	* Tie-break prio towards the current CPU
6225	*/
6226	for_each_cpu_wrap(i, smt_mask, cpu) {
6227	rq_i = cpu_rq(i);
6228
6229	/*
6230	* Current cpu always has its clock updated on entrance to
6231	* pick_next_task(). If the current cpu is not the core,
6232	* the core may also have been updated above.
6233	*/
6234	if (i != cpu && (rq_i != rq->core \|\| !core_clock_updated))
6235	update_rq_clock(rq: rq_i);
6236
6237	rq_i->core_pick = p = pick_task(rq: rq_i);
6238	rq_i->core_dl_server = rq_i->dl_server;
6239
6240	if (!max \|\| prio_less(a: max, b: p, in_fi: fi_before))
6241	max = p;
6242	}
6243
6244	cookie = rq->core->core_cookie = max->core_cookie;
6245
6246	/*
6247	* For each thread: try and find a runnable task that matches @max or
6248	* force idle.
6249	*/
6250	for_each_cpu(i, smt_mask) {
6251	rq_i = cpu_rq(i);
6252	p = rq_i->core_pick;
6253
6254	if (!cookie_equals(a: p, cookie)) {
6255	p = NULL;
6256	if (cookie)
6257	p = sched_core_find(rq: rq_i, cookie);
6258	if (!p)
6259	p = idle_sched_class.pick_task(rq_i);
6260	}
6261
6262	rq_i->core_pick = p;
6263	rq_i->core_dl_server = NULL;
6264
6265	if (p == rq_i->idle) {
6266	if (rq_i->nr_running) {
6267	rq->core->core_forceidle_count++;
6268	if (!fi_before)
6269	rq->core->core_forceidle_seq++;
6270	}
6271	} else {
6272	occ++;
6273	}
6274	}
6275
6276	if (schedstat_enabled() && rq->core->core_forceidle_count) {
6277	rq->core->core_forceidle_start = rq_clock(rq: rq->core);
6278	rq->core->core_forceidle_occupation = occ;
6279	}
6280
6281	rq->core->core_pick_seq = rq->core->core_task_seq;
6282	next = rq->core_pick;
6283	rq->core_sched_seq = rq->core->core_pick_seq;
6284
6285	/ Something should have been selected for current CPU /
6286	WARN_ON_ONCE(!next);
6287
6288	/*
6289	* Reschedule siblings
6290	*
6291	* NOTE: L1TF -- at this point we're no longer running the old task and
6292	* sending an IPI (below) ensures the sibling will no longer be running
6293	* their task. This ensures there is no inter-sibling overlap between
6294	* non-matching user state.
6295	*/
6296	for_each_cpu(i, smt_mask) {
6297	rq_i = cpu_rq(i);
6298
6299	/*
6300	* An online sibling might have gone offline before a task
6301	* could be picked for it, or it might be offline but later
6302	* happen to come online, but its too late and nothing was
6303	* picked for it. That's Ok - it will pick tasks for itself,
6304	* so ignore it.
6305	*/
6306	if (!rq_i->core_pick)
6307	continue;
6308
6309	/*
6310	* Update for new !FI->FI transitions, or if continuing to be in !FI:
6311	* fi_before fi update?
6312	* 0 0 1
6313	* 0 1 1
6314	* 1 0 1
6315	* 1 1 0
6316	*/
6317	if (!(fi_before && rq->core->core_forceidle_count))
6318	task_vruntime_update(rq: rq_i, p: rq_i->core_pick, in_fi: !!rq->core->core_forceidle_count);
6319
6320	rq_i->core_pick->core_occupation = occ;
6321
6322	if (i == cpu) {
6323	rq_i->core_pick = NULL;
6324	rq_i->core_dl_server = NULL;
6325	continue;
6326	}
6327
6328	/ Did we break L1TF mitigation requirements? /
6329	WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
6330
6331	if (rq_i->curr == rq_i->core_pick) {
6332	rq_i->core_pick = NULL;
6333	rq_i->core_dl_server = NULL;
6334	continue;
6335	}
6336
6337	resched_curr(rq: rq_i);
6338	}
6339
6340	out_set_next:
6341	put_prev_set_next_task(rq, prev, next);
6342	if (rq->core->core_forceidle_count && next == rq->idle)
6343	queue_core_balance(rq);
6344
6345	return next;
6346	}
6347
6348	static bool try_steal_cookie(int this, int that)
6349	{
6350	struct rq dst = cpu_rq(this), src = cpu_rq(that);
6351	struct task_struct *p;
6352	unsigned long cookie;
6353	bool success = false;
6354
6355	guard(irq)();
6356	guard(double_rq_lock)(lock: dst, lock2: src);
6357
6358	cookie = dst->core->core_cookie;
6359	if (!cookie)
6360	return false;
6361
6362	if (dst->curr != dst->idle)
6363	return false;
6364
6365	p = sched_core_find(rq: src, cookie);
6366	if (!p)
6367	return false;
6368
6369	do {
6370	if (p == src->core_pick \|\| p == src->curr)
6371	goto next;
6372
6373	if (!is_cpu_allowed(p, cpu: this))
6374	goto next;
6375
6376	if (p->core_occupation > dst->idle->core_occupation)
6377	goto next;
6378	/*
6379	* sched_core_find() and sched_core_next() will ensure
6380	* that task @p is not throttled now, we also need to
6381	* check whether the runqueue of the destination CPU is
6382	* being throttled.
6383	*/
6384	if (sched_task_is_throttled(p, cpu: this))
6385	goto next;
6386
6387	move_queued_task_locked(src_rq: src, dst_rq: dst, task: p);
6388	resched_curr(rq: dst);
6389
6390	success = true;
6391	break;
6392
6393	next:
6394	p = sched_core_next(p, cookie);
6395	} while (p);
6396
6397	return success;
6398	}
6399
6400	static bool steal_cookie_task(int cpu, struct sched_domain *sd)
6401	{
6402	int i;
6403
6404	for_each_cpu_wrap(i, sched_domain_span(sd), cpu + `1`) {
6405	if (i == cpu)
6406	continue;
6407
6408	if (need_resched())
6409	break;
6410
6411	if (try_steal_cookie(this: cpu, that: i))
6412	return true;
6413	}
6414
6415	return false;
6416	}
6417
6418	static void sched_core_balance(struct rq *rq)
6419	{
6420	struct sched_domain *sd;
6421	int cpu = cpu_of(rq);
6422
6423	guard(preempt)();
6424	guard(rcu)();
6425
6426	raw_spin_rq_unlock_irq(rq);
6427	for_each_domain(cpu, sd) {
6428	if (need_resched())
6429	break;
6430
6431	if (steal_cookie_task(cpu, sd))
6432	break;
6433	}
6434	raw_spin_rq_lock_irq(rq);
6435	}
6436
6437	static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
6438
6439	static void queue_core_balance(struct rq *rq)
6440	{
6441	if (!sched_core_enabled(rq))
6442	return;
6443
6444	if (!rq->core->core_cookie)
6445	return;
6446
6447	if (!rq->nr_running) / not forced idle /
6448	return;
6449
6450	queue_balance_callback(rq, head: &per_cpu(core_balance_head, rq->cpu), func: sched_core_balance);
6451	}
6452
6453	DEFINE_LOCK_GUARD_1(core_lock, int,
6454	sched_core_lock(*_T->lock, &_T->flags),
6455	sched_core_unlock(*_T->lock, &_T->flags),
6456	unsigned long flags)
6457
6458	static void sched_core_cpu_starting(unsigned int cpu)
6459	{
6460	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
6461	struct rq rq = cpu_rq(cpu), core_rq = NULL;
6462	int t;
6463
6464	guard(core_lock)(l: &cpu);
6465
6466	WARN_ON_ONCE(rq->core != rq);
6467
6468	/ if we're the first, we'll be our own leader /
6469	if (cpumask_weight(srcp: smt_mask) == `1`)
6470	return;
6471
6472	/ find the leader /
6473	for_each_cpu(t, smt_mask) {
6474	if (t == cpu)
6475	continue;
6476	rq = cpu_rq(t);
6477	if (rq->core == rq) {
6478	core_rq = rq;
6479	break;
6480	}
6481	}
6482
6483	if (WARN_ON_ONCE(!core_rq)) / whoopsie /
6484	return;
6485
6486	/ install and validate core_rq /
6487	for_each_cpu(t, smt_mask) {
6488	rq = cpu_rq(t);
6489
6490	if (t == cpu)
6491	rq->core = core_rq;
6492
6493	WARN_ON_ONCE(rq->core != core_rq);
6494	}
6495	}
6496
6497	static void sched_core_cpu_deactivate(unsigned int cpu)
6498	{
6499	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
6500	struct rq rq = cpu_rq(cpu), core_rq = NULL;
6501	int t;
6502
6503	guard(core_lock)(l: &cpu);
6504
6505	/ if we're the last man standing, nothing to do /
6506	if (cpumask_weight(srcp: smt_mask) == `1`) {
6507	WARN_ON_ONCE(rq->core != rq);
6508	return;
6509	}
6510
6511	/ if we're not the leader, nothing to do /
6512	if (rq->core != rq)
6513	return;
6514
6515	/ find a new leader /
6516	for_each_cpu(t, smt_mask) {
6517	if (t == cpu)
6518	continue;
6519	core_rq = cpu_rq(t);
6520	break;
6521	}
6522
6523	if (WARN_ON_ONCE(!core_rq)) / impossible /
6524	return;
6525
6526	/ copy the shared state to the new leader /
6527	core_rq->core_task_seq = rq->core_task_seq;
6528	core_rq->core_pick_seq = rq->core_pick_seq;
6529	core_rq->core_cookie = rq->core_cookie;
6530	core_rq->core_forceidle_count = rq->core_forceidle_count;
6531	core_rq->core_forceidle_seq = rq->core_forceidle_seq;
6532	core_rq->core_forceidle_occupation = rq->core_forceidle_occupation;
6533
6534	/*
6535	* Accounting edge for forced idle is handled in pick_next_task().
6536	* Don't need another one here, since the hotplug thread shouldn't
6537	* have a cookie.
6538	*/
6539	core_rq->core_forceidle_start = `0`;
6540
6541	/ install new leader /
6542	for_each_cpu(t, smt_mask) {
6543	rq = cpu_rq(t);
6544	rq->core = core_rq;
6545	}
6546	}
6547
6548	static inline void sched_core_cpu_dying(unsigned int cpu)
6549	{
6550	struct rq *rq = cpu_rq(cpu);
6551
6552	if (rq->core != rq)
6553	rq->core = rq;
6554	}
6555
6556	#else /* !CONFIG_SCHED_CORE */
6557
6558	static inline void sched_core_cpu_starting(unsigned int cpu) {}
6559	static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
6560	static inline void sched_core_cpu_dying(unsigned int cpu) {}
6561
6562	static struct task_struct *
6563	pick_next_task(struct rq rq, struct* task_struct prev, struct* rq_flags *rf)
6564	{
6565	return __pick_next_task(rq, prev, rf);
6566	}
6567
6568	#endif /* CONFIG_SCHED_CORE */
6569
6570	/*
6571	* Constants for the sched_mode argument of __schedule().
6572	*
6573	* The mode argument allows RT enabled kernels to differentiate a
6574	* preemption from blocking on an 'sleeping' spin/rwlock.
6575	*/
6576	#define SM_IDLE (-1)
6577	#define SM_NONE 0
6578	#define SM_PREEMPT 1
6579	#define SM_RTLOCK_WAIT 2
6580
6581	/*
6582	* Helper function for __schedule()
6583	*
6584	* If a task does not have signals pending, deactivate it
6585	* Otherwise marks the task's __state as RUNNING
6586	*/
6587	static bool try_to_block_task(struct rq rq, struct* task_struct *p,
6588	unsigned long *task_state_p)
6589	{
6590	unsigned long task_state = *task_state_p;
6591	int flags = DEQUEUE_NOCLOCK;
6592
6593	if (signal_pending_state(state: task_state, p)) {
6594	WRITE_ONCE(p->__state, TASK_RUNNING);
6595	*task_state_p = TASK_RUNNING;
6596	return false;
6597	}
6598
6599	p->sched_contributes_to_load =
6600	(task_state & TASK_UNINTERRUPTIBLE) &&
6601	!(task_state & TASK_NOLOAD) &&
6602	!(task_state & TASK_FROZEN);
6603
6604	if (unlikely(is_special_task_state(task_state)))
6605	flags \|= DEQUEUE_SPECIAL;
6606
6607	/*
6608	* __schedule() ttwu()
6609	* prev_state = prev->state; if (p->on_rq && ...)
6610	* if (prev_state) goto out;
6611	* p->on_rq = 0; smp_acquire__after_ctrl_dep();
6612	* p->state = TASK_WAKING
6613	*
6614	* Where __schedule() and ttwu() have matching control dependencies.
6615	*
6616	* After this, schedule() must not care about p->state any more.
6617	*/
6618	block_task(rq, p, flags);
6619	return true;
6620	}
6621
6622	/*
6623	* __schedule() is the main scheduler function.
6624	*
6625	* The main means of driving the scheduler and thus entering this function are:
6626	*
6627	* 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
6628	*
6629	* 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
6630	* paths. For example, see arch/x86/entry_64.S.
6631	*
6632	* To drive preemption between tasks, the scheduler sets the flag in timer
6633	* interrupt handler sched_tick().
6634	*
6635	* 3. Wakeups don't really cause entry into schedule(). They add a
6636	* task to the run-queue and that's it.
6637	*
6638	* Now, if the new task added to the run-queue preempts the current
6639	* task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
6640	* called on the nearest possible occasion:
6641	*
6642	* - If the kernel is preemptible (CONFIG_PREEMPTION=y):
6643	*
6644	* - in syscall or exception context, at the next outmost
6645	* preempt_enable(). (this might be as soon as the wake_up()'s
6646	* spin_unlock()!)
6647	*
6648	* - in IRQ context, return from interrupt-handler to
6649	* preemptible context
6650	*
6651	* - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
6652	* then at the next:
6653	*
6654	* - cond_resched() call
6655	* - explicit schedule() call
6656	* - return from syscall or exception to user-space
6657	* - return from interrupt-handler to user-space
6658	*
6659	* WARNING: must be called with preemption disabled!
6660	*/
6661	static void __sched notrace __schedule(int sched_mode)
6662	{
6663	struct task_struct prev, next;
6664	/*
6665	* On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
6666	* as a preemption by schedule_debug() and RCU.
6667	*/
6668	bool preempt = sched_mode > SM_NONE;
6669	bool is_switch = false;
6670	unsigned long *switch_count;
6671	unsigned long prev_state;
6672	struct rq_flags rf;
6673	struct rq *rq;
6674	int cpu;
6675
6676	trace_sched_entry_tp(preempt, CALLER_ADDR0);
6677
6678	cpu = smp_processor_id();
6679	rq = cpu_rq(cpu);
6680	prev = rq->curr;
6681
6682	schedule_debug(prev, preempt);
6683
6684	if (sched_feat(HRTICK) \|\| sched_feat(HRTICK_DL))
6685	hrtick_clear(rq);
6686
6687	klp_sched_try_switch(curr: prev);
6688
6689	local_irq_disable();
6690	rcu_note_context_switch(preempt);
6691
6692	/*
6693	* Make sure that signal_pending_state()->signal_pending() below
6694	* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
6695	* done by the caller to avoid the race with signal_wake_up():
6696	*
6697	* __set_current_state(@state) signal_wake_up()
6698	* schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
6699	* wake_up_state(p, state)
6700	* LOCK rq->lock LOCK p->pi_state
6701	* smp_mb__after_spinlock() smp_mb__after_spinlock()
6702	* if (signal_pending_state()) if (p->state & @state)
6703	*
6704	* Also, the membarrier system call requires a full memory barrier
6705	* after coming from user-space, before storing to rq->curr; this
6706	* barrier matches a full barrier in the proximity of the membarrier
6707	* system call exit.
6708	*/
6709	rq_lock(rq, rf: &rf);
6710	smp_mb__after_spinlock();
6711
6712	/ Promote REQ to ACT /
6713	rq->clock_update_flags <<= `1`;
6714	update_rq_clock(rq);
6715	rq->clock_update_flags = RQCF_UPDATED;
6716
6717	switch_count = &prev->nivcsw;
6718
6719	/ Task state changes only considers SM_PREEMPT as preemption /
6720	preempt = sched_mode == SM_PREEMPT;
6721
6722	/*
6723	* We must load prev->state once (task_struct::state is volatile), such
6724	* that we form a control dependency vs deactivate_task() below.
6725	*/
6726	prev_state = READ_ONCE(prev->__state);
6727	if (sched_mode == SM_IDLE) {
6728	/ SCX must consult the BPF scheduler to tell if rq is empty /
6729	if (!rq->nr_running && !scx_enabled()) {
6730	next = prev;
6731	goto picked;
6732	}
6733	} else if (!preempt && prev_state) {
6734	try_to_block_task(rq, p: prev, task_state_p: &prev_state);
6735	switch_count = &prev->nvcsw;
6736	}
6737
6738	next = pick_next_task(rq, prev, rf: &rf);
6739	rq_set_donor(rq, t: next);
6740	picked:
6741	clear_tsk_need_resched(tsk: prev);
6742	clear_preempt_need_resched();
6743	rq->last_seen_need_resched_ns = `0`;
6744
6745	is_switch = prev != next;
6746	if (likely(is_switch)) {
6747	rq->nr_switches++;
6748	/*
6749	* RCU users of rcu_dereference(rq->curr) may not see
6750	* changes to task_struct made by pick_next_task().
6751	*/
6752	RCU_INIT_POINTER(rq->curr, next);
6753	/*
6754	* The membarrier system call requires each architecture
6755	* to have a full memory barrier after updating
6756	* rq->curr, before returning to user-space.
6757	*
6758	* Here are the schemes providing that barrier on the
6759	* various architectures:
6760	* - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC,
6761	* RISC-V. switch_mm() relies on membarrier_arch_switch_mm()
6762	* on PowerPC and on RISC-V.
6763	* - finish_lock_switch() for weakly-ordered
6764	* architectures where spin_unlock is a full barrier,
6765	* - switch_to() for arm64 (weakly-ordered, spin_unlock
6766	* is a RELEASE barrier),
6767	*
6768	* The barrier matches a full barrier in the proximity of
6769	* the membarrier system call entry.
6770	*
6771	* On RISC-V, this barrier pairing is also needed for the
6772	* SYNC_CORE command when switching between processes, cf.
6773	* the inline comments in membarrier_arch_switch_mm().
6774	*/
6775	++*switch_count;
6776
6777	migrate_disable_switch(rq, p: prev);
6778	psi_account_irqtime(rq, curr: prev, prev: next);
6779	psi_sched_switch(prev, next, sleep: !task_on_rq_queued(p: prev) \|\|
6780	prev->se.sched_delayed);
6781
6782	trace_sched_switch(preempt, prev, next, prev_state);
6783
6784	/ Also unlocks the rq: /
6785	rq = context_switch(rq, prev, next, rf: &rf);
6786	} else {
6787	rq_unpin_lock(rq, rf: &rf);
6788	__balance_callbacks(rq);
6789	raw_spin_rq_unlock_irq(rq);
6790	}
6791	trace_sched_exit_tp(is_switch, CALLER_ADDR0);
6792	}
6793
6794	void __noreturn do_task_dead(void)
6795	{
6796	/ Causes final put_task_struct in finish_task_switch(): /
6797	set_special_state(TASK_DEAD);
6798
6799	/ Tell freezer to ignore us: /
6800	current->flags \|= PF_NOFREEZE;
6801
6802	__schedule(SM_NONE);
6803	BUG();
6804
6805	/ Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: /
6806	for (;;)
6807	cpu_relax();
6808	}
6809
6810	static inline void sched_submit_work(struct task_struct *tsk)
6811	{
6812	static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
6813	unsigned int task_flags;
6814
6815	/*
6816	* Establish LD_WAIT_CONFIG context to ensure none of the code called
6817	* will use a blocking primitive -- which would lead to recursion.
6818	*/
6819	lock_map_acquire_try(&sched_map);
6820
6821	task_flags = tsk->flags;
6822	/*
6823	* If a worker goes to sleep, notify and ask workqueue whether it
6824	* wants to wake up a task to maintain concurrency.
6825	*/
6826	if (task_flags & PF_WQ_WORKER)
6827	wq_worker_sleeping(task: tsk);
6828	else if (task_flags & PF_IO_WORKER)
6829	io_wq_worker_sleeping(tsk);
6830
6831	/*
6832	* spinlock and rwlock must not flush block requests. This will
6833	* deadlock if the callback attempts to acquire a lock which is
6834	* already acquired.
6835	*/
6836	WARN_ON_ONCE(current->__state & TASK_RTLOCK_WAIT);
6837
6838	/*
6839	* If we are going to sleep and we have plugged IO queued,
6840	* make sure to submit it to avoid deadlocks.
6841	*/
6842	blk_flush_plug(plug: tsk->plug, async: true);
6843
6844	lock_map_release(&sched_map);
6845	}
6846
6847	static void sched_update_worker(struct task_struct *tsk)
6848	{
6849	if (tsk->flags & (PF_WQ_WORKER \| PF_IO_WORKER \| PF_BLOCK_TS)) {
6850	if (tsk->flags & PF_BLOCK_TS)
6851	blk_plug_invalidate_ts(tsk);
6852	if (tsk->flags & PF_WQ_WORKER)
6853	wq_worker_running(task: tsk);
6854	else if (tsk->flags & PF_IO_WORKER)
6855	io_wq_worker_running(tsk);
6856	}
6857	}
6858
6859	static __always_inline void __schedule_loop(int sched_mode)
6860	{
6861	do {
6862	preempt_disable();
6863	__schedule(sched_mode);
6864	sched_preempt_enable_no_resched();
6865	} while (need_resched());
6866	}
6867
6868	asmlinkage __visible void __sched schedule(void)
6869	{
6870	struct task_struct *tsk = current;
6871
6872	#ifdef CONFIG_RT_MUTEXES
6873	lockdep_assert(!tsk->sched_rt_mutex);
6874	#endif
6875
6876	if (!task_is_running(tsk))
6877	sched_submit_work(tsk);
6878	__schedule_loop(SM_NONE);
6879	sched_update_worker(tsk);
6880	}
6881	EXPORT_SYMBOL(schedule);
6882
6883	/*
6884	* synchronize_rcu_tasks() makes sure that no task is stuck in preempted
6885	* state (have scheduled out non-voluntarily) by making sure that all
6886	* tasks have either left the run queue or have gone into user space.
6887	* As idle tasks do not do either, they must not ever be preempted
6888	* (schedule out non-voluntarily).
6889	*
6890	* schedule_idle() is similar to schedule_preempt_disable() except that it
6891	* never enables preemption because it does not call sched_submit_work().
6892	*/
6893	void __sched schedule_idle(void)
6894	{
6895	/*
6896	* As this skips calling sched_submit_work(), which the idle task does
6897	* regardless because that function is a NOP when the task is in a
6898	* TASK_RUNNING state, make sure this isn't used someplace that the
6899	* current task can be in any other state. Note, idle is always in the
6900	* TASK_RUNNING state.
6901	*/
6902	WARN_ON_ONCE(current->__state);
6903	do {
6904	__schedule(SM_IDLE);
6905	} while (need_resched());
6906	}
6907
6908	#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_USER_OFFSTACK)
6909	asmlinkage __visible void __sched schedule_user(void)
6910	{
6911	/*
6912	* If we come here after a random call to set_need_resched(),
6913	* or we have been woken up remotely but the IPI has not yet arrived,
6914	* we haven't yet exited the RCU idle mode. Do it here manually until
6915	* we find a better solution.
6916	*
6917	* NB: There are buggy callers of this function. Ideally we
6918	* should warn if prev_state != CT_STATE_USER, but that will trigger
6919	* too frequently to make sense yet.
6920	*/
6921	enum ctx_state prev_state = exception_enter();
6922	schedule();
6923	exception_exit(prev_state);
6924	}
6925	#endif
6926
6927	/**
6928	* schedule_preempt_disabled - called with preemption disabled
6929	*
6930	* Returns with preemption disabled. Note: preempt_count must be 1
6931	*/
6932	void __sched schedule_preempt_disabled(void)
6933	{
6934	sched_preempt_enable_no_resched();
6935	schedule();
6936	preempt_disable();
6937	}
6938
6939	#ifdef CONFIG_PREEMPT_RT
6940	void __sched notrace schedule_rtlock(void)
6941	{
6942	__schedule_loop(SM_RTLOCK_WAIT);
6943	}
6944	NOKPROBE_SYMBOL(schedule_rtlock);
6945	#endif
6946
6947	static void __sched notrace preempt_schedule_common(void)
6948	{
6949	do {
6950	/*
6951	* Because the function tracer can trace preempt_count_sub()
6952	* and it also uses preempt_enable/disable_notrace(), if
6953	* NEED_RESCHED is set, the preempt_enable_notrace() called
6954	* by the function tracer will call this function again and
6955	* cause infinite recursion.
6956	*
6957	* Preemption must be disabled here before the function
6958	* tracer can trace. Break up preempt_disable() into two
6959	* calls. One to disable preemption without fear of being
6960	* traced. The other to still record the preemption latency,
6961	* which can also be traced by the function tracer.
6962	*/
6963	preempt_disable_notrace();
6964	preempt_latency_start(val: `1`);
6965	__schedule(SM_PREEMPT);
6966	preempt_latency_stop(val: `1`);
6967	preempt_enable_no_resched_notrace();
6968
6969	/*
6970	* Check again in case we missed a preemption opportunity
6971	* between schedule and now.
6972	*/
6973	} while (need_resched());
6974	}
6975
6976	#ifdef CONFIG_PREEMPTION
6977	/*
6978	* This is the entry point to schedule() from in-kernel preemption
6979	* off of preempt_enable.
6980	*/
6981	asmlinkage __visible void __sched notrace preempt_schedule(void)
6982	{
6983	/*
6984	* If there is a non-zero preempt_count or interrupts are disabled,
6985	* we do not want to preempt the current task. Just return..
6986	*/
6987	if (likely(!preemptible()))
6988	return;
6989	preempt_schedule_common();
6990	}
6991	NOKPROBE_SYMBOL(preempt_schedule);
6992	EXPORT_SYMBOL(preempt_schedule);
6993
6994	#ifdef CONFIG_PREEMPT_DYNAMIC
6995	#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
6996	#ifndef preempt_schedule_dynamic_enabled
6997	#define preempt_schedule_dynamic_enabled preempt_schedule
6998	#define preempt_schedule_dynamic_disabled NULL
6999	#endif
7000	DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
7001	EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
7002	#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
7003	static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
7004	void __sched notrace dynamic_preempt_schedule(void)
7005	{
7006	if (!static_branch_unlikely(&sk_dynamic_preempt_schedule))
7007	return;
7008	preempt_schedule();
7009	}
7010	NOKPROBE_SYMBOL(dynamic_preempt_schedule);
7011	EXPORT_SYMBOL(dynamic_preempt_schedule);
7012	#endif
7013	#endif
7014
7015	/**
7016	* preempt_schedule_notrace - preempt_schedule called by tracing
7017	*
7018	* The tracing infrastructure uses preempt_enable_notrace to prevent
7019	* recursion and tracing preempt enabling caused by the tracing
7020	* infrastructure itself. But as tracing can happen in areas coming
7021	* from userspace or just about to enter userspace, a preempt enable
7022	* can occur before user_exit() is called. This will cause the scheduler
7023	* to be called when the system is still in usermode.
7024	*
7025	* To prevent this, the preempt_enable_notrace will use this function
7026	* instead of preempt_schedule() to exit user context if needed before
7027	* calling the scheduler.
7028	*/
7029	asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
7030	{
7031	enum ctx_state prev_ctx;
7032
7033	if (likely(!preemptible()))
7034	return;
7035
7036	do {
7037	/*
7038	* Because the function tracer can trace preempt_count_sub()
7039	* and it also uses preempt_enable/disable_notrace(), if
7040	* NEED_RESCHED is set, the preempt_enable_notrace() called
7041	* by the function tracer will call this function again and
7042	* cause infinite recursion.
7043	*
7044	* Preemption must be disabled here before the function
7045	* tracer can trace. Break up preempt_disable() into two
7046	* calls. One to disable preemption without fear of being
7047	* traced. The other to still record the preemption latency,
7048	* which can also be traced by the function tracer.
7049	*/
7050	preempt_disable_notrace();
7051	preempt_latency_start(val: `1`);
7052	/*
7053	* Needs preempt disabled in case user_exit() is traced
7054	* and the tracer calls preempt_enable_notrace() causing
7055	* an infinite recursion.
7056	*/
7057	prev_ctx = exception_enter();
7058	__schedule(SM_PREEMPT);
7059	exception_exit(prev_ctx);
7060
7061	preempt_latency_stop(val: `1`);
7062	preempt_enable_no_resched_notrace();
7063	} while (need_resched());
7064	}
7065	EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
7066
7067	#ifdef CONFIG_PREEMPT_DYNAMIC
7068	#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
7069	#ifndef preempt_schedule_notrace_dynamic_enabled
7070	#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
7071	#define preempt_schedule_notrace_dynamic_disabled NULL
7072	#endif
7073	DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
7074	EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
7075	#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
7076	static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
7077	void __sched notrace dynamic_preempt_schedule_notrace(void)
7078	{
7079	if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace))
7080	return;
7081	preempt_schedule_notrace();
7082	}
7083	NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
7084	EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
7085	#endif
7086	#endif
7087
7088	#endif /* CONFIG_PREEMPTION */
7089
7090	/*
7091	* This is the entry point to schedule() from kernel preemption
7092	* off of IRQ context.
7093	* Note, that this is called and return with IRQs disabled. This will
7094	* protect us against recursive calling from IRQ contexts.
7095	*/
7096	asmlinkage __visible void __sched preempt_schedule_irq(void)
7097	{
7098	enum ctx_state prev_state;
7099
7100	/ Catch callers which need to be fixed /
7101	BUG_ON(preempt_count() \|\| !irqs_disabled());
7102
7103	prev_state = exception_enter();
7104
7105	do {
7106	preempt_disable();
7107	local_irq_enable();
7108	__schedule(SM_PREEMPT);
7109	local_irq_disable();
7110	sched_preempt_enable_no_resched();
7111	} while (need_resched());
7112
7113	exception_exit(prev_ctx: prev_state);
7114	}
7115
7116	int default_wake_function(wait_queue_entry_t curr, unsigned* mode, int wake_flags,
7117	void *key)
7118	{
7119	WARN_ON_ONCE(wake_flags & ~(WF_SYNC\|WF_CURRENT_CPU));
7120	return try_to_wake_up(p: curr->private, state: mode, wake_flags);
7121	}
7122	EXPORT_SYMBOL(default_wake_function);
7123
7124	const struct sched_class __setscheduler_class(int* policy, int prio)
7125	{
7126	if (dl_prio(prio))
7127	return &dl_sched_class;
7128
7129	if (rt_prio(prio))
7130	return &rt_sched_class;
7131
7132	#ifdef CONFIG_SCHED_CLASS_EXT
7133	if (task_should_scx(policy))
7134	return &ext_sched_class;
7135	#endif
7136
7137	return &fair_sched_class;
7138	}
7139
7140	#ifdef CONFIG_RT_MUTEXES
7141
7142	/*
7143	* Would be more useful with typeof()/auto_type but they don't mix with
7144	* bit-fields. Since it's a local thing, use int. Keep the generic sounding
7145	* name such that if someone were to implement this function we get to compare
7146	* notes.
7147	*/
7148	#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; })
7149
7150	void rt_mutex_pre_schedule(void)
7151	{
7152	lockdep_assert(!fetch_and_set(current->sched_rt_mutex, `1`));
7153	sched_submit_work(current);
7154	}
7155
7156	void rt_mutex_schedule(void)
7157	{
7158	lockdep_assert(current->sched_rt_mutex);
7159	__schedule_loop(SM_NONE);
7160	}
7161
7162	void rt_mutex_post_schedule(void)
7163	{
7164	sched_update_worker(current);
7165	lockdep_assert(fetch_and_set(current->sched_rt_mutex, `0`));
7166	}
7167
7168	/*
7169	* rt_mutex_setprio - set the current priority of a task
7170	* @p: task to boost
7171	* @pi_task: donor task
7172	*
7173	* This function changes the 'effective' priority of a task. It does
7174	* not touch ->normal_prio like __setscheduler().
7175	*
7176	* Used by the rt_mutex code to implement priority inheritance
7177	* logic. Call site only calls if the priority of the task changed.
7178	*/
7179	void rt_mutex_setprio(struct task_struct p, struct* task_struct *pi_task)
7180	{
7181	int prio, oldprio, queued, running, queue_flag =
7182	DEQUEUE_SAVE \| DEQUEUE_MOVE \| DEQUEUE_NOCLOCK;
7183	const struct sched_class prev_class, next_class;
7184	struct rq_flags rf;
7185	struct rq *rq;
7186
7187	/ XXX used to be waiter->prio, not waiter->task->prio /
7188	prio = __rt_effective_prio(pi_task, prio: p->normal_prio);
7189
7190	/*
7191	* If nothing changed; bail early.
7192	*/
7193	if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
7194	return;
7195
7196	rq = __task_rq_lock(p, rf: &rf);
7197	update_rq_clock(rq);
7198	/*
7199	* Set under pi_lock && rq->lock, such that the value can be used under
7200	* either lock.
7201	*
7202	* Note that there is loads of tricky to make this pointer cache work
7203	* right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
7204	* ensure a task is de-boosted (pi_task is set to NULL) before the
7205	* task is allowed to run again (and can exit). This ensures the pointer
7206	* points to a blocked task -- which guarantees the task is present.
7207	*/
7208	p->pi_top_task = pi_task;
7209
7210	/*
7211	* For FIFO/RR we only need to set prio, if that matches we're done.
7212	*/
7213	if (prio == p->prio && !dl_prio(prio))
7214	goto out_unlock;
7215
7216	/*
7217	* Idle task boosting is a no-no in general. There is one
7218	* exception, when PREEMPT_RT and NOHZ is active:
7219	*
7220	* The idle task calls get_next_timer_interrupt() and holds
7221	* the timer wheel base->lock on the CPU and another CPU wants
7222	* to access the timer (probably to cancel it). We can safely
7223	* ignore the boosting request, as the idle CPU runs this code
7224	* with interrupts disabled and will complete the lock
7225	* protected section without being interrupted. So there is no
7226	* real need to boost.
7227	*/
7228	if (unlikely(p == rq->idle)) {
7229	WARN_ON(p != rq->curr);
7230	WARN_ON(p->pi_blocked_on);
7231	goto out_unlock;
7232	}
7233
7234	trace_sched_pi_setprio(tsk: p, pi_task);
7235	oldprio = p->prio;
7236
7237	if (oldprio == prio)
7238	queue_flag &= ~DEQUEUE_MOVE;
7239
7240	prev_class = p->sched_class;
7241	next_class = __setscheduler_class(policy: p->policy, prio);
7242
7243	if (prev_class != next_class && p->se.sched_delayed)
7244	dequeue_task(rq, p, DEQUEUE_SLEEP \| DEQUEUE_DELAYED \| DEQUEUE_NOCLOCK);
7245
7246	queued = task_on_rq_queued(p);
7247	running = task_current_donor(rq, p);
7248	if (queued)
7249	dequeue_task(rq, p, flags: queue_flag);
7250	if (running)
7251	put_prev_task(rq, prev: p);
7252
7253	/*
7254	* Boosting condition are:
7255	* 1. -rt task is running and holds mutex A
7256	* --> -dl task blocks on mutex A
7257	*
7258	* 2. -dl task is running and holds mutex A
7259	* --> -dl task blocks on mutex A and could preempt the
7260	* running task
7261	*/
7262	if (dl_prio(prio)) {
7263	if (!dl_prio(prio: p->normal_prio) \|\|
7264	(pi_task && dl_prio(prio: pi_task->prio) &&
7265	dl_entity_preempt(a: &pi_task->dl, b: &p->dl))) {
7266	p->dl.pi_se = pi_task->dl.pi_se;
7267	queue_flag \|= ENQUEUE_REPLENISH;
7268	} else {
7269	p->dl.pi_se = &p->dl;
7270	}
7271	} else if (rt_prio(prio)) {
7272	if (dl_prio(prio: oldprio))
7273	p->dl.pi_se = &p->dl;
7274	if (oldprio < prio)
7275	queue_flag \|= ENQUEUE_HEAD;
7276	} else {
7277	if (dl_prio(prio: oldprio))
7278	p->dl.pi_se = &p->dl;
7279	if (rt_prio(prio: oldprio))
7280	p->rt.timeout = `0`;
7281	}
7282
7283	p->sched_class = next_class;
7284	p->prio = prio;
7285
7286	check_class_changing(rq, p, prev_class);
7287
7288	if (queued)
7289	enqueue_task(rq, p, flags: queue_flag);
7290	if (running)
7291	set_next_task(rq, next: p);
7292
7293	check_class_changed(rq, p, prev_class, oldprio);
7294	out_unlock:
7295	/ Avoid rq from going away on us: /
7296	preempt_disable();
7297
7298	rq_unpin_lock(rq, rf: &rf);
7299	__balance_callbacks(rq);
7300	raw_spin_rq_unlock(rq);
7301
7302	preempt_enable();
7303	}
7304	#endif
7305
7306	#if !defined(CONFIG_PREEMPTION) \|\| defined(CONFIG_PREEMPT_DYNAMIC)
7307	int __sched __cond_resched(void)
7308	{
7309	if (should_resched(preempt_offset: `0`) && !irqs_disabled()) {
7310	preempt_schedule_common();
7311	return `1`;
7312	}
7313	/*
7314	* In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick
7315	* whether the current CPU is in an RCU read-side critical section,
7316	* so the tick can report quiescent states even for CPUs looping
7317	* in kernel context. In contrast, in non-preemptible kernels,
7318	* RCU readers leave no in-memory hints, which means that CPU-bound
7319	* processes executing in kernel context might never report an
7320	* RCU quiescent state. Therefore, the following code causes
7321	* cond_resched() to report a quiescent state, but only when RCU
7322	* is in urgent need of one.
7323	* A third case, preemptible, but non-PREEMPT_RCU provides for
7324	* urgently needed quiescent states via rcu_flavor_sched_clock_irq().
7325	*/
7326	#ifndef CONFIG_PREEMPT_RCU
7327	rcu_all_qs();
7328	#endif
7329	return `0`;
7330	}
7331	EXPORT_SYMBOL(__cond_resched);
7332	#endif
7333
7334	#ifdef CONFIG_PREEMPT_DYNAMIC
7335	#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
7336	#define cond_resched_dynamic_enabled __cond_resched
7337	#define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
7338	DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
7339	EXPORT_STATIC_CALL_TRAMP(cond_resched);
7340
7341	#define might_resched_dynamic_enabled __cond_resched
7342	#define might_resched_dynamic_disabled ((void *)&__static_call_return0)
7343	DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
7344	EXPORT_STATIC_CALL_TRAMP(might_resched);
7345	#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
7346	static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
7347	int __sched dynamic_cond_resched(void)
7348	{
7349	if (!static_branch_unlikely(&sk_dynamic_cond_resched))
7350	return `0`;
7351	return __cond_resched();
7352	}
7353	EXPORT_SYMBOL(dynamic_cond_resched);
7354
7355	static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);
7356	int __sched dynamic_might_resched(void)
7357	{
7358	if (!static_branch_unlikely(&sk_dynamic_might_resched))
7359	return `0`;
7360	return __cond_resched();
7361	}
7362	EXPORT_SYMBOL(dynamic_might_resched);
7363	#endif
7364	#endif
7365
7366	/*
7367	* __cond_resched_lock() - if a reschedule is pending, drop the given lock,
7368	* call schedule, and on return reacquire the lock.
7369	*
7370	* This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
7371	* operations here to prevent schedule() from being called twice (once via
7372	* spin_unlock(), once by hand).
7373	*/
7374	int __cond_resched_lock(spinlock_t *lock)
7375	{
7376	int resched = should_resched(PREEMPT_LOCK_OFFSET);
7377	int ret = `0`;
7378
7379	lockdep_assert_held(lock);
7380
7381	if (spin_needbreak(lock) \|\| resched) {
7382	spin_unlock(lock);
7383	if (!_cond_resched())
7384	cpu_relax();
7385	ret = `1`;
7386	spin_lock(lock);
7387	}
7388	return ret;
7389	}
7390	EXPORT_SYMBOL(__cond_resched_lock);
7391
7392	int __cond_resched_rwlock_read(rwlock_t *lock)
7393	{
7394	int resched = should_resched(PREEMPT_LOCK_OFFSET);
7395	int ret = `0`;
7396
7397	lockdep_assert_held_read(lock);
7398
7399	if (rwlock_needbreak(lock) \|\| resched) {
7400	read_unlock(lock);
7401	if (!_cond_resched())
7402	cpu_relax();
7403	ret = `1`;
7404	read_lock(lock);
7405	}
7406	return ret;
7407	}
7408	EXPORT_SYMBOL(__cond_resched_rwlock_read);
7409
7410	int __cond_resched_rwlock_write(rwlock_t *lock)
7411	{
7412	int resched = should_resched(PREEMPT_LOCK_OFFSET);
7413	int ret = `0`;
7414
7415	lockdep_assert_held_write(lock);
7416
7417	if (rwlock_needbreak(lock) \|\| resched) {
7418	write_unlock(lock);
7419	if (!_cond_resched())
7420	cpu_relax();
7421	ret = `1`;
7422	write_lock(lock);
7423	}
7424	return ret;
7425	}
7426	EXPORT_SYMBOL(__cond_resched_rwlock_write);
7427
7428	#ifdef CONFIG_PREEMPT_DYNAMIC
7429
7430	#ifdef CONFIG_GENERIC_ENTRY
7431	#include <linux/entry-common.h>
7432	#endif
7433
7434	/*
7435	* SC:cond_resched
7436	* SC:might_resched
7437	* SC:preempt_schedule
7438	* SC:preempt_schedule_notrace
7439	* SC:irqentry_exit_cond_resched
7440	*
7441	*
7442	* NONE:
7443	* cond_resched <- __cond_resched
7444	* might_resched <- RET0
7445	* preempt_schedule <- NOP
7446	* preempt_schedule_notrace <- NOP
7447	* irqentry_exit_cond_resched <- NOP
7448	* dynamic_preempt_lazy <- false
7449	*
7450	* VOLUNTARY:
7451	* cond_resched <- __cond_resched
7452	* might_resched <- __cond_resched
7453	* preempt_schedule <- NOP
7454	* preempt_schedule_notrace <- NOP
7455	* irqentry_exit_cond_resched <- NOP
7456	* dynamic_preempt_lazy <- false
7457	*
7458	* FULL:
7459	* cond_resched <- RET0
7460	* might_resched <- RET0
7461	* preempt_schedule <- preempt_schedule
7462	* preempt_schedule_notrace <- preempt_schedule_notrace
7463	* irqentry_exit_cond_resched <- irqentry_exit_cond_resched
7464	* dynamic_preempt_lazy <- false
7465	*
7466	* LAZY:
7467	* cond_resched <- RET0
7468	* might_resched <- RET0
7469	* preempt_schedule <- preempt_schedule
7470	* preempt_schedule_notrace <- preempt_schedule_notrace
7471	* irqentry_exit_cond_resched <- irqentry_exit_cond_resched
7472	* dynamic_preempt_lazy <- true
7473	*/
7474
7475	enum {
7476	preempt_dynamic_undefined = -`1`,
7477	preempt_dynamic_none,
7478	preempt_dynamic_voluntary,
7479	preempt_dynamic_full,
7480	preempt_dynamic_lazy,
7481	};
7482
7483	int preempt_dynamic_mode = preempt_dynamic_undefined;
7484
7485	int sched_dynamic_mode(const char *str)
7486	{
7487	#ifndef CONFIG_PREEMPT_RT
7488	if (!strcmp(str, "none"))
7489	return preempt_dynamic_none;
7490
7491	if (!strcmp(str, "voluntary"))
7492	return preempt_dynamic_voluntary;
7493	#endif
7494
7495	if (!strcmp(str, "full"))
7496	return preempt_dynamic_full;
7497
7498	#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
7499	if (!strcmp(str, "lazy"))
7500	return preempt_dynamic_lazy;
7501	#endif
7502
7503	return -EINVAL;
7504	}
7505
7506	#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
7507	#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
7508
7509	#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
7510	#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
7511	#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
7512	#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
7513	#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
7514	#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
7515	#else
7516	#error "Unsupported PREEMPT_DYNAMIC mechanism"
7517	#endif
7518
7519	static DEFINE_MUTEX(sched_dynamic_mutex);
7520
7521	static void __sched_dynamic_update(int mode)
7522	{
7523	/*
7524	* Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
7525	* the ZERO state, which is invalid.
7526	*/
7527	preempt_dynamic_enable(cond_resched);
7528	preempt_dynamic_enable(might_resched);
7529	preempt_dynamic_enable(preempt_schedule);
7530	preempt_dynamic_enable(preempt_schedule_notrace);
7531	preempt_dynamic_enable(irqentry_exit_cond_resched);
7532	preempt_dynamic_key_disable(preempt_lazy);
7533
7534	switch (mode) {
7535	case preempt_dynamic_none:
7536	preempt_dynamic_enable(cond_resched);
7537	preempt_dynamic_disable(might_resched);
7538	preempt_dynamic_disable(preempt_schedule);
7539	preempt_dynamic_disable(preempt_schedule_notrace);
7540	preempt_dynamic_disable(irqentry_exit_cond_resched);
7541	preempt_dynamic_key_disable(preempt_lazy);
7542	if (mode != preempt_dynamic_mode)
7543	pr_info("Dynamic Preempt: none\n");
7544	break;
7545
7546	case preempt_dynamic_voluntary:
7547	preempt_dynamic_enable(cond_resched);
7548	preempt_dynamic_enable(might_resched);
7549	preempt_dynamic_disable(preempt_schedule);
7550	preempt_dynamic_disable(preempt_schedule_notrace);
7551	preempt_dynamic_disable(irqentry_exit_cond_resched);
7552	preempt_dynamic_key_disable(preempt_lazy);
7553	if (mode != preempt_dynamic_mode)
7554	pr_info("Dynamic Preempt: voluntary\n");
7555	break;
7556
7557	case preempt_dynamic_full:
7558	preempt_dynamic_disable(cond_resched);
7559	preempt_dynamic_disable(might_resched);
7560	preempt_dynamic_enable(preempt_schedule);
7561	preempt_dynamic_enable(preempt_schedule_notrace);
7562	preempt_dynamic_enable(irqentry_exit_cond_resched);
7563	preempt_dynamic_key_disable(preempt_lazy);
7564	if (mode != preempt_dynamic_mode)
7565	pr_info("Dynamic Preempt: full\n");
7566	break;
7567
7568	case preempt_dynamic_lazy:
7569	preempt_dynamic_disable(cond_resched);
7570	preempt_dynamic_disable(might_resched);
7571	preempt_dynamic_enable(preempt_schedule);
7572	preempt_dynamic_enable(preempt_schedule_notrace);
7573	preempt_dynamic_enable(irqentry_exit_cond_resched);
7574	preempt_dynamic_key_enable(preempt_lazy);
7575	if (mode != preempt_dynamic_mode)
7576	pr_info("Dynamic Preempt: lazy\n");
7577	break;
7578	}
7579
7580	preempt_dynamic_mode = mode;
7581	}
7582
7583	void sched_dynamic_update(int mode)
7584	{
7585	mutex_lock(&sched_dynamic_mutex);
7586	__sched_dynamic_update(mode);
7587	mutex_unlock(lock: &sched_dynamic_mutex);
7588	}
7589
7590	static int __init setup_preempt_mode(char *str)
7591	{
7592	int mode = sched_dynamic_mode(str);
7593	if (mode < `0`) {
7594	pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
7595	return `0`;
7596	}
7597
7598	sched_dynamic_update(mode);
7599	return `1`;
7600	}
7601	__setup("preempt=", setup_preempt_mode);
7602
7603	static void __init preempt_dynamic_init(void)
7604	{
7605	if (preempt_dynamic_mode == preempt_dynamic_undefined) {
7606	if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
7607	sched_dynamic_update(mode: preempt_dynamic_none);
7608	} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
7609	sched_dynamic_update(mode: preempt_dynamic_voluntary);
7610	} else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
7611	sched_dynamic_update(mode: preempt_dynamic_lazy);
7612	} else {
7613	/ Default static call setting, nothing to do /
7614	WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
7615	preempt_dynamic_mode = preempt_dynamic_full;
7616	pr_info("Dynamic Preempt: full\n");
7617	}
7618	}
7619	}
7620
7621	#define PREEMPT_MODEL_ACCESSOR(mode) \
7622	bool preempt_model_##mode(void) \
7623	{ \
7624	WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
7625	return preempt_dynamic_mode == preempt_dynamic_##mode; \
7626	} \
7627	EXPORT_SYMBOL_GPL(preempt_model_##mode)
7628
7629	PREEMPT_MODEL_ACCESSOR(none);
7630	PREEMPT_MODEL_ACCESSOR(voluntary);
7631	PREEMPT_MODEL_ACCESSOR(full);
7632	PREEMPT_MODEL_ACCESSOR(lazy);
7633
7634	#else /* !CONFIG_PREEMPT_DYNAMIC: */
7635
7636	#define preempt_dynamic_mode -1
7637
7638	static inline void preempt_dynamic_init(void) { }
7639
7640	#endif /* CONFIG_PREEMPT_DYNAMIC */
7641
7642	const char *preempt_modes[] = {
7643	"none", "voluntary", "full", "lazy", NULL,
7644	};
7645
7646	const char preempt_model_str(void*)
7647	{
7648	bool brace = IS_ENABLED(CONFIG_PREEMPT_RT) &&
7649	(IS_ENABLED(CONFIG_PREEMPT_DYNAMIC) \|\|
7650	IS_ENABLED(CONFIG_PREEMPT_LAZY));
7651	static char buf[`128`];
7652
7653	if (IS_ENABLED(CONFIG_PREEMPT_BUILD)) {
7654	struct seq_buf s;
7655
7656	seq_buf_init(s: &s, buf, size: sizeof(buf));
7657	seq_buf_puts(s: &s, str: "PREEMPT");
7658
7659	if (IS_ENABLED(CONFIG_PREEMPT_RT))
7660	seq_buf_printf(s: &s, fmt: "%sRT%s",
7661	brace ? "_{" : "_",
7662	brace ? "," : "");
7663
7664	if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) {
7665	seq_buf_printf(s: &s, fmt: "(%s)%s",
7666	preempt_dynamic_mode > `0` ?
7667	preempt_modes[preempt_dynamic_mode] : "undef",
7668	brace ? "}" : "");
7669	return seq_buf_str(s: &s);
7670	}
7671
7672	if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
7673	seq_buf_printf(s: &s, fmt: "LAZY%s",
7674	brace ? "}" : "");
7675	return seq_buf_str(s: &s);
7676	}
7677
7678	return seq_buf_str(s: &s);
7679	}
7680
7681	if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BUILD))
7682	return "VOLUNTARY";
7683
7684	return "NONE";
7685	}
7686
7687	int io_schedule_prepare(void)
7688	{
7689	int old_iowait = current->in_iowait;
7690
7691	current->in_iowait = `1`;
7692	blk_flush_plug(current->plug, async: true);
7693	return old_iowait;
7694	}
7695
7696	void io_schedule_finish(int token)
7697	{
7698	current->in_iowait = token;
7699	}
7700
7701	/*
7702	* This task is about to go to sleep on IO. Increment rq->nr_iowait so
7703	* that process accounting knows that this is a task in IO wait state.
7704	*/
7705	long __sched io_schedule_timeout(long timeout)
7706	{
7707	int token;
7708	long ret;
7709
7710	token = io_schedule_prepare();
7711	ret = schedule_timeout(timeout);
7712	io_schedule_finish(token);
7713
7714	return ret;
7715	}
7716	EXPORT_SYMBOL(io_schedule_timeout);
7717
7718	void __sched io_schedule(void)
7719	{
7720	int token;
7721
7722	token = io_schedule_prepare();
7723	schedule();
7724	io_schedule_finish(token);
7725	}
7726	EXPORT_SYMBOL(io_schedule);
7727
7728	void sched_show_task(struct task_struct *p)
7729	{
7730	unsigned long free;
7731	int ppid;
7732
7733	if (!try_get_task_stack(tsk: p))
7734	return;
7735
7736	pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
7737
7738	if (task_is_running(p))
7739	pr_cont(" running task ");
7740	free = stack_not_used(p);
7741	ppid = `0`;
7742	rcu_read_lock();
7743	if (pid_alive(p))
7744	ppid = task_pid_nr(rcu_dereference(p->real_parent));
7745	rcu_read_unlock();
7746	pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d task_flags:0x%04x flags:0x%08lx\n",
7747	free, task_pid_nr(p), task_tgid_nr(p),
7748	ppid, p->flags, read_task_thread_flags(p));
7749
7750	print_worker_info(KERN_INFO, task: p);
7751	print_stop_info(KERN_INFO, task: p);
7752	print_scx_info(KERN_INFO, p);
7753	show_stack(task: p, NULL, KERN_INFO);
7754	put_task_stack(tsk: p);
7755	}
7756	EXPORT_SYMBOL_GPL(sched_show_task);
7757
7758	static inline bool
7759	state_filter_match(unsigned long state_filter, struct task_struct *p)
7760	{
7761	unsigned int state = READ_ONCE(p->__state);
7762
7763	/ no filter, everything matches /
7764	if (!state_filter)
7765	return true;
7766
7767	/ filter, but doesn't match /
7768	if (!(state & state_filter))
7769	return false;
7770
7771	/*
7772	* When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
7773	* TASK_KILLABLE).
7774	*/
7775	if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD))
7776	return false;
7777
7778	return true;
7779	}
7780
7781
7782	void show_state_filter(unsigned int state_filter)
7783	{
7784	struct task_struct g, p;
7785
7786	rcu_read_lock();
7787	for_each_process_thread(g, p) {
7788	/*
7789	* reset the NMI-timeout, listing all files on a slow
7790	* console might take a lot of time:
7791	* Also, reset softlockup watchdogs on all CPUs, because
7792	* another CPU might be blocked waiting for us to process
7793	* an IPI.
7794	*/
7795	touch_nmi_watchdog();
7796	touch_all_softlockup_watchdogs();
7797	if (state_filter_match(state_filter, p))
7798	sched_show_task(p);
7799	}
7800
7801	if (!state_filter)
7802	sysrq_sched_debug_show();
7803
7804	rcu_read_unlock();
7805	/*
7806	* Only show locks if all tasks are dumped:
7807	*/
7808	if (!state_filter)
7809	debug_show_all_locks();
7810	}
7811
7812	/**
7813	* init_idle - set up an idle thread for a given CPU
7814	* @idle: task in question
7815	* @cpu: CPU the idle task belongs to
7816	*
7817	* NOTE: this function does not set the idle thread's NEED_RESCHED
7818	* flag, to make booting more robust.
7819	*/
7820	void __init init_idle(struct task_struct idle, int* cpu)
7821	{
7822	#ifdef CONFIG_SMP
7823	struct affinity_context ac = (struct affinity_context) {
7824	.new_mask = cpumask_of(cpu),
7825	.flags = `0`,
7826	};
7827	#endif
7828	struct rq *rq = cpu_rq(cpu);
7829	unsigned long flags;
7830
7831	raw_spin_lock_irqsave(&idle->pi_lock, flags);
7832	raw_spin_rq_lock(rq);
7833
7834	idle->__state = TASK_RUNNING;
7835	idle->se.exec_start = sched_clock();
7836	/*
7837	* PF_KTHREAD should already be set at this point; regardless, make it
7838	* look like a proper per-CPU kthread.
7839	*/
7840	idle->flags \|= PF_KTHREAD \| PF_NO_SETAFFINITY;
7841	kthread_set_per_cpu(k: idle, cpu);
7842
7843	#ifdef CONFIG_SMP
7844	/*
7845	* No validation and serialization required at boot time and for
7846	* setting up the idle tasks of not yet online CPUs.
7847	*/
7848	set_cpus_allowed_common(p: idle, ctx: &ac);
7849	#endif
7850	/*
7851	* We're having a chicken and egg problem, even though we are
7852	* holding rq->lock, the CPU isn't yet set to this CPU so the
7853	* lockdep check in task_group() will fail.
7854	*
7855	* Similar case to sched_fork(). / Alternatively we could
7856	* use task_rq_lock() here and obtain the other rq->lock.
7857	*
7858	* Silence PROVE_RCU
7859	*/
7860	rcu_read_lock();
7861	__set_task_cpu(p: idle, cpu);
7862	rcu_read_unlock();
7863
7864	rq->idle = idle;
7865	rq_set_donor(rq, t: idle);
7866	rcu_assign_pointer(rq->curr, idle);
7867	idle->on_rq = TASK_ON_RQ_QUEUED;
7868	#ifdef CONFIG_SMP
7869	idle->on_cpu = `1`;
7870	#endif
7871	raw_spin_rq_unlock(rq);
7872	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
7873
7874	/ Set the preempt count _outside_ the spinlocks! /
7875	init_idle_preempt_count(idle, cpu);
7876
7877	/*
7878	* The idle tasks have their own, simple scheduling class:
7879	*/
7880	idle->sched_class = &idle_sched_class;
7881	ftrace_graph_init_idle_task(t: idle, cpu);
7882	vtime_init_idle(tsk: idle, cpu);
7883	#ifdef CONFIG_SMP
7884	sprintf(buf: idle->comm, fmt: "%s/%d", INIT_TASK_COMM, cpu);
7885	#endif
7886	}
7887
7888	#ifdef CONFIG_SMP
7889
7890	int cpuset_cpumask_can_shrink(const struct cpumask *cur,
7891	const struct cpumask *trial)
7892	{
7893	int ret = `1`;
7894
7895	if (cpumask_empty(srcp: cur))
7896	return ret;
7897
7898	ret = dl_cpuset_cpumask_can_shrink(cur, trial);
7899
7900	return ret;
7901	}
7902
7903	int task_can_attach(struct task_struct *p)
7904	{
7905	int ret = `0`;
7906
7907	/*
7908	* Kthreads which disallow setaffinity shouldn't be moved
7909	* to a new cpuset; we don't want to change their CPU
7910	* affinity and isolating such threads by their set of
7911	* allowed nodes is unnecessary. Thus, cpusets are not
7912	* applicable for such threads. This prevents checking for
7913	* success of set_cpus_allowed_ptr() on all attached tasks
7914	* before cpus_mask may be changed.
7915	*/
7916	if (p->flags & PF_NO_SETAFFINITY)
7917	ret = -EINVAL;
7918
7919	return ret;
7920	}
7921
7922	bool sched_smp_initialized __read_mostly;
7923
7924	#ifdef CONFIG_NUMA_BALANCING
7925	/ Migrate current task p to target_cpu /
7926	int migrate_task_to(struct task_struct p, int* target_cpu)
7927	{
7928	struct migration_arg arg = { p, target_cpu };
7929	int curr_cpu = task_cpu(p);
7930
7931	if (curr_cpu == target_cpu)
7932	return `0`;
7933
7934	if (!cpumask_test_cpu(cpu: target_cpu, cpumask: p->cpus_ptr))
7935	return -EINVAL;
7936
7937	__schedstat_inc(p->stats.numa_task_migrated);
7938	count_vm_numa_event(NUMA_TASK_MIGRATE);
7939	count_memcg_event_mm(mm: p->mm, idx: NUMA_TASK_MIGRATE);
7940	trace_sched_move_numa(tsk: p, src_cpu: curr_cpu, dst_cpu: target_cpu);
7941	return stop_one_cpu(cpu: curr_cpu, fn: migration_cpu_stop, arg: &arg);
7942	}
7943
7944	/*
7945	* Requeue a task on a given node and accurately track the number of NUMA
7946	* tasks on the runqueues
7947	*/
7948	void sched_setnuma(struct task_struct p, int* nid)
7949	{
7950	bool queued, running;
7951	struct rq_flags rf;
7952	struct rq *rq;
7953
7954	rq = task_rq_lock(p, rf: &rf);
7955	queued = task_on_rq_queued(p);
7956	running = task_current_donor(rq, p);
7957
7958	if (queued)
7959	dequeue_task(rq, p, DEQUEUE_SAVE);
7960	if (running)
7961	put_prev_task(rq, prev: p);
7962
7963	p->numa_preferred_nid = nid;
7964
7965	if (queued)
7966	enqueue_task(rq, p, ENQUEUE_RESTORE \| ENQUEUE_NOCLOCK);
7967	if (running)
7968	set_next_task(rq, next: p);
7969	task_rq_unlock(rq, p, rf: &rf);
7970	}
7971	#endif /* CONFIG_NUMA_BALANCING */
7972
7973	#ifdef CONFIG_HOTPLUG_CPU
7974	/*
7975	* Invoked on the outgoing CPU in context of the CPU hotplug thread
7976	* after ensuring that there are no user space tasks left on the CPU.
7977	*
7978	* If there is a lazy mm in use on the hotplug thread, drop it and
7979	* switch to init_mm.
7980	*
7981	* The reference count on init_mm is dropped in finish_cpu().
7982	*/
7983	static void sched_force_init_mm(void)
7984	{
7985	struct mm_struct *mm = current->active_mm;
7986
7987	if (mm != &init_mm) {
7988	mmgrab_lazy_tlb(mm: &init_mm);
7989	local_irq_disable();
7990	current->active_mm = &init_mm;
7991	switch_mm_irqs_off(prev: mm, next: &init_mm, current);
7992	local_irq_enable();
7993	finish_arch_post_lock_switch();
7994	mmdrop_lazy_tlb(mm);
7995	}
7996
7997	/ finish_cpu(), as ran on the BP, will clean up the active_mm state /
7998	}
7999
8000	static int __balance_push_cpu_stop(void *arg)
8001	{
8002	struct task_struct *p = arg;
8003	struct rq *rq = this_rq();
8004	struct rq_flags rf;
8005	int cpu;
8006
8007	raw_spin_lock_irq(&p->pi_lock);
8008	rq_lock(rq, rf: &rf);
8009
8010	update_rq_clock(rq);
8011
8012	if (task_rq(p) == rq && task_on_rq_queued(p)) {
8013	cpu = select_fallback_rq(cpu: rq->cpu, p);
8014	rq = __migrate_task(rq, rf: &rf, p, dest_cpu: cpu);
8015	}
8016
8017	rq_unlock(rq, rf: &rf);
8018	raw_spin_unlock_irq(&p->pi_lock);
8019
8020	put_task_struct(t: p);
8021
8022	return `0`;
8023	}
8024
8025	static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
8026
8027	/*
8028	* Ensure we only run per-cpu kthreads once the CPU goes !active.
8029	*
8030	* This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only
8031	* effective when the hotplug motion is down.
8032	*/
8033	static void balance_push(struct rq *rq)
8034	{
8035	struct task_struct *push_task = rq->curr;
8036
8037	lockdep_assert_rq_held(rq);
8038
8039	/*
8040	* Ensure the thing is persistent until balance_push_set(.on = false);
8041	*/
8042	rq->balance_callback = &balance_push_callback;
8043
8044	/*
8045	* Only active while going offline and when invoked on the outgoing
8046	* CPU.
8047	*/
8048	if (!cpu_dying(cpu: rq->cpu) \|\| rq != this_rq())
8049	return;
8050
8051	/*
8052	* Both the cpu-hotplug and stop task are in this case and are
8053	* required to complete the hotplug process.
8054	*/
8055	if (kthread_is_per_cpu(k: push_task) \|\|
8056	is_migration_disabled(p: push_task)) {
8057
8058	/*
8059	* If this is the idle task on the outgoing CPU try to wake
8060	* up the hotplug control thread which might wait for the
8061	* last task to vanish. The rcuwait_active() check is
8062	* accurate here because the waiter is pinned on this CPU
8063	* and can't obviously be running in parallel.
8064	*
8065	* On RT kernels this also has to check whether there are
8066	* pinned and scheduled out tasks on the runqueue. They
8067	* need to leave the migrate disabled section first.
8068	*/
8069	if (!rq->nr_running && !rq_has_pinned_tasks(rq) &&
8070	rcuwait_active(w: &rq->hotplug_wait)) {
8071	raw_spin_rq_unlock(rq);
8072	rcuwait_wake_up(w: &rq->hotplug_wait);
8073	raw_spin_rq_lock(rq);
8074	}
8075	return;
8076	}
8077
8078	get_task_struct(t: push_task);
8079	/*
8080	* Temporarily drop rq->lock such that we can wake-up the stop task.
8081	* Both preemption and IRQs are still disabled.
8082	*/
8083	preempt_disable();
8084	raw_spin_rq_unlock(rq);
8085	stop_one_cpu_nowait(cpu: rq->cpu, fn: __balance_push_cpu_stop, arg: push_task,
8086	this_cpu_ptr(&push_work));
8087	preempt_enable();
8088	/*
8089	* At this point need_resched() is true and we'll take the loop in
8090	* schedule(). The next pick is obviously going to be the stop task
8091	* which kthread_is_per_cpu() and will push this task away.
8092	*/
8093	raw_spin_rq_lock(rq);
8094	}
8095
8096	static void balance_push_set(int cpu, bool on)
8097	{
8098	struct rq *rq = cpu_rq(cpu);
8099	struct rq_flags rf;
8100
8101	rq_lock_irqsave(rq, rf: &rf);
8102	if (on) {
8103	WARN_ON_ONCE(rq->balance_callback);
8104	rq->balance_callback = &balance_push_callback;
8105	} else if (rq->balance_callback == &balance_push_callback) {
8106	rq->balance_callback = NULL;
8107	}
8108	rq_unlock_irqrestore(rq, rf: &rf);
8109	}
8110
8111	/*
8112	* Invoked from a CPUs hotplug control thread after the CPU has been marked
8113	* inactive. All tasks which are not per CPU kernel threads are either
8114	* pushed off this CPU now via balance_push() or placed on a different CPU
8115	* during wakeup. Wait until the CPU is quiescent.
8116	*/
8117	static void balance_hotplug_wait(void)
8118	{
8119	struct rq *rq = this_rq();
8120
8121	rcuwait_wait_event(&rq->hotplug_wait,
8122	rq->nr_running == `1` && !rq_has_pinned_tasks(rq),
8123	TASK_UNINTERRUPTIBLE);
8124	}
8125
8126	#else
8127
8128	static inline void balance_push(struct rq *rq)
8129	{
8130	}
8131
8132	static inline void balance_push_set(int cpu, bool on)
8133	{
8134	}
8135
8136	static inline void balance_hotplug_wait(void)
8137	{
8138	}
8139
8140	#endif /* CONFIG_HOTPLUG_CPU */
8141
8142	void set_rq_online(struct rq *rq)
8143	{
8144	if (!rq->online) {
8145	const struct sched_class *class;
8146
8147	cpumask_set_cpu(cpu: rq->cpu, dstp: rq->rd->online);
8148	rq->online = `1`;
8149
8150	for_each_class(class) {
8151	if (class->rq_online)
8152	class->rq_online(rq);
8153	}
8154	}
8155	}
8156
8157	void set_rq_offline(struct rq *rq)
8158	{
8159	if (rq->online) {
8160	const struct sched_class *class;
8161
8162	update_rq_clock(rq);
8163	for_each_class(class) {
8164	if (class->rq_offline)
8165	class->rq_offline(rq);
8166	}
8167
8168	cpumask_clear_cpu(cpu: rq->cpu, dstp: rq->rd->online);
8169	rq->online = `0`;
8170	}
8171	}
8172
8173	static inline void sched_set_rq_online(struct rq rq, int* cpu)
8174	{
8175	struct rq_flags rf;
8176
8177	rq_lock_irqsave(rq, rf: &rf);
8178	if (rq->rd) {
8179	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
8180	set_rq_online(rq);
8181	}
8182	rq_unlock_irqrestore(rq, rf: &rf);
8183	}
8184
8185	static inline void sched_set_rq_offline(struct rq rq, int* cpu)
8186	{
8187	struct rq_flags rf;
8188
8189	rq_lock_irqsave(rq, rf: &rf);
8190	if (rq->rd) {
8191	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
8192	set_rq_offline(rq);
8193	}
8194	rq_unlock_irqrestore(rq, rf: &rf);
8195	}
8196
8197	/*
8198	* used to mark begin/end of suspend/resume:
8199	*/
8200	static int num_cpus_frozen;
8201
8202	/*
8203	* Update cpusets according to cpu_active mask. If cpusets are
8204	* disabled, cpuset_update_active_cpus() becomes a simple wrapper
8205	* around partition_sched_domains().
8206	*
8207	* If we come here as part of a suspend/resume, don't touch cpusets because we
8208	* want to restore it back to its original state upon resume anyway.
8209	*/
8210	static void cpuset_cpu_active(void)
8211	{
8212	if (cpuhp_tasks_frozen) {
8213	/*
8214	* num_cpus_frozen tracks how many CPUs are involved in suspend
8215	* resume sequence. As long as this is not the last online
8216	* operation in the resume sequence, just build a single sched
8217	* domain, ignoring cpusets.
8218	*/
8219	cpuset_reset_sched_domains();
8220	if (--num_cpus_frozen)
8221	return;
8222	/*
8223	* This is the last CPU online operation. So fall through and
8224	* restore the original sched domains by considering the
8225	* cpuset configurations.
8226	*/
8227	cpuset_force_rebuild();
8228	}
8229	cpuset_update_active_cpus();
8230	}
8231
8232	static void cpuset_cpu_inactive(unsigned int cpu)
8233	{
8234	if (!cpuhp_tasks_frozen) {
8235	cpuset_update_active_cpus();
8236	} else {
8237	num_cpus_frozen++;
8238	cpuset_reset_sched_domains();
8239	}
8240	}
8241
8242	static inline void sched_smt_present_inc(int cpu)
8243	{
8244	#ifdef CONFIG_SCHED_SMT
8245	if (cpumask_weight(srcp: cpu_smt_mask(cpu)) == `2`)
8246	static_branch_inc_cpuslocked(&sched_smt_present);
8247	#endif
8248	}
8249
8250	static inline void sched_smt_present_dec(int cpu)
8251	{
8252	#ifdef CONFIG_SCHED_SMT
8253	if (cpumask_weight(srcp: cpu_smt_mask(cpu)) == `2`)
8254	static_branch_dec_cpuslocked(&sched_smt_present);
8255	#endif
8256	}
8257
8258	int sched_cpu_activate(unsigned int cpu)
8259	{
8260	struct rq *rq = cpu_rq(cpu);
8261
8262	/*
8263	* Clear the balance_push callback and prepare to schedule
8264	* regular tasks.
8265	*/
8266	balance_push_set(cpu, on: false);
8267
8268	/*
8269	* When going up, increment the number of cores with SMT present.
8270	*/
8271	sched_smt_present_inc(cpu);
8272	set_cpu_active(cpu, true);
8273
8274	if (sched_smp_initialized) {
8275	sched_update_numa(cpu, online: true);
8276	sched_domains_numa_masks_set(cpu);
8277	cpuset_cpu_active();
8278	}
8279
8280	scx_rq_activate(rq);
8281
8282	/*
8283	* Put the rq online, if not already. This happens:
8284	*
8285	* 1) In the early boot process, because we build the real domains
8286	* after all CPUs have been brought up.
8287	*
8288	* 2) At runtime, if cpuset_cpu_active() fails to rebuild the
8289	* domains.
8290	*/
8291	sched_set_rq_online(rq, cpu);
8292
8293	return `0`;
8294	}
8295
8296	int sched_cpu_deactivate(unsigned int cpu)
8297	{
8298	struct rq *rq = cpu_rq(cpu);
8299	int ret;
8300
8301	ret = dl_bw_deactivate(cpu);
8302
8303	if (ret)
8304	return ret;
8305
8306	/*
8307	* Remove CPU from nohz.idle_cpus_mask to prevent participating in
8308	* load balancing when not active
8309	*/
8310	nohz_balance_exit_idle(rq);
8311
8312	set_cpu_active(cpu, false);
8313
8314	/*
8315	* From this point forward, this CPU will refuse to run any task that
8316	* is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
8317	* push those tasks away until this gets cleared, see
8318	* sched_cpu_dying().
8319	*/
8320	balance_push_set(cpu, on: true);
8321
8322	/*
8323	* We've cleared cpu_active_mask / set balance_push, wait for all
8324	* preempt-disabled and RCU users of this state to go away such that
8325	* all new such users will observe it.
8326	*
8327	* Specifically, we rely on ttwu to no longer target this CPU, see
8328	* ttwu_queue_cond() and is_cpu_allowed().
8329	*
8330	* Do sync before park smpboot threads to take care the RCU boost case.
8331	*/
8332	synchronize_rcu();
8333
8334	sched_set_rq_offline(rq, cpu);
8335
8336	scx_rq_deactivate(rq);
8337
8338	/*
8339	* When going down, decrement the number of cores with SMT present.
8340	*/
8341	sched_smt_present_dec(cpu);
8342
8343	#ifdef CONFIG_SCHED_SMT
8344	sched_core_cpu_deactivate(cpu);
8345	#endif
8346
8347	if (!sched_smp_initialized)
8348	return `0`;
8349
8350	sched_update_numa(cpu, online: false);
8351	cpuset_cpu_inactive(cpu);
8352	sched_domains_numa_masks_clear(cpu);
8353	return `0`;
8354	}
8355
8356	static void sched_rq_cpu_starting(unsigned int cpu)
8357	{
8358	struct rq *rq = cpu_rq(cpu);
8359
8360	rq->calc_load_update = calc_load_update;
8361	update_max_interval();
8362	}
8363
8364	int sched_cpu_starting(unsigned int cpu)
8365	{
8366	sched_core_cpu_starting(cpu);
8367	sched_rq_cpu_starting(cpu);
8368	sched_tick_start(cpu);
8369	return `0`;
8370	}
8371
8372	#ifdef CONFIG_HOTPLUG_CPU
8373
8374	/*
8375	* Invoked immediately before the stopper thread is invoked to bring the
8376	* CPU down completely. At this point all per CPU kthreads except the
8377	* hotplug thread (current) and the stopper thread (inactive) have been
8378	* either parked or have been unbound from the outgoing CPU. Ensure that
8379	* any of those which might be on the way out are gone.
8380	*
8381	* If after this point a bound task is being woken on this CPU then the
8382	* responsible hotplug callback has failed to do it's job.
8383	* sched_cpu_dying() will catch it with the appropriate fireworks.
8384	*/
8385	int sched_cpu_wait_empty(unsigned int cpu)
8386	{
8387	balance_hotplug_wait();
8388	sched_force_init_mm();
8389	return `0`;
8390	}
8391
8392	/*
8393	* Since this CPU is going 'away' for a while, fold any nr_active delta we
8394	* might have. Called from the CPU stopper task after ensuring that the
8395	* stopper is the last running task on the CPU, so nr_active count is
8396	* stable. We need to take the tear-down thread which is calling this into
8397	* account, so we hand in adjust = 1 to the load calculation.
8398	*
8399	* Also see the comment "Global load-average calculations".
8400	*/
8401	static void calc_load_migrate(struct rq *rq)
8402	{
8403	long delta = calc_load_fold_active(this_rq: rq, adjust: `1`);
8404
8405	if (delta)
8406	atomic_long_add(i: delta, v: &calc_load_tasks);
8407	}
8408
8409	static void dump_rq_tasks(struct rq rq, const* char *loglvl)
8410	{
8411	struct task_struct g, p;
8412	int cpu = cpu_of(rq);
8413
8414	lockdep_assert_rq_held(rq);
8415
8416	printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
8417	for_each_process_thread(g, p) {
8418	if (task_cpu(p) != cpu)
8419	continue;
8420
8421	if (!task_on_rq_queued(p))
8422	continue;
8423
8424	printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
8425	}
8426	}
8427
8428	int sched_cpu_dying(unsigned int cpu)
8429	{
8430	struct rq *rq = cpu_rq(cpu);
8431	struct rq_flags rf;
8432
8433	/ Handle pending wakeups and then migrate everything off /
8434	sched_tick_stop(cpu);
8435
8436	rq_lock_irqsave(rq, rf: &rf);
8437	if (rq->nr_running != `1` \|\| rq_has_pinned_tasks(rq)) {
8438	WARN(true, "Dying CPU not properly vacated!");
8439	dump_rq_tasks(rq, KERN_WARNING);
8440	}
8441	rq_unlock_irqrestore(rq, rf: &rf);
8442
8443	calc_load_migrate(rq);
8444	update_max_interval();
8445	hrtick_clear(rq);
8446	sched_core_cpu_dying(cpu);
8447	return `0`;
8448	}
8449	#endif
8450
8451	void __init sched_init_smp(void)
8452	{
8453	sched_init_numa(NUMA_NO_NODE);
8454
8455	/*
8456	* There's no userspace yet to cause hotplug operations; hence all the
8457	* CPU masks are stable and all blatant races in the below code cannot
8458	* happen.
8459	*/
8460	sched_domains_mutex_lock();
8461	sched_init_domains(cpu_active_mask);
8462	sched_domains_mutex_unlock();
8463
8464	/ Move init over to a non-isolated CPU /
8465	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(type: HK_TYPE_DOMAIN)) < `0`)
8466	BUG();
8467	current->flags &= ~PF_NO_SETAFFINITY;
8468	sched_init_granularity();
8469
8470	init_sched_rt_class();
8471	init_sched_dl_class();
8472
8473	sched_smp_initialized = true;
8474	}
8475
8476	static int __init migration_init(void)
8477	{
8478	sched_cpu_starting(smp_processor_id());
8479	return `0`;
8480	}
8481	early_initcall(migration_init);
8482
8483	#else
8484	void __init sched_init_smp(void)
8485	{
8486	sched_init_granularity();
8487	}
8488	#endif /* CONFIG_SMP */
8489
8490	int in_sched_functions(unsigned long addr)
8491	{
8492	return in_lock_functions(addr) \|\|
8493	(addr >= (unsigned long)__sched_text_start
8494	&& addr < (unsigned long)__sched_text_end);
8495	}
8496
8497	#ifdef CONFIG_CGROUP_SCHED
8498	/*
8499	* Default task group.
8500	* Every task in system belongs to this group at bootup.
8501	*/
8502	struct task_group root_task_group;
8503	LIST_HEAD(task_groups);
8504
8505	/ Cacheline aligned slab cache for task_group /
8506	static struct kmem_cache *task_group_cache __ro_after_init;
8507	#endif
8508
8509	void __init sched_init(void)
8510	{
8511	unsigned long ptr = `0`;
8512	int i;
8513
8514	/ Make sure the linker didn't screw up /
8515	#ifdef CONFIG_SMP
8516	BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class));
8517	#endif
8518	BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
8519	BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
8520	BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
8521	#ifdef CONFIG_SCHED_CLASS_EXT
8522	BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class));
8523	BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
8524	#endif
8525
8526	wait_bit_init();
8527
8528	#ifdef CONFIG_FAIR_GROUP_SCHED
8529	ptr += `2` * nr_cpu_ids * sizeof(void **);
8530	#endif
8531	#ifdef CONFIG_RT_GROUP_SCHED
8532	ptr += `2` * nr_cpu_ids * sizeof(void **);
8533	#endif
8534	if (ptr) {
8535	ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
8536
8537	#ifdef CONFIG_FAIR_GROUP_SCHED
8538	root_task_group.se = (struct sched_entity **)ptr;
8539	ptr += nr_cpu_ids * sizeof(void **);
8540
8541	root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8542	ptr += nr_cpu_ids * sizeof(void **);
8543
8544	root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8545	init_cfs_bandwidth(cfs_b: &root_task_group.cfs_bandwidth, NULL);
8546	#endif /* CONFIG_FAIR_GROUP_SCHED */
8547	#ifdef CONFIG_EXT_GROUP_SCHED
8548	root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
8549	#endif /* CONFIG_EXT_GROUP_SCHED */
8550	#ifdef CONFIG_RT_GROUP_SCHED
8551	root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8552	ptr += nr_cpu_ids * sizeof(void **);
8553
8554	root_task_group.rt_rq = (struct rt_rq **)ptr;
8555	ptr += nr_cpu_ids * sizeof(void **);
8556
8557	#endif /* CONFIG_RT_GROUP_SCHED */
8558	}
8559
8560	#ifdef CONFIG_SMP
8561	init_defrootdomain();
8562	#endif
8563
8564	#ifdef CONFIG_RT_GROUP_SCHED
8565	init_rt_bandwidth(rt_b: &root_task_group.rt_bandwidth,
8566	period: global_rt_period(), runtime: global_rt_runtime());
8567	#endif /* CONFIG_RT_GROUP_SCHED */
8568
8569	#ifdef CONFIG_CGROUP_SCHED
8570	task_group_cache = KMEM_CACHE(task_group, `0`);
8571
8572	list_add(new: &root_task_group.list, head: &task_groups);
8573	INIT_LIST_HEAD(list: &root_task_group.children);
8574	INIT_LIST_HEAD(list: &root_task_group.siblings);
8575	autogroup_init(init_task: &init_task);
8576	#endif /* CONFIG_CGROUP_SCHED */
8577
8578	for_each_possible_cpu(i) {
8579	struct rq *rq;
8580
8581	rq = cpu_rq(i);
8582	raw_spin_lock_init(&rq->__lock);
8583	rq->nr_running = `0`;
8584	rq->calc_load_active = `0`;
8585	rq->calc_load_update = jiffies + LOAD_FREQ;
8586	init_cfs_rq(cfs_rq: &rq->cfs);
8587	init_rt_rq(rt_rq: &rq->rt);
8588	init_dl_rq(dl_rq: &rq->dl);
8589	#ifdef CONFIG_FAIR_GROUP_SCHED
8590	INIT_LIST_HEAD(list: &rq->leaf_cfs_rq_list);
8591	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
8592	/*
8593	* How much CPU bandwidth does root_task_group get?
8594	*
8595	* In case of task-groups formed through the cgroup filesystem, it
8596	* gets 100% of the CPU resources in the system. This overall
8597	* system CPU resource is divided among the tasks of
8598	* root_task_group and its child task-groups in a fair manner,
8599	* based on each entity's (task or task-group's) weight
8600	* (se->load.weight).
8601	*
8602	* In other words, if root_task_group has 10 tasks of weight
8603	* 1024) and two child groups A0 and A1 (of weight 1024 each),
8604	* then A0's share of the CPU resource is:
8605	*
8606	* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8607	*
8608	* We achieve this by letting root_task_group's tasks sit
8609	* directly in rq->cfs (i.e root_task_group->se[] = NULL).
8610	*/
8611	init_tg_cfs_entry(tg: &root_task_group, cfs_rq: &rq->cfs, NULL, cpu: i, NULL);
8612	#endif /* CONFIG_FAIR_GROUP_SCHED */
8613
8614	#ifdef CONFIG_RT_GROUP_SCHED
8615	/*
8616	* This is required for init cpu because rt.c:__enable_runtime()
8617	* starts working after scheduler_running, which is not the case
8618	* yet.
8619	*/
8620	rq->rt.rt_runtime = global_rt_runtime();
8621	init_tg_rt_entry(tg: &root_task_group, rt_rq: &rq->rt, NULL, cpu: i, NULL);
8622	#endif
8623	#ifdef CONFIG_SMP
8624	rq->sd = NULL;
8625	rq->rd = NULL;
8626	rq->cpu_capacity = SCHED_CAPACITY_SCALE;
8627	rq->balance_callback = &balance_push_callback;
8628	rq->active_balance = `0`;
8629	rq->next_balance = jiffies;
8630	rq->push_cpu = `0`;
8631	rq->cpu = i;
8632	rq->online = `0`;
8633	rq->idle_stamp = `0`;
8634	rq->avg_idle = `2`*sysctl_sched_migration_cost;
8635	rq->max_idle_balance_cost = sysctl_sched_migration_cost;
8636
8637	INIT_LIST_HEAD(list: &rq->cfs_tasks);
8638
8639	rq_attach_root(rq, rd: &def_root_domain);
8640	#ifdef CONFIG_NO_HZ_COMMON
8641	rq->last_blocked_load_update_tick = jiffies;
8642	atomic_set(v: &rq->nohz_flags, i: `0`);
8643
8644	INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
8645	#endif
8646	#ifdef CONFIG_HOTPLUG_CPU
8647	rcuwait_init(w: &rq->hotplug_wait);
8648	#endif
8649	#endif /* CONFIG_SMP */
8650	hrtick_rq_init(rq);
8651	atomic_set(v: &rq->nr_iowait, i: `0`);
8652	fair_server_init(rq);
8653
8654	#ifdef CONFIG_SCHED_CORE
8655	rq->core = rq;
8656	rq->core_pick = NULL;
8657	rq->core_dl_server = NULL;
8658	rq->core_enabled = `0`;
8659	rq->core_tree = RB_ROOT;
8660	rq->core_forceidle_count = `0`;
8661	rq->core_forceidle_occupation = `0`;
8662	rq->core_forceidle_start = `0`;
8663
8664	rq->core_cookie = `0UL`;
8665	#endif
8666	zalloc_cpumask_var_node(mask: &rq->scratch_mask, GFP_KERNEL, cpu_to_node(cpu: i));
8667	}
8668
8669	set_load_weight(p: &init_task, update_load: false);
8670	init_task.se.slice = sysctl_sched_base_slice,
8671
8672	/*
8673	* The boot idle thread does lazy MMU switching as well:
8674	*/
8675	mmgrab_lazy_tlb(mm: &init_mm);
8676	enter_lazy_tlb(mm: &init_mm, current);
8677
8678	/*
8679	* The idle task doesn't need the kthread struct to function, but it
8680	* is dressed up as a per-CPU kthread and thus needs to play the part
8681	* if we want to avoid special-casing it in code that deals with per-CPU
8682	* kthreads.
8683	*/
8684	WARN_ON(!set_kthread_struct(current));
8685
8686	/*
8687	* Make us the idle thread. Technically, schedule() should not be
8688	* called from this thread, however somewhere below it might be,
8689	* but because we are the idle thread, we just pick up running again
8690	* when this runqueue becomes "idle".
8691	*/
8692	__sched_fork(clone_flags: `0`, current);
8693	init_idle(current, smp_processor_id());
8694
8695	calc_load_update = jiffies + LOAD_FREQ;
8696
8697	#ifdef CONFIG_SMP
8698	idle_thread_set_boot_cpu();
8699	balance_push_set(smp_processor_id(), on: false);
8700	#endif
8701	init_sched_fair_class();
8702	init_sched_ext_class();
8703
8704	psi_init();
8705
8706	init_uclamp();
8707
8708	preempt_dynamic_init();
8709
8710	scheduler_running = `1`;
8711	}
8712
8713	#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
8714
8715	void __might_sleep(const char file, int* line)
8716	{
8717	unsigned int state = get_current_state();
8718	/*
8719	* Blocking primitives will set (and therefore destroy) current->state,
8720	* since we will exit with TASK_RUNNING make sure we enter with it,
8721	* otherwise we will destroy state.
8722	*/
8723	WARN_ONCE(state != TASK_RUNNING && current->task_state_change,
8724	"do not call blocking ops when !TASK_RUNNING; "
8725	"state=%x set at [<%p>] %pS\n", state,
8726	(void *)current->task_state_change,
8727	(void *)current->task_state_change);
8728
8729	__might_resched(file, line, offsets: `0`);
8730	}
8731	EXPORT_SYMBOL(__might_sleep);
8732
8733	static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)
8734	{
8735	if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
8736	return;
8737
8738	if (preempt_count() == preempt_offset)
8739	return;
8740
8741	pr_err("Preemption disabled at:");
8742	print_ip_sym(KERN_ERR, ip);
8743	}
8744
8745	static inline bool resched_offsets_ok(unsigned int offsets)
8746	{
8747	unsigned int nested = preempt_count();
8748
8749	nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT;
8750
8751	return nested == offsets;
8752	}
8753
8754	void __might_resched(const char file, int* line, unsigned int offsets)
8755	{
8756	/ Ratelimiting timestamp: /
8757	static unsigned long prev_jiffy;
8758
8759	unsigned long preempt_disable_ip;
8760
8761	/ WARN_ON_ONCE() by default, no rate limit required: /
8762	rcu_sleep_check();
8763
8764	if ((resched_offsets_ok(offsets) && !irqs_disabled() &&
8765	!is_idle_task(current) && !current->non_block_count) \|\|
8766	system_state == SYSTEM_BOOTING \|\| system_state > SYSTEM_RUNNING \|\|
8767	oops_in_progress)
8768	return;
8769
8770	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8771	return;
8772	prev_jiffy = jiffies;
8773
8774	/ Save this before calling printk(), since that will clobber it: /
8775	preempt_disable_ip = get_preempt_disable_ip(current);
8776
8777	pr_err("BUG: sleeping function called from invalid context at %s:%d\n",
8778	file, line);
8779	pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
8780	in_atomic(), irqs_disabled(), current->non_block_count,
8781	current->pid, current->comm);
8782	pr_err("preempt_count: %x, expected: %x\n", preempt_count(),
8783	offsets & MIGHT_RESCHED_PREEMPT_MASK);
8784
8785	if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
8786	pr_err("RCU nest depth: %d, expected: %u\n",
8787	rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT);
8788	}
8789
8790	if (task_stack_end_corrupted(current))
8791	pr_emerg("Thread overran stack, or stack corrupted\n");
8792
8793	debug_show_held_locks(current);
8794	if (irqs_disabled())
8795	print_irqtrace_events(current);
8796
8797	print_preempt_disable_ip(preempt_offset: offsets & MIGHT_RESCHED_PREEMPT_MASK,
8798	ip: preempt_disable_ip);
8799
8800	dump_stack();
8801	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
8802	}
8803	EXPORT_SYMBOL(__might_resched);
8804
8805	void __cant_sleep(const char file, int* line, int preempt_offset)
8806	{
8807	static unsigned long prev_jiffy;
8808
8809	if (irqs_disabled())
8810	return;
8811
8812	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
8813	return;
8814
8815	if (preempt_count() > preempt_offset)
8816	return;
8817
8818	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8819	return;
8820	prev_jiffy = jiffies;
8821
8822	printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
8823	printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8824	in_atomic(), irqs_disabled(),
8825	current->pid, current->comm);
8826
8827	debug_show_held_locks(current);
8828	dump_stack();
8829	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
8830	}
8831	EXPORT_SYMBOL_GPL(__cant_sleep);
8832
8833	#ifdef CONFIG_SMP
8834	void __cant_migrate(const char file, int* line)
8835	{
8836	static unsigned long prev_jiffy;
8837
8838	if (irqs_disabled())
8839	return;
8840
8841	if (is_migration_disabled(current))
8842	return;
8843
8844	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
8845	return;
8846
8847	if (preempt_count() > `0`)
8848	return;
8849
8850	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8851	return;
8852	prev_jiffy = jiffies;
8853
8854	pr_err("BUG: assuming non migratable context at %s:%d\n", file, line);
8855	pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n",
8856	in_atomic(), irqs_disabled(), is_migration_disabled(current),
8857	current->pid, current->comm);
8858
8859	debug_show_held_locks(current);
8860	dump_stack();
8861	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
8862	}
8863	EXPORT_SYMBOL_GPL(__cant_migrate);
8864	#endif
8865	#endif
8866
8867	#ifdef CONFIG_MAGIC_SYSRQ
8868	void normalize_rt_tasks(void)
8869	{
8870	struct task_struct g, p;
8871	struct sched_attr attr = {
8872	.sched_policy = SCHED_NORMAL,
8873	};
8874
8875	read_lock(&tasklist_lock);
8876	for_each_process_thread(g, p) {
8877	/*
8878	* Only normalize user tasks:
8879	*/
8880	if (p->flags & PF_KTHREAD)
8881	continue;
8882
8883	p->se.exec_start = `0`;
8884	schedstat_set(p->stats.wait_start, `0`);
8885	schedstat_set(p->stats.sleep_start, `0`);
8886	schedstat_set(p->stats.block_start, `0`);
8887
8888	if (!rt_or_dl_task(p)) {
8889	/*
8890	* Renice negative nice level userspace
8891	* tasks back to 0:
8892	*/
8893	if (task_nice(p) < `0`)
8894	set_user_nice(p, nice: `0`);
8895	continue;
8896	}
8897
8898	__sched_setscheduler(p, attr: &attr, user: false, pi: false);
8899	}
8900	read_unlock(&tasklist_lock);
8901	}
8902
8903	#endif /* CONFIG_MAGIC_SYSRQ */
8904
8905	#if defined(CONFIG_KGDB_KDB)
8906	/*
8907	* These functions are only useful for KDB.
8908	*
8909	* They can only be called when the whole system has been
8910	* stopped - every CPU needs to be quiescent, and no scheduling
8911	* activity can take place. Using them for anything else would
8912	* be a serious bug, and as a result, they aren't even visible
8913	* under any other configuration.
8914	*/
8915
8916	/**
8917	* curr_task - return the current task for a given CPU.
8918	* @cpu: the processor in question.
8919	*
8920	* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8921	*
8922	* Return: The current task for @cpu.
8923	*/
8924	struct task_struct curr_task(int* cpu)
8925	{
8926	return cpu_curr(cpu);
8927	}
8928
8929	#endif /* defined(CONFIG_KGDB_KDB) */
8930
8931	#ifdef CONFIG_CGROUP_SCHED
8932	/ task_group_lock serializes the addition/removal of task groups /
8933	static DEFINE_SPINLOCK(task_group_lock);
8934
8935	static inline void alloc_uclamp_sched_group(struct task_group *tg,
8936	struct task_group *parent)
8937	{
8938	#ifdef CONFIG_UCLAMP_TASK_GROUP
8939	enum uclamp_id clamp_id;
8940
8941	for_each_clamp_id(clamp_id) {
8942	uclamp_se_set(uc_se: &tg->uclamp_req[clamp_id],
8943	value: uclamp_none(clamp_id), user_defined: false);
8944	tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
8945	}
8946	#endif
8947	}
8948
8949	static void sched_free_group(struct task_group *tg)
8950	{
8951	free_fair_sched_group(tg);
8952	free_rt_sched_group(tg);
8953	autogroup_free(tg);
8954	kmem_cache_free(s: task_group_cache, objp: tg);
8955	}
8956
8957	static void sched_free_group_rcu(struct rcu_head *rcu)
8958	{
8959	sched_free_group(container_of(rcu, struct task_group, rcu));
8960	}
8961
8962	static void sched_unregister_group(struct task_group *tg)
8963	{
8964	unregister_fair_sched_group(tg);
8965	unregister_rt_sched_group(tg);
8966	/*
8967	* We have to wait for yet another RCU grace period to expire, as
8968	* print_cfs_stats() might run concurrently.
8969	*/
8970	call_rcu(head: &tg->rcu, func: sched_free_group_rcu);
8971	}
8972
8973	/ allocate runqueue etc for a new task group /
8974	struct task_group sched_create_group(struct* task_group *parent)
8975	{
8976	struct task_group *tg;
8977
8978	tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL \| __GFP_ZERO);
8979	if (!tg)
8980	return ERR_PTR(error: -ENOMEM);
8981
8982	if (!alloc_fair_sched_group(tg, parent))
8983	goto err;
8984
8985	if (!alloc_rt_sched_group(tg, parent))
8986	goto err;
8987
8988	scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
8989	alloc_uclamp_sched_group(tg, parent);
8990
8991	return tg;
8992
8993	err:
8994	sched_free_group(tg);
8995	return ERR_PTR(error: -ENOMEM);
8996	}
8997
8998	void sched_online_group(struct task_group tg, struct* task_group *parent)
8999	{
9000	unsigned long flags;
9001
9002	spin_lock_irqsave(&task_group_lock, flags);
9003	list_add_tail_rcu(new: &tg->list, head: &task_groups);
9004
9005	/ Root should already exist: /
9006	WARN_ON(!parent);
9007
9008	tg->parent = parent;
9009	INIT_LIST_HEAD(list: &tg->children);
9010	list_add_rcu(new: &tg->siblings, head: &parent->children);
9011	spin_unlock_irqrestore(lock: &task_group_lock, flags);
9012
9013	online_fair_sched_group(tg);
9014	}
9015
9016	/ RCU callback to free various structures associated with a task group /
9017	static void sched_unregister_group_rcu(struct rcu_head *rhp)
9018	{
9019	/ Now it should be safe to free those cfs_rqs: /
9020	sched_unregister_group(container_of(rhp, struct task_group, rcu));
9021	}
9022
9023	void sched_destroy_group(struct task_group *tg)
9024	{
9025	/ Wait for possible concurrent references to cfs_rqs complete: /
9026	call_rcu(head: &tg->rcu, func: sched_unregister_group_rcu);
9027	}
9028
9029	void sched_release_group(struct task_group *tg)
9030	{
9031	unsigned long flags;
9032
9033	/*
9034	* Unlink first, to avoid walk_tg_tree_from() from finding us (via
9035	* sched_cfs_period_timer()).
9036	*
9037	* For this to be effective, we have to wait for all pending users of
9038	* this task group to leave their RCU critical section to ensure no new
9039	* user will see our dying task group any more. Specifically ensure
9040	* that tg_unthrottle_up() won't add decayed cfs_rq's to it.
9041	*
9042	* We therefore defer calling unregister_fair_sched_group() to
9043	* sched_unregister_group() which is guarantied to get called only after the
9044	* current RCU grace period has expired.
9045	*/
9046	spin_lock_irqsave(&task_group_lock, flags);
9047	list_del_rcu(entry: &tg->list);
9048	list_del_rcu(entry: &tg->siblings);
9049	spin_unlock_irqrestore(lock: &task_group_lock, flags);
9050	}
9051
9052	static void sched_change_group(struct task_struct *tsk)
9053	{
9054	struct task_group *tg;
9055
9056	/*
9057	* All callers are synchronized by task_rq_lock(); we do not use RCU
9058	* which is pointless here. Thus, we pass "true" to task_css_check()
9059	* to prevent lockdep warnings.
9060	*/
9061	tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
9062	struct task_group, css);
9063	tg = autogroup_task_group(p: tsk, tg);
9064	tsk->sched_task_group = tg;
9065
9066	#ifdef CONFIG_FAIR_GROUP_SCHED
9067	if (tsk->sched_class->task_change_group)
9068	tsk->sched_class->task_change_group(tsk);
9069	else
9070	#endif
9071	set_task_rq(p: tsk, cpu: task_cpu(p: tsk));
9072	}
9073
9074	/*
9075	* Change task's runqueue when it moves between groups.
9076	*
9077	* The caller of this function should have put the task in its new group by
9078	* now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
9079	* its new group.
9080	*/
9081	void sched_move_task(struct task_struct *tsk, bool for_autogroup)
9082	{
9083	int queued, running, queue_flags =
9084	DEQUEUE_SAVE \| DEQUEUE_MOVE \| DEQUEUE_NOCLOCK;
9085	struct rq *rq;
9086
9087	CLASS(task_rq_lock, rq_guard)(l: tsk);
9088	rq = rq_guard.rq;
9089
9090	update_rq_clock(rq);
9091
9092	running = task_current_donor(rq, p: tsk);
9093	queued = task_on_rq_queued(p: tsk);
9094
9095	if (queued)
9096	dequeue_task(rq, p: tsk, flags: queue_flags);
9097	if (running)
9098	put_prev_task(rq, prev: tsk);
9099
9100	sched_change_group(tsk);
9101	if (!for_autogroup)
9102	scx_cgroup_move_task(p: tsk);
9103
9104	if (queued)
9105	enqueue_task(rq, p: tsk, flags: queue_flags);
9106	if (running) {
9107	set_next_task(rq, next: tsk);
9108	/*
9109	* After changing group, the running task may have joined a
9110	* throttled one but it's still the running task. Trigger a
9111	* resched to make sure that task can still run.
9112	*/
9113	resched_curr(rq);
9114	}
9115	}
9116
9117	static struct cgroup_subsys_state *
9118	cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
9119	{
9120	struct task_group *parent = css_tg(css: parent_css);
9121	struct task_group *tg;
9122
9123	if (!parent) {
9124	/ This is early initialization for the top cgroup /
9125	return &root_task_group.css;
9126	}
9127
9128	tg = sched_create_group(parent);
9129	if (IS_ERR(ptr: tg))
9130	return ERR_PTR(error: -ENOMEM);
9131
9132	return &tg->css;
9133	}
9134
9135	/ Expose task group only after completing cgroup initialization /
9136	static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
9137	{
9138	struct task_group *tg = css_tg(css);
9139	struct task_group *parent = css_tg(css: css->parent);
9140	int ret;
9141
9142	ret = scx_tg_online(tg);
9143	if (ret)
9144	return ret;
9145
9146	if (parent)
9147	sched_online_group(tg, parent);
9148
9149	#ifdef CONFIG_UCLAMP_TASK_GROUP
9150	/ Propagate the effective uclamp value for the new group /
9151	guard(mutex)(T: &uclamp_mutex);
9152	guard(rcu)();
9153	cpu_util_update_eff(css);
9154	#endif
9155
9156	return `0`;
9157	}
9158
9159	static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
9160	{
9161	struct task_group *tg = css_tg(css);
9162
9163	scx_tg_offline(tg);
9164	}
9165
9166	static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
9167	{
9168	struct task_group *tg = css_tg(css);
9169
9170	sched_release_group(tg);
9171	}
9172
9173	static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
9174	{
9175	struct task_group *tg = css_tg(css);
9176
9177	/*
9178	* Relies on the RCU grace period between css_released() and this.
9179	*/
9180	sched_unregister_group(tg);
9181	}
9182
9183	static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
9184	{
9185	#ifdef CONFIG_RT_GROUP_SCHED
9186	struct task_struct *task;
9187	struct cgroup_subsys_state *css;
9188
9189	if (!rt_group_sched_enabled())
9190	goto scx_check;
9191
9192	cgroup_taskset_for_each(task, css, tset) {
9193	if (!sched_rt_can_attach(tg: css_tg(css), tsk: task))
9194	return -EINVAL;
9195	}
9196	scx_check:
9197	#endif /* CONFIG_RT_GROUP_SCHED */
9198	return scx_cgroup_can_attach(tset);
9199	}
9200
9201	static void cpu_cgroup_attach(struct cgroup_taskset *tset)
9202	{
9203	struct task_struct *task;
9204	struct cgroup_subsys_state *css;
9205
9206	cgroup_taskset_for_each(task, css, tset)
9207	sched_move_task(tsk: task, for_autogroup: false);
9208
9209	scx_cgroup_finish_attach();
9210	}
9211
9212	static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
9213	{
9214	scx_cgroup_cancel_attach(tset);
9215	}
9216
9217	#ifdef CONFIG_UCLAMP_TASK_GROUP
9218	static void cpu_util_update_eff(struct cgroup_subsys_state *css)
9219	{
9220	struct cgroup_subsys_state *top_css = css;
9221	struct uclamp_se *uc_parent = NULL;
9222	struct uclamp_se *uc_se = NULL;
9223	unsigned int eff[UCLAMP_CNT];
9224	enum uclamp_id clamp_id;
9225	unsigned int clamps;
9226
9227	lockdep_assert_held(&uclamp_mutex);
9228	WARN_ON_ONCE(!rcu_read_lock_held());
9229
9230	css_for_each_descendant_pre(css, top_css) {
9231	uc_parent = css_tg(css)->parent
9232	? css_tg(css)->parent->uclamp : NULL;
9233
9234	for_each_clamp_id(clamp_id) {
9235	/ Assume effective clamps matches requested clamps /
9236	eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
9237	/ Cap effective clamps with parent's effective clamps /
9238	if (uc_parent &&
9239	eff[clamp_id] > uc_parent[clamp_id].value) {
9240	eff[clamp_id] = uc_parent[clamp_id].value;
9241	}
9242	}
9243	/ Ensure protection is always capped by limit /
9244	eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
9245
9246	/ Propagate most restrictive effective clamps /
9247	clamps = `0x0`;
9248	uc_se = css_tg(css)->uclamp;
9249	for_each_clamp_id(clamp_id) {
9250	if (eff[clamp_id] == uc_se[clamp_id].value)
9251	continue;
9252	uc_se[clamp_id].value = eff[clamp_id];
9253	uc_se[clamp_id].bucket_id = uclamp_bucket_id(clamp_value: eff[clamp_id]);
9254	clamps \|= (`0x1` << clamp_id);
9255	}
9256	if (!clamps) {
9257	css = css_rightmost_descendant(pos: css);
9258	continue;
9259	}
9260
9261	/ Immediately update descendants RUNNABLE tasks /
9262	uclamp_update_active_tasks(css);
9263	}
9264	}
9265
9266	/*
9267	* Integer 10^N with a given N exponent by casting to integer the literal "1eN"
9268	* C expression. Since there is no way to convert a macro argument (N) into a
9269	* character constant, use two levels of macros.
9270	*/
9271	#define _POW10(exp) ((unsigned int)1e##exp)
9272	#define POW10(exp) _POW10(exp)
9273
9274	struct uclamp_request {
9275	#define UCLAMP_PERCENT_SHIFT 2
9276	#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
9277	s64 percent;
9278	u64 util;
9279	int ret;
9280	};
9281
9282	static inline struct uclamp_request
9283	capacity_from_percent(char *buf)
9284	{
9285	struct uclamp_request req = {
9286	.percent = UCLAMP_PERCENT_SCALE,
9287	.util = SCHED_CAPACITY_SCALE,
9288	.ret = `0`,
9289	};
9290
9291	buf = strim(buf);
9292	if (strcmp(buf, "max")) {
9293	req.ret = cgroup_parse_float(input: buf, UCLAMP_PERCENT_SHIFT,
9294	v: &req.percent);
9295	if (req.ret)
9296	return req;
9297	if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
9298	req.ret = -ERANGE;
9299	return req;
9300	}
9301
9302	req.util = req.percent << SCHED_CAPACITY_SHIFT;
9303	req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
9304	}
9305
9306	return req;
9307	}
9308
9309	static ssize_t cpu_uclamp_write(struct kernfs_open_file of, char* *buf,
9310	size_t nbytes, loff_t off,
9311	enum uclamp_id clamp_id)
9312	{
9313	struct uclamp_request req;
9314	struct task_group *tg;
9315
9316	req = capacity_from_percent(buf);
9317	if (req.ret)
9318	return req.ret;
9319
9320	sched_uclamp_enable();
9321
9322	guard(mutex)(T: &uclamp_mutex);
9323	guard(rcu)();
9324
9325	tg = css_tg(css: of_css(of));
9326	if (tg->uclamp_req[clamp_id].value != req.util)
9327	uclamp_se_set(uc_se: &tg->uclamp_req[clamp_id], value: req.util, user_defined: false);
9328
9329	/*
9330	* Because of not recoverable conversion rounding we keep track of the
9331	* exact requested value
9332	*/
9333	tg->uclamp_pct[clamp_id] = req.percent;
9334
9335	/ Update effective clamps to track the most restrictive value /
9336	cpu_util_update_eff(css: of_css(of));
9337
9338	return nbytes;
9339	}
9340
9341	static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
9342	char *buf, size_t nbytes,
9343	loff_t off)
9344	{
9345	return cpu_uclamp_write(of, buf, nbytes, off, clamp_id: UCLAMP_MIN);
9346	}
9347
9348	static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
9349	char *buf, size_t nbytes,
9350	loff_t off)
9351	{
9352	return cpu_uclamp_write(of, buf, nbytes, off, clamp_id: UCLAMP_MAX);
9353	}
9354
9355	static inline void cpu_uclamp_print(struct seq_file *sf,
9356	enum uclamp_id clamp_id)
9357	{
9358	struct task_group *tg;
9359	u64 util_clamp;
9360	u64 percent;
9361	u32 rem;
9362
9363	scoped_guard (rcu) {
9364	tg = css_tg(css: seq_css(seq: sf));
9365	util_clamp = tg->uclamp_req[clamp_id].value;
9366	}
9367
9368	if (util_clamp == SCHED_CAPACITY_SCALE) {
9369	seq_puts(m: sf, s: "max\n");
9370	return;
9371	}
9372
9373	percent = tg->uclamp_pct[clamp_id];
9374	percent = div_u64_rem(dividend: percent, POW10(UCLAMP_PERCENT_SHIFT), remainder: &rem);
9375	seq_printf(m: sf, fmt: "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
9376	}
9377
9378	static int cpu_uclamp_min_show(struct seq_file sf, void* *v)
9379	{
9380	cpu_uclamp_print(sf, clamp_id: UCLAMP_MIN);
9381	return `0`;
9382	}
9383
9384	static int cpu_uclamp_max_show(struct seq_file sf, void* *v)
9385	{
9386	cpu_uclamp_print(sf, clamp_id: UCLAMP_MAX);
9387	return `0`;
9388	}
9389	#endif /* CONFIG_UCLAMP_TASK_GROUP */
9390
9391	#ifdef CONFIG_GROUP_SCHED_WEIGHT
9392	static unsigned long tg_weight(struct task_group *tg)
9393	{
9394	#ifdef CONFIG_FAIR_GROUP_SCHED
9395	return scale_load_down(tg->shares);
9396	#else
9397	return sched_weight_from_cgroup(tg->scx_weight);
9398	#endif
9399	}
9400
9401	static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
9402	struct cftype *cftype, u64 shareval)
9403	{
9404	int ret;
9405
9406	if (shareval > scale_load_down(ULONG_MAX))
9407	shareval = MAX_SHARES;
9408	ret = sched_group_set_shares(tg: css_tg(css), scale_load(shareval));
9409	if (!ret)
9410	scx_group_set_weight(tg: css_tg(css),
9411	cgrp_weight: sched_weight_to_cgroup(weight: shareval));
9412	return ret;
9413	}
9414
9415	static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
9416	struct cftype *cft)
9417	{
9418	return tg_weight(tg: css_tg(css));
9419	}
9420	#endif /* CONFIG_GROUP_SCHED_WEIGHT */
9421
9422	#ifdef CONFIG_CFS_BANDWIDTH
9423	static DEFINE_MUTEX(cfs_constraints_mutex);
9424
9425	const u64 max_cfs_quota_period = `1` * NSEC_PER_SEC; / 1s /
9426	static const u64 min_cfs_quota_period = `1` * NSEC_PER_MSEC; / 1ms /
9427	/ More than 203 days if BW_SHIFT equals 20. /
9428	static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
9429
9430	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9431
9432	static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
9433	u64 burst)
9434	{
9435	int i, ret = `0`, runtime_enabled, runtime_was_enabled;
9436	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9437
9438	if (tg == &root_task_group)
9439	return -EINVAL;
9440
9441	/*
9442	* Ensure we have at some amount of bandwidth every period. This is
9443	* to prevent reaching a state of large arrears when throttled via
9444	* entity_tick() resulting in prolonged exit starvation.
9445	*/
9446	if (quota < min_cfs_quota_period \|\| period < min_cfs_quota_period)
9447	return -EINVAL;
9448
9449	/*
9450	* Likewise, bound things on the other side by preventing insane quota
9451	* periods. This also allows us to normalize in computing quota
9452	* feasibility.
9453	*/
9454	if (period > max_cfs_quota_period)
9455	return -EINVAL;
9456
9457	/*
9458	* Bound quota to defend quota against overflow during bandwidth shift.
9459	*/
9460	if (quota != RUNTIME_INF && quota > max_cfs_runtime)
9461	return -EINVAL;
9462
9463	if (quota != RUNTIME_INF && (burst > quota \|\|
9464	burst + quota > max_cfs_runtime))
9465	return -EINVAL;
9466
9467	/*
9468	* Prevent race between setting of cfs_rq->runtime_enabled and
9469	* unthrottle_offline_cfs_rqs().
9470	*/
9471	guard(cpus_read_lock)();
9472	guard(mutex)(T: &cfs_constraints_mutex);
9473
9474	ret = __cfs_schedulable(tg, period, runtime: quota);
9475	if (ret)
9476	return ret;
9477
9478	runtime_enabled = quota != RUNTIME_INF;
9479	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
9480	/*
9481	* If we need to toggle cfs_bandwidth_used, off->on must occur
9482	* before making related changes, and on->off must occur afterwards
9483	*/
9484	if (runtime_enabled && !runtime_was_enabled)
9485	cfs_bandwidth_usage_inc();
9486
9487	scoped_guard (raw_spinlock_irq, &cfs_b->lock) {
9488	cfs_b->period = ns_to_ktime(ns: period);
9489	cfs_b->quota = quota;
9490	cfs_b->burst = burst;
9491
9492	__refill_cfs_bandwidth_runtime(cfs_b);
9493
9494	/*
9495	* Restart the period timer (if active) to handle new
9496	* period expiry:
9497	*/
9498	if (runtime_enabled)
9499	start_cfs_bandwidth(cfs_b);
9500	}
9501
9502	for_each_online_cpu(i) {
9503	struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9504	struct rq *rq = cfs_rq->rq;
9505
9506	guard(rq_lock_irq)(l: rq);
9507	cfs_rq->runtime_enabled = runtime_enabled;
9508	cfs_rq->runtime_remaining = `0`;
9509
9510	if (cfs_rq->throttled)
9511	unthrottle_cfs_rq(cfs_rq);
9512	}
9513
9514	if (runtime_was_enabled && !runtime_enabled)
9515	cfs_bandwidth_usage_dec();
9516
9517	return `0`;
9518	}
9519
9520	static int tg_set_cfs_quota(struct task_group tg, long* cfs_quota_us)
9521	{
9522	u64 quota, period, burst;
9523
9524	period = ktime_to_ns(kt: tg->cfs_bandwidth.period);
9525	burst = tg->cfs_bandwidth.burst;
9526	if (cfs_quota_us < `0`)
9527	quota = RUNTIME_INF;
9528	else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
9529	quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9530	else
9531	return -EINVAL;
9532
9533	return tg_set_cfs_bandwidth(tg, period, quota, burst);
9534	}
9535
9536	static long tg_get_cfs_quota(struct task_group *tg)
9537	{
9538	u64 quota_us;
9539
9540	if (tg->cfs_bandwidth.quota == RUNTIME_INF)
9541	return -`1`;
9542
9543	quota_us = tg->cfs_bandwidth.quota;
9544	do_div(quota_us, NSEC_PER_USEC);
9545
9546	return quota_us;
9547	}
9548
9549	static int tg_set_cfs_period(struct task_group tg, long* cfs_period_us)
9550	{
9551	u64 quota, period, burst;
9552
9553	if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
9554	return -EINVAL;
9555
9556	period = (u64)cfs_period_us * NSEC_PER_USEC;
9557	quota = tg->cfs_bandwidth.quota;
9558	burst = tg->cfs_bandwidth.burst;
9559
9560	return tg_set_cfs_bandwidth(tg, period, quota, burst);
9561	}
9562
9563	static long tg_get_cfs_period(struct task_group *tg)
9564	{
9565	u64 cfs_period_us;
9566
9567	cfs_period_us = ktime_to_ns(kt: tg->cfs_bandwidth.period);
9568	do_div(cfs_period_us, NSEC_PER_USEC);
9569
9570	return cfs_period_us;
9571	}
9572
9573	static int tg_set_cfs_burst(struct task_group tg, long* cfs_burst_us)
9574	{
9575	u64 quota, period, burst;
9576
9577	if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
9578	return -EINVAL;
9579
9580	burst = (u64)cfs_burst_us * NSEC_PER_USEC;
9581	period = ktime_to_ns(kt: tg->cfs_bandwidth.period);
9582	quota = tg->cfs_bandwidth.quota;
9583
9584	return tg_set_cfs_bandwidth(tg, period, quota, burst);
9585	}
9586
9587	static long tg_get_cfs_burst(struct task_group *tg)
9588	{
9589	u64 burst_us;
9590
9591	burst_us = tg->cfs_bandwidth.burst;
9592	do_div(burst_us, NSEC_PER_USEC);
9593
9594	return burst_us;
9595	}
9596
9597	static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
9598	struct cftype *cft)
9599	{
9600	return tg_get_cfs_quota(tg: css_tg(css));
9601	}
9602
9603	static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
9604	struct cftype *cftype, s64 cfs_quota_us)
9605	{
9606	return tg_set_cfs_quota(tg: css_tg(css), cfs_quota_us);
9607	}
9608
9609	static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
9610	struct cftype *cft)
9611	{
9612	return tg_get_cfs_period(tg: css_tg(css));
9613	}
9614
9615	static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
9616	struct cftype *cftype, u64 cfs_period_us)
9617	{
9618	return tg_set_cfs_period(tg: css_tg(css), cfs_period_us);
9619	}
9620
9621	static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css,
9622	struct cftype *cft)
9623	{
9624	return tg_get_cfs_burst(tg: css_tg(css));
9625	}
9626
9627	static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css,
9628	struct cftype *cftype, u64 cfs_burst_us)
9629	{
9630	return tg_set_cfs_burst(tg: css_tg(css), cfs_burst_us);
9631	}
9632
9633	struct cfs_schedulable_data {
9634	struct task_group *tg;
9635	u64 period, quota;
9636	};
9637
9638	/*
9639	* normalize group quota/period to be quota/max_period
9640	* note: units are usecs
9641	*/
9642	static u64 normalize_cfs_quota(struct task_group *tg,
9643	struct cfs_schedulable_data *d)
9644	{
9645	u64 quota, period;
9646
9647	if (tg == d->tg) {
9648	period = d->period;
9649	quota = d->quota;
9650	} else {
9651	period = tg_get_cfs_period(tg);
9652	quota = tg_get_cfs_quota(tg);
9653	}
9654
9655	/ note: these should typically be equivalent /
9656	if (quota == RUNTIME_INF \|\| quota == -`1`)
9657	return RUNTIME_INF;
9658
9659	return to_ratio(period, runtime: quota);
9660	}
9661
9662	static int tg_cfs_schedulable_down(struct task_group tg, void* *data)
9663	{
9664	struct cfs_schedulable_data *d = data;
9665	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9666	s64 quota = `0`, parent_quota = -`1`;
9667
9668	if (!tg->parent) {
9669	quota = RUNTIME_INF;
9670	} else {
9671	struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
9672
9673	quota = normalize_cfs_quota(tg, d);
9674	parent_quota = parent_b->hierarchical_quota;
9675
9676	/*
9677	* Ensure max(child_quota) <= parent_quota. On cgroup2,
9678	* always take the non-RUNTIME_INF min. On cgroup1, only
9679	* inherit when no limit is set. In both cases this is used
9680	* by the scheduler to determine if a given CFS task has a
9681	* bandwidth constraint at some higher level.
9682	*/
9683	if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
9684	if (quota == RUNTIME_INF)
9685	quota = parent_quota;
9686	else if (parent_quota != RUNTIME_INF)
9687	quota = min(quota, parent_quota);
9688	} else {
9689	if (quota == RUNTIME_INF)
9690	quota = parent_quota;
9691	else if (parent_quota != RUNTIME_INF && quota > parent_quota)
9692	return -EINVAL;
9693	}
9694	}
9695	cfs_b->hierarchical_quota = quota;
9696
9697	return `0`;
9698	}
9699
9700	static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
9701	{
9702	struct cfs_schedulable_data data = {
9703	.tg = tg,
9704	.period = period,
9705	.quota = quota,
9706	};
9707
9708	if (quota != RUNTIME_INF) {
9709	do_div(data.period, NSEC_PER_USEC);
9710	do_div(data.quota, NSEC_PER_USEC);
9711	}
9712
9713	guard(rcu)();
9714	return walk_tg_tree(down: tg_cfs_schedulable_down, up: tg_nop, data: &data);
9715	}
9716
9717	static int cpu_cfs_stat_show(struct seq_file sf, void* *v)
9718	{
9719	struct task_group *tg = css_tg(css: seq_css(seq: sf));
9720	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9721
9722	seq_printf(m: sf, fmt: "nr_periods %d\n", cfs_b->nr_periods);
9723	seq_printf(m: sf, fmt: "nr_throttled %d\n", cfs_b->nr_throttled);
9724	seq_printf(m: sf, fmt: "throttled_time %llu\n", cfs_b->throttled_time);
9725
9726	if (schedstat_enabled() && tg != &root_task_group) {
9727	struct sched_statistics *stats;
9728	u64 ws = `0`;
9729	int i;
9730
9731	for_each_possible_cpu(i) {
9732	stats = __schedstats_from_se(se: tg->se[i]);
9733	ws += schedstat_val(stats->wait_sum);
9734	}
9735
9736	seq_printf(m: sf, fmt: "wait_sum %llu\n", ws);
9737	}
9738
9739	seq_printf(m: sf, fmt: "nr_bursts %d\n", cfs_b->nr_burst);
9740	seq_printf(m: sf, fmt: "burst_time %llu\n", cfs_b->burst_time);
9741
9742	return `0`;
9743	}
9744
9745	static u64 throttled_time_self(struct task_group *tg)
9746	{
9747	int i;
9748	u64 total = `0`;
9749
9750	for_each_possible_cpu(i) {
9751	total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
9752	}
9753
9754	return total;
9755	}
9756
9757	static int cpu_cfs_local_stat_show(struct seq_file sf, void* *v)
9758	{
9759	struct task_group *tg = css_tg(css: seq_css(seq: sf));
9760
9761	seq_printf(m: sf, fmt: "throttled_time %llu\n", throttled_time_self(tg));
9762
9763	return `0`;
9764	}
9765	#endif /* CONFIG_CFS_BANDWIDTH */
9766
9767	#ifdef CONFIG_RT_GROUP_SCHED
9768	static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
9769	struct cftype *cft, s64 val)
9770	{
9771	return sched_group_set_rt_runtime(tg: css_tg(css), rt_runtime_us: val);
9772	}
9773
9774	static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
9775	struct cftype *cft)
9776	{
9777	return sched_group_rt_runtime(tg: css_tg(css));
9778	}
9779
9780	static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
9781	struct cftype *cftype, u64 rt_period_us)
9782	{
9783	return sched_group_set_rt_period(tg: css_tg(css), rt_period_us);
9784	}
9785
9786	static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
9787	struct cftype *cft)
9788	{
9789	return sched_group_rt_period(tg: css_tg(css));
9790	}
9791	#endif /* CONFIG_RT_GROUP_SCHED */
9792
9793	#ifdef CONFIG_GROUP_SCHED_WEIGHT
9794	static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
9795	struct cftype *cft)
9796	{
9797	return css_tg(css)->idle;
9798	}
9799
9800	static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
9801	struct cftype *cft, s64 idle)
9802	{
9803	int ret;
9804
9805	ret = sched_group_set_idle(tg: css_tg(css), idle);
9806	if (!ret)
9807	scx_group_set_idle(tg: css_tg(css), idle);
9808	return ret;
9809	}
9810	#endif
9811
9812	static struct cftype cpu_legacy_files[] = {
9813	#ifdef CONFIG_GROUP_SCHED_WEIGHT
9814	{
9815	.name = "shares",
9816	.read_u64 = cpu_shares_read_u64,
9817	.write_u64 = cpu_shares_write_u64,
9818	},
9819	{
9820	.name = "idle",
9821	.read_s64 = cpu_idle_read_s64,
9822	.write_s64 = cpu_idle_write_s64,
9823	},
9824	#endif
9825	#ifdef CONFIG_CFS_BANDWIDTH
9826	{
9827	.name = "cfs_quota_us",
9828	.read_s64 = cpu_cfs_quota_read_s64,
9829	.write_s64 = cpu_cfs_quota_write_s64,
9830	},
9831	{
9832	.name = "cfs_period_us",
9833	.read_u64 = cpu_cfs_period_read_u64,
9834	.write_u64 = cpu_cfs_period_write_u64,
9835	},
9836	{
9837	.name = "cfs_burst_us",
9838	.read_u64 = cpu_cfs_burst_read_u64,
9839	.write_u64 = cpu_cfs_burst_write_u64,
9840	},
9841	{
9842	.name = "stat",
9843	.seq_show = cpu_cfs_stat_show,
9844	},
9845	{
9846	.name = "stat.local",
9847	.seq_show = cpu_cfs_local_stat_show,
9848	},
9849	#endif
9850	#ifdef CONFIG_UCLAMP_TASK_GROUP
9851	{
9852	.name = "uclamp.min",
9853	.flags = CFTYPE_NOT_ON_ROOT,
9854	.seq_show = cpu_uclamp_min_show,
9855	.write = cpu_uclamp_min_write,
9856	},
9857	{
9858	.name = "uclamp.max",
9859	.flags = CFTYPE_NOT_ON_ROOT,
9860	.seq_show = cpu_uclamp_max_show,
9861	.write = cpu_uclamp_max_write,
9862	},
9863	#endif
9864	{ } / Terminate /
9865	};
9866
9867	#ifdef CONFIG_RT_GROUP_SCHED
9868	static struct cftype rt_group_files[] = {
9869	{
9870	.name = "rt_runtime_us",
9871	.read_s64 = cpu_rt_runtime_read,
9872	.write_s64 = cpu_rt_runtime_write,
9873	},
9874	{
9875	.name = "rt_period_us",
9876	.read_u64 = cpu_rt_period_read_uint,
9877	.write_u64 = cpu_rt_period_write_uint,
9878	},
9879	{ } / Terminate /
9880	};
9881
9882	# ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED
9883	DEFINE_STATIC_KEY_FALSE(rt_group_sched);
9884	# else
9885	DEFINE_STATIC_KEY_TRUE(rt_group_sched);
9886	# endif
9887
9888	static int __init setup_rt_group_sched(char *str)
9889	{
9890	long val;
9891
9892	if (kstrtol(s: str, base: `0`, res: &val) \|\| val < `0` \|\| val > `1`) {
9893	pr_warn("Unable to set rt_group_sched\n");
9894	return `1`;
9895	}
9896	if (val)
9897	static_branch_enable(&rt_group_sched);
9898	else
9899	static_branch_disable(&rt_group_sched);
9900
9901	return `1`;
9902	}
9903	__setup("rt_group_sched=", setup_rt_group_sched);
9904
9905	static int __init cpu_rt_group_init(void)
9906	{
9907	if (!rt_group_sched_enabled())
9908	return `0`;
9909
9910	WARN_ON(cgroup_add_legacy_cftypes(&cpu_cgrp_subsys, rt_group_files));
9911	return `0`;
9912	}
9913	subsys_initcall(cpu_rt_group_init);
9914	#endif /* CONFIG_RT_GROUP_SCHED */
9915
9916	static int cpu_extra_stat_show(struct seq_file *sf,
9917	struct cgroup_subsys_state *css)
9918	{
9919	#ifdef CONFIG_CFS_BANDWIDTH
9920	{
9921	struct task_group *tg = css_tg(css);
9922	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9923	u64 throttled_usec, burst_usec;
9924
9925	throttled_usec = cfs_b->throttled_time;
9926	do_div(throttled_usec, NSEC_PER_USEC);
9927	burst_usec = cfs_b->burst_time;
9928	do_div(burst_usec, NSEC_PER_USEC);
9929
9930	seq_printf(m: sf, fmt: "nr_periods %d\n"
9931	"nr_throttled %d\n"
9932	"throttled_usec %llu\n"
9933	"nr_bursts %d\n"
9934	"burst_usec %llu\n",
9935	cfs_b->nr_periods, cfs_b->nr_throttled,
9936	throttled_usec, cfs_b->nr_burst, burst_usec);
9937	}
9938	#endif
9939	return `0`;
9940	}
9941
9942	static int cpu_local_stat_show(struct seq_file *sf,
9943	struct cgroup_subsys_state *css)
9944	{
9945	#ifdef CONFIG_CFS_BANDWIDTH
9946	{
9947	struct task_group *tg = css_tg(css);
9948	u64 throttled_self_usec;
9949
9950	throttled_self_usec = throttled_time_self(tg);
9951	do_div(throttled_self_usec, NSEC_PER_USEC);
9952
9953	seq_printf(m: sf, fmt: "throttled_usec %llu\n",
9954	throttled_self_usec);
9955	}
9956	#endif
9957	return `0`;
9958	}
9959
9960	#ifdef CONFIG_GROUP_SCHED_WEIGHT
9961
9962	static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
9963	struct cftype *cft)
9964	{
9965	return sched_weight_to_cgroup(weight: tg_weight(tg: css_tg(css)));
9966	}
9967
9968	static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
9969	struct cftype *cft, u64 cgrp_weight)
9970	{
9971	unsigned long weight;
9972	int ret;
9973
9974	if (cgrp_weight < CGROUP_WEIGHT_MIN \|\| cgrp_weight > CGROUP_WEIGHT_MAX)
9975	return -ERANGE;
9976
9977	weight = sched_weight_from_cgroup(cgrp_weight);
9978
9979	ret = sched_group_set_shares(tg: css_tg(css), scale_load(weight));
9980	if (!ret)
9981	scx_group_set_weight(tg: css_tg(css), cgrp_weight);
9982	return ret;
9983	}
9984
9985	static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
9986	struct cftype *cft)
9987	{
9988	unsigned long weight = tg_weight(tg: css_tg(css));
9989	int last_delta = INT_MAX;
9990	int prio, delta;
9991
9992	/ find the closest nice value to the current weight /
9993	for (prio = `0`; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
9994	delta = abs(sched_prio_to_weight[prio] - weight);
9995	if (delta >= last_delta)
9996	break;
9997	last_delta = delta;
9998	}
9999
10000	return PRIO_TO_NICE(prio - `1` + MAX_RT_PRIO);
10001	}
10002
10003	static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
10004	struct cftype *cft, s64 nice)
10005	{
10006	unsigned long weight;
10007	int idx, ret;
10008
10009	if (nice < MIN_NICE \|\| nice > MAX_NICE)
10010	return -ERANGE;
10011
10012	idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
10013	idx = array_index_nospec(idx, `40`);
10014	weight = sched_prio_to_weight[idx];
10015
10016	ret = sched_group_set_shares(tg: css_tg(css), scale_load(weight));
10017	if (!ret)
10018	scx_group_set_weight(tg: css_tg(css),
10019	cgrp_weight: sched_weight_to_cgroup(weight));
10020	return ret;
10021	}
10022	#endif /* CONFIG_GROUP_SCHED_WEIGHT */
10023
10024	static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
10025	long period, long quota)
10026	{
10027	if (quota < `0`)
10028	seq_puts(m: sf, s: "max");
10029	else
10030	seq_printf(m: sf, fmt: "%ld", quota);
10031
10032	seq_printf(m: sf, fmt: " %ld\n", period);
10033	}
10034
10035	/ caller should put the current value in @periodp before calling /*
10036	static int __maybe_unused cpu_period_quota_parse(char *buf,
10037	u64 periodp, u64 quotap)
10038	{
10039	char tok[`21`]; / U64_MAX /
10040
10041	if (sscanf(buf, "%20s %llu", tok, periodp) < `1`)
10042	return -EINVAL;
10043
10044	periodp = NSEC_PER_USEC;
10045
10046	if (sscanf(tok, "%llu", quotap))
10047	quotap = NSEC_PER_USEC;
10048	else if (!strcmp(tok, "max"))
10049	*quotap = RUNTIME_INF;
10050	else
10051	return -EINVAL;
10052
10053	return `0`;
10054	}
10055
10056	#ifdef CONFIG_CFS_BANDWIDTH
10057	static int cpu_max_show(struct seq_file sf, void* *v)
10058	{
10059	struct task_group *tg = css_tg(css: seq_css(seq: sf));
10060
10061	cpu_period_quota_print(sf, period: tg_get_cfs_period(tg), quota: tg_get_cfs_quota(tg));
10062	return `0`;
10063	}
10064
10065	static ssize_t cpu_max_write(struct kernfs_open_file *of,
10066	char *buf, size_t nbytes, loff_t off)
10067	{
10068	struct task_group *tg = css_tg(css: of_css(of));
10069	u64 period = tg_get_cfs_period(tg);
10070	u64 burst = tg->cfs_bandwidth.burst;
10071	u64 quota;
10072	int ret;
10073
10074	ret = cpu_period_quota_parse(buf, periodp: &period, quotap: &quota);
10075	if (!ret)
10076	ret = tg_set_cfs_bandwidth(tg, period, quota, burst);
10077	return ret ?: nbytes;
10078	}
10079	#endif
10080
10081	static struct cftype cpu_files[] = {
10082	#ifdef CONFIG_GROUP_SCHED_WEIGHT
10083	{
10084	.name = "weight",
10085	.flags = CFTYPE_NOT_ON_ROOT,
10086	.read_u64 = cpu_weight_read_u64,
10087	.write_u64 = cpu_weight_write_u64,
10088	},
10089	{
10090	.name = "weight.nice",
10091	.flags = CFTYPE_NOT_ON_ROOT,
10092	.read_s64 = cpu_weight_nice_read_s64,
10093	.write_s64 = cpu_weight_nice_write_s64,
10094	},
10095	{
10096	.name = "idle",
10097	.flags = CFTYPE_NOT_ON_ROOT,
10098	.read_s64 = cpu_idle_read_s64,
10099	.write_s64 = cpu_idle_write_s64,
10100	},
10101	#endif
10102	#ifdef CONFIG_CFS_BANDWIDTH
10103	{
10104	.name = "max",
10105	.flags = CFTYPE_NOT_ON_ROOT,
10106	.seq_show = cpu_max_show,
10107	.write = cpu_max_write,
10108	},
10109	{
10110	.name = "max.burst",
10111	.flags = CFTYPE_NOT_ON_ROOT,
10112	.read_u64 = cpu_cfs_burst_read_u64,
10113	.write_u64 = cpu_cfs_burst_write_u64,
10114	},
10115	#endif
10116	#ifdef CONFIG_UCLAMP_TASK_GROUP
10117	{
10118	.name = "uclamp.min",
10119	.flags = CFTYPE_NOT_ON_ROOT,
10120	.seq_show = cpu_uclamp_min_show,
10121	.write = cpu_uclamp_min_write,
10122	},
10123	{
10124	.name = "uclamp.max",
10125	.flags = CFTYPE_NOT_ON_ROOT,
10126	.seq_show = cpu_uclamp_max_show,
10127	.write = cpu_uclamp_max_write,
10128	},
10129	#endif
10130	{ } / terminate /
10131	};
10132
10133	struct cgroup_subsys cpu_cgrp_subsys = {
10134	.css_alloc = cpu_cgroup_css_alloc,
10135	.css_online = cpu_cgroup_css_online,
10136	.css_offline = cpu_cgroup_css_offline,
10137	.css_released = cpu_cgroup_css_released,
10138	.css_free = cpu_cgroup_css_free,
10139	.css_extra_stat_show = cpu_extra_stat_show,
10140	.css_local_stat_show = cpu_local_stat_show,
10141	.can_attach = cpu_cgroup_can_attach,
10142	.attach = cpu_cgroup_attach,
10143	.cancel_attach = cpu_cgroup_cancel_attach,
10144	.legacy_cftypes = cpu_legacy_files,
10145	.dfl_cftypes = cpu_files,
10146	.early_init = true,
10147	.threaded = true,
10148	};
10149
10150	#endif /* CONFIG_CGROUP_SCHED */
10151
10152	void dump_cpu_task(int cpu)
10153	{
10154	if (in_hardirq() && cpu == smp_processor_id()) {
10155	struct pt_regs *regs;
10156
10157	regs = get_irq_regs();
10158	if (regs) {
10159	show_regs(regs);
10160	return;
10161	}
10162	}
10163
10164	if (trigger_single_cpu_backtrace(cpu))
10165	return;
10166
10167	pr_info("Task dump for CPU %d:\n", cpu);
10168	sched_show_task(cpu_curr(cpu));
10169	}
10170
10171	/*
10172	* Nice levels are multiplicative, with a gentle 10% change for every
10173	* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
10174	* nice 1, it will get ~10% less CPU time than another CPU-bound task
10175	* that remained on nice 0.
10176	*
10177	* The "10% effect" is relative and cumulative: from _any_ nice level,
10178	* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
10179	* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
10180	* If a task goes up by ~10% and another task goes down by ~10% then
10181	* the relative distance between them is ~25%.)
10182	*/
10183	const int sched_prio_to_weight[`40`] = {
10184	/ -20 / `88761`, `71755`, `56483`, `46273`, `36291`,
10185	/ -15 / `29154`, `23254`, `18705`, `14949`, `11916`,
10186	/ -10 / `9548`, `7620`, `6100`, `4904`, `3906`,
10187	/ -5 / `3121`, `2501`, `1991`, `1586`, `1277`,
10188	/ 0 / `1024`, `820`, `655`, `526`, `423`,
10189	/ 5 / `335`, `272`, `215`, `172`, `137`,
10190	/ 10 / `110`, `87`, `70`, `56`, `45`,
10191	/ 15 / `36`, `29`, `23`, `18`, `15`,
10192	};
10193
10194	/*
10195	* Inverse (2^32/x) values of the sched_prio_to_weight[] array, pre-calculated.
10196	*
10197	* In cases where the weight does not change often, we can use the
10198	* pre-calculated inverse to speed up arithmetics by turning divisions
10199	* into multiplications:
10200	*/
10201	const u32 sched_prio_to_wmult[`40`] = {
10202	/ -20 / `48388`, `59856`, `76040`, `92818`, `118348`,
10203	/ -15 / `147320`, `184698`, `229616`, `287308`, `360437`,
10204	/ -10 / `449829`, `563644`, `704093`, `875809`, `1099582`,
10205	/ -5 / `1376151`, `1717300`, `2157191`, `2708050`, `3363326`,
10206	/ 0 / `4194304`, `5237765`, `6557202`, `8165337`, `10153587`,
10207	/ 5 / `12820798`, `15790321`, `19976592`, `24970740`, `31350126`,
10208	/ 10 / `39045157`, `49367440`, `61356676`, `76695844`, `95443717`,
10209	/ 15 / `119304647`, `148102320`, `186737708`, `238609294`, `286331153`,
10210	};
10211
10212	void call_trace_sched_update_nr_running(struct rq rq, int* count)
10213	{
10214	trace_sched_update_nr_running_tp(rq, change: count);
10215	}
10216
10217	#ifdef CONFIG_SCHED_MM_CID
10218
10219	/*
10220	* @cid_lock: Guarantee forward-progress of cid allocation.
10221	*
10222	* Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
10223	* is only used when contention is detected by the lock-free allocation so
10224	* forward progress can be guaranteed.
10225	*/
10226	DEFINE_RAW_SPINLOCK(cid_lock);
10227
10228	/*
10229	* @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
10230	*
10231	* When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
10232	* detected, it is set to 1 to ensure that all newly coming allocations are
10233	* serialized by @cid_lock until the allocation which detected contention
10234	* completes and sets @use_cid_lock back to 0. This guarantees forward progress
10235	* of a cid allocation.
10236	*/
10237	int use_cid_lock;
10238
10239	/*
10240	* mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid
10241	* concurrently with respect to the execution of the source runqueue context
10242	* switch.
10243	*
10244	* There is one basic properties we want to guarantee here:
10245	*
10246	* (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively
10247	* used by a task. That would lead to concurrent allocation of the cid and
10248	* userspace corruption.
10249	*
10250	* Provide this guarantee by introducing a Dekker memory ordering to guarantee
10251	* that a pair of loads observe at least one of a pair of stores, which can be
10252	* shown as:
10253	*
10254	* X = Y = 0
10255	*
10256	* w[X]=1 w[Y]=1
10257	* MB MB
10258	* r[Y]=y r[X]=x
10259	*
10260	* Which guarantees that x==0 && y==0 is impossible. But rather than using
10261	* values 0 and 1, this algorithm cares about specific state transitions of the
10262	* runqueue current task (as updated by the scheduler context switch), and the
10263	* per-mm/cpu cid value.
10264	*
10265	* Let's introduce task (Y) which has task->mm == mm and task (N) which has
10266	* task->mm != mm for the rest of the discussion. There are two scheduler state
10267	* transitions on context switch we care about:
10268	*
10269	* (TSA) Store to rq->curr with transition from (N) to (Y)
10270	*
10271	* (TSB) Store to rq->curr with transition from (Y) to (N)
10272	*
10273	* On the remote-clear side, there is one transition we care about:
10274	*
10275	* (TMA) cmpxchg to *pcpu_cid to set the LAZY flag
10276	*
10277	* There is also a transition to UNSET state which can be performed from all
10278	* sides (scheduler, remote-clear). It is always performed with a cmpxchg which
10279	* guarantees that only a single thread will succeed:
10280	*
10281	* (TMB) cmpxchg to *pcpu_cid to mark UNSET
10282	*
10283	* Just to be clear, what we do _not_ want to happen is a transition to UNSET
10284	* when a thread is actively using the cid (property (1)).
10285	*
10286	* Let's looks at the relevant combinations of TSA/TSB, and TMA transitions.
10287	*
10288	* Scenario A) (TSA)+(TMA) (from next task perspective)
10289	*
10290	* CPU0 CPU1
10291	*
10292	* Context switch CS-1 Remote-clear
10293	* - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA)
10294	* (implied barrier after cmpxchg)
10295	* - switch_mm_cid()
10296	* - memory barrier (see switch_mm_cid()
10297	* comment explaining how this barrier
10298	* is combined with other scheduler
10299	* barriers)
10300	* - mm_cid_get (next)
10301	* - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr)
10302	*
10303	* This Dekker ensures that either task (Y) is observed by the
10304	* rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are
10305	* observed.
10306	*
10307	* If task (Y) store is observed by rcu_dereference(), it means that there is
10308	* still an active task on the cpu. Remote-clear will therefore not transition
10309	* to UNSET, which fulfills property (1).
10310	*
10311	* If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(),
10312	* it will move its state to UNSET, which clears the percpu cid perhaps
10313	* uselessly (which is not an issue for correctness). Because task (Y) is not
10314	* observed, CPU1 can move ahead to set the state to UNSET. Because moving
10315	* state to UNSET is done with a cmpxchg expecting that the old state has the
10316	* LAZY flag set, only one thread will successfully UNSET.
10317	*
10318	* If both states (LAZY flag and task (Y)) are observed, the thread on CPU0
10319	* will observe the LAZY flag and transition to UNSET (perhaps uselessly), and
10320	* CPU1 will observe task (Y) and do nothing more, which is fine.
10321	*
10322	* What we are effectively preventing with this Dekker is a scenario where
10323	* neither LAZY flag nor store (Y) are observed, which would fail property (1)
10324	* because this would UNSET a cid which is actively used.
10325	*/
10326
10327	void sched_mm_cid_migrate_from(struct task_struct *t)
10328	{
10329	t->migrate_from_cpu = task_cpu(p: t);
10330	}
10331
10332	static
10333	int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
10334	struct task_struct *t,
10335	struct mm_cid *src_pcpu_cid)
10336	{
10337	struct mm_struct *mm = t->mm;
10338	struct task_struct *src_task;
10339	int src_cid, last_mm_cid;
10340
10341	if (!mm)
10342	return -`1`;
10343
10344	last_mm_cid = t->last_mm_cid;
10345	/*
10346	* If the migrated task has no last cid, or if the current
10347	* task on src rq uses the cid, it means the source cid does not need
10348	* to be moved to the destination cpu.
10349	*/
10350	if (last_mm_cid == -`1`)
10351	return -`1`;
10352	src_cid = READ_ONCE(src_pcpu_cid->cid);
10353	if (!mm_cid_is_valid(cid: src_cid) \|\| last_mm_cid != src_cid)
10354	return -`1`;
10355
10356	/*
10357	* If we observe an active task using the mm on this rq, it means we
10358	* are not the last task to be migrated from this cpu for this mm, so
10359	* there is no need to move src_cid to the destination cpu.
10360	*/
10361	guard(rcu)();
10362	src_task = rcu_dereference(src_rq->curr);
10363	if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
10364	t->last_mm_cid = -`1`;
10365	return -`1`;
10366	}
10367
10368	return src_cid;
10369	}
10370
10371	static
10372	int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
10373	struct task_struct *t,
10374	struct mm_cid *src_pcpu_cid,
10375	int src_cid)
10376	{
10377	struct task_struct *src_task;
10378	struct mm_struct *mm = t->mm;
10379	int lazy_cid;
10380
10381	if (src_cid == -`1`)
10382	return -`1`;
10383
10384	/*
10385	* Attempt to clear the source cpu cid to move it to the destination
10386	* cpu.
10387	*/
10388	lazy_cid = mm_cid_set_lazy_put(cid: src_cid);
10389	if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid))
10390	return -`1`;
10391
10392	/*
10393	* The implicit barrier after cmpxchg per-mm/cpu cid before loading
10394	* rq->curr->mm matches the scheduler barrier in context_switch()
10395	* between store to rq->curr and load of prev and next task's
10396	* per-mm/cpu cid.
10397	*
10398	* The implicit barrier after cmpxchg per-mm/cpu cid before loading
10399	* rq->curr->mm_cid_active matches the barrier in
10400	* sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
10401	* sched_mm_cid_after_execve() between store to t->mm_cid_active and
10402	* load of per-mm/cpu cid.
10403	*/
10404
10405	/*
10406	* If we observe an active task using the mm on this rq after setting
10407	* the lazy-put flag, this task will be responsible for transitioning
10408	* from lazy-put flag set to MM_CID_UNSET.
10409	*/
10410	scoped_guard (rcu) {
10411	src_task = rcu_dereference(src_rq->curr);
10412	if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
10413	/*
10414	* We observed an active task for this mm, there is therefore
10415	* no point in moving this cid to the destination cpu.
10416	*/
10417	t->last_mm_cid = -`1`;
10418	return -`1`;
10419	}
10420	}
10421
10422	/*
10423	* The src_cid is unused, so it can be unset.
10424	*/
10425	if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
10426	return -`1`;
10427	WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET);
10428	return src_cid;
10429	}
10430
10431	/*
10432	* Migration to dst cpu. Called with dst_rq lock held.
10433	* Interrupts are disabled, which keeps the window of cid ownership without the
10434	* source rq lock held small.
10435	*/
10436	void sched_mm_cid_migrate_to(struct rq dst_rq, struct* task_struct *t)
10437	{
10438	struct mm_cid src_pcpu_cid, dst_pcpu_cid;
10439	struct mm_struct *mm = t->mm;
10440	int src_cid, src_cpu;
10441	bool dst_cid_is_set;
10442	struct rq *src_rq;
10443
10444	lockdep_assert_rq_held(rq: dst_rq);
10445
10446	if (!mm)
10447	return;
10448	src_cpu = t->migrate_from_cpu;
10449	if (src_cpu == -`1`) {
10450	t->last_mm_cid = -`1`;
10451	return;
10452	}
10453	/*
10454	* Move the src cid if the dst cid is unset. This keeps id
10455	* allocation closest to 0 in cases where few threads migrate around
10456	* many CPUs.
10457	*
10458	* If destination cid or recent cid is already set, we may have
10459	* to just clear the src cid to ensure compactness in frequent
10460	* migrations scenarios.
10461	*
10462	* It is not useful to clear the src cid when the number of threads is
10463	* greater or equal to the number of allowed CPUs, because user-space
10464	* can expect that the number of allowed cids can reach the number of
10465	* allowed CPUs.
10466	*/
10467	dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
10468	dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) \|\|
10469	!mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid));
10470	if (dst_cid_is_set && atomic_read(v: &mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed))
10471	return;
10472	src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
10473	src_rq = cpu_rq(src_cpu);
10474	src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid);
10475	if (src_cid == -`1`)
10476	return;
10477	src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid,
10478	src_cid);
10479	if (src_cid == -`1`)
10480	return;
10481	if (dst_cid_is_set) {
10482	__mm_cid_put(mm, cid: src_cid);
10483	return;
10484	}
10485	/ Move src_cid to dst cpu. /
10486	mm_cid_snapshot_time(rq: dst_rq, mm);
10487	WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
10488	WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid);
10489	}
10490
10491	static void sched_mm_cid_remote_clear(struct mm_struct mm, struct* mm_cid *pcpu_cid,
10492	int cpu)
10493	{
10494	struct rq *rq = cpu_rq(cpu);
10495	struct task_struct *t;
10496	int cid, lazy_cid;
10497
10498	cid = READ_ONCE(pcpu_cid->cid);
10499	if (!mm_cid_is_valid(cid))
10500	return;
10501
10502	/*
10503	* Clear the cpu cid if it is set to keep cid allocation compact. If
10504	* there happens to be other tasks left on the source cpu using this
10505	* mm, the next task using this mm will reallocate its cid on context
10506	* switch.
10507	*/
10508	lazy_cid = mm_cid_set_lazy_put(cid);
10509	if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
10510	return;
10511
10512	/*
10513	* The implicit barrier after cmpxchg per-mm/cpu cid before loading
10514	* rq->curr->mm matches the scheduler barrier in context_switch()
10515	* between store to rq->curr and load of prev and next task's
10516	* per-mm/cpu cid.
10517	*
10518	* The implicit barrier after cmpxchg per-mm/cpu cid before loading
10519	* rq->curr->mm_cid_active matches the barrier in
10520	* sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
10521	* sched_mm_cid_after_execve() between store to t->mm_cid_active and
10522	* load of per-mm/cpu cid.
10523	*/
10524
10525	/*
10526	* If we observe an active task using the mm on this rq after setting
10527	* the lazy-put flag, that task will be responsible for transitioning
10528	* from lazy-put flag set to MM_CID_UNSET.
10529	*/
10530	scoped_guard (rcu) {
10531	t = rcu_dereference(rq->curr);
10532	if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
10533	return;
10534	}
10535
10536	/*
10537	* The cid is unused, so it can be unset.
10538	* Disable interrupts to keep the window of cid ownership without rq
10539	* lock small.
10540	*/
10541	scoped_guard (irqsave) {
10542	if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
10543	__mm_cid_put(mm, cid);
10544	}
10545	}
10546
10547	static void sched_mm_cid_remote_clear_old(struct mm_struct mm, int* cpu)
10548	{
10549	struct rq *rq = cpu_rq(cpu);
10550	struct mm_cid *pcpu_cid;
10551	struct task_struct *curr;
10552	u64 rq_clock;
10553
10554	/*
10555	* rq->clock load is racy on 32-bit but one spurious clear once in a
10556	* while is irrelevant.
10557	*/
10558	rq_clock = READ_ONCE(rq->clock);
10559	pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
10560
10561	/*
10562	* In order to take care of infrequently scheduled tasks, bump the time
10563	* snapshot associated with this cid if an active task using the mm is
10564	* observed on this rq.
10565	*/
10566	scoped_guard (rcu) {
10567	curr = rcu_dereference(rq->curr);
10568	if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
10569	WRITE_ONCE(pcpu_cid->time, rq_clock);
10570	return;
10571	}
10572	}
10573
10574	if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
10575	return;
10576	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
10577	}
10578
10579	static void sched_mm_cid_remote_clear_weight(struct mm_struct mm, int* cpu,
10580	int weight)
10581	{
10582	struct mm_cid *pcpu_cid;
10583	int cid;
10584
10585	pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
10586	cid = READ_ONCE(pcpu_cid->cid);
10587	if (!mm_cid_is_valid(cid) \|\| cid < weight)
10588	return;
10589	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
10590	}
10591
10592	static void task_mm_cid_work(struct callback_head *work)
10593	{
10594	unsigned long now = jiffies, old_scan, next_scan;
10595	struct task_struct *t = current;
10596	struct cpumask *cidmask;
10597	struct mm_struct *mm;
10598	int weight, cpu;
10599
10600	WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
10601
10602	work->next = work; / Prevent double-add /
10603	if (t->flags & PF_EXITING)
10604	return;
10605	mm = t->mm;
10606	if (!mm)
10607	return;
10608	old_scan = READ_ONCE(mm->mm_cid_next_scan);
10609	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
10610	if (!old_scan) {
10611	unsigned long res;
10612
10613	res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
10614	if (res != old_scan)
10615	old_scan = res;
10616	else
10617	old_scan = next_scan;
10618	}
10619	if (time_before(now, old_scan))
10620	return;
10621	if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
10622	return;
10623	cidmask = mm_cidmask(mm);
10624	/ Clear cids that were not recently used. /
10625	for_each_possible_cpu(cpu)
10626	sched_mm_cid_remote_clear_old(mm, cpu);
10627	weight = cpumask_weight(srcp: cidmask);
10628	/*
10629	* Clear cids that are greater or equal to the cidmask weight to
10630	* recompact it.
10631	*/
10632	for_each_possible_cpu(cpu)
10633	sched_mm_cid_remote_clear_weight(mm, cpu, weight);
10634	}
10635
10636	void init_sched_mm_cid(struct task_struct *t)
10637	{
10638	struct mm_struct *mm = t->mm;
10639	int mm_users = `0`;
10640
10641	if (mm) {
10642	mm_users = atomic_read(v: &mm->mm_users);
10643	if (mm_users == `1`)
10644	mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
10645	}
10646	t->cid_work.next = &t->cid_work; / Protect against double add /
10647	init_task_work(twork: &t->cid_work, func: task_mm_cid_work);
10648	}
10649
10650	void task_tick_mm_cid(struct rq rq, struct* task_struct *curr)
10651	{
10652	struct callback_head *work = &curr->cid_work;
10653	unsigned long now = jiffies;
10654
10655	if (!curr->mm \|\| (curr->flags & (PF_EXITING \| PF_KTHREAD)) \|\|
10656	work->next != work)
10657	return;
10658	if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
10659	return;
10660
10661	/ No page allocation under rq lock /
10662	task_work_add(task: curr, twork: work, mode: TWA_RESUME);
10663	}
10664
10665	void sched_mm_cid_exit_signals(struct task_struct *t)
10666	{
10667	struct mm_struct *mm = t->mm;
10668	struct rq *rq;
10669
10670	if (!mm)
10671	return;
10672
10673	preempt_disable();
10674	rq = this_rq();
10675	guard(rq_lock_irqsave)(l: rq);
10676	preempt_enable_no_resched(); / holding spinlock /
10677	WRITE_ONCE(t->mm_cid_active, `0`);
10678	/*
10679	* Store t->mm_cid_active before loading per-mm/cpu cid.
10680	* Matches barrier in sched_mm_cid_remote_clear_old().
10681	*/
10682	smp_mb();
10683	mm_cid_put(mm);
10684	t->last_mm_cid = t->mm_cid = -`1`;
10685	}
10686
10687	void sched_mm_cid_before_execve(struct task_struct *t)
10688	{
10689	struct mm_struct *mm = t->mm;
10690	struct rq *rq;
10691
10692	if (!mm)
10693	return;
10694
10695	preempt_disable();
10696	rq = this_rq();
10697	guard(rq_lock_irqsave)(l: rq);
10698	preempt_enable_no_resched(); / holding spinlock /
10699	WRITE_ONCE(t->mm_cid_active, `0`);
10700	/*
10701	* Store t->mm_cid_active before loading per-mm/cpu cid.
10702	* Matches barrier in sched_mm_cid_remote_clear_old().
10703	*/
10704	smp_mb();
10705	mm_cid_put(mm);
10706	t->last_mm_cid = t->mm_cid = -`1`;
10707	}
10708
10709	void sched_mm_cid_after_execve(struct task_struct *t)
10710	{
10711	struct mm_struct *mm = t->mm;
10712	struct rq *rq;
10713
10714	if (!mm)
10715	return;
10716
10717	preempt_disable();
10718	rq = this_rq();
10719	scoped_guard (rq_lock_irqsave, rq) {
10720	preempt_enable_no_resched(); / holding spinlock /
10721	WRITE_ONCE(t->mm_cid_active, `1`);
10722	/*
10723	* Store t->mm_cid_active before loading per-mm/cpu cid.
10724	* Matches barrier in sched_mm_cid_remote_clear_old().
10725	*/
10726	smp_mb();
10727	t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm);
10728	}
10729	}
10730
10731	void sched_mm_cid_fork(struct task_struct *t)
10732	{
10733	WARN_ON_ONCE(!t->mm \|\| t->mm_cid != -`1`);
10734	t->mm_cid_active = `1`;
10735	}
10736	#endif
10737
10738	#ifdef CONFIG_SCHED_CLASS_EXT
10739	void sched_deq_and_put_task(struct task_struct p, int* queue_flags,
10740	struct sched_enq_and_set_ctx *ctx)
10741	{
10742	struct rq *rq = task_rq(p);
10743
10744	lockdep_assert_rq_held(rq);
10745
10746	ctx = (struct* sched_enq_and_set_ctx){
10747	.p = p,
10748	.queue_flags = queue_flags,
10749	.queued = task_on_rq_queued(p),
10750	.running = task_current(rq, p),
10751	};
10752
10753	update_rq_clock(rq);
10754	if (ctx->queued)
10755	dequeue_task(rq, p, queue_flags \| DEQUEUE_NOCLOCK);
10756	if (ctx->running)
10757	put_prev_task(rq, p);
10758	}
10759
10760	void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
10761	{
10762	struct rq *rq = task_rq(ctx->p);
10763
10764	lockdep_assert_rq_held(rq);
10765
10766	if (ctx->queued)
10767	enqueue_task(rq, ctx->p, ctx->queue_flags \| ENQUEUE_NOCLOCK);
10768	if (ctx->running)
10769	set_next_task(rq, ctx->p);
10770	}
10771	#endif /* CONFIG_SCHED_CLASS_EXT */
10772

source code of linux/kernel/sched/core.c